#! /usr/bin/perl # # $Id: collateindex.pl,v 1.14 2000/07/19 19:05:03 nwalsh Exp $ use Getopt::Std; $usage = "Usage: $0 file Where are: -p Link to points in the document. The default is to link to the closest containing section. -g Group terms with IndexDiv based on the first letter of the term (or its sortas attribute). (This probably doesn't handle i10n particularly well) -s name Name the IndexDiv that contains symbols. The default is 'Symbols'. Meaningless if -g is not used. -t name Title for the index. -P file Read a preamble from file. The content of file will be inserted before the tag. -i id The ID for the tag. -o file Output to file. Defaults to stdout. -S scope Scope of the index, must be 'all', 'local', or 'global'. If unspecified, 'all' is assumed. -I scope The implied scope, must be 'all', 'local', or 'global'. IndexTerms which do not specify a scope will have the implied scope. If unspecified, 'all' is assumed. -x Make a SetIndex. -f Force the output file to be written, even if it appears to have been edited by hand. -N New index (generates an empty index file). file The file containing index data generated by Jade with the DocBook HTML Stylesheet.\n"; die $usage if ! getopts('Dfgi:NpP:s:o:S:I:t:x'); $linkpoints = $opt_p; $lettergroups = $opt_g; $symbolsname = $opt_s || "Symbols"; $title = $opt_t; $preamble = $opt_P; $outfile = $opt_o || '-'; $indexid = $opt_i; $scope = uc($opt_S) || 'ALL'; $impliedscope = uc($opt_I) || 'ALL'; $setindex = $opt_x; $forceoutput = $opt_f; $newindex = $opt_N; $debug = $opt_D; $indextag = $setindex ? 'setindex' : 'index'; if ($newindex) { safe_open(*OUT, $outfile); if ($indexid) { print OUT "<$indextag id='$indexid'>\n\n"; } else { print OUT "<$indextag>\n\n"; } print OUT "\n"; print OUT "\n"; print OUT "\n"; exit 0; } $dat = shift @ARGV || die $usage; die "$0: cannot find $dat.\n" if ! -f $dat; %legal_scopes = ('ALL' => 1, 'LOCAL' => 1, 'GLOBAL' => 1); if ($scope && !$legal_scopes{$scope}) { die "Invalid scope.\n$usage\n"; } if ($impliedscope && !$legal_scopes{$impliedscope}) { die "Invalid implied scope.\n$usage\n"; } @term = (); %id = (); $termcount = 0; print STDERR "Processing $dat...\n"; # Read the index file, creating an array of objects. Each object # represents and indexterm and has fields for the content of the # indexterm open (F, $dat); while () { chop; if (/^\/indexterm/i) { push (@term, $idx); next; } if (/^indexterm (.*)$/i) { $termcount++; $idx = {}; $idx->{'zone'} = {}; $idx->{'href'} = $1; $idx->{'count'} = $termcount; $idx->{'scope'} = $impliedscope; next; } if (/^indexpoint (.*)$/i) { $idx->{'hrefpoint'} = $1; next; } if (/^title (.*)$/i) { $idx->{'title'} = $1; next; } if (/^primary[\[ ](.*)$/i) { if (/^primary\[(.*?)\] (.*)$/i) { $idx->{'psortas'} = $1; $idx->{'primary'} = $2; } else { $idx->{'psortas'} = $1; $idx->{'primary'} = $1; } next; } if (/^secondary[\[ ](.*)$/i) { if (/^secondary\[(.*?)\] (.*)$/i) { $idx->{'ssortas'} = $1; $idx->{'secondary'} = $2; } else { $idx->{'ssortas'} = $1; $idx->{'secondary'} = $1; } next; } if (/^tertiary[\[ ](.*)$/i) { if (/^tertiary\[(.*?)\] (.*)$/i) { $idx->{'tsortas'} = $1; $idx->{'tertiary'} = $2; } else { $idx->{'tsortas'} = $1; $idx->{'tertiary'} = $1; } next; } if (/^see (.*)$/i) { $idx->{'see'} = $1; next; } if (/^seealso (.*)$/i) { $idx->{'seealso'} = $1; next; } if (/^significance (.*)$/i) { $idx->{'significance'} = $1; next; } if (/^class (.*)$/i) { $idx->{'class'} = $1; next; } if (/^scope (.*)$/i) { $idx->{'scope'} = uc($1); next; } if (/^startref (.*)$/i) { $idx->{'startref'} = $1; next; } if (/^id (.*)$/i) { $idx->{'id'} = $1; $id{$1} = $idx; next; } if (/^zone (.*)$/i) { my($href) = $1; $_ = scalar(); chop; die "Bad zone: $_\n" if !/^title (.*)$/i; $idx->{'zone'}->{$href} = $1; next; } die "Unrecognized: $_\n"; } close (F); print STDERR "$termcount entries loaded...\n"; # Fixup the startrefs... # In DocBook, STARTREF is a #CONREF attribute; support this by copying # all of the fields from the indexterm with the id specified by STARTREF # to the indexterm that has the STARTREF. foreach $idx (@term) { my($ididx, $field); if ($idx->{'startref'}) { $ididx = $id{$idx->{'startref'}}; foreach $field ('primary', 'secondary', 'tertiary', 'see', 'seealso', 'psortas', 'ssortas', 'tsortas', 'significance', 'class', 'scope') { $idx->{$field} = $ididx->{$field}; } } } # Sort the index terms @term = sort termsort @term; # Move all of the non-alphabetic entries to the front of the index. @term = sortsymbols(@term); safe_open(*OUT, $outfile); # Write the index... if ($indexid) { print OUT "<$indextag id='$indexid'>\n\n"; } else { print OUT "<$indextag>\n\n"; } print OUT "\n"; print OUT "\n"; print OUT "\n\n"; print OUT "$title\n\n" if $title; $last = {}; # the last indexterm we processed $first = 1; # this is the first one $group = ""; # we're not in a group yet $lastout = ""; # we've not put anything out yet foreach $idx (@term) { next if $idx->{'startref'}; # no way to represent spans... next if ($idx->{'scope'} eq 'LOCAL') && ($scope eq 'GLOBAL'); next if ($idx->{'scope'} eq 'GLOBAL') && ($scope eq 'LOCAL'); next if &same($idx, $last); # suppress duplicates $termcount--; # If primary changes, output a whole new index term, otherwise just # output another secondary or tertiary, as appropriate. We know from # sorting that the terms will always be in the right order. if (!&tsame($last, $idx, 'primary')) { print "DIFF PRIM\n" if $debug; &end_entry() if not $first; if ($lettergroups) { # If we're grouping, make the right indexdivs $letter = $idx->{'psortas'}; $letter = $idx->{'primary'} if !$letter; $letter = uc(substr($letter, 0, 1)); # symbols are a special case if (($letter lt 'A') || ($letter gt 'Z')) { if (($group eq '') || (($group ge 'A') && ($group le 'Z'))) { print OUT "\n" if !$first; print OUT "$symbolsname\n\n"; $group = $letter; } } elsif (($group eq '') || ($group ne $letter)) { print OUT "\n" if !$first; print OUT "$letter\n\n"; $group = $letter; } } $first = 0; # there can only be on first ;-) print OUT "\n"; print OUT " ", $idx->{'primary'}; $lastout = "primaryie"; if ($idx->{'secondary'}) { print OUT "\n \n"; print OUT " ", $idx->{'secondary'}; $lastout = "secondaryie"; }; if ($idx->{'tertiary'}) { print OUT "\n \n"; print OUT " ", $idx->{'tertiary'}; $lastout = "tertiaryie"; } } elsif (!&tsame($last, $idx, 'secondary')) { print "DIFF SEC\n" if $debug; print OUT "\n \n" if $lastout; print OUT " ", $idx->{'secondary'}; $lastout = "secondaryie"; if ($idx->{'tertiary'}) { print OUT "\n \n"; print OUT " ", $idx->{'tertiary'}; $lastout = "tertiaryie"; } } elsif (!&tsame($last, $idx, 'tertiary')) { print "DIFF TERT\n" if $debug; print OUT "\n \n" if $lastout; if ($idx->{'tertiary'}) { print OUT " ", $idx->{'tertiary'}; $lastout = "tertiaryie"; } } &print_term($idx); $last = $idx; } # Termcount is > 0 iff some entries were skipped. print STDERR "$termcount entries ignored...\n"; &end_entry(); print OUT "\n" if $lettergroups; print OUT "\n"; close (OUT); print STDERR "Done.\n"; sub same { my($a) = shift; my($b) = shift; my($aP) = $a->{'psortas'} || $a->{'primary'}; my($aS) = $a->{'ssortas'} || $a->{'secondary'}; my($aT) = $a->{'tsortas'} || $a->{'tertiary'}; my($bP) = $b->{'psortas'} || $b->{'primary'}; my($bS) = $b->{'ssortas'} || $b->{'secondary'}; my($bT) = $b->{'tsortas'} || $b->{'tertiary'}; my($same); $aP =~ s/^\s*//; $aP =~ s/\s*$//; $aP = uc($aP); $aS =~ s/^\s*//; $aS =~ s/\s*$//; $aS = uc($aS); $aT =~ s/^\s*//; $aT =~ s/\s*$//; $aT = uc($aT); $bP =~ s/^\s*//; $bP =~ s/\s*$//; $bP = uc($bP); $bS =~ s/^\s*//; $bS =~ s/\s*$//; $bS = uc($bS); $bT =~ s/^\s*//; $bT =~ s/\s*$//; $bT = uc($bT); # print "[$aP]=[$bP]\n"; # print "[$aS]=[$bS]\n"; # print "[$aT]=[$bT]\n"; # Two index terms are the same if: # 1. the primary, secondary, and tertiary entries are the same # (or have the same SORTAS) # AND # 2. They occur in the same titled section # AND # 3. They point to the same place # # Notes: Scope is used to suppress some entries, but can't be used # for comparing duplicates. # Interpretation of "the same place" depends on whether or # not $linkpoints is true. $same = (($aP eq $bP) && ($aS eq $bS) && ($aT eq $bT) && ($a->{'title'} eq $b->{'title'}) && ($a->{'href'} eq $b->{'href'})); # If we're linking to points, they're only the same if they link # to exactly the same spot. (surely this is redundant?) $same = $same && ($a->{'hrefpoint'} eq $b->{'hrefpoint'}) if $linkpoints; $same; } sub tsame { # Unlike same(), tsame only compares a single term my($a) = shift; my($b) = shift; my($term) = shift; my($sterm) = substr($term, 0, 1) . "sortas"; my($A, $B); $A = $a->{$sterm} || $a->{$term}; $B = $b->{$sterm} || $b->{$term}; $A =~ s/^\s*//; $A =~ s/\s*$//; $A = uc($A); $B =~ s/^\s*//; $B =~ s/\s*$//; $B = uc($B); return $A eq $B; } sub end_entry { # End any open elements... print OUT "\n \n" if $lastout; print OUT "\n\n"; $lastout = ""; } sub print_term { # Print out the links for an indexterm. There can be more than # one if the term has a ZONE that points to more than one place. # (do we do the right thing in that case?) my($idx) = shift; my($key, $indent, @hrefs); my(%href) = (); my(%phref) = (); $indent = " "; if ($idx->{'see'}) { # it'd be nice to make this a link... if ($lastout) { print OUT "\n \n"; $lastout = ""; } print OUT $indent, "", &escape($idx->{'see'}), "\n"; return; } if ($idx->{'seealso'}) { # it'd be nice to make this a link... if ($lastout) { print OUT "\n \n"; $lastout = ""; } print OUT $indent, "", &escape($idx->{'seealso'}), "\n"; return; } if (keys %{$idx->{'zone'}}) { foreach $key (keys %{$idx->{'zone'}}) { $href{$key} = $idx->{'zone'}->{$key}; $phref{$key} = $idx->{'zone'}->{$key}; } } else { $href{$idx->{'href'}} = $idx->{'title'}; $phref{$idx->{'href'}} = $idx->{'hrefpoint'}; } # We can't use because we don't know the ID of the term in the # original source (and, in fact, it might not have one). print OUT ",\n"; @hrefs = keys %href; while (@hrefs) { my($linkend) = ""; my($role) = ""; $key = shift @hrefs; if ($linkpoints) { $linkend = $phref{$key}; } else { $linkend = $key; } $role = $phref{$key}; $role = $1 if $role =~ /\#(.*)$/; print OUT $indent; print OUT ""; print OUT "" if ($idx->{'significance'} eq 'PREFERRED'); print OUT &escape($href{$key}); print OUT "" if ($idx->{'significance'} eq 'PREFERRED'); print OUT ""; } } sub termsort { my($aP) = $a->{'psortas'} || $a->{'primary'}; my($aS) = $a->{'ssortas'} || $a->{'secondary'}; my($aT) = $a->{'tsortas'} || $a->{'tertiary'}; my($ap) = $a->{'count'}; my($bP) = $b->{'psortas'} || $b->{'primary'}; my($bS) = $b->{'ssortas'} || $b->{'secondary'}; my($bT) = $b->{'tsortas'} || $b->{'tertiary'}; my($bp) = $b->{'count'}; $aP =~ s/^\s*//; $aP =~ s/\s*$//; $aP = uc($aP); $aS =~ s/^\s*//; $aS =~ s/\s*$//; $aS = uc($aS); $aT =~ s/^\s*//; $aT =~ s/\s*$//; $aT = uc($aT); $bP =~ s/^\s*//; $bP =~ s/\s*$//; $bP = uc($bP); $bS =~ s/^\s*//; $bS =~ s/\s*$//; $bS = uc($bS); $bT =~ s/^\s*//; $bT =~ s/\s*$//; $bT = uc($bT); if ($aP eq $bP) { if ($aS eq $bS) { if ($aT eq $bT) { # make sure seealso's always sort to the bottom return 1 if ($a->{'seealso'}); return -1 if ($b->{'seealso'}); # if everything else is the same, keep these elements # in document order (so the index links are in the right # order) return $ap <=> $bp; } else { return $aT cmp $bT; } } else { return $aS cmp $bS; } } else { return $aP cmp $bP; } } sub sortsymbols { my(@term) = @_; my(@new) = (); my(@sym) = (); my($letter); my($idx); # Move the non-letter things to the front. Should digits be thier # own group? Maybe... foreach $idx (@term) { $letter = $idx->{'psortas'}; $letter = $idx->{'primary'} if !$letter; $letter = uc(substr($letter, 0, 1)); if (($letter lt 'A') || ($letter gt 'Z')) { push (@sym, $idx); } else { push (@new, $idx); } } return (@sym, @new); } sub safe_open { local(*OUT) = shift; local(*F, $_); if (($outfile ne '-') && (!$forceoutput)) { my($handedit) = 1; if (open (OUT, $outfile)) { while () { if (//){ $handedit = 0; last; } } close (OUT); } else { $handedit = 0; } if ($handedit) { print "\n$outfile appears to have been edited by hand; use -f or\n"; print " change the output file.\n"; exit 1; } } open (OUT, ">$outfile") || die "$usage\nCannot write to $outfile.\n"; if ($preamble) { # Copy the preamble if (open(F, $preamble)) { while () { print OUT $_; } close(F); } else { warn "$0: cannot open preamble $preamble.\n"; } } } sub escape { # make sure & and < don't show up in the index local $_ = shift; s/&/&/sg; s//>/sg; # what the heck return $_; }