#!/usr/bin/perl # # bib2html - make an html file that is equivalent to the bib files, # using one of the html bib styles # # usage: $usage = "usage: $0 {alpha|index|long|longp|long-pario|short|short-pario|cv} [-o <outfile>] file.bib...\ OR \ $0 [ {alpha|index|long|longp|long-pario|short|short-pario|cv} ] [-o <outfile>] -a file.aux"; # Output is to bib.html. # alpha makes a bibliography like bibstyle{alpha} # long makes a bibliography like bibstyle{abstract} # short makes a bibliography like bibstyle{alpha}, but with citekey for tags # cv lists title first, does not print key, does not sort # (useful for listing your papers on your home page) # # David Kotz 7/94 # dfk@cs.dartmouth.edu # updated Dan Gildea 1/2003: # . now handles nested \bf \em etc correctly # . added some more accents # . new -a file.aux usage to use existing .aux file, useful to # list only papers cited by a given latex document # http://www.cs.rochester.edu/~gildea/ # updated 2/2004, thanks to Juergen Vollmer # a) allowing \emph{..}, \texttt{...} and \textbf{..} # b) setting the environment variable BSTINPUTS to directory from $0 # c) if run on existing .aux file, bibstyle is now optional on the command line. # if not bib style nor specified on command line, uses the html version # of bib style from original document. use File::Basename; use File::Spec::Functions; $tmp = "/tmp/bib2html$$"; my $cmd_dir = catfile (dirname ($0), dirname ((defined readlink($0)) ? readlink($0) : $0)); $ENV{BSTINPUTS} .= ":$cmd_dir"; if ( scalar(@ARGV) < 2 ) { die "$usage\n"; } ## pick up the args my %Opts; # will hold the option values if ($ARGV[0] !~ /^-/) { $Opts{style} = shift @ARGV; } while ($ARGV[0] =~ /^-/) { $tmpArg = shift @ARGV; if ($tmpArg eq "-o" ) { $Opts{outfile} = shift @ARGV; } elsif ($tmpArg eq "-a" ) { $Opts{auxfile} = shift @ARGV; } } $Opts{outfile} = "bib.html" if ( !defined ($Opts{outfile}) ); $SIG{INT} = \&cleanup; print "Creating $tmp.aux for $files\n"; open AUX, ">$tmp.aux"; if ($Opts{auxfile}) { system "ls"; open(AUXIN, $Opts{auxfile}) || die "could not open $Opts{auxfile}"; while (<AUXIN>) { # if style not specified on command line, use html version # of style specified in .aux file s/^\\bibstyle\{([^\}]*)\}/"\\bibstyle{html-".($Opts{style}||$1)."}"/e; print AUX; } } else { if (!defined $Opts{style}) { die "bib style not defined"; } ## get all the remaining file names my $files = killSuffix ( shift @ARGV ); foreach $file ( @ARGV ) { $files .= ','; # make sure the list is comma delimited, bibtex needs it $files .= killSuffix ($file); } ## need double slashes here to go around slash interpretation ## only one is printed print AUX <<EOF; \\relax \\citation{*} \\bibstyle{html-$Opts{style}} \\bibdata{$files} EOF } close AUX; unlink <$tmp.{bbl,blg}>; if (-r "html-split.bst.gz" ) { system ( "gunzip html-split.bst.gz" ); } print "bibtex $tmp\n"; system "bibtex $tmp"; print "Bibtex done\n\n"; ## ------------------------------------------------------------------- ## the cleanup operation open BBL, "$tmp.bbl"; open OUT, ">$Opts{outfile}"; ## stack of formatted text types that were started and did not finish on teh same line ## elements are em, tt or b or i and when a candidate for a format-closing brace is found ## this stack will be popped to see which format we have to close my @formatsToClose; while ( <BBL> ) { ## accent handlers ## these are quite definite about their replacemnets and so should ## come before the more generic operations later ## all braces in the latex constructs are optional ## info on the html codes can be found at http://www.w3.org/TR/REC-html40/ ## \\? is to handle latex \'\i which you do to put acute on i without dot s/\\ \` (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )? /&$1grave;/gx; # grave accent s/ \\ \' (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )? /&$1acute;/gx; # 'acute accent s/ \\ \^ (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )? /&$1circ;/gx; # circumflex s/ \\ \" (?: \{ )? \\? ([aeiouyAEIOUY]) (?: \} )? /&$1uml;/gx; # "umlaut s/ \\ \~ (?: \{ )? \\? ([anoANO]) (?: \} )? /&$1tilde;/gx; # tilde s/ \\ c (?: \{ )? ([cC]) (?: \} )? /&$1cedil;/gx; # cedilla s/ \\ [v\'] (?: \{ )? ([cC]) (?: \} )? /$1/gx; # no html code for czech hook s/ \{? \\[s\"]s \}? /ß/gx; # german Scharf-S ## several weird symbols s/ \\copyright /©/gx; s/ \\pounds /£/gx; ## weirder symbols s/ \\ (ae|AE) /&$1lig;/gx; s/ \\ (o|O) /&$1slash;/gx; s/ \\ss /ß/gx; ## greek letters, case insensitive matching, but upper case in latex and ## html have the first letter of the english word capitalized s/ \\ (?: var )? (alpha|beta|gamma|delta|epsilon|theta|lambda|pi|rho|sigma|omega) /&$1;/gxi; ## remove any \/ space-increasing symbols s+ ([^\\]) \\ \/ +$1+gx; s+ \\ \ + +gx; ## deal with \cite stuff, change it to a link to a record the same html file s+ ([^\\]) \\cite\{ (.*?) \} +$1<a href="#$2">$2</a>+xg; ## These rules are to deal with my (DFK) macros s/\\ie/i.e./g; s/\\eg/e.g./g; s/\\etc/etc./g; s+\\vs\\+<EM>vs.</EM>+g; s/\\usec/usec/g; s/\\mbox //g; s/\\par / <P> /g; s/\\par$/ <P>/g; s/\\\&/\&/g; # ampersand s/-{2,3}/-/g; # multiple dashes # a few rules are needed to compensate for BibTeXs way of splitting # long words over two lines by sticking a % (TeX comment character) at # the end of the line. This works when one word (usually a URL) is # split over more than one line. ## if we have an unescaped % at the end of the line, remove it and the newline and ## join the next line on ## example straight out of the camel book, pg 204. amazing if ( s/ ([^\\]) \% \n$ /$1/x and $nextline = <BBL> ) { $_ .= $nextline; redo; # back to the top } # hyphenation characters should be removed s+\\-++g; # tildes - # tilde not preceded by \ or / is a nbsp # \~{} is ~ (likely in a URL) # all other tildes left alone, notably /~ (URL) s+ ([^\\/]) ~ +$1 +xg; # normal standalone tilde - nbsp s/ \\~ \{\} /~/xg; # \~{} to ~ - do before removing braces below ## -------------------------------------------------------- ## deal with em and tt and bf text surrounded by braces ## deal with \emph{..}, \texttt{...} and \textbf{..} ## the fancy groupings around the em etc are because in html bold font is not 'bf' ## but just b and italic is i not 'it' so we have to pick out only a part of those ## latex tags ## final |\w+ is to skip over \rm or any other commands we don't handle ## also removes braces with no command while ( /(text(b)f|text(tt)|(em)ph)?([\{\}])/ ) { if (defined $1) { my $cmd = "$1"; my $fmt = $2 || $3 || $4; s/\\$cmd\{/<$fmt>/; push ( @formatsToClose, $fmt ) } elsif ($5 eq '{') { ## beginning of format s/ \{ (?: \\ (?: (em)|(b)f|(tt)|(i)t|\w+ ) )? \s* / ( $format = $1 or $2 or $3 or $4 ) ? "<$format>" : '' /ex ; ## push the format to be closed onto the stack (may be nothing) push ( @formatsToClose, $format ) } else { ## pop format to close from stack s+ \s*\} + ( $format = pop (@formatsToClose) ) ? "</$format>" : '' +ex ; } } ## -------------------------------------------------------- ##retrieve symbols escaped by backslashes my $escapedChars = quotemeta ( '#$%&_{}' ); s/ ([^\\]) \\ ([$escapedChars]) /$1$2/gxo; print OUT $_; } print "\n"; print "\noutput is in $Opts{outfile}\n"; cleanup(); sub killSuffix { $file = shift(); ( $name, $path ) = fileparse ( $file, '\.[^.]*$' ); # the pattern indicates what a suffix looks like return ($path . $name); } sub cleanup { unlink ( glob ("$tmp.{aux,bbl,blg}") ); }