#!/usr/bin/perl
#
# bib2html - make an html file that is equivalent to the bib files,
#   using one of the html bib styles
#
# usage: 
$usage = "usage: $0 {alpha|index|long|longp|long-pario|short|short-pario|cv} [-o <outfile>] file.bib...\
OR \
$0 [ {alpha|index|long|longp|long-pario|short|short-pario|cv} ] [-o <outfile>] -a file.aux";
# Output is to bib.html. 
#    alpha makes a bibliography like bibstyle{alpha}
#    long makes a bibliography like bibstyle{abstract}
#    short makes a bibliography like bibstyle{alpha}, but with citekey for tags
#    cv lists title first, does not print key, does not sort
#	(useful for listing your papers on your home page)
#
# David Kotz 7/94
# dfk@cs.dartmouth.edu

# updated Dan Gildea 1/2003:
#  . now handles nested \bf \em etc correctly
#  . added some more accents
#  . new -a file.aux usage to use existing .aux file, useful to 
#	list only papers cited by a given latex document
# http://www.cs.rochester.edu/~gildea/

# updated 2/2004, thanks to Juergen Vollmer
#  a) allowing \emph{..}, \texttt{...} and \textbf{..}
#  b) setting the environment variable BSTINPUTS to directory from $0
#  c) if run on existing .aux file, bibstyle is now optional on the command line.
#	if not bib style nor specified on command line, uses the html version
#	of bib style from original document.

use File::Basename;
use File::Spec::Functions;

$tmp = "/tmp/bib2html$$";

my $cmd_dir   = catfile (dirname ($0),
			 dirname ((defined readlink($0)) ? readlink($0) : $0));

$ENV{BSTINPUTS} .= ":$cmd_dir";

if ( scalar(@ARGV) < 2 ) {
    die "$usage\n";
}


## pick up the args
my %Opts;			# will hold the option values

if ($ARGV[0] !~ /^-/) {
    $Opts{style} = shift @ARGV;
}
while ($ARGV[0] =~ /^-/) {
    $tmpArg = shift @ARGV;
    if ($tmpArg eq "-o" ) {
	$Opts{outfile} = shift @ARGV;
    } elsif ($tmpArg eq "-a" ) {
   	$Opts{auxfile} = shift @ARGV; 
    }
}

$Opts{outfile} = "bib.html" if ( !defined ($Opts{outfile}) );

$SIG{INT} = \&cleanup;

print "Creating $tmp.aux for $files\n";
open AUX, ">$tmp.aux";

if ($Opts{auxfile}) {
    system "ls";
    open(AUXIN, $Opts{auxfile}) || die "could not open $Opts{auxfile}";
    while (<AUXIN>) {
	# if style not specified on command line, use html version
	# of style specified in .aux file
	s/^\\bibstyle\{([^\}]*)\}/"\\bibstyle{html-".($Opts{style}||$1)."}"/e;
	print AUX;
    }
} else {
    if (!defined $Opts{style}) {
	die "bib style not defined";
    }
    ## get all the remaining file names
    my $files = killSuffix ( shift @ARGV );
    foreach $file ( @ARGV ) {
	$files .= ',';	# make sure the list is comma delimited, bibtex needs it
	$files .= killSuffix ($file);
    }


## need double slashes here to go around slash interpretation
## only one is printed
    print AUX <<EOF;
\\relax 
\\citation{*}
\\bibstyle{html-$Opts{style}}
\\bibdata{$files}
EOF
}
close AUX;
    
unlink <$tmp.{bbl,blg}>;

if (-r "html-split.bst.gz" ) {
    system ( "gunzip html-split.bst.gz" );
}

print "bibtex $tmp\n";

system "bibtex $tmp";

print "Bibtex done\n\n";


## -------------------------------------------------------------------
## the cleanup operation
open BBL, "$tmp.bbl";
open OUT, ">$Opts{outfile}";

## stack of formatted text types that were started and did not finish on teh same line
## elements are em, tt or b or i and when a candidate for a format-closing brace is found
## this stack will be popped to see which format we have to close
my @formatsToClose;

while ( <BBL> ) { 


    ## accent handlers
    ## these are quite definite about their replacemnets and so should
    ## come before the more generic operations later
    ## all braces in the latex constructs are optional
    ## info on the html codes can be found at http://www.w3.org/TR/REC-html40/
    ## \\? is to handle latex \'\i which you do to put acute on i without dot
    s/\\ \` (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )?
          /&$1grave;/gx;	# grave accent
    s/ \\ \' (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )?
          /&$1acute;/gx;	# 'acute accent
    s/ \\ \^ (?: \{ )? \\? ([aeiouAEIOU]) (?: \} )?
          /&$1circ;/gx;		# circumflex
    s/ \\ \" (?: \{ )? \\? ([aeiouyAEIOUY]) (?: \} )?
          /&$1uml;/gx;		# "umlaut
    s/ \\ \~ (?: \{ )? \\? ([anoANO]) (?: \} )?
          /&$1tilde;/gx;	# tilde
    s/ \\ c (?: \{ )? ([cC]) (?: \} )?
          /&$1cedil;/gx;	# cedilla
    s/ \\ [v\'] (?: \{ )? ([cC]) (?: \} )?
          /$1/gx;		# no html code for czech hook
    s/ \{? \\[s\"]s \}?
          /&szlig;/gx;          # german Scharf-S

    ## several weird symbols
    s/ \\copyright
	/&copy;/gx;
    s/ \\pounds
	/&pound;/gx;
    
    ## weirder symbols
    s/ \\ (ae|AE)
	/&$1lig;/gx;
    s/ \\ (o|O)
	/&$1slash;/gx;
    s/ \\ss
	/&szlig;/gx;

    ## greek letters, case insensitive matching, but upper case in latex and
    ## html have the first letter of the english word capitalized
    s/ \\ (?: var )? (alpha|beta|gamma|delta|epsilon|theta|lambda|pi|rho|sigma|omega)
	/&$1;/gxi;

    ## remove any \/ space-increasing symbols
    s+ ([^\\]) \\ \/
	+$1+gx;

    s+ \\ \ + +gx;

    ## deal with \cite stuff, change it to a link to a record the same html file
    s+ ([^\\]) \\cite\{ (.*?) \}
        +$1<a href="#$2">$2</a>+xg;

    ## These rules are to deal with my (DFK) macros
    s/\\ie/i.e./g;
    s/\\eg/e.g./g;
    s/\\etc/etc./g;
    s+\\vs\\+<EM>vs.</EM>+g;
    s/\\usec/usec/g;
    s/\\mbox //g;
    s/\\par / <P> /g;
    s/\\par$/ <P>/g;


    s/\\\&/\&amp;/g;		# ampersand

    s/-{2,3}/-/g;		# multiple dashes

# a few rules are needed to compensate for BibTeXs way of splitting
# long words over two lines by sticking a % (TeX comment character) at
# the end of the line.  This works when one word (usually a URL) is
# split over more than one line. 
    ## if we have an unescaped % at the end of the line, remove it and the newline and
    ## join the next line on
    ## example straight out of the camel book, pg 204. amazing
    if ( s/ ([^\\]) \% \n$ /$1/x and $nextline = <BBL> ) {
	$_ .= $nextline;
	redo;			# back to the top
    }

# hyphenation characters should be removed
    s+\\-++g;

# tildes -
#         tilde not preceded by \ or / is a nbsp
#         \~{} is ~ (likely in a URL)
#         all other tildes left alone, notably /~ (URL)
    s+ ([^\\/]) ~ +$1&nbsp;+xg;	# normal standalone tilde - nbsp
    s/ \\~ \{\} /~/xg;		# \~{} to ~ - do before removing braces below

    ## --------------------------------------------------------
    ## deal with em and tt and bf text surrounded by braces
    ## deal with \emph{..}, \texttt{...} and \textbf{..}

    ## the fancy groupings around the em etc are because in html bold font is not 'bf'
    ## but just b and italic is i not 'it' so we have to pick out only a part of those
    ## latex tags
    ## final |\w+ is to skip over \rm or any other commands we don't handle
    ## also removes braces with no command

    while ( /(text(b)f|text(tt)|(em)ph)?([\{\}])/ ) {
	if (defined $1) {
	    my $cmd = "$1";
	    my $fmt = $2 || $3 || $4;
	    s/\\$cmd\{/<$fmt>/;
	    push ( @formatsToClose, $fmt )
	} elsif ($5 eq '{') {
	    ## beginning of format
	    s/ \{ (?: \\ (?: (em)|(b)f|(tt)|(i)t|\w+ ) )? \s* 
		/ ( $format = $1 or $2 or $3 or $4  ) ? "<$format>" : '' /ex ;
	    ## push the format to be closed onto the stack (may be nothing)
	    push ( @formatsToClose, $format ) 
	} else {
	    ## pop format to close from stack
	    s+ \s*\}
	    + ( $format = pop (@formatsToClose) ) ? "</$format>" : '' +ex  ;
	}
    }

    ## --------------------------------------------------------

    ##retrieve symbols escaped by backslashes
    my $escapedChars = quotemeta ( '#$%&_{}' );
    s/ ([^\\]) \\ ([$escapedChars]) 
        /$1$2/gxo;

    print OUT $_;
}


print "\n";
print "\noutput is in $Opts{outfile}\n";

cleanup();

sub killSuffix {
    $file = shift();
    ( $name, $path ) = fileparse ( $file, '\.[^.]*$' ); # the pattern indicates what a suffix looks like
    return ($path . $name);
}

sub cleanup {
    unlink ( glob ("$tmp.{aux,bbl,blg}") );
}