#!/usr/bin/perl -w
##
# Import bibtex data (files/stdin), breaking it on `^\s*@'
# into separate files, making some standardisations to
# formatting, then writing each entry to a file in the pwd,
# named in Author-Year 'NamNN' format, with suffixed number
# if the chosen filename already exists.  A 'key' field can 
# override the chosen NamNN if either Nam or NN has failed.
#
# The parsing is NOT proper bibtex: bibtex comments are any
# text outside an entry, an entry being defined as @\w+{...}
# where `...' may contain newlines and matching pairs of
# braces.  For simplicity, no brace matching is done here,
# but comments may be shown by lines starting with `%'.
#
# `Bibtexing' (1988), from the tetex distribution, has been
# used as the source of some details of comments, name formats
# and required/optional fields.
#
# Nathaniel,
#    started 2008-10-03.
#    working 'enough' 2008-12-23.
#    working 'quite neatly' 2008-12-25.
#    considered finished 2008-12-28.
#

# options
$month_full = 1;  # true value ==>  e.g. 'January' rather than 'jan'


# hope to avoid problems of non-ascii names (e.g. author),
# by using e.g. \w rather than [a-zA-Z] and having a utf-8 locale;
# the actual language/country makes little/no difference
use POSIX 'locale_h';
setlocale(LC_ALL, 'en_GB.UTF-8');

## Get non-comment input into a single line, then split into bibtex entries.
$in = '';
while (<>) {
    if (!m/^[\t ]*$/ && !m/^[\t ]*[%#]/) {
        s/^[\t ]*@/REAL@/g;
        $in .= $_;
    }
}
$in =~ s/\s+/ /g;
@entries = split('REAL@', $in);
shift(@entries);

## Entry by entry, do processing: separate fields, do `correcting', write bibtex file.
$nent = 0;
for (@entries) {

    %fields = ();
    printf "\n## Processing Bibtex entry %d\n", ++$nent;

    # remove top (e.g. '@Article{lab:el,') and bottom (e.g. '}')
    s/ *(\w+) *{ *([^,]+?),//;
    $type = "\L$1";
    $label_orig = $2;
    s/}[ ,]*$//;

    # split into fields (e.g. 'title = {The Title},' or 'title="The Title",')
    s/ *(\w+) *= *([{"])/\n$1 = $2/g;
    @lines = split('\n');
    shift(@lines);
    for (@lines) {
        if (m/^ *(\w+) *= *[{"] *(.*?) *[}"][, ]*$/) {
            $fieldname = "\L$1";
            $content = $2;
            $fields{$fieldname} = $content;
        } else {
            print "couldn't match field with \"name = {value}\" pattern\n";
            print "$_\n";
        }
    }

    # split authors or editors (on ' and ', but some sources may use ';')
    # and try to format names in 'Surname, Rest' format
    $label = 'Nnn';
    for $authedit ( ('author', 'editor') ) {
      if (exists($fields{$authedit})) {
        $fields{$authedit} =~ s/( *; *| +and +)/\n/g;
        @people = split( '\n', $fields{$authedit} );
        for $person (@people) {
            if ( !($person =~ m/,/) ) { # 'First Last' -> 'Last, First'
                $person =~ s/^(.*?) (([[:lower:]]+ )?\w+)\W*$/$2, $1/;
            }
            $person =~ s/\.([^ ,])/. $1/g; # space the initials
            if ($person && $label=~m/^Nnn$/) {
                $_ = $person;
                # the following is a bodge to avoid (some) non-ascii troubles
                s/Ä/A/g;s/Æ/A/g;s/Ö/O/g;s/Ø/O/g;s/Å/A/g;s/Ü/U/g;s/É/E/g;
                s/ä/a/g;s/æ/a/g;s/ö/o/g;s/ø/o/g;s/å/a/g;s/ü/u/g;s/é/e/g;
                # clean up before parsing
                s/-/ /g; # treat e.g. 'Espino-Cortes' similarly to 'Van Brunt', for label
                s/[^\w ,.]//g; # prevent e.g. 'G\{ae}..' from giving a label of 'G\{'
                if (m/^ *([[:upper:]])\w+ +([[:upper:]])\w+ *,/) { 
                    # name is e.g. 'Van Brunt' or 'Espino-Cortes': take VB or EC
                    $label = $1 . $2;
                } elsif (m/^[[:lower:] ]*([[:upper:]]\w{0,2})\w* *,/) { 
                    # name is e.g. 'Smith' or 'van der Laan': take Smi or Laa
                    $label = $1;
                }
            }
        }
        $fields{$authedit} = join(' and ', @people);
      }
    }

    # check for presence of a year
    $label .= 'XX';
    if (!exists($fields{'year'})) {
        print "missing year\n";
    } else {
        if ($fields{'year'} =~ m/\d\d(\d\d)/) {
                $y2d = $1;
                $label =~ s/XX/$y2d/;
        } else {
            print "year doesn't contain a numeric year:  " .
                "\"$fields{'year'}\"\n";
        }
    }

    # let the optional `key' field override automatic label iff full person/year not found
    if ($fields{'key'} && $label=~m/(^Nnn|XX$)/) {
        printf "using supplied 'key' field '$fields{'key'}' instead of automatic '$label'\n";
	$label = $fields{'key'};
    }
    #
    printf "%d: '$label' replaces original label '$label_orig'\n", $nent;

    # make months into a standard format (I've chosen full name)
    if (exists($fields{'month'}) && $fields{'month'} =~ m/\W*([A-Za-z]{3}).*/) {
        $fields{'month'} = "\L$1";
        if ($fields{'month'} =~ /[a-z]{3}/) {
            if ($month_full) {
                %months = ( qw(  jan January    feb February   mar March
                                 apr April      may May        jun June
                                 jul July       aug August     sep September
                                 oct October    nov November  dec December ) );
                $fields{'month'} = $months{$fields{'month'}};
            }
	} else {
	    printf "month '%s' doesn't match expected format\n", $fields{'month'};
        }
    }

    # clean page-numbering: hope to keep 'n,n-n,n' ranges, but
    # remove spaces and the frequent ieeexploit 'vol.N', etc.
    if (exists($fields{'pages'})) {
	if ($fields{'pages'} =~ m/^(\d+) *-+ *(\d+)$/) {
		$fields{'pages'} = "$1--$2";
	} elsif ($fields{'pages'} =~ m/^(\d+) *-+ *(\d+) *vol(.|ume) *(\d+)$/) {
                $fields{'pages'} = "$1--$2";
		if ($fields{'volume'}) {
			print "'volume' and 'pages' fields specify a volume\n";
		} else {
			$fields{'volume'} = $4;
		}
	} else {
		printf "page description '%s' isn't the neat n-m format: beware\n", 
			$fields{'pages'}; 
	}
    }

    # protect likely desirable capitals in titles (e.g. 'PDs'->'{PDs}', 'SiC'->'{SiC}')
    if (exists($fields{'title'})) {
        $fields{'title'} =~ s/(^|[^\w{])([[:upper:]][[:alpha:]]*[[:upper:]]'?\w?)($|[^\w}])/$1\{$2\}$3/g;
    }

    # ieeexplore often bodges conference papers into journal-article entries; correct this
    if (exists($fields{'journal'})) {
        if ($fields{'journal'} =~ m/(Conference|Symposium|Colloquium|Seminar)/) {
            printf "changing reference type from $type to InProceedings;\n" .
                   "and moving fieldname 'journal' to 'booktitle'\n";
            $type = 'inproceedings';
            $fields{'booktitle'} = $fields{'journal'};
            delete $fields{'journal'};
        }
    }

    # my colleagues don't like permutated titles such as 'Things and Stuff. Conference on':
    # so, try making them be in straight form; also, remove excessive extra information;
    # this is *very* hard to do, given the range of input from different sources --
    # better, for known publications, is to replace the entire field when recognising 
    # a particular string
    for $source ( ('journal', 'booktitle') ) {
        if ($fields{$source}) {
            $_ = $fields{$source};
            s/\s+/ /g;
            # convert recognised sources to a standard form
            if (m/iee[^e]/i && m/proceedings/i) {
                if (m/science, measurement/i) {
                    $_ = '{IEE} Proceedings -- Science, Measurement and Technology';
                }
            } elsif (m/iet\W/i && m/science, measurement/i) {
                    $_ = '{IET} Science, Measurement and Technology';
            } elsif (m/ieee/i && m/transactions/i) {
                if (m/energy conversion/i) {
                    $_ = '{IEEE} Transactions on Energy Conversion';
                } elsif (m/industry applications/i) {
                    $_ = '{IEEE} Transactions on Industry Applications';
                } elsif (m/on electrical insulation/i) {
                    $_ = '{IEEE} Transactions on Electrical Insulation';
                } elsif (m/dielectrics and electrical insulation/i) {
                    $_ = '{IEEE} Transactions on Dielectrics and Electrical Insulation';
                }
            } elsif (m/international symposium/i) {
                if (m/electrical insulating materials/i) {
                    $_ = 'International Symposium on Electrical Insulating Materials (ISEM)';
                } elsif (m/electrical insulation/i) {
                    $_ = 'International Symposium on Electrical Insulation (ISEI)';
                } elsif (m/high voltage engineering/i) {
                    $_ = 'International Symposium on High Voltage Engineering (ISH)';
                }
            } elsif (m/international conference/i) {
                if (m/applications/i && m/dielectric materials/i) {
                    $_ = 'International Conference on Properties and Applications of Dielectric Materials';
                } elsif (m/conduction and breakdown/i && m/solid dielectrics/i) {
                    $_ = 'International Conference on Conduction and Breakdown in Solid Dielectrics (ICSD)';
                } elsif (m/solid dielectrics/i) {
                    $_ = 'International Conference on Solid Dielectrics (ICSD)';
                } elsif (m/coil winding tech/i) {
                    $_ = 'Electrical Insulation Conference and ' . 
                         'Electrical Manufacturing \& Coil Winding Technology Conference';
                }
            } elsif (m/conference/i) {
                if (m/insulation/i && m/dielectric phenomena/i) { 
                    $_ = 'Conference on Electrical Insulation and Dielectric Phenomena (CEIDP)';
                }
            }
            # basic cleaning up (ideally *not* done to already substituted names from the
            # above, but the nested 'if' structure there doesn't easily permit making 
            # the following chunk happen iff no title matched
            if ($_ eq $fields{$source}) {
                s/\([^)]+\.[^)]*\)//g; # remove whole sentences in parentheses
                s/^(.*)[,.] ([[:upper:]][^,.]*) *$/$2 $1/; # try `depermutation' (see above comment)
                s/\[[^\]]*\]//g; # remove anything in [...]
                s/([^\\])&/$1\\\&/g; # protect bare ampersands
                s/ +/ /g; s/^ //; s/ $//;  # clean spaces
            }
            # replace field if it was changed
            if ($_ ne $fields{$source}) {
                printf "re-writing '$source' field:\n";
                printf " < %s\n", $fields{$source};
                printf " > %s\n", $_;
                $fields{$source} = $_;
            }
        }
    }

    # check presence of fields; set order of writing fields
    @fieldlist_orig = sort keys %fields;
    @putafterreq = qw( month key );
    @putatend = qw( note isbn issn doi keywords abstract );
    # the various bibtex entry types, with required fields (`bibtexing', 1988)
    # keys are lowercased names; arrays are 'neat' names followed by required
    # fields; alternative required fields can be shown with a `|'
    %types = (
      article       => [ 'Article', 'author', 'title', 'journal', 'year' ],
      book          => [ 'Book', 'author|editor', 'title', 'publisher', 'year' ],
      booklet       => [ 'Booklet', 'title' ],
      conference    => [ 'InProceedings', 'author', 'title', 'booktitle', 'year' ],
      inproceedings => [ 'InProceedings', 'author', 'title', 'booktitle', 'year' ],
      inbook        => [ 'InBook', 'author|editor', 'title', 'chapter|pages', 'publisher', 'year' ],
      incollection  => [ 'InBook', 'author', 'title', 'booktitle', 'publisher', 'year' ],
      manual        => [ 'Manual', 'title' ],
      mastersthesis => [ 'MastersThesis', 'author', 'title', 'school', 'year' ],
      misc          => [ 'Misc' ],
      phdthesis     => [ 'PhdThesis', 'author', 'title', 'school', 'year' ],
      proceedings   => [ 'Proceedings', 'title', 'year' ],
      techreport    => [ 'TechReport', 'author', 'title', 'institution', 'year' ],
      unpublished   => [ 'Unpublished', 'author', 'title', 'note']
    );
    # if entry type is recognised, check for required fields
    if (!$types{$type}) {
        printf "reference type '$type' is unrecognised:\n" .
               "no checking of fields will be done\n";
        @fieldlist = @putafterreq;
	push @fieldlist, @fieldlist_orig;
    } else {
        @reqfields = @{$types{$type}};
        $type = shift @reqfields;
        @fieldlist = ();
        for (@reqfields) {
            $alt = '';
            if (m/\|/) {
                @reqalt = split('\|');
                $alt = ' alternative';
            } else {
                @reqalt = ($_);
            }
            $foundreq = '';
            for (@reqalt) {
                if ($fields{$_}) {
                    if ($foundreq) {
                        print "$type: found '$_' but ALREADY had alternative field '$foundreq'\n";
                    } else {
                        push @fieldlist, $_;
                        $foundreq = $_;
                    }
                }
            }
            if (!$foundreq) {
                print "$type: REQUIRED$alt field '[not?]$_' not found\n";
            }
        }
        push @fieldlist, @putafterreq;
	push @fieldlist, @fieldlist_orig;
     }
    # make 'required' then 'putafterreq' fields come first
    @fieldlist_orig = @fieldlist;
    @fieldlist = (); %tmp = ();
    for (@fieldlist_orig) {
        if (!exists($tmp{$_})) {
            $tmp{$_} = 'exists';
            push @fieldlist, $_;
        }
    }
    # make 'putatend' fields come at the end
    push @fieldlist, @putatend;
    @fieldlist_orig = reverse @fieldlist;
    @fieldlist = (); %tmp = ();
    for (@fieldlist_orig) {
        if (!exists($tmp{$_})) {
            $tmp{$_} = 'exists';
            push @fieldlist, $_;
        }
    }
    @fieldlist = reverse @fieldlist;


    # find a filename that doesn't yet exist: start by trying NnnXX-format label
    $file = $label;
    while (-e $file) {
        $file_old = $file;
        if ($file =~ m/^(.*)\.(\d{3})/) {
            $file = sprintf "%s.%03d", $1, $2+1;
        } else {
            $file .= '.001';
        }
        print "output file '$file_old' exists: trying '$file'\n";
    }
    # write the record to its own file
    if ( !open(SEPF, ">$file") ) {
        print "error opening $file for writing: giving up on reference $label\n";
        next;
    }
    print SEPF "\@$type\{$label,\n";
    for $key (@fieldlist) {
        if ($fields{$key}) {
            print SEPF "$key = {$fields{$key}},\n";
        }
    }
    print SEPF "}\n\n";
    close(SEPF);

}

printf "\n\n";