#!/usr/bin/perl -w ## # Import bibtex data (files/stdin), breaking it on `^\s*@' # into separate files, making some standardisations to # formatting, then writing each entry to a file in the pwd, # named in Author-Year 'NamNN' format, with suffixed number # if the chosen filename already exists. A 'key' field can # override the chosen NamNN if either Nam or NN has failed. # # The parsing is NOT proper bibtex: bibtex comments are any # text outside an entry, an entry being defined as @\w+{...} # where `...' may contain newlines and matching pairs of # braces. For simplicity, no brace matching is done here, # but comments may be shown by lines starting with `%'. # # `Bibtexing' (1988), from the tetex distribution, has been # used as the source of some details of comments, name formats # and required/optional fields. # # Nathaniel, # started 2008-10-03. # working 'enough' 2008-12-23. # working 'quite neatly' 2008-12-25. # considered finished 2008-12-28. # # options $month_full = 1; # true value ==> e.g. 'January' rather than 'jan' # hope to avoid problems of non-ascii names (e.g. author), # by using e.g. \w rather than [a-zA-Z] and having a utf-8 locale; # the actual language/country makes little/no difference use POSIX 'locale_h'; setlocale(LC_ALL, 'en_GB.UTF-8'); ## Get non-comment input into a single line, then split into bibtex entries. $in = ''; while (<>) { if (!m/^[\t ]*$/ && !m/^[\t ]*[%#]/) { s/^[\t ]*@/REAL@/g; $in .= $_; } } $in =~ s/\s+/ /g; @entries = split('REAL@', $in); shift(@entries); ## Entry by entry, do processing: separate fields, do `correcting', write bibtex file. $nent = 0; for (@entries) { %fields = (); printf "\n## Processing Bibtex entry %d\n", ++$nent; # remove top (e.g. '@Article{lab:el,') and bottom (e.g. '}') s/ *(\w+) *{ *([^,]+?),//; $type = "\L$1"; $label_orig = $2; s/}[ ,]*$//; # split into fields (e.g. 'title = {The Title},' or 'title="The Title",') s/ *(\w+) *= *([{"])/\n$1 = $2/g; @lines = split('\n'); shift(@lines); for (@lines) { if (m/^ *(\w+) *= *[{"] *(.*?) *[}"][, ]*$/) { $fieldname = "\L$1"; $content = $2; $fields{$fieldname} = $content; } else { print "couldn't match field with \"name = {value}\" pattern\n"; print "$_\n"; } } # split authors or editors (on ' and ', but some sources may use ';') # and try to format names in 'Surname, Rest' format $label = 'Nnn'; for $authedit ( ('author', 'editor') ) { if (exists($fields{$authedit})) { $fields{$authedit} =~ s/( *; *| +and +)/\n/g; @people = split( '\n', $fields{$authedit} ); for $person (@people) { if ( !($person =~ m/,/) ) { # 'First Last' -> 'Last, First' $person =~ s/^(.*?) (([[:lower:]]+ )?\w+)\W*$/$2, $1/; } $person =~ s/\.([^ ,])/. $1/g; # space the initials if ($person && $label=~m/^Nnn$/) { $_ = $person; # the following is a bodge to avoid (some) non-ascii troubles s/Ä/A/g;s/Æ/A/g;s/Ö/O/g;s/Ø/O/g;s/Å/A/g;s/Ü/U/g;s/É/E/g; s/ä/a/g;s/æ/a/g;s/ö/o/g;s/ø/o/g;s/å/a/g;s/ü/u/g;s/é/e/g; # clean up before parsing s/-/ /g; # treat e.g. 'Espino-Cortes' similarly to 'Van Brunt', for label s/[^\w ,.]//g; # prevent e.g. 'G\{ae}..' from giving a label of 'G\{' if (m/^ *([[:upper:]])\w+ +([[:upper:]])\w+ *,/) { # name is e.g. 'Van Brunt' or 'Espino-Cortes': take VB or EC $label = $1 . $2; } elsif (m/^[[:lower:] ]*([[:upper:]]\w{0,2})\w* *,/) { # name is e.g. 'Smith' or 'van der Laan': take Smi or Laa $label = $1; } } } $fields{$authedit} = join(' and ', @people); } } # check for presence of a year $label .= 'XX'; if (!exists($fields{'year'})) { print "missing year\n"; } else { if ($fields{'year'} =~ m/\d\d(\d\d)/) { $y2d = $1; $label =~ s/XX/$y2d/; } else { print "year doesn't contain a numeric year: " . "\"$fields{'year'}\"\n"; } } # let the optional `key' field override automatic label iff full person/year not found if ($fields{'key'} && $label=~m/(^Nnn|XX$)/) { printf "using supplied 'key' field '$fields{'key'}' instead of automatic '$label'\n"; $label = $fields{'key'}; } # printf "%d: '$label' replaces original label '$label_orig'\n", $nent; # make months into a standard format (I've chosen full name) if (exists($fields{'month'}) && $fields{'month'} =~ m/\W*([A-Za-z]{3}).*/) { $fields{'month'} = "\L$1"; if ($fields{'month'} =~ /[a-z]{3}/) { if ($month_full) { %months = ( qw( jan January feb February mar March apr April may May jun June jul July aug August sep September oct October nov November dec December ) ); $fields{'month'} = $months{$fields{'month'}}; } } else { printf "month '%s' doesn't match expected format\n", $fields{'month'}; } } # clean page-numbering: hope to keep 'n,n-n,n' ranges, but # remove spaces and the frequent ieeexploit 'vol.N', etc. if (exists($fields{'pages'})) { if ($fields{'pages'} =~ m/^(\d+) *-+ *(\d+)$/) { $fields{'pages'} = "$1--$2"; } elsif ($fields{'pages'} =~ m/^(\d+) *-+ *(\d+) *vol(.|ume) *(\d+)$/) { $fields{'pages'} = "$1--$2"; if ($fields{'volume'}) { print "'volume' and 'pages' fields specify a volume\n"; } else { $fields{'volume'} = $4; } } else { printf "page description '%s' isn't the neat n-m format: beware\n", $fields{'pages'}; } } # protect likely desirable capitals in titles (e.g. 'PDs'->'{PDs}', 'SiC'->'{SiC}') if (exists($fields{'title'})) { $fields{'title'} =~ s/(^|[^\w{])([[:upper:]][[:alpha:]]*[[:upper:]]'?\w?)($|[^\w}])/$1\{$2\}$3/g; } # ieeexplore often bodges conference papers into journal-article entries; correct this if (exists($fields{'journal'})) { if ($fields{'journal'} =~ m/(Conference|Symposium|Colloquium|Seminar)/) { printf "changing reference type from $type to InProceedings;\n" . "and moving fieldname 'journal' to 'booktitle'\n"; $type = 'inproceedings'; $fields{'booktitle'} = $fields{'journal'}; delete $fields{'journal'}; } } # my colleagues don't like permutated titles such as 'Things and Stuff. Conference on': # so, try making them be in straight form; also, remove excessive extra information; # this is *very* hard to do, given the range of input from different sources -- # better, for known publications, is to replace the entire field when recognising # a particular string for $source ( ('journal', 'booktitle') ) { if ($fields{$source}) { $_ = $fields{$source}; s/\s+/ /g; # convert recognised sources to a standard form if (m/iee[^e]/i && m/proceedings/i) { if (m/science, measurement/i) { $_ = '{IEE} Proceedings -- Science, Measurement and Technology'; } } elsif (m/iet\W/i && m/science, measurement/i) { $_ = '{IET} Science, Measurement and Technology'; } elsif (m/ieee/i && m/transactions/i) { if (m/energy conversion/i) { $_ = '{IEEE} Transactions on Energy Conversion'; } elsif (m/industry applications/i) { $_ = '{IEEE} Transactions on Industry Applications'; } elsif (m/on electrical insulation/i) { $_ = '{IEEE} Transactions on Electrical Insulation'; } elsif (m/dielectrics and electrical insulation/i) { $_ = '{IEEE} Transactions on Dielectrics and Electrical Insulation'; } } elsif (m/international symposium/i) { if (m/electrical insulating materials/i) { $_ = 'International Symposium on Electrical Insulating Materials (ISEM)'; } elsif (m/electrical insulation/i) { $_ = 'International Symposium on Electrical Insulation (ISEI)'; } elsif (m/high voltage engineering/i) { $_ = 'International Symposium on High Voltage Engineering (ISH)'; } } elsif (m/international conference/i) { if (m/applications/i && m/dielectric materials/i) { $_ = 'International Conference on Properties and Applications of Dielectric Materials'; } elsif (m/conduction and breakdown/i && m/solid dielectrics/i) { $_ = 'International Conference on Conduction and Breakdown in Solid Dielectrics (ICSD)'; } elsif (m/solid dielectrics/i) { $_ = 'International Conference on Solid Dielectrics (ICSD)'; } elsif (m/coil winding tech/i) { $_ = 'Electrical Insulation Conference and ' . 'Electrical Manufacturing \& Coil Winding Technology Conference'; } } elsif (m/conference/i) { if (m/insulation/i && m/dielectric phenomena/i) { $_ = 'Conference on Electrical Insulation and Dielectric Phenomena (CEIDP)'; } } # basic cleaning up (ideally *not* done to already substituted names from the # above, but the nested 'if' structure there doesn't easily permit making # the following chunk happen iff no title matched if ($_ eq $fields{$source}) { s/\([^)]+\.[^)]*\)//g; # remove whole sentences in parentheses s/^(.*)[,.] ([[:upper:]][^,.]*) *$/$2 $1/; # try `depermutation' (see above comment) s/\[[^\]]*\]//g; # remove anything in [...] s/([^\\])&/$1\\\&/g; # protect bare ampersands s/ +/ /g; s/^ //; s/ $//; # clean spaces } # replace field if it was changed if ($_ ne $fields{$source}) { printf "re-writing '$source' field:\n"; printf " < %s\n", $fields{$source}; printf " > %s\n", $_; $fields{$source} = $_; } } } # check presence of fields; set order of writing fields @fieldlist_orig = sort keys %fields; @putafterreq = qw( month key ); @putatend = qw( note isbn issn doi keywords abstract ); # the various bibtex entry types, with required fields (`bibtexing', 1988) # keys are lowercased names; arrays are 'neat' names followed by required # fields; alternative required fields can be shown with a `|' %types = ( article => [ 'Article', 'author', 'title', 'journal', 'year' ], book => [ 'Book', 'author|editor', 'title', 'publisher', 'year' ], booklet => [ 'Booklet', 'title' ], conference => [ 'InProceedings', 'author', 'title', 'booktitle', 'year' ], inproceedings => [ 'InProceedings', 'author', 'title', 'booktitle', 'year' ], inbook => [ 'InBook', 'author|editor', 'title', 'chapter|pages', 'publisher', 'year' ], incollection => [ 'InBook', 'author', 'title', 'booktitle', 'publisher', 'year' ], manual => [ 'Manual', 'title' ], mastersthesis => [ 'MastersThesis', 'author', 'title', 'school', 'year' ], misc => [ 'Misc' ], phdthesis => [ 'PhdThesis', 'author', 'title', 'school', 'year' ], proceedings => [ 'Proceedings', 'title', 'year' ], techreport => [ 'TechReport', 'author', 'title', 'institution', 'year' ], unpublished => [ 'Unpublished', 'author', 'title', 'note'] ); # if entry type is recognised, check for required fields if (!$types{$type}) { printf "reference type '$type' is unrecognised:\n" . "no checking of fields will be done\n"; @fieldlist = @putafterreq; push @fieldlist, @fieldlist_orig; } else { @reqfields = @{$types{$type}}; $type = shift @reqfields; @fieldlist = (); for (@reqfields) { $alt = ''; if (m/\|/) { @reqalt = split('\|'); $alt = ' alternative'; } else { @reqalt = ($_); } $foundreq = ''; for (@reqalt) { if ($fields{$_}) { if ($foundreq) { print "$type: found '$_' but ALREADY had alternative field '$foundreq'\n"; } else { push @fieldlist, $_; $foundreq = $_; } } } if (!$foundreq) { print "$type: REQUIRED$alt field '[not?]$_' not found\n"; } } push @fieldlist, @putafterreq; push @fieldlist, @fieldlist_orig; } # make 'required' then 'putafterreq' fields come first @fieldlist_orig = @fieldlist; @fieldlist = (); %tmp = (); for (@fieldlist_orig) { if (!exists($tmp{$_})) { $tmp{$_} = 'exists'; push @fieldlist, $_; } } # make 'putatend' fields come at the end push @fieldlist, @putatend; @fieldlist_orig = reverse @fieldlist; @fieldlist = (); %tmp = (); for (@fieldlist_orig) { if (!exists($tmp{$_})) { $tmp{$_} = 'exists'; push @fieldlist, $_; } } @fieldlist = reverse @fieldlist; # find a filename that doesn't yet exist: start by trying NnnXX-format label $file = $label; while (-e $file) { $file_old = $file; if ($file =~ m/^(.*)\.(\d{3})/) { $file = sprintf "%s.%03d", $1, $2+1; } else { $file .= '.001'; } print "output file '$file_old' exists: trying '$file'\n"; } # write the record to its own file if ( !open(SEPF, ">$file") ) { print "error opening $file for writing: giving up on reference $label\n"; next; } print SEPF "\@$type\{$label,\n"; for $key (@fieldlist) { if ($fields{$key}) { print SEPF "$key = {$fields{$key}},\n"; } } print SEPF "}\n\n"; close(SEPF); } printf "\n\n";