Here is a perl program that queries the PubMed site and adds it to a BibTeX database. It would be simple enough to add a DBI backend. I also found the source code above turgid. Using better data structures, the following program I wrote several years ago weighs in at 427 lines. If I was rewriting it these days, I would probably add a Perl/Tk interface.
#!/usr/bin/perl -w
#
# mk_pm2bib.pl - save pubmed searches to a bibtex database
#
# This is a heavy reworking of a script originally written by Dave
# Blake and modified by Katrin Schenk.
use strict;
use Getopt::Std;
use Text::Wrap;
use vars qw|$opt_u $opt_a $store_abstract $ans $made_backup|;
getopts('au') or die "bad options!";
# -u - display usage
# -a - store abstract
usage() if $opt_u;
$store_abstract = 1 if $opt_a;
$made_backup = 0;
$Text::Wrap::columns = 74;
my $bib_entries = {};
my $bibtex_file = "";
if (($ans = get_action()) eq 'o') { # append to an existing database
print "Which bibtex file would you like to add entries to? ";
$bibtex_file = <STDIN>;
chomp $bibtex_file;
print "Reading in file $bibtex_file...";
$bib_entries = parse_bibtex( $bibtex_file);
print "done.\n";
}
elsif ($ans eq 'c') { # create a new database
print "Name of the new bibtex file to add entries to? ";
$bibtex_file = <STDIN>;
chomp $bibtex_file;
}
else { # quit
exit(0);
}
# main event loop: query, convert, add, store
while (1) {
my ($q_response, $data) = query_pubmed();
last if $q_response =~ /q/i;
my ($c_response, $pubmed_entries) = convert_pubmed( $data);
if (scalar keys %$pubmed_entries) {
print "Merging chosen entries into database...";
$bib_entries = add_entries( $bib_entries, $pubmed_entries);
print "done.\nSaving database to $bibtex_file...";
write_bibtex( $bibtex_file, $bib_entries);
print "done.\n";
}
last if $c_response =~ /q/i;
}
exit(0);
### Subroutines
sub usage
{
print STDOUT<<"EoF";
usage: mk_pm2bib.pl [-u] [-a]
-u display usage
-a store abtract, if any, in bibtex file
This script allows users to search the PubMed database, screen the
results, and add desired entries into a new or pre-existing bibtex
file. When choosing to add entries to a pre-existing bibtex file this
script reads in this file and parses it such that duplicates are not
allowed and bibtex handles are unique.
EoF
exit(0);
}
# Prompt user for mode of use
sub get_action
{
my $append;
do {
print STDOUT <<"EoF";
What would you like to do?
(o) Open an existing bibtex file.
(c) Create a new bibtex file.
(q))Quit.
EoF
print "Choice: ";
$append = <STDIN>;
} while ($append !~ /([ocq])/);
return $1;
}
# Read in and parse bibtex file. This parser would fail on a general
# bibtex file, but is good enough for our machine generated entries.
# For entries with identical keys, only last one seen is saved.
sub parse_bibtex
{
my $bibtex_file = shift;
my $parse = {};
my $key = "initial";
open( BIBTEX, "<$bibtex_file")
or die "Can't open $bibtex_file for reading: $!\n";
while (<BIBTEX>) {
next if /^\s*$/;
if (/^\@preamble\s*([({]?)(.*?)([)}]?)\s*$/i) { # @preamble
my $open = $1;
my $value = defined $2 ? $2 : "";
my $close = defined $3 ? $3 : "";
$parse->{preamble} = $value;
until ( ($open eq '(' && $close eq ')')
|| ($open eq '{' && $close eq '}'))
{
my $line = <BIBTEX>;
die "Syntax error, runaway preamble: $parse->{preamble}"
if $line =~ /^\@/;
$line =~ /^(.*?)([)}]?)\s*$/;
$value = defined $1 ? $1 : "";
$close = defined $2 ? $2 : "";
$parse->{preamble} .= "\n$value";
}
}
elsif (/^\@string\s*([({])\s*(\w+)\s*=\s*(["{]?)(.*?)(["}]?)([)}
+])\s*$/i)
{ # assume a single line string entry
my $field = $2;
my $value = defined $4 ? $4 : "";
$parse->{strings}{$field} = $value;
}
elsif (/^\@comment\s*\{(.*?)(\}?)\s*$/i) { # @comment{...}
my $value = defined $1 ? $1 : "";
my $close = defined $2 ? $2 : "";
$parse->{comment}{$key} = $value;
until ($close eq '}') {
my $line = <BIBTEX>;
die "Syntax error, runaway comment: $parse->{comment}{$key
+}"
if $line =~ /^\@/;
$line =~ /^(.*?)(\}?)\s*$/;
$value = defined $1 ? $1 : "";
$close = defined $2 ? $2 : "";
$parse->{comment}{$key} .= "\n$value";
}
}
elsif (/^\@(\w+)\s*\{\s*(.*),$/) { # start of regular entry
$key = $2;
delete $parse->{$key} if $parse->{$key}{entry_type};
$parse->{$key}{entry_type} = $1;
}
elsif (/^\s*(\w+)\s*=\s*(["{]?)(.*?)(["}]?,?)\s*$/) { # field =
+value
my $field = $1;
my $open = defined $2 ? $2 : "";
my $value = defined $3 ? $3 : "";
my $close = defined $4 ? $4 : "";
$parse->{$key}{$field} = $value;
until ( ($open eq '"' && $close =~ /^",?$/)
|| ($open eq '{' && $close =~ /^\},?$/)
|| ($open eq '' && $close =~ /^,?$/) )
{
my $line = <BIBTEX>;
die "Syntax error, runaway field: $field = $parse->{$key}{
+$field}"
if $line =~ /^\@/;
$line =~ /^\s*(.*?)(["}]?,?)\s*$/;
$value = defined $1 ? $1 : "";
$close = defined $2 ? $2 : "";
$parse->{$key}{$field} .= " $value";
}
$parse->{$key}{$field} =~ s/^\s*(.+?)\s*$/$1/s;
$parse->{$key}{$field} =~ s/\s+/ /gs;
}
elsif (/^\s*\}\s*$/) { # end of entry
# do nothing
}
else { # safety check
die "Could not parse the line\n $_";
}
}
close BIBTEX;
return $parse;
}
# query Pubmed and return raw output.
sub query_pubmed
{
my $search_string = 'wget -O - "http://www.ncbi.nlm.nih.gov:80/entr
+ez/' .
'query.fcgi?cmd=Search&db=PubMed&dispmax=100&doptcmdl=MEDLINE&te
+rm=';
my $terms = "";
print STDOUT<<"EoF";
ENTER SEARCH CRITERIA: A search string is just search terms separated
by spaces. Authors are formatted like "doupe aj". Keyword search is by
default in the whole entry. To search for a keyword in just a title,
qualify it with the location in brackets, e.g., spectrum[TITLE]. Note
that "and" is implictly assumed between search terms.
Example 1: Search for papers by both A. J. Doupe and T. Troyer
Enter: doupe aj troyer t
Example 2: Search for papers by both A. J. Doupe and T. Troyer wit
+h
the "birdsong" somewhere in the abstract and/or title.
Enter: doupe aj troyer t birdsong
Example 3: Search for papers with "zebra" in the title only.
Enter: zebra[TI]
You may also quit by entering a single 'q' and carraige return.
EoF
# Read in search terms and convert spaces to URL encoding
do {
print "Search: ";
$terms = <STDIN>;
} while ($terms =~ /^\s*$/);
return ( "q", "") if $terms =~ /^q$/i;
$terms =~ s/^\s*(.+?)\s*$/$1/s; # trim space
$terms =~ s/\s+/\%20/g; # encode spaces
# Add terms to search string, ignore stderr output
$search_string .= "$terms \" 2>/dev/null";
# Execute wget command and capture output
print "Getting info from Pubmed...";
my @output = `$search_string`;
print "done.\n";
return ( "yes", \@output);
}
# convert raw pubmed output to bibtex format. %map shows what to extra
+ct.
sub convert_pubmed
{
my $data = shift;
my $entries = {};
my $i = 0; # line index
my $pm = {}; # pubmed entry info
my %map = (AB => "abstract", TI => "title", AU => "author", TA => "
+journal",
VI => "volume", IP => "number", DP => "year", PG => "pag
+es");
my $choices = join '|', keys %map;
my $answer = "";
my $key;
# extract entries from raw text
MAINLOOP:
while ($i < @$data) {
next unless $data->[$i++] =~ /^PMID-\s*(\d+)\s*$/; # start of en
+try
my $pmid = $1;
while ($i < @$data) {
if ($data->[$i] =~ /^($choices)\s*-\s*(.+?)\s*$/) {
my $type = $map{$1};
if ($type eq "author") { # could have multiple authors
push @{$pm->{$pmid}{author}}, $2;
$i++;
}
else { # single field, possibly multiple lines
$pm->{$pmid}{$type} = $2;
while ($data->[++$i] =~ /^\s+( .+)$/) {
$pm->{$pmid}{$type} .= $1;
}
}
}
elsif ($data->[$i] =~ /^SO/) { # end of entry
$answer = entry_wanted( $pm, $pmid);
delete $pm->{$pmid}, last MAINLOOP if $answer =~ /q|n/;
delete $pm->{$pmid} unless $answer == 1;
$i++;
last;
}
else {
$i++;
}
}
}
# convert the selected entries to bibtex format
foreach my $pmid (keys %$pm) {
# create a bibtex key
my $auth = $pm->{$pmid}{author};
$pm->{$pmid}{year} =~ /((19|20)\d\d)/;
my $year = $1;
if ( @{$auth} == 1) { # author0:year
$auth->[0] =~ /^(\w+)/;
$key = lc "$1:$year";
}
elsif ( @{$auth} == 2) { # author0_author1:year
$auth->[0] =~ /^(\w+)/;
$key = lc "$1_";
$auth->[1] =~ /^(\w+)/;
$key .= lc "$1:$year";
}
else { # author0_etal:year
$auth->[0] =~ /^(\w+)/;
$key = lc "$1_etal:$year";
}
# populate bibtex entry - assume an article XXX
$entries->{$key}{entry_type} = "article";
@$auth = map {my ($sur, $ini) = /^(.+?)\s+(\w+)$/;
$ini =~ s/(\w)/$1./g; "$ini $sur"; } @$auth;
$entries->{$key}{author} = join " and ", @$auth;
$entries->{$key}{author} =~ s/^\s*(.+?)\s*$/$1/s;
$entries->{$key}{author} =~ s/\s+/ /gs;
foreach my $field (values %map) {
next if $field eq "author";
next unless exists $pm->{$pmid}{$field};
my $value = $pm->{$pmid}{$field};
$value =~ s/"/"/g; # decode HTML escapes
$value =~ s/>/>/g;
$value =~ s/</</g;
$value =~ s/^\s*(.+?)\s*$/$1/s; # remove extra space
$value =~ s/\s+/ /gs;
$value =~ s/\.$//s if $field eq "title";
$entries->{$key}{$field} = $value;
}
}
return ($answer, $entries);
}
# Does user want to add entry, skip, new search, quit?
sub entry_wanted
{
my ($pm, $pmid) = @_;
my $authors = join " ", @{$pm->{$pmid}{author}};
my $answer = "";
do {
print "\n";
print wrap( "", " " x 10, "Authors: $authors ($pm->{$pmid}{year}
+).\n");
print wrap( "", " " x 8, "Title: $pm->{$pmid}{title}\n");
print wrap( "", " " x 12, "Reference: $pm->{$pmid}{journal} ($p
+m->{$pmid}
{volume}):$pm->{$pmid}{pages}.\n");
print wrap( "", " " x 10, "Abstract: $pm->{$pmid}{abstract}\n")
if exists $pm->{$pmid}{abstract};
print STDOUT<<"EoF2";
Now what?
(a) Aadd entry to bibtex database
(s) Skip entry.
(n) New search.
(q) Quit.
EoF2
print "Choice: ";
$answer = <STDIN>;
} while ($answer !~ /[asqn]/i);
return $answer if $answer =~ /q|n/i;
return 1 if $answer =~ /a/i;
return 0;
}
# merge new entries into existing bib database
sub add_entries
{
my ($old, $new) = @_;
new_key:
foreach my $new_key (keys %$new) {
my @close_keys = grep {/^$new_key[a-z]?$/} keys %$old;
unless (@close_keys) { # no collision, just add it
$old->{$new_key} = $new->{$new_key};
}
else { # collision, see if it is a duplicate
close_key:
foreach my $c_key ( @close_keys) {
foreach my $field (keys %{$old->{$c_key}}) {
next close_key unless exists $new->{$new_key}{$field}
&& $new->{$new_key}{$field} eq $old->{$c_key}{$field
+};
}
next new_key; # found a duplicate, skip it
} # all @close_keys entries are different, add new key
my $uniq_key = $new_key . chr(ord('a') + @close_keys - 1);
$old->{$uniq_key} = $new->{$new_key};
}
}
return $old;
}
# Parse bibtex format and write out to file.
sub write_bibtex
{
my ($bibtex_file, $bib) = @_;
my @format = qw|author title journal volume number pages year|;
push @format, "abstract" if $store_abstract;
rename $bibtex_file, "$bibtex_file.bak" unless $made_backup;
$made_backup = 1;
unless (open BIBOUT, ">$bibtex_file") {
use File::Copy; # be conservative
copy( "$bibtex_file.bak", $bibtex_file);
die "Could not open $bibtex_file for writing: $!";
}
print BIBOUT "\@comment\{$bib->{comment}{initial}\}\n\n"
if exists $bib->{comment}{initial};
print BIBOUT "\@preamble\{$bib->{preamble}\}\n\n"
if exists $bib->{preamble};
if (exists $bib->{strings}) {
foreach my $field (keys %{$bib->{strings}}) {
print BIBOUT "\@string\{$field = \"$bib->{strings}{$field}\"\
+}\n\n";
}
}
foreach my $key (sort keys %$bib) {
next if $key =~ /preamble|strings|comment/; # treated elsewhere
print BIBOUT "\@$bib->{$key}{entry_type}\{ $key,\n";
foreach my $field (@format) {
next unless exists $bib->{$key}{$field};
my $text = $bib->{$key}{$field};
$text = "$field = " .($text =~ /^\d+$/ ? "$text," : "\"$text\
+",");
print BIBOUT wrap( " ", " " x (7 + length $field), "$text\n
+");
}
print BIBOUT "\}\n\n";
print BIBOUT "\@comment\{$bib->{comment}{$key}\}\n\n"
if exists $bib->{comment}{$key};
}
close BIBOUT;
}