Recently, I wrote a short piece on compulsive card cataloging, that I must now admit, contained code that was flawed. Or at least didn't entirely please me. The reason was tied to the Library of Congress and relates to how easy (or not) it is to get data from same. I have since learned that there is a better way. In a recent blog by Jon Udell (see: http://weblog.infoworld.com/udell/2003/05/22.html), I noticed
After I get some details ironed out, I plan on updating the code I posted previously. In the mean time, for the curious, here is a test script:
#!/perl/bin/perl
#
# newISBN.pl -- ISBN to BibTex via webscraping...
use strict;
use warnings;
use diagnostics;
use LWP::Simple;
my %bib = (
author => '?',
title => '?',
publisher => '?',
address => '?',
edition => '?',
year => '?',
ISBN => '?',
);
my @list = map {chomp;ean2isbn($_)} <DATA>;
for (@list) {
my $webpage = get("http://lcweb.loc.gov/cgi-bin/zclient?host=z3950
+.loc.gov&port=7090&attrset=BIB1&rtype=USMARC&DisplayRecordSyntax=HTML
+&ESN=F&startrec=1&maxrecords=10&dbname=Voyager&srchtype=1,7,2,3,3,1,4
+,1,5,1,6,1&term_term_1=$_");
scrapeLOC($webpage,\%bib);
print "\@book{,\n";
print " author=\{", $bib{'author'}, "\}\n";
print " title=\{", $bib{'title'}, "\}\n";
print " edition=\{", $bib{'edition'}, "\}\n";
print " publisher=\{", $bib{'publisher'}, "\}\n";
print " address=\{", $bib{'address'}, "\}\n";
print " year=\{", $bib{'year'}, "\}\n";
print " ISBN=\{", $bib{'ISBN'}, "\}\n";
print "}\n\n";
}
sub parsespan {
my ($begin,$end,$s) = @_;
$s =~ /$begin((?:(?!$begin).)*)$end/ms;
my $span = $1;
$span =~ s/\n/ /g;
$span =~ s/\s\s+/ /g;
return $span;
}
sub scrapeLOC{
my ( $webpage, $bib ) = @_;
for ( keys %$bib ) {
$bib->{$_} = '?';
}
if ($webpage) {
my @list;
my %fields;
while ($webpage =~ /^(.*?:)/mg) {
push(@list,$1);
}
for (0..@list - 2) {
$fields{$list[$_]} = $list[$_ + 1];
}
$fields{$list[-1]} = '<\/PRE>';
if (exists($fields{'Author:'})) {
$bib->{'author'} = parsespan('Author:',$fields{'Author:'},
+$webpage);
}
if (exists($fields{'Title:'})) {
my $title = parsespan('Title:',$fields{'Title:'},$webpage)
+;
my @title = split(/\//,$title);
for (@title) {
$_ = alltrim($_);
}
$title = $title[0];
my $author = $title[1];
$author =~ s/,/ and/g;
$author =~ s/\.$//;
$bib->{'title'} = $title;
$bib->{'author'} = $author;
}
if (exists($fields{'Edition:'})) {
my $edition = parsespan('Edition:',$fields{'Edition:'},$we
+bpage);
$edition =~ s/(.*?)\s.*$/$1/;
$bib->{'edition'} = $edition if $edition;
}
if (exists($fields{'ISBN:'})) {
my $isbn = parsespan('ISBN:',$fields{'ISBN:'},$webpage);
$isbn =~ s/.*?(\d{9}\d|X).*/$1/;
$bib->{'ISBN'} = $isbn;
}
if (exists($fields{'Published:'})) {
my $published = parsespan('Published:',$fields{'Published:
+'},$webpage);
$published =~ /(.*?)\s:\s(.*?),.*?(\d{4}).*$/;
$bib->{'publisher'} = $2 if $2;
$bib->{'address'} = alltrim($1) if $1;
$bib->{'year'} = $3 if $3;
}
}
}
sub alltrim {
my $s = shift;
$s =~ s/^\s+//;
$s =~ s/\s+$//;
return $s;
}
sub ean2isbn {
my $isbn = substr( shift, 3, 10 );
return substr($isbn,0,9) . checkDigit($isbn);
}
sub checkISBN {
my $isbn = shift;
my $n = length($isbn);
if ( $n != 10 ) {
return ( 0, ( $n < 10 ? '-' : '+' ) );
}
else {
my $cd = checkDigit($isbn);
return ((($cd eq substr($isbn,-1,1)) ? 1 : 0), $cd);
}
}
sub checkDigit {
my @digits = split ( //, uc(shift) );
my $sum = 0;
my $m = 10;
for ( 0 .. @digits - 2 ) {
$sum += $digits[$_] * $m--;
}
return qw(0 X 9 8 7 6 5 4 3 2 1) [ $sum % 11 ];
}
__DATA__
9780451458711
9780201185379
9780201489460
9780764545696
9780138482763
--hsm
"Never try to teach a pig to sing...it wastes your time and it annoys the pig."