Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

CiteULike tools

by eweaverp (Scribe)
on Dec 08, 2005 at 08:46 UTC ( #515163=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info
Description:

bib_canonicalizer.pl - takes a citeulike url or a .bib file and searches it against the Collection of Computer Science Bibliographies, outputing a canonicalized file with all available fields filled. It does not clobber your citeulike URLs or tags.

citeulike_pdf_grabber.pl - downloads and caches all .pdfs it can find from your citeulike account (including private) and spools them to a printer. Will not print duplicates even over multiple runs as long as you don't delete the cache folder. It's good for printing only the things you have most recently added. Outputs a "missing.html" file with links to the citeulike articles it could not find a .pdf for. You will probably have to customize some of the regexs for the databases you use the most.

################################### bib_canonicalizer.pl
#!/alruccabah/local/usr/bin/perl

use strict;
use warnings;
#use diagnostics;

my $username = "whatever";

my $base = "http://liinwww.ira.uka.de";
my $url = "$base/csbib";

my $citeulike = "http://www.citeulike.org/bibtex/user/";

my $input = $ARGV[0];
my $output = $ARGV[1];

die("Specify your citeulike username/tag path or an input .bib file.\n
+") unless ($input);

use Text::BibTeX qw(:macrosubs);
use LWP;
use HTML::Strip;
use URI::Escape;
use String::Compare;

my $agent = LWP::UserAgent->new();
my $stripper = HTML::Strip->new();

my %month = (jan => "January", feb => "February", mar => "March",
     apr => "April", may => "May", jun => "June", jul => "July", 
     aug => "August", sep => "September", oct => "October",
     nov => "November", dec => "December");
my ($macro, $value);
add_macro_text($macro, $value) while (($macro, $value) = each %month);

#my $Input = new Text::BibTeX::File($input);

my @Input;

if ($input =~ m/^$username/) {
  print "Trying to retrieve from $citeulike$input.\n";
  my $response = $agent->get($citeulike . $input,
                                                         [do_username_
+prefix => 0,
                                                         key_type => 0
+]
                                                         );
    open(TMP, ">tmp.bib");
    print TMP $response->content;
    close(TMP);
    
    @Input = split(/\n\@/, $response->content);
    
    if (!$output) {
      ($output) = $input =~ m/.*\/(.*)/;
        $output .= ".bib";
    }
} else {
  die unless ($output);
    
    local $/ = undef;
    open(INPUT, "$input");
  @Input = split(/\n\@/, <INPUT>);
}

shift(@Input);


open(OUT, ">$output");

#my $Output = new Text::BibTeX::File(">$output");

my $entry_text;
my $pass = 0;
my $entry;
my ($failed, $total) = (0, 0);

foreach $entry_text (@Input) {
  #print "$entry_text\n";
    #exit();

     $entry = new Text::BibTeX::Entry("@" . $entry_text);
    next unless $entry->parse_ok; 
    
    close(OUT);

    my @names = $entry->names("author");
    my $authors = "";

    foreach (@names) {
      my ($tmp_author) = join(" ", $_->part("last"));
        $authors .= " " . $tmp_author;
    }

    my ($title) = $entry->get("title");
    my ($year) = $entry->get("year");

    my $new_entry;
    
    my $continue = 0;

    do {
    
        if ($pass == 0) {
          $new_entry = lookup_single($title, $authors);
        } elsif ($pass == 1) {
          $new_entry = lookup_single($title, $year);
        } elsif ($pass == 2) {
          $new_entry = lookup_single($authors, $year);
        }

        $pass++;

        print "  Pass $pass.\n";
        
        if ($new_entry) {
        
          $new_entry = new Text::BibTeX::Entry($new_entry);

            $new_entry->delete(["crossref"]);

            my $new_title = $new_entry->get("title");
            if (compare($title, $new_title) > .5) {
              $continue = 1;
            }
    }
    } while ($pass < 3 && !$continue);
    
    if (!$continue) {
        print "  Match fail.\n";
           $new_entry = $entry;
            $failed++;
    }
    
    my %new_fieldlist;
    my @field_array = $new_entry->fieldlist;
    foreach (@field_array) {
      $new_fieldlist{$_} = 1;
    }
    
    foreach ($entry->fieldlist) {
      if (!exists($new_fieldlist{$_})) { # && !($_ =~ m/url/)
          $new_entry->set($_, $entry->get($_));
      }
    }
    
    open(OUT, ">>$output");
    print OUT $new_entry->print_s;
    print OUT "\n\n";
    
    print "Done with $title, $authors.\n\n";

    $pass = 0;
    $total++;

}

close (OUT);

print "Ran $total entries; $failed failed.\n";

sub lookup_single {
  my ($string_left, $string_right) = @_;
    
    if (!defined($string_right)) {
      $string_right = "";
    }
    
    my $query;
    my $year = "";
    if ($string_right =~ m/^\d\d\d\d$/) {
      $year = $string_right;
      $query = $string_left;
    } else {
      $query = "$string_left $string_right";
    }
    
    $query =~ s/\{|\}//mg;
    $query =~ s/\.$//mg;
    
    my $response = $agent->post($url,
        [ maxnum => 10,
            query => $query,
            results => "citation",
            sort => "score",
            year => $year
        ]
      );



    if ($response->content() =~ m/accesskey\=/) {
        my ($bibtex_url) = $response->content() =~ m/accesskey\=\"1\".
+*?\"biblinks\".*?\"(\/cgi-bin\/bibshow.*?)\"/sg;

        #print "$base$bibtex_url\n";
        #exit();
        if (!defined($bibtex_url)) {
            print "Bibtex URL error on $string_left, $string_right.\n"
+;
        } else {

            $bibtex_url =~ s/\&amp\;/&/gmi; #stupid html escaping

            $response = $agent->get(uri_unescape("$base$bibtex_url"));
            my ($html) = $response->content() =~ m/.*?<pre class=\"bib
+tex\">(.*?)<\/pre>/sg;

            #print "$base$bibtex_url\n";
            #print $html; 
            #exit();

            if (!defined($html)) {
                print "<pre> tag parse error on $string_left, $string_
+right.\n";
            } else {
                return $stripper->parse($html);
            }
        }
    } else {
        print "No accesskey for $string_left, $string_right.\n";
    }

    return "";
}


#sub value_to_string {
#  my $string = "";
#    my $value = shift;
#    my @all_values = $value->values;
#    my $simpleval;
#    foreach $simpleval (@all_values) {
#      $string .= $simpleval->text . " ";
#    }
#}



################################### citeulike_pdf_grabber.pl
#!/alruccabah/local/usr/bin/perl

use warnings;
use strict;

use WWW::Mechanize;

print "Starting up.\n";

my $username = "eweaver";
my $printer = "cis5";

my $password;
if (defined($ARGV[0])) {
  $password = $ARGV[0];
} else {
  die ("You need to supply a password, an optionally, the --print flag
+.\n");
}

my $extension = "pdf~";
my $print = 0;
if (defined($ARGV[1]) && $ARGV[1] eq "--print") {
  print "Printing enabled!\n";
    sleep(5);
  $print = 1;
    $extension = "pdf";
}


my $home = "http://www.citeulike.org/";
my $base = "$home/user/$username/";
my $mech = WWW::Mechanize->new(autocheck => 1);
my $print_cache = "/usa/$username/tmp/pdf_cache/";
chdir($print_cache);

my $missing = "missing.html";
open(MISSING, ">$missing");
close(MISSING);

print "Creating print cache...\n";
if (! -e $print_cache) {
    system("mkdir $print_cache");
} else {
  print "Exists.\n";
}

#$mech->cookie_jar(HTTP::Cookies->new);

$mech->get($base);
$mech->follow_link( text => "Log in");

$mech->form_name("frm");
$mech->set_visible($username, $password);
$mech->submit();

print "We should be authenticated now.\n";

$mech->follow_link( text => "$username");

print "At the base page.\n";

my @tag_links_all = $mech->links();
my @tag_links;

print "Searching for tags.\n";

my $found_rss = 0;
foreach (@tag_links_all) {
  if ($found_rss) {
      if ($_->url() =~ m/\/tag\//) {
          push (@tag_links, $_);
            print "  Found tag " . $_->text() . ".\n";
        }
    } elsif ($_->text() eq "RSS") {
      $found_rss = 1;
    }
}
#my @tag_links = $mech->find_all_links( url_abs_regex => qr/\/$usernam
+e\/tag\//);

print "Found " . scalar(@tag_links) . " tags.\n";

my $tag_url;

my $miss = 0;

foreach $tag_url (@tag_links) {
  my ($tag) = $tag_url->url() =~ m/.*\/(.+)$/;
    
  print "  Fetching $tag.\n";
    
    $mech->get($base);
    $mech->get($tag_url);
    
    my @cite_links = $mech->find_all_links( url_abs_regex => qr/\/$use
+rname\/article\/\d+$/ );
  print "  Found " . scalar(@cite_links) . " citations.\n";
        
  my $cite_url;
    
    foreach $cite_url (@cite_links) {
      print "    Looking for article at: " . $cite_url->url() . "\n";
      $mech->get($base);
        $mech->get($cite_url);
        if (!$mech->success()) {
          print "    Not found.\n\n";
            next;
        }
        
        my $title = $mech->title();
        $title =~ s/CiteULike\: //;
        
        my $partial_hit_flag = 0;
        
        print "    Fetched article $title.\n";
        if (-e "$title.pdf" || -e "$title.$extension") {
          print "    Article exists in print cache.\n";
        } else {
          print "    Retrieving article.\n";
            my $link = $mech->find_link( url_abs_regex => qr/\/pdf\/us
+er\/$username\/.*\.pdf/i);
            if ($link) {
              $mech->get($link);
              print "      Found a personal .pdf at " . $link->url() .
+ ".\n";
                $mech->save_content("$title.$extension");
                print "      Saved it.\n";
          } else {
              print "      Trying to follow an external link.\n";
#              $mech->get($cite_url); # reset success() flag

                for (my $i = 0; $i < 6; $i++) {
                    $link = $mech->find_link( url_regex => qr/\.pdf\s*
+$/ );
                    if ($link) {
                      print "        I found a direct link.\n";
                        $mech->get($link);
                        $mech->save_content("$title.$extension");
                      print "        Saved it.\n";
                        last;
                    } else {
                      print "        Descending URL tree ($i).\n";
                        
                        my @regexs = ( "View article online",
                                                     "PDF",
                                                     "Full text",
                                                     "here" );
                        
                      #my $link = $mech->find_link( text => "View arti
+cle online" );
                        #if (!$link) {
                          for (my $j = 0; $j < scalar(@regexs); $j++) 
+{
                              $link = $mech->find_link(  text_regex =>
+ qr/$regexs[$j]/i);
                                if ($link) {
                                  print "        Found a \"" . $regexs
+[$j] . "\" link.\n";
                                  $mech->get($link);
                                    $partial_hit_flag++;
                                    last;
                                }
                          }
                        #} else {
                        #  print "        Found \"View article online\
+" link.\n";
                        #    $mech->get($link);
                        #}
                    }
                } #while ($link);
            }
            if (!-e "$title.pdf" && !-e "$title.$extension")    {
              print  "  Couldn't find any match for:\n    $title\n    
+" . $cite_url->url() . "    \n";
              $miss++;
                open(MISSING, ">>$missing");
                print MISSING "<p>";
                if ($partial_hit_flag > 0) {
                    print MISSING "<b>Please check following (partial 
+hits $partial_hit_flag):</b><br>\n";
                }
                print MISSING "<a href=\"" . $home . $cite_url->url() 
+. "\">$title</a></p>\n";
                close(MISSING);
            } else {
              if ($print) {
                  print "  Printing $title.\n";
                    if (-e "tmp.ps") {
                        system("rm \"tmp.ps\"");
                    }
                    if (-e "$title.pdf~") {
                      print "  Removed non-printed cache file.\n";
                        system("rm \"$title.pdf~\"");
                    }
                    system("acroread -toPostScript -start 1 -end 40 -p
+airs \"$title.pdf\" \"tmp.ps\"");
                    print "  Converted to .ps.\n";
                    sleep(3);
                    system("lpr -P$printer \"tmp.ps\"");
                }
            }
        }
    print "\n";
    }
}                
            
print "Done. There were $miss articles I couldn't find.\n";



Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://515163]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others musing on the Monastery: (7)
As of 2019-12-11 12:08 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found

    Notices?