Category: | Web Stuff |
Author/Contact Info | Bernt 'Bug' Budde |
Description: | Screen scrape kuro5hin.org.
Downloads all your posted comments to a directory. Generates a yaml file with definitions of all the comments, suitable for other scripts to use for parsing the data. Can be the first of a number of scripts to e.g. find mod-bombers, etc. Tab size 4. |
#!/usr/bin/perl # k5download.pl # This has the same license as Perl. use strict; use warnings; # Probably needs to install these modules. # See the cpan command or your local variant like apt-get # for Debian based systems and PPM for Win. All these should # be in Debian Testing/Ubuntu. use HTML::TokeParser; # In HTML::Parser. use WWW::Mechanize; use Getopt::Long; use Date::Manip; use YAML; # ------------------------------------------------ our $VERSION = '0.01'; use constant TIME_BETWEEN_PAGELOADS => 4; use constant SCOOP_WEB_SITE => "http://www.kuro5hin.org"; # ------------------------------------------------ # Help text: my $usage = " This program is used to screen scrape 'kuro5hin.org' for your comments +. You own your comments on K5 but not other people's comments, so this just supports getting what you have written yourself. It d/l the full html text, so do NOT use 'nested' for 'Comment display mode'! It hurts usability for grep-ping (searching) your saved comments, anyway. Use: perl k5download.pl OPTIONS ... Options are: -u username Specify the username at kuro5hin. -p password Password so the program can log in. --dump dir Save all your comments in dir. -n no Just get the 'no' last comments. -log Log what's done on STD ERR. (Simple debug.) --evil Usually, the program waits a bit between every read from K5. Not with this flag. (-: If you use it, I'll laugh if rusty cancels your account. :-) --help Shows this text. '-p' and '--dump' are mandatory. If no user name is specified, the login name will be used (-: that should work for rusty, at least :-). Examples of use: Dumps html of ALL posts to K5 in dir 'foo'. perl k5download.pl -u bar -p XX --dump 'foo' --log Gets html of the last 200 posts to K5 in a dir. perl k5download.pl -u bar -p XX --dump foo --log -n 200 To Do: - Utility to parse out the html to textfiles, so they are easier to grep (search, for win people). 3vial. - Utility that d/l new things you've written (and when you get comment +s and mods on old stuff) and updates without d/l everything! - Utility that d/l modding and finds who modbombs you (Hello 'V' and 'jxg'!) - Utility that parse data to find sniper comments added much later, to get the last word (Hello, 'A Bore'!) - Remove warning when d/l comment list. :-) Notes: - The generated yaml file contains all the info an updater program wou +ld need -- and it is both human readable and not Perl specific. Have fu +n! - Not tested on Win; sorry, I can only run Unix stuff at home. - Don't use the same dir for multiple downloads. An information file will be overwritten. - This will certainly needs modification to work with other Scoop site +s. - I should have written an extension to Scoop and published instead, which rusty could have installed. More efficient. "; # ------------------------------------------------ # Parse parameters: my($username, $password, $dir, $max_comments, $log, $evil, $help); my($rslt) =GetOptions('u|user:s' => \$username, 'p|pwd=s' => \$password, 'h|help' => \$help, evil => \$evil, 'n|maxno:i' => \$max_comments, 'dump:s' => \$dir, log => \$log, ); die "Bad parameters?!$usage" unless $rslt; if ($help) { print $usage; exit 0; } die "Password is mandatory. Try '--help'" unless $password; $username = `whoami` unless $username; die "Mandatory with '--dump'" unless $dir; # Try to create dir before slow && painful d/l: chop $dir if $dir =~ m!/$!; mkdir $dir unless -d $dir; die "'--dump' can't make dir '$dir'" unless -d $dir; # ------------------------------------------------ # Init and try to login at the site: my $mech = WWW::Mechanize->new(); $mech->stack_depth(2); # Don't use so much RAM $mech->get(SCOOP_WEB_SITE . "?uname=$username&pass=$password"); die "Failed to login with user $username" if $mech->content() !~ /Logout from all locations/ && $mech->content() !~ /Logout $username/; print STDERR "Logged in\n" if $log; sleep TIME_BETWEEN_PAGELOADS unless $evil; # ------------------------------------------------ # Read down Comment list: my($cmts) = []; # Store comment info here load_from_comment_list($mech, $username, $cmts, $max_comments, $log, $evil); # Note for you guys with multiple accounts. You could just add # calls here. # ------------------------------------------------ # Get all comments. # (Already checked so there is a dir $dir.) # (Start with earliest days, so numbering of comments a given day # is always the same even if adds comments the same day.) foreach my $c (reverse @$cmts) { $mech->get(SCOOP_WEB_SITE . $c->{url}); # - - - Create file name: my($date) = ParseDate($c->{date}); my($fname) = UnixDate($date, "%y-%m-%d"); my($no) = 1; while (-f "$dir/$fname-$no.html") { $no++; } $c->{fname} = "$fname-$no.html"; $fname = "$dir/$fname-$no.html"; # - - - Log, etc: print STDERR "Got $fname " . $c->{url} ."\n" if $log; sleep TIME_BETWEEN_PAGELOADS unless $evil; # - - - Prepare data for storing: my($html) = $mech->content(); # Add html comment with easily parsed known info about comment: my($text) = sprintf "mod %s,%s. repl %s date %s, url '%s'" . " story %s title '%s'.", $c->{mods}, $c->{mod}, $c->{answers}, $c->{date}, $c->{url}, $c->{storyurl}, $c->{title}; $text =~ s/-->/-->/g; die "Couldn't find '</head>' in " . $c->{url} unless $html =~ s@</HEAD@<!-- $text --> </HEAD@i; # - - - Store: if (open(my $fh, '>', $fname)) { print $fh $html; close $fh; } else { warn "Couldn't save to file '$fname'"; } } # ------------------------------------------------ # Dump YAML Data description (important) # This yaml file makes an update script trivial to write. YAML::DumpFile("$dir/comments.yaml", $cmts); # ------------------------------------------------ # Dump comment list as simple html. # Make utility html list with references to all comments. print STDERR "Save list to $dir/comment-list.html\n" if $log; if (open(my $fh, '>', "$dir/comment-list.html")) { print $fh " <html><head><TITLE>Comments By $username</title><body> "; foreach my $c (@$cmts) { print $fh '<p>'; print $fh '<a href="', SCOOP_WEB_SITE, $c->{url}, '">', $c->{title}, '</a> on ', $c->{date}; print $fh "<br />\n"; my($mod) = $c->{mod} ? $c->{mod} : ''; printf $fh 'Rated by <a href="%s">%s</a> for %s, %s replies.', SCOOP_WEB_SITE . $c->{modurl}, $c->{mods}, $mod, $c->{answers}; printf $fh ' Story <a href="%s">%s</a>', SCOOP_WEB_SITE . $c->{storyurl}, $c->{storyname}; print $fh "</p>\n"; } print $fh "\n</body></html>\n"; close $fh; } else { warn "ERR! Couldn't save to '$dir/comment-list.html'!?\n"; } # ------------------------------------------------ # Get Comment list from Scoop site: sub load_from_comment_list { my($mech, $username, $cmts, $max_comments, $log, $evil) = @_; my($cmts_url) = SCOOP_WEB_SITE . "/user/$username/comments"; # Suitable Tst; user with quite few posts: #$cmts_url = SCOOP_WEB_SITE . "/user/Zombie%20Abu%20Musab%2 +0al%20Zarqawi/comments"; # - - - Read first page of comment list: $mech->get($cmts_url); # XXX Err test for success here! print STDERR "Got 1st comment list page\n" if $log; sleep TIME_BETWEEN_PAGELOADS unless $evil; # - - - Loop over comment list pages and get comment refs: while (1) { my($no_cmts)= scalar(@$cmts); last if $max_comments && $max_comments < $no_cmts; my($content)= $mech->content(); my($cmtinfo)= parse_cmtlist(\$content, $cmts); my($c_now) = scalar(@$cmts); print STDERR "Has read $c_now cmts\n" if $log; # - - - Go to next page, if any for comment list: # (Quite new in WWW::Mechanize API, so don't use it.) #my($form) = $mech->form_with_fields('next'); # This will give a warning, but sigh... my($form); for(my $i = 0; $i < 4; $i++) { $form = $mech->form_number($i); last unless $form; last if $form->find_input('next'); } last unless $form; # No more data $mech->select('count', '50'); $mech->click_button(name => 'next'); print STDERR "Got next comment page\n" if $log; sleep TIME_BETWEEN_PAGELOADS unless $evil; } } # ------------------------------------------------ # Get Comment info from Comment list page sub parse_cmtlist { my($data, $results) = @_; my $parse = HTML::TokeParser->new($data); # Skip everything before comment list: my($flag, $tok); while ( $tok = $parse->get_tag( 'a' ) ) { my($url) = $tok->[1]->{href}; #print "Prerun $url\n"; if ( $url =~ m'^/comments/[#/0-9]+$' ) { # ' (for emacs) # Unget doesn't work for get_tag()??? #$parse->unget_token($tok); $flag = 1; last; } } print STDERR "No Comments found!\n" unless $flag; return undef unless $flag; while ( $flag || ($tok = $parse->get_tag( 'a' ) ) ) { my($url) = $tok->[1]->{href}; $flag = 0; # 'unget' doesn't work?? next if $url =~ m!^/user/!; next if $url =~ m!^/story/!; if ( $url !~ m'^/comments/[#/0-9]+$') { # ' (for emacs) # This will be after all comments. Just loop # through them, to be certain. next; } # Data structure for comment: my %c = ( url => $url, title => $parse->get_text(), ); # - - - Get modding info: $tok = $parse->get_tag( 'a' ); $c{modurl} = $tok->[1]->{href}; my($modinfo)= $parse->get_text(); if ($modinfo =~ m!([^/]*)\s*/\s*(\d+)!) { $c{mods}= 0 + $2; $c{mod} = $1 if $c{mods}; $c{mod} = '' unless $c{mod}; } else { warn "Failed to parse mod info '$modinfo;"; } # - - - Get Reply info: $tok = $parse->get_tag( 'b' ); $c{answers} = $parse->get_text() + 0; # No of answers # - - - Get Posting date: $tok = $parse->get_tag( '/a' ); my($date) = $parse->get_text(); $date =~ s/^\s*on\s*//i; $date =~ s/\s*$//; $c{date} = $date; # - - - Get Story ref: $tok = $parse->get_tag( 'a' ); $c{storyurl}= $tok->[1]->{href}; $c{storyname}= $parse->get_text(); # - - - push @$results, \%c; } return $results; } |
|
---|
Back to
Code Catacombs