Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked
 
PerlMonks  

k5download

by BerntB (Deacon)
on Nov 13, 2006 at 15:19 UTC ( #583725=sourcecode: print w/replies, xml ) Need Help??
Category: Web Stuff
Author/Contact Info Bernt 'Bug' Budde
Description: Screen scrape kuro5hin.org.

Downloads all your posted comments to a directory. Generates a yaml file with definitions of all the comments, suitable for other scripts to use for parsing the data.

Can be the first of a number of scripts to e.g. find mod-bombers, etc.

Tab size 4.

#!/usr/bin/perl

# k5download.pl

# This has the same license as Perl.

use strict;
use warnings;

# Probably needs to install these modules.

# See the cpan command or your local variant like apt-get
# for Debian based systems and PPM for Win. All these should
# be in Debian Testing/Ubuntu.

use HTML::TokeParser;            # In HTML::Parser.
use WWW::Mechanize;
use Getopt::Long;
use Date::Manip;
use YAML;

# ------------------------------------------------
our $VERSION = '0.01';

use constant TIME_BETWEEN_PAGELOADS    => 4;
use constant SCOOP_WEB_SITE            => "http://www.kuro5hin.org";


# ------------------------------------------------
# Help text:

my $usage = "
This program is used to screen scrape 'kuro5hin.org' for your comments
+.

You own your comments on K5 but not other people's comments, so this
just supports getting what you have written yourself. It d/l the full
html text, so do NOT use 'nested' for 'Comment display mode'! It hurts
usability for grep-ping (searching) your saved comments, anyway.

    Use: perl k5download.pl OPTIONS ...

    Options are:

  -u username   Specify the username at kuro5hin.
  -p password   Password so the program can log in.
  --dump dir    Save all your comments in dir.

  -n   no       Just get the 'no' last comments.
  -log          Log what's done on STD ERR. (Simple debug.)
 --evil         Usually, the program waits a bit between every read
                from K5. Not with this flag. (-: If you use it, I'll
                laugh if rusty cancels your account. :-)
 --help         Shows this text.


'-p' and '--dump' are mandatory.

If no user name is specified, the login name will be used (-: that
should work for rusty, at least :-).


    Examples of use:

Dumps html of ALL posts to K5 in dir 'foo'.
  perl k5download.pl -u bar -p XX --dump 'foo' --log
Gets html of the last 200 posts to K5 in a dir.
  perl k5download.pl -u bar -p XX --dump foo --log -n 200

    To Do:

- Utility to parse out the html to textfiles, so they are easier to
  grep (search, for win people). 3vial.
- Utility that d/l new things you've written (and when you get comment
+s
  and mods on old stuff) and updates without d/l everything!
- Utility that d/l modding and finds who modbombs you (Hello 'V' and
  'jxg'!)
- Utility that parse data to find sniper comments added much later, to
  get the last word (Hello, 'A Bore'!)
- Remove warning when d/l comment list. :-)

    Notes:

- The generated yaml file contains all the info an updater program wou
+ld
  need -- and it is both human readable and not Perl specific. Have fu
+n!
- Not tested on Win; sorry, I can only run Unix stuff at home.
- Don't use the same dir for multiple downloads. An information file
  will be overwritten.
- This will certainly needs modification to work with other Scoop site
+s.
- I should have written an extension to Scoop and published instead,
  which rusty could have installed. More efficient.

";



# ------------------------------------------------
# Parse parameters:

my($username, $password, $dir,
   $max_comments, $log, $evil,  $help);


my($rslt)    =GetOptions('u|user:s'    => \$username,
                        'p|pwd=s'    => \$password,
                        'h|help'    => \$help,
                        evil        => \$evil,
                        'n|maxno:i'    => \$max_comments,
                        'dump:s'    => \$dir,
                        log            => \$log,
                     );


die "Bad parameters?!$usage"                unless $rslt;

if ($help) {
    print $usage;
    exit 0;
}

die "Password is mandatory. Try '--help'"    unless $password;
$username    = `whoami`                        unless $username;
die "Mandatory with '--dump'"                unless $dir;


# Try to create dir before slow && painful d/l:
chop $dir                                    if $dir =~ m!/$!;
mkdir $dir                                    unless -d $dir;
die "'--dump' can't make dir '$dir'"        unless -d $dir;


# ------------------------------------------------
# Init and try to login at the site:


my $mech        = WWW::Mechanize->new();
$mech->stack_depth(2);                # Don't use so much RAM

$mech->get(SCOOP_WEB_SITE . "?uname=$username&pass=$password");

die "Failed to login with user $username"
            if    $mech->content() !~ /Logout from all locations/ &&
                $mech->content() !~ /Logout $username/;

print STDERR "Logged in\n"                    if $log;
sleep TIME_BETWEEN_PAGELOADS                unless $evil;



# ------------------------------------------------
# Read down Comment list:

my($cmts) = [];                        # Store comment info here

load_from_comment_list($mech, $username, $cmts,
                       $max_comments, $log, $evil);

# Note for you guys with multiple accounts. You could just add
# calls here.

# ------------------------------------------------
# Get all comments.

# (Already checked so there is a dir $dir.)

# (Start with earliest days, so numbering of comments a given day
# is always the same even if adds comments the same day.)
foreach my $c (reverse @$cmts) {
    $mech->get(SCOOP_WEB_SITE . $c->{url});

    # - - - Create file name:
    my($date)    = ParseDate($c->{date});
    my($fname)    = UnixDate($date, "%y-%m-%d");
    my($no)        = 1;
    while (-f "$dir/$fname-$no.html") {
        $no++;
    }
    $c->{fname}    = "$fname-$no.html";
    $fname        = "$dir/$fname-$no.html";

    # - - - Log, etc:
    print STDERR "Got $fname " . $c->{url} ."\n"    if $log;
    sleep TIME_BETWEEN_PAGELOADS            unless $evil;

    # - - - Prepare data for storing:
    my($html)    = $mech->content();

    # Add html comment with easily parsed known info about comment:
    my($text) = sprintf    "mod %s,%s. repl %s date %s, url '%s'"
                    .    " story %s title '%s'.",
                        $c->{mods}, $c->{mod}, $c->{answers},
                        $c->{date}, $c->{url},
                        $c->{storyurl}, $c->{title};
    $text =~ s/-->/-->/g;
    die "Couldn't find '</head>' in " . $c->{url}
                    unless $html =~ s@</HEAD@<!-- $text --> </HEAD@i;

    # - - - Store:
    if (open(my $fh, '>', $fname)) {
        print $fh $html;
        close $fh;
    } else {
        warn "Couldn't save to file '$fname'";
    }
}


    
# ------------------------------------------------
# Dump YAML Data description (important)

# This yaml file makes an update script trivial to write.

YAML::DumpFile("$dir/comments.yaml", $cmts);


# ------------------------------------------------
# Dump comment list as simple html.

# Make utility html list with references to all comments.

print STDERR "Save list to $dir/comment-list.html\n"    if $log;
if (open(my $fh, '>', "$dir/comment-list.html")) {
    print $fh "
<html><head><TITLE>Comments By $username</title><body>
";
    foreach my $c (@$cmts) {
        print $fh '<p>';
        print $fh '<a href="', SCOOP_WEB_SITE, $c->{url}, '">',
                  $c->{title}, '</a> on ', $c->{date};
        print $fh "<br />\n";
        my($mod) = $c->{mod} ? $c->{mod} : '';
        printf $fh 'Rated by <a href="%s">%s</a> for %s, %s replies.',
                    SCOOP_WEB_SITE . $c->{modurl}, $c->{mods},
                    $mod, $c->{answers};
        printf $fh ' Story <a href="%s">%s</a>',
                    SCOOP_WEB_SITE . $c->{storyurl}, $c->{storyname};
        print $fh "</p>\n";
    }
    print $fh "\n</body></html>\n";
    close $fh;
} else {
    warn "ERR! Couldn't save to '$dir/comment-list.html'!?\n";
}



# ------------------------------------------------
# Get Comment list from Scoop site:

sub load_from_comment_list {
    my($mech, $username, $cmts, $max_comments, $log, $evil) = @_;

    my($cmts_url)    = SCOOP_WEB_SITE . "/user/$username/comments";

    # Suitable Tst; user with quite few posts:
    #$cmts_url        = SCOOP_WEB_SITE . "/user/Zombie%20Abu%20Musab%2
+0al%20Zarqawi/comments";

    # - - - Read first page of comment list:
    $mech->get($cmts_url);
    # XXX Err test for success here!
    print STDERR "Got 1st comment list page\n"    if $log;
    sleep TIME_BETWEEN_PAGELOADS                unless $evil;

    # - - - Loop over comment list pages and get comment refs:

    while (1) {
        my($no_cmts)= scalar(@$cmts);
        last        if $max_comments && $max_comments < $no_cmts;

        my($content)= $mech->content();
        my($cmtinfo)= parse_cmtlist(\$content, $cmts);
        my($c_now)    = scalar(@$cmts);
        print STDERR "Has read $c_now cmts\n"    if $log;

        # - - - Go to next page, if any for comment list:

        # (Quite new in WWW::Mechanize API, so don't use it.)
        #my($form)    = $mech->form_with_fields('next');

        # This will give a warning, but sigh...
        my($form);
        for(my $i = 0; $i < 4; $i++) {
            $form    = $mech->form_number($i);
            last                    unless $form;
            last                    if $form->find_input('next');
        }
        last                        unless $form; # No more data

        $mech->select('count', '50');
        $mech->click_button(name => 'next');
        print STDERR "Got next comment page\n"    if $log;
        sleep TIME_BETWEEN_PAGELOADS        unless $evil;
    }
       
}


# ------------------------------------------------
# Get Comment info from Comment list page

sub parse_cmtlist {
    my($data, $results) = @_;

    my $parse = HTML::TokeParser->new($data);

    # Skip everything before comment list:
    my($flag, $tok);
    while ( $tok    = $parse->get_tag( 'a' ) ) {
        my($url)    = $tok->[1]->{href};
        #print "Prerun $url\n";
        if ( $url =~ m'^/comments/[#/0-9]+$' ) { # ' (for emacs)
            # Unget doesn't work for get_tag()???
            #$parse->unget_token($tok);
            $flag    = 1;
            last;
        }
    }

    print STDERR "No Comments found!\n"    unless $flag;
    return undef                        unless $flag;

    while ( $flag || ($tok = $parse->get_tag( 'a' ) ) ) {

        my($url)    = $tok->[1]->{href};
        $flag        = 0;        # 'unget' doesn't work??

        next                            if $url =~ m!^/user/!;
        next                            if $url =~ m!^/story/!;
        if ( $url !~ m'^/comments/[#/0-9]+$') { # ' (for emacs)
            # This will be after all comments. Just loop
            # through them, to be certain.
            next;
        }

        # Data structure for comment:
        my %c = (
            url        => $url,
            title    => $parse->get_text(),
        );

        # - - - Get modding info:
        $tok = $parse->get_tag( 'a' );
        $c{modurl}    = $tok->[1]->{href};
        my($modinfo)= $parse->get_text();
        if ($modinfo =~ m!([^/]*)\s*/\s*(\d+)!) {
            $c{mods}= 0 + $2;
            $c{mod}    = $1                if $c{mods};
            $c{mod}    = ''                unless $c{mod};
        } else {
            warn "Failed to parse mod info '$modinfo;";
        }

        # - - - Get Reply info:
        $tok = $parse->get_tag( 'b' );
        $c{answers}    = $parse->get_text() + 0; # No of answers

        # - - - Get Posting date:
        $tok = $parse->get_tag( '/a' );
        my($date)    = $parse->get_text();
        $date =~ s/^\s*on\s*//i;
        $date =~ s/\s*$//;
        $c{date}    = $date;

        # - - - Get Story ref:
        $tok = $parse->get_tag( 'a' );
        $c{storyurl}= $tok->[1]->{href};
        $c{storyname}= $parse->get_text();

        # - - -
        push @$results, \%c;
    }
    return $results;
}


Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://583725]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others chilling in the Monastery: (3)
As of 2019-12-07 11:11 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    Strict and warnings: which comes first?



    Results (160 votes). Check out past polls.

    Notices?