Beefy Boxes and Bandwidth Generously Provided by pair Networks
laziness, impatience, and hubris
 
PerlMonks  

Simple HTML doc retrieval and analysis script

by cjf (Parson)
on Jun 10, 2002 at 08:09 UTC ( #173054=sourcecode: print w/ replies, xml ) Need Help??

Category: Miscellaneous
Author/Contact Info /msg cjf
Description:

Simple HTML document retrieval script. Takes a url and a keyword as args, grabs the document, archives it in a directory hierarchy, does a very simple (soon to be improved) analysis for the specified keyword, and returns a relevance score. As always, suggestions for improvements are appreciated.

#!/usr/bin/perl -w

use strict;

use Getopt::Std;
use URI;
use File::Path;
use LWP::UserAgent;
use HTML::TokeParser;

my %opts;

getopt('dk', \%opts);

unless ($opts{d} && $opts{k}) {
    error("Usage = $0 -d url -k keyword");
}

my $retrieved_document = retrieve_document($opts{d});

archive_document($opts{d}, $retrieved_document);

my $doc_rating = analyze_document($retrieved_document, $opts{k});

print "Document rating for keyword: ", $opts{k}, " = $doc_rating\n";

sub archive_document {

    # creates a hierarchy of directories to
    # store data in based on domain/file path

    my ($url, $doc) = @_;

    my $path = url_to_path($url);

    mkpath([$path], 1, 0755);

    open DATA, ">$path/data" or error("Can't write data: $!\n");
        print DATA $doc;
    close DATA;

}


sub error {

    # before you ask, this is here so I can easily change
    # the formatting of the error messages later on (think HTML)

    my $error = shift;
    print "Error: $error\n";
    exit;
}

sub analyze_document {

    # takes an html document and performs a (very) crude
    # analysis to determine relevance to the given keyword
    # returns an integer relevance rating

    my ($doc, $keyword) = @_;

    my $p = HTML::TokeParser->new(\$doc) || die "$!";

    my %tag_weights = (
        a => {
            text => 2,
        },
        title => {
            text => 5,
        },
        p => {
            text => 1,
        }
    );


    my $rating = 0;

    # This ain't pretty, suggestions for improvements
    # are greatly appreciated

    while (my $token = $p->get_token) {

        my $token_type = shift @{$token};

        if ($token_type eq "S") {

            my ($tag, $attr, $attrseq, $rawtxt) = @{$token};

            for (keys %tag_weights) {

                if ($tag eq $_) {

                    if ($p->get_text("/$tag") =~ /\Q$keyword\E/i) {
                        $rating += $tag_weights{$tag}{text};
                    }
                }

            }
        }
    }

    return $rating;

}


sub retrieve_document {

    my $url = shift;

    my $ua = LWP::UserAgent->new;

    $ua->agent("cjf/0.0.1");

    my $req = HTTP::Request->new(GET => $url);

    $req->header('Accept' => 'text/html');

    my $res = $ua->request($req);

    if ($res->is_success) {
        return $res->content;
    } else {
        error($res->status_line);
    }

    # should probably add a check on the size of the document
    # not a huge concern yet because it's locally submitted

}

sub url_to_path {

    my $url  = URI->new(shift);

    print $url, "\n";

    my $path = $url->host;

    $path  =~ tr[.][/];

    $path .= $url->path;

    unless (substr($url->path, -1) eq "/") {
        $path .= '/';
    }

    return $path;

}

Comment on Simple HTML doc retrieval and analysis script
Download Code

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://173054]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others imbibing at the Monastery: (8)
As of 2014-09-23 21:30 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    How do you remember the number of days in each month?











    Results (241 votes), past polls