Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

Simple HTML doc retrieval and analysis script

by cjf (Parson)
on Jun 10, 2002 at 08:09 UTC ( #173054=sourcecode: print w/ replies, xml ) Need Help??

Category: Miscellaneous
Author/Contact Info /msg cjf
Description:

Simple HTML document retrieval script. Takes a url and a keyword as args, grabs the document, archives it in a directory hierarchy, does a very simple (soon to be improved) analysis for the specified keyword, and returns a relevance score. As always, suggestions for improvements are appreciated.

#!/usr/bin/perl -w

use strict;

use Getopt::Std;
use URI;
use File::Path;
use LWP::UserAgent;
use HTML::TokeParser;

my %opts;

getopt('dk', \%opts);

unless ($opts{d} && $opts{k}) {
    error("Usage = $0 -d url -k keyword");
}

my $retrieved_document = retrieve_document($opts{d});

archive_document($opts{d}, $retrieved_document);

my $doc_rating = analyze_document($retrieved_document, $opts{k});

print "Document rating for keyword: ", $opts{k}, " = $doc_rating\n";

sub archive_document {

    # creates a hierarchy of directories to
    # store data in based on domain/file path

    my ($url, $doc) = @_;

    my $path = url_to_path($url);

    mkpath([$path], 1, 0755);

    open DATA, ">$path/data" or error("Can't write data: $!\n");
        print DATA $doc;
    close DATA;

}


sub error {

    # before you ask, this is here so I can easily change
    # the formatting of the error messages later on (think HTML)

    my $error = shift;
    print "Error: $error\n";
    exit;
}

sub analyze_document {

    # takes an html document and performs a (very) crude
    # analysis to determine relevance to the given keyword
    # returns an integer relevance rating

    my ($doc, $keyword) = @_;

    my $p = HTML::TokeParser->new(\$doc) || die "$!";

    my %tag_weights = (
        a => {
            text => 2,
        },
        title => {
            text => 5,
        },
        p => {
            text => 1,
        }
    );


    my $rating = 0;

    # This ain't pretty, suggestions for improvements
    # are greatly appreciated

    while (my $token = $p->get_token) {

        my $token_type = shift @{$token};

        if ($token_type eq "S") {

            my ($tag, $attr, $attrseq, $rawtxt) = @{$token};

            for (keys %tag_weights) {

                if ($tag eq $_) {

                    if ($p->get_text("/$tag") =~ /\Q$keyword\E/i) {
                        $rating += $tag_weights{$tag}{text};
                    }
                }

            }
        }
    }

    return $rating;

}


sub retrieve_document {

    my $url = shift;

    my $ua = LWP::UserAgent->new;

    $ua->agent("cjf/0.0.1");

    my $req = HTTP::Request->new(GET => $url);

    $req->header('Accept' => 'text/html');

    my $res = $ua->request($req);

    if ($res->is_success) {
        return $res->content;
    } else {
        error($res->status_line);
    }

    # should probably add a check on the size of the document
    # not a huge concern yet because it's locally submitted

}

sub url_to_path {

    my $url  = URI->new(shift);

    print $url, "\n";

    my $path = $url->host;

    $path  =~ tr[.][/];

    $path .= $url->path;

    unless (substr($url->path, -1) eq "/") {
        $path .= '/';
    }

    return $path;

}

Comment on Simple HTML doc retrieval and analysis script
Download Code

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://173054]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others about the Monastery: (9)
As of 2014-12-18 12:00 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (51 votes), past polls