Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot

Simple HTML doc retrieval and analysis script

by cjf (Parson)
on Jun 10, 2002 at 08:09 UTC ( #173054=sourcecode: print w/replies, xml ) Need Help??
Category: Miscellaneous
Author/Contact Info /msg cjf

Simple HTML document retrieval script. Takes a url and a keyword as args, grabs the document, archives it in a directory hierarchy, does a very simple (soon to be improved) analysis for the specified keyword, and returns a relevance score. As always, suggestions for improvements are appreciated.

#!/usr/bin/perl -w

use strict;

use Getopt::Std;
use URI;
use File::Path;
use LWP::UserAgent;
use HTML::TokeParser;

my %opts;

getopt('dk', \%opts);

unless ($opts{d} && $opts{k}) {
    error("Usage = $0 -d url -k keyword");

my $retrieved_document = retrieve_document($opts{d});

archive_document($opts{d}, $retrieved_document);

my $doc_rating = analyze_document($retrieved_document, $opts{k});

print "Document rating for keyword: ", $opts{k}, " = $doc_rating\n";

sub archive_document {

    # creates a hierarchy of directories to
    # store data in based on domain/file path

    my ($url, $doc) = @_;

    my $path = url_to_path($url);

    mkpath([$path], 1, 0755);

    open DATA, ">$path/data" or error("Can't write data: $!\n");
        print DATA $doc;
    close DATA;


sub error {

    # before you ask, this is here so I can easily change
    # the formatting of the error messages later on (think HTML)

    my $error = shift;
    print "Error: $error\n";

sub analyze_document {

    # takes an html document and performs a (very) crude
    # analysis to determine relevance to the given keyword
    # returns an integer relevance rating

    my ($doc, $keyword) = @_;

    my $p = HTML::TokeParser->new(\$doc) || die "$!";

    my %tag_weights = (
        a => {
            text => 2,
        title => {
            text => 5,
        p => {
            text => 1,

    my $rating = 0;

    # This ain't pretty, suggestions for improvements
    # are greatly appreciated

    while (my $token = $p->get_token) {

        my $token_type = shift @{$token};

        if ($token_type eq "S") {

            my ($tag, $attr, $attrseq, $rawtxt) = @{$token};

            for (keys %tag_weights) {

                if ($tag eq $_) {

                    if ($p->get_text("/$tag") =~ /\Q$keyword\E/i) {
                        $rating += $tag_weights{$tag}{text};


    return $rating;


sub retrieve_document {

    my $url = shift;

    my $ua = LWP::UserAgent->new;


    my $req = HTTP::Request->new(GET => $url);

    $req->header('Accept' => 'text/html');

    my $res = $ua->request($req);

    if ($res->is_success) {
        return $res->content;
    } else {

    # should probably add a check on the size of the document
    # not a huge concern yet because it's locally submitted


sub url_to_path {

    my $url  = URI->new(shift);

    print $url, "\n";

    my $path = $url->host;

    $path  =~ tr[.][/];

    $path .= $url->path;

    unless (substr($url->path, -1) eq "/") {
        $path .= '/';

    return $path;

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://173054]
[Corion]: choroba: I'm still using spod5 as my pod-to-ooxml- converter didn't really get off the ground so far
[choroba]: Slidy is basically HTML, so I'm not sure I really fly high :)
choroba should make his presentation scaffolding public
[Corion]: choroba: spod5 converts pod to S5 HTML, so it's also still basic. I find the lack of animations (in the sense of "highlight this", "highlight that" in code) somewhat tedious as I do it with rendered PNGs
[Corion]: I haven't found a good way to include/use the source SVGs I use for creating the PNGs directly as animations
[ambrus]: Presentations come in many different shapes, and so slides do as well.
[Corion]: Doing that in Powerpoint or Ooxml would be nice(r) but I'm much quicker doing the outline of a presentation and the code as Pod
[Corion]: Hurrr - on a machine that is behaving weirdly, I have two processes CROND running. I guess that is the source of unattended jobs sometimes not finding their files anymore...
[ambrus]: Most of the time if I make slides, they're just a formatted document with a medium level of formality (between a well written article and an informal draft), with usually the page breaks chosen carefully and possibly some content repeated between pages.

How do I use this? | Other CB clients
Other Users?
Others rifling through the Monastery: (10)
As of 2017-09-26 10:04 GMT
Find Nodes?
    Voting Booth?
    During the recent solar eclipse, I:

    Results (293 votes). Check out past polls.