comment on

I abondoned the idea. To do the indexing I envisioned justice, I would have had to download the greater majority of PM's nodes. At the mandated rate of 1 every 5 seconds+, it would require 500 hours. Split that into 2 hour chunks of connect time and it becomes untenable.

Hence I've never bothered to extend the scripts beyond their simplist form:

PMDown.pl takes a filename containing a list of PM nodeid's to download:

BewareEven with 1 thread running, this will far exceed the download rate approved.

#! perl -slw
use strict;
use threads qw[ yield async ];
use Thread::Queue;
use LWP::Simple;

$|=1;

our $THREADS ||= 4;

my $Qwork    = new Thread::Queue;
my $Qresults = new Thread::Queue;

sub work{
    my $tid = threads->self->tid();
    sleep 1 until $Qwork->pending;
    while( $Qwork->pending or ( sleep(1) and $Qwork->pending ) ) {
        my $work = $Qwork->dequeue;
        print "$tid checking: $work";
        if( -e "c:/perlmonks/$work.xml" and not -z _ ) {
            $Qresults->enqueue( "$work returned exists not fetched" );
            next;
        }
        print "$tid fetching: $work";
        my $rc = getstore(
            "http://perlmonks.com/index.pl?node_id=$work&displaytype=x
+ml",
            "c:/perlmonks/$work.xml"
        );

        $Qresults->enqueue( "$work returned $rc" );
    }
    return;
}

my @t = map{ threads->new( \&work ) } 1 .. $THREADS;

open IN, '<', $ARGV[ 0 ] or die "$ARGV[ 0 ] : $!";

my $cResults = 0;
while( <IN> ) {
    chomp;
    $Qwork->enqueue( $_ ) and $cResults++;
    sleep 1 while $Qwork->pending > 100;
}

$_->join for @t;

my %summary;
$Qresults->dequeue =~ m[(\S+) returned (\S+)]
    and push @{ $summary{ $2 } }, $1
    while $Qresults->pending;

for my $key ( sort keys %summary ) {
    print $key, scalar @{ $summary{ $key } };
}

printf 'Enter to see details or ^C'; <STDIN>;

for my $key ( sort keys %summary ) {
    print $key;
    my $details =  join ' ', @{ $summary{ $key } };
    $details =~ s[.{1,80} ][\n]g;
    print $details;
}
[download]

ExtractWords.pl

#! perl -slw
use G;

my %words;
while( <> ) {
    $words{ $_ }++ for m[\b([a-zA-Z][a-zA-Z']+[a-zA-Z])\b]g;
}

open WORDS, '>', 'words.dat' or die $!;
print WORDS for sort keys %words;
close WORDS;
[download]

IndexDocs.pl

#! perl -slw
use strict;
use G;

$|=1;

chomp( my @words = do{ open my $fh, '<words.dat'; <$fh> } );
print "loaded: " . @words . ' words';

local $/;

my %index; @index{ @ARGV } = ('') x @ARGV;
while(  <> ) {
    chomp( my $file = lc );
    1+index( $file, $words[ $_ ] )
        and vec( $index{ $ARGV }, $_, 1 ) = 1
            for 0 .. $#words;
}

open INDEX, '>', 'index.dat' or die $!;
print INDEX "$_(@{[ unpack '%b*', $index{ $_ } ]}) : [@{[ unpack 'b*',
+ $index{ $_ } ]}]" 
    for sort keys %index;
close INDEX;
[download]

.\171594.txt : all an and anonymous asked at back be better but by com
+ concerning contain create directories even excluding expression foll
+owing for gone has have hours in index jun last list looking monks of
+ on over pl probably question renders replies round seekers simple th
+anks that the this to want wisdom without would
.\171599.txt : am and are at be being brothers but by com comes create
+ darkness directories doubt enlightenment etiquette help here if in i
+ndex jun light list living me my no not of on order piece pl re repli
+es reply seeking so someone strong sure tell that the thread to unsur
+e until way weak will with without

[download]