Snippets Permuted Index without Stopwords
This script is a modification of code
received from jdporter. The original
code generates a permuted index of the
perlmonks code Snippets titles.
I added two subroutines, to read a list
of stopwords, and to determine if one
of those stopwords was in a Snippet title.
If found, that term would not be used
as an index term in the permuted index.
This program should make a much smaller
permuted index, without the clutter
of index entries based on stopwords.
#!/usr/bin/perl
# generate_snip_indx_nostop.pl clpoda 2009.08.30
# Derived from generate_snippets_index.pl
use Getopt::Long;
use LWP::Simple;
use Data::Dumper; $Data::Dumper::Indent=1;
use HTML::Entities;
use strict;
use warnings;
my $DEBUG = 0;
my( $get_new, $by_titles, $permuted, $nostopwords )=(0)x99;
GetOptions(
'getnew|new!' => \$get_new,
'titles|by_titles!' => \$by_titles,
'permuted!' => \$permuted,
'nostopwords!' => \$nostopwords,
);
$get_new + $by_titles + $permuted + $nostopwords == 1 or
die "Usage: $0 [-getnew] [-titles] [-permuted] [-nostopwords] (exa
+ctly one)\n";
my @stopwords;
read_stopwords ();
# the files we refer to:
my $snippets_data_file = 'snippets.pld'; # read/written by get_new; re
+ad by the others
my $last_date_file = 'snippets_last_date.pld'; # read/written by get_n
+ew.
# if title is not provided, the id will be used as the title.
# XXX this really should be made to get the title from the xml record.
sub get_one_snippet
{
my( $id, $title ) = @_;
warn "\tgetting $id\n";
local $_ = get( "http://perlmonks.org/?displaytype=xml;node_id=$id
+" );
my( $author_id, $author_name ) = m#<author id="(\d+)">\s*(.*)</aut
+hor>#;
$author_id == 52855 and return();
my( $date ) = m#<node id=.* created="([^"]*)" updated=#;
my( $desc ) = m#<field name="snippetdesc">\s*(.*?)</field>#s;
$desc =~ s/\r//g;
# any other cleanup needed?
{
id => $id,
date => $date,
author_id => $author_id,
author_name => $author_name,
desc => decode_entities($desc),
title => decode_entities($title),
}
}
my $snips = do $snippets_data_file; # load the snippets base (ref to A
+oH)
if ( $get_new )
{
my %snips = map { ( $_->{'id'} => $_ ) } @$snips; # key by id
warn scalar(keys %snips)." snippets already in local cache.\n";
my $last_date = do $last_date_file;
my( $year, $mon, $mday ) = @{$last_date}{qw( year mon mday )};
# back the date up by a day or three
# sure, we could use a module for this, but that would be
# way heavier-weight than we need. Fuzzy is fine.
if ( $mday > 1 ) { $mday--; }
elsif ( $mon > 1 ) { $mon--; $mday = $mon == 2 ? 28 : 30; }
else { $year--; $mon = 12; $mday = 31; }
my $query = sprintf "yr=%04d;mo=%02d;dy=%02d", $year, $mon, $mday;
warn "Querying for snippets posted since $query ...\n";
$query = "node_id=3989;nf=0;$query;re=N;Sn;go=Search;as_user=961";
+ # anonymonk
my $html = get( "http://www.perlmonks.org/bare/?".$query );
$html =~ /\(searched [.0-9]+% of DB\)/
or die "Failed to get good result from Super Search!";
my $n_added = 0;
for my $tr ( $html =~ m#<tr>.*?</tr>#sg )
{
local $_ = $tr;
s/[\r\n]+//g;
/<td><a href="\?node_id=1980">Snippet<\/a><\/td>/ or next;
my( $y, $m, $d ) = m/<td>(\d\d\d\d)?(\d\d)?(\d\d)<\/td>/;
my( $author_id, $author_name, $id, $title, ) =
m/<td><a href="\?node_id=(\d+)">(.*?)<\/a><\/td>\s*<td><a href
+="\?node_id=(\d+)">(.*?)<\/a><\/td>/;
if ( $author_id == 52855 )
{
warn "Skipping $id, owned by $author_name\n";
next;
}
if ( $snips{$id} )
{
warn "Skipping $id, already got it.\n";
next;
}
my $r = get_one_snippet( $id, $title );
unless ( defined $r )
{
warn "Bogus! No valid snippet data returned for $id ".
qq("$title" by $author_name ($author_id)\n);
next;
}
warn "Adding new snippet $id, posted on $y-$m-$d\n";
$snips{$id} = $r;
$n_added++;
}
print $n_added ? "Added $n_added new snippets.\n" : "No new snippe
+ts found.\n";
@$snips = values %snips;
open F, ">", $snippets_data_file or die "write $snippets_data_file
+ - $!";
print F Dumper($snips);
close F;
my @now = gmtime;
$last_date->{'year'} = $now[5] + 1900;
$last_date->{'mon'} = $now[4] + 1;
$last_date->{'mday'} = $now[3];
open F, ">", $last_date_file or die "write $last_date_file - $!";
print F Dumper($last_date);
close F;
exit 0;
}
=pod
=begin comment
typical:
{
'id' => 248201,
'title' => 'Quick and Dirty Seti@home Server Status',
'author_name' => 'Mr. Muskrat',
'author_id' => 155876,
'date' => '2003-04-04 17:14:39',
'desc' => 'Perform a quick and dirty check of the Seti@home server
+s.',
}
=end comment
=cut
sub header
{
my( $title, $ref_id, $ref_title ) = @_;
my( $year, $mon, $day ) = (gmtime)[5,4,3];
my $date = sprintf "%04d-%02d-%02d", $year+1900, $mon+1, $day;
<<EOF
<html><head><title>$title</title></head><body>
<h1>$title</h1>
<p>See <a href="http://perlmonks.org/?node_id=$ref_id">$ref_title</a>.
+</p>
<p>Generated on $date</p>
EOF
}
sub linkId
{
my( $id, $title ) = @_;
if (0)
{
return defined $title ? "[id://$id|$title]" : "[id://$id]"
}
defined $title or $title = $id;
qq(<a href="http://perlmonks.org/?node_id=$id">$title</a>)
}
# by title/id
if ( $by_titles )
{
print header('PerlMonks Snippets - By Title',619683,'New Snippets
+Index'), "<table>\n";
for ( sort {
$a->{'title'} cmp $b->{'title'}
or
$a->{'id'} <=> $b->{'id'}
} @$snips )
{
my $d = $_->{'date'};
$d =~ s/ .*//; # strip off time part
print
'<tr><td>',
linkId( $_->{'id'}, $_->{'title'} ),
'</td><td>',
linkId( $_->{'author_id'}, $_->{'author_name'} ),
"</td><td>$d</td></tr>\n"
}
print "</table></body></html>\n";
exit 0;
}
# permuted title index
if ( $permuted or $nostopwords )
{
print header('PerlMonks Snippets - Permuted Titles',619691,'Snippe
+ts Permuted Index'), "<pre>\n";
my @permut;
for my $s ( @$snips )
{
local $_ = $s->{'title'};
/untitled node/ and next;
print "Analyzing input string .$_. \n" if $DEBUG;
while ( /\b[\w]/g )
{
my $p = pos;
my $l = substr $_, 0, $p-1;
my $r = substr $_, $p-1;
#
# Skip a record where a stopword is the index term
# and if this cmd line option was specified: 'nostopwords'
next if ( $nostopwords && stopword_record_found($r) );
#
push @permut, [ $l, $r, $s ];
}
}
my $limit = 40;
my $lmax = 0;
for ( @permut )
{
$lmax < length($_->[0]) and
$lmax = length($_->[0]);
}
$lmax > $limit and
$lmax = $limit;
for my $p ( sort {
lc($a->[1]) cmp lc($b->[1])
or
$a->[2]{'id'} <=> $b->[2]{'id'}
or
lc($a->[0]) cmp lc($b->[0])
} @permut )
{
my $l = $p->[0];
my $r = $p->[1];
my $id = $p->[2]{'id'};
#$l =~ /./ and $l = "[id://$id|$l]";
#$r =~ /./ and $r = "[id://$id|$r]";
#print qq(<tr><td align=right>$l</td><td>$r</td></tr>\n);
if ( length($l) > $lmax )
{
my $chop = length($l) - $lmax; # how many to chop
substr( $l, 0, $chop ) = '';
}
else
{
print ' ' x ( $lmax - length($l) );
}
print linkId( $id, $l.$r ),"\n";
}
print "</pre></body></html>\n";
exit 0;
}
# </pre></body></html>
sub stopword_record_found
{
my @string = split (' ', shift);
foreach my $w (@stopwords) {
#D print "Now testing stopword .$w.\n";
if ( (lc $string[0]) eq $w ) {
# A stopword was found.
print "Stopword was found; input word .$string[0]. and sto
+pword .$w. .\n" if $DEBUG;
return 1;
}
}
# Stopword was not found.
#D print "Stopword was not found; returning 0 for word .$string[0]
+. .\n";
return 0;
}
sub read_stopwords
{
# Terms removed from stopword list.
# always
# off
# on
# only
#D my @stopwords2;
@stopwords=qw(
a
about
above
according
across
actually
adj
after
afterwards
again
against
all
almost
alone
along
already
also
although
among
amongst
an
and
another
any
anyhow
anyone
anything
anywhere
are
aren't
around
as
at
be
became
because
become
becomes
becoming
been
before
beforehand
begin
beginning
behind
being
below
beside
besides
between
beyond
billion
both
but
by
can
can't
cannot
caption
co
company
corp
corporation
could
couldn't
did
didn't
do
does
doesn't
don't
down
during
each
eg
eight
eighty
either
else
elsewhere
end
ending
enough
etc
even
ever
every
everyone
everything
everywhere
except
few
fifty
first
five
for
former
formerly
forty
found
four
from
further
had
has
hasn't
have
haven't
he
he'd
he'll
he's
hence
her
here
here's
hereafter
hereby
herein
hereupon
hers
herself
him
himself
his
how
however
hundred
i
i'd
i'll
i'm
i've
ie
if
in
inc
indeed
instead
into
is
isn't
it
it's
its
itself
last
later
latter
latterly
least
less
let
let's
like
likely
ltd
made
make
makes
many
maybe
me
meantime
meanwhile
might
million
miss
more
moreover
most
mostly
mr
mrs
much
must
my
myself
namely
neither
never
nevertheless
next
nine
ninety
no
nobody
none
nonetheless
noone
nor
not
nothing
now
nowhere
of
often
once
one
one's
onto
or
other
others
otherwise
our
ours
ourselves
out
over
overall
own
per
perhaps
rather
recent
recently
same
seem
seemed
seeming
seems
seven
seventy
several
she
she'd
she'll
she's
should
shouldn't
since
six
sixty
so
some
somehow
someone
something
sometime
sometimes
somewhere
still
stop
such
taking
ten
than
that
that'll
that's
that've
the
their
them
themselves
then
thence
there
there'd
there'll
there're
there's
there've
thereafter
thereby
therefore
therein
thereupon
these
they
they'd
they'll
they're
they've
thirty
this
those
though
thousand
three
through
throughout
thru
thus
to
together
too
toward
towards
trillion
twenty
two
under
unless
unlike
unlikely
until
up
upon
us
used
using
very
via
ve
was
wasn't
we
we'd
we'll
we're
we've
well
were
weren't
what
what'll
what's
what've
whatever
when
whence
whenever
where
where's
whereafter
whereas
whereby
wherein
whereupon
wherever
whether
which
while
whither
who
who'd
who'll
who's
whoever
whole
whom
whomever
whose
why
will
with
within
without
won't
would
wouldn't
yeah
yes
yet
you
you'd
you'll
you're
you've
your
yours
yourself
yourselves
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
);
}
=head1 NAME
generate_snip_indx_nostop - Generate permuted index
of titles of code snippets from perlmonks.org, ignoring stopwords.
=head1 USAGE
generate_snip_indx_nostop [-getnew] [-titles] [-permuted] [-nostopwo
+rds] (exactly one)
# Make a permuted index without showing any stopwords in the index pos
+ition:
generate_snip_indx_nostop -nostopwords
=head1 OPTIONS
One and only one of the following options
must be specified:
-getnew -titles -permuted -nostopwords
B<-getnew>
Retrieve metadata
that does not yet exist
on the local machine,
from perlmonks.org.
B<-titles> Print a list of titles of code snippets.
B<-permuted> Print a list of titles of code snippets
in the 'permuted' format,
using each word from each title as an index term.
B<-nostopwords> Print a permuted index
that does not use any stopword as an
index term.
=head1 DESCRIPTION
First,
get a list of titles from Code Snippets section of the perlmonks.org s
+ite,
using the -getnew option:
generate_snip_indx_nostop -getnew
Next,
make a permuted index without stopwords,
using the -nostopwords option:
generate_snip_indx_nostop -nostopwords
=head1 DEPENDENCIES
The
get_new function requires access to the perlmonks.org
web site
(using LWP::Simple),
to get titles of the code snippets.
=head1 BUGS AND LIMITATIONS
B<A title containing angle brackets is not handled correctly.>
=over
The angle brackets and any text inside them do not appear
in the output.
This does not apply to such brackets used in the
corresponding URL for the title.
Any text to the right of such a term is shifted
to the left in the permuted index output line,
causing the wrong text to be placed in the index term's
location for that line.
If other index terms for that line follow the angle brackets,
those lines in the permuted index will also be
misaligned by the number of missing spaces
in the index term plus the two angle brackets.
Eg, the line
"Search for <n>th occurrence of regex"
will produce entries shifted left
by three spaces in the permuted
index for these terms:
<n>th,
occurrence,
and regex.
Idea: Consider encoding angle brackets
so they appear
properly on the permuted index web page.
=back
=head1 SEE ALSO
Snippets Permuted Index
at Perlmonks:
L<http://perlmonks.org/?node_id=619691>
=head1 ACKNOWLEDGEMENTS
The original Perlmonks permuted index program was provided by jdporter
+.
The list of stopwords was also from perlmonks:
Removing Stopwords from a String,
L<http://perlmonks.org/?node_id=50257>
=head1 AUTHOR
C. Poda,
clppm at poda.net
=head1 LICENSE AND COPYRIGHT
This program is free software; you can redistribute it
and/or modify it under the same terms as Perl itself. See
L<perlartistic>.
This program is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied
warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE.
=cut
|