#!/usr/local/bin/perl5.8.0 # -*- cperl -*- # Version 0.02 released Mon Sep 29 09:32:41 CEST 2003 # -- Fixed stupid syntax errors # Version 0.01 released Mon Sep 29 09:14:41 CEST 2003 use warnings; use strict; use LWP::Simple ('get', '$ua'); use URI::Escape; use XML::Simple; use Data::Dumper; use HTML::TableExtract; use HTML::TreeBuilder; use DBI; my $dbh=DBI->connect('dbi:mysql:perlmonks', 'dbusername', 'dbpassword'); my $pmuser=uri_escape('pmusername'); my $pmpasswd=uri_escape('pmpassword'); my $node=uri_escape(shift); my $nodetype=shift; $ua->cookie_jar({}); $ua->cookie_jar() or die "Couldn't create cookie jar: $!"; get("http://perlmonks.org/?op=login;user=$pmuser;passwd=$pmpasswd") or die "Couldn't log in to perlmonks: $!"; #warn "http://perlmonks.org/?node=$node;displaytype=xml" . # (defined $nodetype?";type=$nodetype":''); my $xml = get("http://perlmonks.org/?node=$node;displaytype=xml" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; $xml = XMLin($xml); print Dumper $xml; my $type = nowhitespace($xml->{type}{content}); my $NODE; $NODE->{node}{node_id}= $xml->{id}; $NODE->{node}{type_nodetype}=$xml->{type}{id}; $NODE->{node}{title}= $xml->{title}; $NODE->{node}{author_user}= $xml->{author}{id}; $NODE->{node}{createtime}= $xml->{created}; $NODE->{node}{lastupdate}= $xml->{updated}; # $NODE->{node}{hits} # $NODE->{node}{reputation} # $NODE->{node}{votescast} # $NODE->{node}{lockedby_user} # $NODE->{node}{locktime} # $NODE->{node}{core} # $NODE->{node}{package} # $NODE->{node}{postbonus} # $NODE->{node}{ucreatetime} # $NODE->{node}{node_iip} if ($type eq 'dbtable') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $table = HTML::TableExtract->new( headers => [qw(Field Type Null Default Key Extra)] ); $table->parse($html); $table = $table->first_table_state_found(); if (!$table) { die "Couldn't extract HTML table"; } my $statement = "CREATE TABLE $node ("; foreach ($table->rows()) { my %col; @col{qw(field type null default key extra)} = map {$_ eq chr(0xA0) ? undef : $_} @$_; $statement .= " $col{field} $col{type}"; if (!$col{null}) { $statement .= " NOT"; } $statement .= " NULL"; if (defined $col{default}) { $statement .= " DEFAULT '$col{default}'"; } if (defined($col{extra}) && $col{extra} =~ /auto_increment/) { $statement .= " AUTO_INCREMENT"; } # if (defined($col{key}) && $col{key} eq 'PRI') { # $statement .= " PRIMARY KEY"; # } $statement .= ","; } ## Now do it all again for the indeces. $html = get("http://perlmonks.org/?node=$node;type=dbtable;displaytype=index") or die "Couldnt't get node $node: $!"; $table = HTML::TableExtract->new (headers => [qw(Name Dup Seq Column Coll Card SubPt Packed Comment)]); $table->parse($html); $table = $table->first_table_state_found(); if (!$table) { die "Couldn't extract HTML table for indeces"; } my %indexes; foreach ($table->rows()) { my %col; @col{qw(name dup seq column coll card subpt packed comment)} = map {$_ eq chr(0xA0) ? undef : $_} @$_; $indexes{$col{name}}{dup} = $col{dup}; # seq starts at 1, but we want to map to a perl array. $indexes{$col{name}}{cols}[$col{seq}-1] = $col{column}; } foreach my $idxname (keys %indexes) { # PRIMARY KEY: Named PRIMARY # KEY: "Norm. a synonym for INDEX..." # INDEX: # UNIQUE INDEX: if ($idxname eq 'PRIMARY') { $statement .= " PRIMARY KEY"; } else { $statement .= " UNIQUE" if (!$indexes{dup}); $statement .= " INDEX"; } $statement .= " ("; $statement .= join(', ', @{$indexes{$idxname}{cols}}); $statement .= "), "; } #FIXME: Join $statement =~ s?,\s*$?);?; print $statement, "\n"; my $sth = $dbh->prepare($statement) or die "Preparing: $DBI::errstr"; $sth->execute or die "Executing: $DBI::errstr"; } elsif ($type eq 'linktype') { # There's nothing here; these are just node nodes by another name. } elsif ($type eq 'strangedoc') { my $html = get("http://perlmonks.org/?node=$node;displaytype=viewcode" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'td', 'class', 'main_content'); my $pre = $main_content->look_down('_tag', 'pre'); my $code = $pre->as_text; $code=~s/^[ 0-9]{4}: //mg; $NODE->{document}{doctext} = $code; } elsif ($type eq 'note') { # Don't have note.rank or document.lastedit $NODE->{note}{parent_node} = nowhitespace($xml->{data}{field}{parent_node}{content}); $NODE->{note}{root_node} = nowhitespace($xml->{data}{field}{root_node}{content}); $NODE->{document}{doctext} = $xml->{data}{field}{doctext}{content}; } elsif ($type eq 'opcode' or $type eq 'strangedoc') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content'); print $main_content->dump; my $pre = $main_content->look_down('_tag', 'pre'); my $code = $pre->as_text; $code=~s/^[ 0-9]{4}: //mg; #htmlcode $NODE->{htmlcode}{code} = $code; } elsif ($type eq 'nodetype') { # Handle these by hand for now, please. } elsif ($type eq 'image') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content'); my $img = $main_content->look_down('_tag', 'img'); my $p = $main_content->look_down('_tag', 'p'); $NODE->{image}{src} = $img->attr('src'); $NODE->{image}{alt} = $img->attr('alt'); # thumbsrc $NODE->{image}{description} = $p->as_text; } elsif ($type eq 'pmmodule') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $elem = $tree->look_down('_tag', 'font', 'size', '-1', sub { $_[0]->is_inside('tt'); }); my $file = IO::File->new(">".$NODE->{node}{title}) or die "Can't open ".$NODE->{node}{title}." for writing"; print $file $elem->as_text; } elsif ($type eq 'nodelet') { #Table: nodelet # nltext $NODE->{nodelet}{nlcode} = $xml->{data}{field}{nlcode}{content}; # updateinterval # nlgoto $NODE->{nodelet}{parent_container} = $xml->{data}{field}{parent_container}{content}+0; # lastupdate } elsif ($type eq 'htmlpage') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content'); my %data; my @bolds = $main_content->look_down('_tag', 'b'); foreach (@bolds) { $_->normalize_content; my $key=$_->as_text; my $value= ($_->right)[1]; #Two to the right; there's a ": " text element, then the A or I tag. $data{$key}=$value; print "$key: $value\n"; } #Table: htmlpage $NODE->{htmlpage}{pagetype_nodetype} = a_element_to_nodeid($data{pagetype}); $NODE->{htmlpage}{displaytype} = nowhitespace($xml->{data}{field}{displaytype}{content}); $NODE->{htmlpage}{page} = $xml->{data}{field}{page}{content}; $NODE->{htmlpage}{parent_container} = a_element_to_nodeid($data{'parent container'}); if ($data{theme}->attr('_tag') eq 'i') { # /this htmlpage does not belong to a theme/ $NODE->{htmlpage}{ownedby_theme} = 0; } else { $NODE->{htmlpage}{ownedby_theme} = a_element_to_nodeid($data{theme}); } # mimetype -- no way to find? $NODE->{htmlpage}{mimetype} = 'text/html'; } elsif ($type eq 'htmlcode') { $NODE->{htmlcode}{code} = $xml->{data}{field}{content}; } elsif ($type eq 'setting' or $type eq 'theme') { use URI::Escape; my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my @vars; } elsif ($type eq 'setting' or $type eq 'theme') { use URI::Escape; my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my @vars; my $table = HTML::TableExtract->new(headers => ['Setting', 'Value']); $table->parse($html); foreach my $row ($table->rows) { my $key = uri_escape($row->[0]); my $val = uri_escape($row->[1]); $val='+' if ($val eq ''); push @vars, "$key=$val"; } $NODE->{setting}{vars} = join '&', @vars; } elsif ($type eq 'nodegroup' or $type eq 'usergroup' or $type eq 'nodeletgroup') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content'); my @nodes = $main_content->look_down('_tag', 'li'); my $groupsth = $dbh->prepare("INSERT nodegroup(nodegroup_id, rank, node_id, orderby) VALUES ($NODE->{node}{node_id}, ?, ?, ?)"); my $n=0; foreach my $node (@nodes) { my $a=$node->look_down('_tag', 'a'); my $href = $a->attr('href'); $href =~ m|(\d+)|; my $id = $1; print "$n: $id\n"; $groupsth->execute($n, $id, $n); $n++; } } elsif ($type eq 'data') { my $html = get("http://perlmonks.org/?node=$node;displaytype=print" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; my $tree = HTML::TreeBuilder->new_from_content($html); my $main_content = $tree->look_down('_tag', 'div', 'id', 'content'); $NODE->{document}{doctext} = nowhitespace($main_content->as_text); } elsif ($type eq 'rawpage' or $type eq 'css') { my $doctext = get("http://perlmonks.org/?node=$node" . (defined $nodetype?";type=$nodetype":'')) or die "Couldnt't get node $node: $!"; #Table document: $NODE->{document}{doctext}=$doctext; #Table rawpage: $NODE->{rawpage}={}; $NODE->{rawpage}{datatype} = 'text/css' if ($type eq 'css'); } elsif ($type eq 'pmdevsuperdoc' or $type eq 'superdoc' or $type eq 'document' or $type eq 'fullpage' or $type eq 'perlquestion' or $NODE->{document}{doctext} = $xml->{data}{field}{content}; # $NODE->{document}{lastedit} } elsif ($type eq 'user') { # Table: user # nick # passwd # realname # email $NODE->{user}{lasttime} = nowhitespace($xml->{data}{field}{lasttime}{content}); # karma $NODE->{user}{experience} = nowhitespace($xml->{data}{field}{experience}{content}); # votesleft # votes # imgsrc # lastupdate # scratchpad # givevotes $NODE->{setting} = {}; $NODE->{document} = {}; $NODE->{document}{doctext} = $xml->{data}{field}{doctext}{content} if (exists $xml->{data}{field}{doctext}{content}); } elsif ($type eq 'container') { #Table container: # context # parent_container $NODE->{container}{parent_container} = nowhitespace($xml->{data}{field}{parent_container}{content}); $NODE->{container}{context} = nowhitespace($xml->{data}{field}{context}{content}); } else { die "Unknown type $type"; } foreach my $table (keys %$NODE) { my @keys; my @values; if (not exists $NODE->{$table}{$table.'_id'}) { $NODE->{$table}{$table.'_id'} = $NODE->{node}{node_id}; } print "$table:\n"; foreach my $key (keys %{$NODE->{$table}}) { print "\t $key: $NODE->{$table}{$key}\n"; push @keys, $key; push @values, $NODE->{$table}{$key}; } # We don't care about the success of this line. $dbh->do("delete from $table where ".$table."_id=".$NODE->{node}{node_id}); my $sth = $dbh->prepare("INSERT INTO $table (".join(',', @keys).") values (".join(',', ('?')x@values).')'); $sth->execute(@values) or die; } sub a_element_to_nodeid { my $a=shift; my $href = $a->attr('href'); $href =~ m|(\d+)|; return "$1"; } # Strips leading and trailing whitespace (but not whitespace in the middle. sub nowhitespace { local $_=shift; s/^\s+//; s/\s+$//; return $_; }