Hi all,
I have produced a new version of a simple walker to extract the data from a aspx page.
#!/usr/bin/perl
package ASPXXTRAKTOR;
# HAPPY SOFTWARE FREEDOM DAY 2009!
#
# This program processes a ASPX database page and dumps the data.
# Copyright (C) 2009 James Michael Du Pont <h4ck3rm1k3@flossk.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# see also http://www.perlmonks.org/index.pl?node_id=786884
#
# parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
# Copyright 2008 Evan Carroll, all rights reserved.
# L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
use strict;
use warnings;
use WWW::Mechanize;
use LWP::UserAgent;
use Getopt::Long;
## SPEC
my $site= "";
my $searchform = "";
my $searchfield = '';
my $searchbutton = '';
my $searchterm = '' ; #'HOTEL'
my $searchnext = '';
my $formbase = '';
my $result = GetOptions (
"form=s" => \$searchform, # string
"site=s" => \$site, # string
"field=s" => \$searchfield, # string
"term=s" => \$searchterm, # string
"next=s" => \$searchnext, # string
"base=s" => \$formbase, # the base of the form
"button=s" => \$searchbutton); # button
# make a filename for the results
our $searchtermname = $searchterm;
$searchtermname =~ s/\s\.\-\~/_/g;
our $agent = WWW::Mechanize->new( autocheck => 1 );
our $pagenumber = 1; # how many pages of data did we get
our $itemnumber = 1; # how many items did we see
our $viewstate; # the encrypted state of the server, stored on the client crap
sub DumpAgent
{
my $content = $agent->content();
open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
${pagenumber}++;
print OUT $content;
close OUT;
}
sub DumpData
{
my $content =shift;
open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
${itemnumber}++;
print OUT $content;
close OUT;
}
sub parseDoPostBack {
#taken from
#Copyright 2008 Evan Carroll, all rights reserved.
#L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
my $href= shift;
# warn " href $href" ;
$href =~ /WebForm_PostBackOptions\((.*)\)/;
$1 =~ s/\\'/'/g;
my $args = $1;
my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
Carp::croak 'Please submit a valid __doPostBack'
unless $eventTarget && $eventArgument;
s/^'// && s/'$// for ($eventTarget, $eventArgument);
#added a filter for "
s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
return ($eventTarget, $eventArgument );
}
sub ProcessLinks
{
my @links = $agent->links();
foreach my $l (@links)
{
my $href = $l->url();
my $id = $l->attrs()->{id};
if ($id)
{
# warn "Found $id and $href";
if ($id =~ /ctl(\d+)_lnkMore/)
{
print "Found item $1\n";
my $newagent = $agent->clone();
my ($eventTarget,$eventArgument) =parseDoPostBack($href);
my $oldtarget = $eventTarget;
my $fields = {
'__EVENTTARGET' => $eventTarget,
'__EVENTARGUMENT' => $eventArgument,
'__VIEWSTATE' => $viewstate,
};
$newagent->submit_form(
form_name => $searchform,
fields => $fields,
);
DumpData($newagent->content());
} # if pattern match
}# if id
} ## foreach
}
sub GetForm
{
$agent->get($site);
$agent->form_name($searchform);
$agent->field($searchfield, $searchterm);
$agent->click($searchbutton);
## this runs the search first
DumpAgent();
$viewstate = $agent->field('__VIEWSTATE');
my $content = $agent->content();
my $searchnextl = $searchnext;
$searchnextl =~ s/\$/\\\$/g;
while ($content =~ /${searchnextl}/) # do we have results?
{
### EDIT// had to process the links of the first page!
# now recurse into the record
ProcessLinks();
#get the next one
$agent->click($searchbutton);
$agent->submit_form(
form_name => "Form",
fields => {
ScriptManager => $formbase . '|'. $searchnext ,
'__EVENTTARGET' => $searchnext,
$searchfield => $searchterm,
'__VIEWSTATE' => $viewstate,
'__EVENTARGUMENT' => '',
"ScrollTop" => '',
"__dnnVariable" => '',
"__VIEWSTATEENCRYPTED" =>''
},
);
$viewstate = $agent->field('__VIEWSTATE');
DumpAgent();
$content = $agent->content();
}
}
GetForm();
1;
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.