Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked
 
PerlMonks  

comment on

( [id://3333]=superdoc: print w/replies, xml ) Need Help??
Hi all,

I have produced a new version of a simple walker to extract the data from a aspx page.

#!/usr/bin/perl

package ASPXXTRAKTOR;

#    HAPPY SOFTWARE FREEDOM DAY 2009!
#    
#    This program processes a ASPX database page and dumps the data.
#    Copyright (C) 2009 James Michael Du Pont  <h4ck3rm1k3@flossk.org>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as
#    published by the Free Software Foundation, either version 3 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Affero General Public License for more details.
#
#    You should have received a copy of the GNU Affero General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

#   see also http://www.perlmonks.org/index.pl?node_id=786884
#
#   parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
#   Copyright 2008 Evan Carroll, all rights reserved.
#   L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>

use strict;
use warnings;

use WWW::Mechanize;
use LWP::UserAgent;

use Getopt::Long;

## SPEC
my $site= ""; 
my $searchform = "";
my $searchfield = '';
my $searchbutton = '';
my $searchterm = '' ; #'HOTEL'
my $searchnext = '';
my $formbase   = '';


my $result = GetOptions (
    "form=s"   => \$searchform, # string
    "site=s"   => \$site, # string
    "field=s"  => \$searchfield, # string
    "term=s"   => \$searchterm, # string
    "next=s"   => \$searchnext, # string
    "base=s"   => \$formbase, # the base of the form
    "button=s" => \$searchbutton); # button


# make a filename for the results
our $searchtermname = $searchterm;
$searchtermname =~ s/\s\.\-\~/_/g;

our $agent = WWW::Mechanize->new( autocheck => 1 );
our $pagenumber = 1; # how many pages of data did we get
our $itemnumber = 1; # how many items did we see
our $viewstate; # the encrypted state of the server, stored on the client crap

sub DumpAgent
{
    my $content = $agent->content();
    open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
    print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
    ${pagenumber}++;
    print OUT $content;
    close OUT;
}

sub DumpData
{
    my $content  =shift;
    open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
    print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
    ${itemnumber}++;
    print OUT $content;
    close OUT;
}

sub parseDoPostBack {
#taken from 
#Copyright 2008 Evan Carroll, all rights reserved.
#L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>

    my $href= shift;
   # warn " href $href" ;
    $href   =~  /WebForm_PostBackOptions\((.*)\)/;
    $1 =~ s/\\'/'/g;
    my $args = $1;
    my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
    Carp::croak 'Please submit a valid __doPostBack'
	unless $eventTarget && $eventArgument;
    s/^'// && s/'$// for ($eventTarget, $eventArgument);

    #added a filter for "
    s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
    return ($eventTarget, $eventArgument );
}

sub ProcessLinks
{
    my @links = $agent->links();
    foreach my $l (@links)
    {
	my $href  = $l->url();
	my $id  = $l->attrs()->{id};
	if ($id)
	{
#	    warn "Found $id and $href";
	    if ($id =~ /ctl(\d+)_lnkMore/)
	    {
		print "Found item $1\n";
		my $newagent = $agent->clone();
		my ($eventTarget,$eventArgument) =parseDoPostBack($href);
		my $oldtarget = $eventTarget;

		my $fields =  {
		    '__EVENTTARGET'    => $eventTarget,
		    '__EVENTARGUMENT'  => $eventArgument,
		    '__VIEWSTATE'      => $viewstate,

		};

		$newagent->submit_form(
		    form_name => $searchform,
		    fields => $fields,
		    );

		DumpData($newagent->content());


		} # if pattern match
	    }# if id
	} ## foreach

}
sub GetForm
{
    $agent->get($site);
    $agent->form_name($searchform);
    $agent->field($searchfield, $searchterm);
    $agent->click($searchbutton);

    ## this runs the search first 
    DumpAgent();

    $viewstate = $agent->field('__VIEWSTATE');


    my $content = $agent->content();
    my $searchnextl = $searchnext;
    $searchnextl =~ s/\$/\\\$/g;

    while ($content =~ /${searchnextl}/) # do we have results?
    {
### EDIT// had to process the links of the first page!
	# now recurse into the record
	ProcessLinks();

	#get the next one
	$agent->click($searchbutton);
	$agent->submit_form(
	    form_name => "Form",
	    fields => {
		ScriptManager      => $formbase . '|'. $searchnext ,
		'__EVENTTARGET'    => $searchnext,
		$searchfield => $searchterm,
		'__VIEWSTATE' => $viewstate,
		'__EVENTARGUMENT'  => '',
		"ScrollTop" => '',
		"__dnnVariable" => '',
		"__VIEWSTATEENCRYPTED" =>''
	    },
	    );
	$viewstate = $agent->field('__VIEWSTATE');
	DumpAgent();

	$content = $agent->content();
    }

}


GetForm();


1;

In reply to Re^2: scraping ASP page, __VIEWSTATE problem by mdupont
in thread scraping ASP page, __VIEWSTATE problem by Anonymous Monk

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Are you posting in the right place? Check out Where do I post X? to know for sure.
  • Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
    <code> <a> <b> <big> <blockquote> <br /> <dd> <dl> <dt> <em> <font> <h1> <h2> <h3> <h4> <h5> <h6> <hr /> <i> <li> <nbsp> <ol> <p> <small> <strike> <strong> <sub> <sup> <table> <td> <th> <tr> <tt> <u> <ul>
  • Snippets of code should be wrapped in <code> tags not <pre> tags. In fact, <pre> tags should generally be avoided. If they must be used, extreme care should be taken to ensure that their contents do not have long lines (<70 chars), in order to prevent horizontal scrolling (and possible janitor intervention).
  • Want more info? How to link or How to display code and escape characters are good places to start.
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others learning in the Monastery: (5)
As of 2024-03-29 13:13 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found