Beefy Boxes and Bandwidth Generously Provided by pair Networks RobOMonk
Pathologically Eclectic Rubbish Lister
 
PerlMonks  

Re^2: scraping ASP page, __VIEWSTATE problem

by mdupont (Scribe)
on Sep 19, 2009 at 18:32 UTC ( #796318=note: print w/ replies, xml ) Need Help??


in reply to Re: scraping ASP page, __VIEWSTATE problem
in thread scraping ASP page, __VIEWSTATE problem

Hi all,

I have produced a new version of a simple walker to extract the data from a aspx page.

#!/usr/bin/perl

package ASPXXTRAKTOR;

#    HAPPY SOFTWARE FREEDOM DAY 2009!
#    
#    This program processes a ASPX database page and dumps the data.
#    Copyright (C) 2009 James Michael Du Pont  <h4ck3rm1k3@flossk.org>
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as
#    published by the Free Software Foundation, either version 3 of the
#    License, or (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU Affero General Public License for more details.
#
#    You should have received a copy of the GNU Affero General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.

#   see also http://www.perlmonks.org/index.pl?node_id=786884
#
#   parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
#   Copyright 2008 Evan Carroll, all rights reserved.
#   L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>

use strict;
use warnings;

use WWW::Mechanize;
use LWP::UserAgent;

use Getopt::Long;

## SPEC
my $site= ""; 
my $searchform = "";
my $searchfield = '';
my $searchbutton = '';
my $searchterm = '' ; #'HOTEL'
my $searchnext = '';
my $formbase   = '';


my $result = GetOptions (
    "form=s"   => \$searchform, # string
    "site=s"   => \$site, # string
    "field=s"  => \$searchfield, # string
    "term=s"   => \$searchterm, # string
    "next=s"   => \$searchnext, # string
    "base=s"   => \$formbase, # the base of the form
    "button=s" => \$searchbutton); # button


# make a filename for the results
our $searchtermname = $searchterm;
$searchtermname =~ s/\s\.\-\~/_/g;

our $agent = WWW::Mechanize->new( autocheck => 1 );
our $pagenumber = 1; # how many pages of data did we get
our $itemnumber = 1; # how many items did we see
our $viewstate; # the encrypted state of the server, stored on the client crap

sub DumpAgent
{
    my $content = $agent->content();
    open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
    print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
    ${pagenumber}++;
    print OUT $content;
    close OUT;
}

sub DumpData
{
    my $content  =shift;
    open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
    print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
    ${itemnumber}++;
    print OUT $content;
    close OUT;
}

sub parseDoPostBack {
#taken from 
#Copyright 2008 Evan Carroll, all rights reserved.
#L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>

    my $href= shift;
   # warn " href $href" ;
    $href   =~  /WebForm_PostBackOptions\((.*)\)/;
    $1 =~ s/\\'/'/g;
    my $args = $1;
    my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
    Carp::croak 'Please submit a valid __doPostBack'
	unless $eventTarget && $eventArgument;
    s/^'// && s/'$// for ($eventTarget, $eventArgument);

    #added a filter for "
    s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
    return ($eventTarget, $eventArgument );
}

sub ProcessLinks
{
    my @links = $agent->links();
    foreach my $l (@links)
    {
	my $href  = $l->url();
	my $id  = $l->attrs()->{id};
	if ($id)
	{
#	    warn "Found $id and $href";
	    if ($id =~ /ctl(\d+)_lnkMore/)
	    {
		print "Found item $1\n";
		my $newagent = $agent->clone();
		my ($eventTarget,$eventArgument) =parseDoPostBack($href);
		my $oldtarget = $eventTarget;

		my $fields =  {
		    '__EVENTTARGET'    => $eventTarget,
		    '__EVENTARGUMENT'  => $eventArgument,
		    '__VIEWSTATE'      => $viewstate,

		};

		$newagent->submit_form(
		    form_name => $searchform,
		    fields => $fields,
		    );

		DumpData($newagent->content());


		} # if pattern match
	    }# if id
	} ## foreach

}
sub GetForm
{
    $agent->get($site);
    $agent->form_name($searchform);
    $agent->field($searchfield, $searchterm);
    $agent->click($searchbutton);

    ## this runs the search first 
    DumpAgent();

    $viewstate = $agent->field('__VIEWSTATE');


    my $content = $agent->content();
    my $searchnextl = $searchnext;
    $searchnextl =~ s/\$/\\\$/g;

    while ($content =~ /${searchnextl}/) # do we have results?
    {
### EDIT// had to process the links of the first page!
	# now recurse into the record
	ProcessLinks();

	#get the next one
	$agent->click($searchbutton);
	$agent->submit_form(
	    form_name => "Form",
	    fields => {
		ScriptManager      => $formbase . '|'. $searchnext ,
		'__EVENTTARGET'    => $searchnext,
		$searchfield => $searchterm,
		'__VIEWSTATE' => $viewstate,
		'__EVENTARGUMENT'  => '',
		"ScrollTop" => '',
		"__dnnVariable" => '',
		"__VIEWSTATEENCRYPTED" =>''
	    },
	    );
	$viewstate = $agent->field('__VIEWSTATE');
	DumpAgent();

	$content = $agent->content();
    }

}


GetForm();


1;


Comment on Re^2: scraping ASP page, __VIEWSTATE problem
Re^3: scraping ASP page, __VIEWSTATE problem
by mdupont (Scribe) on Sep 20, 2009 at 10:51 UTC
      I am having a problem with a new site, and need your help. The main entry point is here http://www.kqz-ks.org/SKQZ-WEB/en/shv/vbk.html The webpage does not respond at all to my post,

      Here is my current revision : BZR Repo copy

      Unfortunately I cannot get it to work, and I get no debug message back from the server.

      The server I am trying to interface is here : form you can call it like this :

      perl aspxtraktor_vote.pl --fname=John --lname=Smith--dob=12 --mob=Pri + --yob=1980
      The code dumps out the all the data need when you run it. Here is the output from tamper when I run it in firefox. tamper data Here is the log file : Log File Any tips or help would be appreciated.

      thanks,

      mike

        As usual, track with Wireshark the difference between what your Perl script sends and what your browser sends when you access the page with it.

        Also, consider just automating the page using WWW::Scripter or WWW::Mechanize::Firefox, both of which know how to handle Javascript.

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://796318]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others imbibing at the Monastery: (10)
As of 2014-04-16 23:56 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    April first is:







    Results (436 votes), past polls