Beefy Boxes and Bandwidth Generously Provided by pair Networks
XP is just a number

Re^2: scraping ASP page, __VIEWSTATE problem

by mdupont (Scribe)
on Sep 19, 2009 at 18:32 UTC ( #796318=note: print w/replies, xml ) Need Help??

in reply to Re: scraping ASP page, __VIEWSTATE problem
in thread scraping ASP page, __VIEWSTATE problem

Hi all,

I have produced a new version of a simple walker to extract the data from a aspx page.



#    This program processes a ASPX database page and dumps the data.
#    Copyright (C) 2009 James Michael Du Pont  <>
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU Affero General Public License as
#    published by the Free Software Foundation, either version 3 of the
#    License, or (at your option) any later version.
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    GNU Affero General Public License for more details.
#    You should have received a copy of the GNU Affero General Public License
#    along with this program.  If not, see <>.

#   see also
#   parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
#   Copyright 2008 Evan Carroll, all rights reserved.
#   L<>

use strict;
use warnings;

use WWW::Mechanize;
use LWP::UserAgent;

use Getopt::Long;

my $site= ""; 
my $searchform = "";
my $searchfield = '';
my $searchbutton = '';
my $searchterm = '' ; #'HOTEL'
my $searchnext = '';
my $formbase   = '';

my $result = GetOptions (
    "form=s"   => \$searchform, # string
    "site=s"   => \$site, # string
    "field=s"  => \$searchfield, # string
    "term=s"   => \$searchterm, # string
    "next=s"   => \$searchnext, # string
    "base=s"   => \$formbase, # the base of the form
    "button=s" => \$searchbutton); # button

# make a filename for the results
our $searchtermname = $searchterm;
$searchtermname =~ s/\s\.\-\~/_/g;

our $agent = WWW::Mechanize->new( autocheck => 1 );
our $pagenumber = 1; # how many pages of data did we get
our $itemnumber = 1; # how many items did we see
our $viewstate; # the encrypted state of the server, stored on the client crap

sub DumpAgent
    my $content = $agent->content();
    open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
    print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
    print OUT $content;
    close OUT;

sub DumpData
    my $content  =shift;
    open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
    print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
    print OUT $content;
    close OUT;

sub parseDoPostBack {
#taken from 
#Copyright 2008 Evan Carroll, all rights reserved.

    my $href= shift;
   # warn " href $href" ;
    $href   =~  /WebForm_PostBackOptions\((.*)\)/;
    $1 =~ s/\\'/'/g;
    my $args = $1;
    my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
    Carp::croak 'Please submit a valid __doPostBack'
	unless $eventTarget && $eventArgument;
    s/^'// && s/'$// for ($eventTarget, $eventArgument);

    #added a filter for "
    s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
    return ($eventTarget, $eventArgument );

sub ProcessLinks
    my @links = $agent->links();
    foreach my $l (@links)
	my $href  = $l->url();
	my $id  = $l->attrs()->{id};
	if ($id)
#	    warn "Found $id and $href";
	    if ($id =~ /ctl(\d+)_lnkMore/)
		print "Found item $1\n";
		my $newagent = $agent->clone();
		my ($eventTarget,$eventArgument) =parseDoPostBack($href);
		my $oldtarget = $eventTarget;

		my $fields =  {
		    '__EVENTTARGET'    => $eventTarget,
		    '__EVENTARGUMENT'  => $eventArgument,
		    '__VIEWSTATE'      => $viewstate,


		    form_name => $searchform,
		    fields => $fields,


		} # if pattern match
	    }# if id
	} ## foreach

sub GetForm
    $agent->field($searchfield, $searchterm);

    ## this runs the search first 

    $viewstate = $agent->field('__VIEWSTATE');

    my $content = $agent->content();
    my $searchnextl = $searchnext;
    $searchnextl =~ s/\$/\\\$/g;

    while ($content =~ /${searchnextl}/) # do we have results?
### EDIT// had to process the links of the first page!
	# now recurse into the record

	#get the next one
	    form_name => "Form",
	    fields => {
		ScriptManager      => $formbase . '|'. $searchnext ,
		'__EVENTTARGET'    => $searchnext,
		$searchfield => $searchterm,
		'__VIEWSTATE' => $viewstate,
		'__EVENTARGUMENT'  => '',
		"ScrollTop" => '',
		"__dnnVariable" => '',
	$viewstate = $agent->field('__VIEWSTATE');

	$content = $agent->content();



  • Comment on Re^2: scraping ASP page, __VIEWSTATE problem

Replies are listed 'Best First'.
Re^3: scraping ASP page, __VIEWSTATE problem
by mdupont (Scribe) on Sep 20, 2009 at 10:51 UTC
      I am having a problem with a new site, and need your help. The main entry point is here The webpage does not respond at all to my post,

      Here is my current revision : BZR Repo copy

      Unfortunately I cannot get it to work, and I get no debug message back from the server.

      The server I am trying to interface is here : form you can call it like this :

      perl --fname=John --lname=Smith--dob=12 --mob=Pri + --yob=1980
      The code dumps out the all the data need when you run it. Here is the output from tamper when I run it in firefox. tamper data Here is the log file : Log File Any tips or help would be appreciated.



        As usual, track with Wireshark the difference between what your Perl script sends and what your browser sends when you access the page with it.

        Also, consider just automating the page using WWW::Scripter or WWW::Mechanize::Firefox, both of which know how to handle Javascript.

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://796318]
[marto]: and I'm off next week1
[Corion]: Whee! ;)
[Corion]: I was away last week already, so I know how good a week off is ;))
[Discipulus]: a barter with your time (above the poorline time is always better than money)

How do I use this? | Other CB clients
Other Users?
Others lurking in the Monastery: (7)
As of 2017-02-28 10:16 GMT
Find Nodes?
    Voting Booth?
    Before electricity was invented, what was the Electric Eel called?

    Results (399 votes). Check out past polls.