Beefy Boxes and Bandwidth Generously Provided by pair Networks
XP is just a number

Re: scraping ASP page, __VIEWSTATE problem

by EvanCarroll (Chaplain)
on Sep 17, 2009 at 19:54 UTC ( #795961=note: print w/replies, xml ) Need Help??

in reply to scraping ASP page, __VIEWSTATE problem

I'm the author of HTML::TreeBuilderX::ASP_NET. It was brought to my attention that mst was trash talking me and my module in a channel I'm banned from (because he won't let an op unban me). Logs are as follows:
06:59 <go|dfish> you're being badmouthed in #Perl
06:59 <go|dfish> lol
07:00 <go|dfish> 12:58           mst │ Phurl: yeah, do note that the author of that module is banned from, well, everywhere
07:00 <go|dfish> 12:58           mst │ Phurl: the guy who wrote ::ASP_NET
07:00 <go|dfish> 12:58         Phurl │ what module? the asp one?
07:00 <go|dfish> 12:58         Phurl │ it sux
07:00 <go|dfish> 12:58           mst │ right
07:00 <go|dfish> 12:59           mst │ he's a known troll
07:00 <go|dfish> 12:59         Phurl │ god
07:00 <go|dfish> 12:59         Phurl │ even the synopsis is bad
07:00 <go|dfish> >_<
07:04 <go|dfish> oh man, harsh.
07:04 <go|dfish> 13:00           mst │ please don't think anything produced by that fucktarded sack of shit is "normal" Moose code :) 
Anyway, this module uses HTTP::Request::Form which uses HTML::TreeBuilder which currently has a bug in it. Otherwise, this would be a /very/ simple process.
A test case in the works is at, essentially HTML::TreeBuilder doesn't associate the __EVENT* with the form so I can't generate a proper request without that data.

Evan Carroll
The most respected person in the whole perl community.
  • Comment on Re: scraping ASP page, __VIEWSTATE problem

Replies are listed 'Best First'.
Re^2: scraping ASP page, __VIEWSTATE problem
by mdupont (Scribe) on Sep 19, 2009 at 18:32 UTC
    Hi all,

    I have produced a new version of a simple walker to extract the data from a aspx page.

    package ASPXXTRAKTOR;
    #    This program processes a ASPX database page and dumps the data.
    #    Copyright (C) 2009 James Michael Du Pont  <>
    #    This program is free software: you can redistribute it and/or modify
    #    it under the terms of the GNU Affero General Public License as
    #    published by the Free Software Foundation, either version 3 of the
    #    License, or (at your option) any later version.
    #    This program is distributed in the hope that it will be useful,
    #    but WITHOUT ANY WARRANTY; without even the implied warranty of
    #    GNU Affero General Public License for more details.
    #    You should have received a copy of the GNU Affero General Public License
    #    along with this program.  If not, see <>.
    #   see also
    #   parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
    #   Copyright 2008 Evan Carroll, all rights reserved.
    #   L<>
    use strict;
    use warnings;
    use WWW::Mechanize;
    use LWP::UserAgent;
    use Getopt::Long;
    ## SPEC
    my $site= ""; 
    my $searchform = "";
    my $searchfield = '';
    my $searchbutton = '';
    my $searchterm = '' ; #'HOTEL'
    my $searchnext = '';
    my $formbase   = '';
    my $result = GetOptions (
        "form=s"   => \$searchform, # string
        "site=s"   => \$site, # string
        "field=s"  => \$searchfield, # string
        "term=s"   => \$searchterm, # string
        "next=s"   => \$searchnext, # string
        "base=s"   => \$formbase, # the base of the form
        "button=s" => \$searchbutton); # button
    # make a filename for the results
    our $searchtermname = $searchterm;
    $searchtermname =~ s/\s\.\-\~/_/g;
    our $agent = WWW::Mechanize->new( autocheck => 1 );
    our $pagenumber = 1; # how many pages of data did we get
    our $itemnumber = 1; # how many items did we see
    our $viewstate; # the encrypted state of the server, stored on the client crap
    sub DumpAgent
        my $content = $agent->content();
        open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
        print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
        print OUT $content;
        close OUT;
    sub DumpData
        my $content  =shift;
        open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
        print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
        print OUT $content;
        close OUT;
    sub parseDoPostBack {
    #taken from 
    #Copyright 2008 Evan Carroll, all rights reserved.
        my $href= shift;
       # warn " href $href" ;
        $href   =~  /WebForm_PostBackOptions\((.*)\)/;
        $1 =~ s/\\'/'/g;
        my $args = $1;
        my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
        Carp::croak 'Please submit a valid __doPostBack'
    	unless $eventTarget && $eventArgument;
        s/^'// && s/'$// for ($eventTarget, $eventArgument);
        #added a filter for "
        s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
        return ($eventTarget, $eventArgument );
    sub ProcessLinks
        my @links = $agent->links();
        foreach my $l (@links)
    	my $href  = $l->url();
    	my $id  = $l->attrs()->{id};
    	if ($id)
    #	    warn "Found $id and $href";
    	    if ($id =~ /ctl(\d+)_lnkMore/)
    		print "Found item $1\n";
    		my $newagent = $agent->clone();
    		my ($eventTarget,$eventArgument) =parseDoPostBack($href);
    		my $oldtarget = $eventTarget;
    		my $fields =  {
    		    '__EVENTTARGET'    => $eventTarget,
    		    '__EVENTARGUMENT'  => $eventArgument,
    		    '__VIEWSTATE'      => $viewstate,
    		    form_name => $searchform,
    		    fields => $fields,
    		} # if pattern match
    	    }# if id
    	} ## foreach
    sub GetForm
        $agent->field($searchfield, $searchterm);
        ## this runs the search first 
        $viewstate = $agent->field('__VIEWSTATE');
        my $content = $agent->content();
        my $searchnextl = $searchnext;
        $searchnextl =~ s/\$/\\\$/g;
        while ($content =~ /${searchnextl}/) # do we have results?
    ### EDIT// had to process the links of the first page!
    	# now recurse into the record
    	#get the next one
    	    form_name => "Form",
    	    fields => {
    		ScriptManager      => $formbase . '|'. $searchnext ,
    		'__EVENTTARGET'    => $searchnext,
    		$searchfield => $searchterm,
    		'__VIEWSTATE' => $viewstate,
    		'__EVENTARGUMENT'  => '',
    		"ScrollTop" => '',
    		"__dnnVariable" => '',
    	$viewstate = $agent->field('__VIEWSTATE');
    	$content = $agent->content();
        I am having a problem with a new site, and need your help. The main entry point is here The webpage does not respond at all to my post,

        Here is my current revision : BZR Repo copy

        Unfortunately I cannot get it to work, and I get no debug message back from the server.

        The server I am trying to interface is here : form you can call it like this :

        perl --fname=John --lname=Smith--dob=12 --mob=Pri + --yob=1980
        The code dumps out the all the data need when you run it. Here is the output from tamper when I run it in firefox. tamper data Here is the log file : Log File Any tips or help would be appreciated.



Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://795961]
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others examining the Monastery: (11)
As of 2018-03-23 11:52 GMT
Find Nodes?
    Voting Booth?
    When I think of a mole I think of:

    Results (292 votes). Check out past polls.