http://www.perlmonks.org?node_id=795961


in reply to scraping ASP page, __VIEWSTATE problem

This node falls below the community's threshold of quality. You may see it by logging in.
  • Comment on Re: scraping ASP page, __VIEWSTATE problem

Replies are listed 'Best First'.
Re^2: scraping ASP page, __VIEWSTATE problem
by mdupont (Scribe) on Sep 19, 2009 at 18:32 UTC
    Hi all,

    I have produced a new version of a simple walker to extract the data from a aspx page.

    #!/usr/bin/perl
    
    package ASPXXTRAKTOR;
    
    #    HAPPY SOFTWARE FREEDOM DAY 2009!
    #    
    #    This program processes a ASPX database page and dumps the data.
    #    Copyright (C) 2009 James Michael Du Pont  <h4ck3rm1k3@flossk.org>
    #
    #    This program is free software: you can redistribute it and/or modify
    #    it under the terms of the GNU Affero General Public License as
    #    published by the Free Software Foundation, either version 3 of the
    #    License, or (at your option) any later version.
    #
    #    This program is distributed in the hope that it will be useful,
    #    but WITHOUT ANY WARRANTY; without even the implied warranty of
    #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    #    GNU Affero General Public License for more details.
    #
    #    You should have received a copy of the GNU Affero General Public License
    #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    
    #   see also http://www.perlmonks.org/index.pl?node_id=786884
    #
    #   parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
    #   Copyright 2008 Evan Carroll, all rights reserved.
    #   L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
    
    use strict;
    use warnings;
    
    use WWW::Mechanize;
    use LWP::UserAgent;
    
    use Getopt::Long;
    
    ## SPEC
    my $site= ""; 
    my $searchform = "";
    my $searchfield = '';
    my $searchbutton = '';
    my $searchterm = '' ; #'HOTEL'
    my $searchnext = '';
    my $formbase   = '';
    
    
    my $result = GetOptions (
        "form=s"   => \$searchform, # string
        "site=s"   => \$site, # string
        "field=s"  => \$searchfield, # string
        "term=s"   => \$searchterm, # string
        "next=s"   => \$searchnext, # string
        "base=s"   => \$formbase, # the base of the form
        "button=s" => \$searchbutton); # button
    
    
    # make a filename for the results
    our $searchtermname = $searchterm;
    $searchtermname =~ s/\s\.\-\~/_/g;
    
    our $agent = WWW::Mechanize->new( autocheck => 1 );
    our $pagenumber = 1; # how many pages of data did we get
    our $itemnumber = 1; # how many items did we see
    our $viewstate; # the encrypted state of the server, stored on the client crap
    
    sub DumpAgent
    {
        my $content = $agent->content();
        open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
        print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
        ${pagenumber}++;
        print OUT $content;
        close OUT;
    }
    
    sub DumpData
    {
        my $content  =shift;
        open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
        print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
        ${itemnumber}++;
        print OUT $content;
        close OUT;
    }
    
    sub parseDoPostBack {
    #taken from 
    #Copyright 2008 Evan Carroll, all rights reserved.
    #L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
    
        my $href= shift;
       # warn " href $href" ;
        $href   =~  /WebForm_PostBackOptions\((.*)\)/;
        $1 =~ s/\\'/'/g;
        my $args = $1;
        my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
        Carp::croak 'Please submit a valid __doPostBack'
    	unless $eventTarget && $eventArgument;
        s/^'// && s/'$// for ($eventTarget, $eventArgument);
    
        #added a filter for "
        s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
        return ($eventTarget, $eventArgument );
    }
    
    sub ProcessLinks
    {
        my @links = $agent->links();
        foreach my $l (@links)
        {
    	my $href  = $l->url();
    	my $id  = $l->attrs()->{id};
    	if ($id)
    	{
    #	    warn "Found $id and $href";
    	    if ($id =~ /ctl(\d+)_lnkMore/)
    	    {
    		print "Found item $1\n";
    		my $newagent = $agent->clone();
    		my ($eventTarget,$eventArgument) =parseDoPostBack($href);
    		my $oldtarget = $eventTarget;
    
    		my $fields =  {
    		    '__EVENTTARGET'    => $eventTarget,
    		    '__EVENTARGUMENT'  => $eventArgument,
    		    '__VIEWSTATE'      => $viewstate,
    
    		};
    
    		$newagent->submit_form(
    		    form_name => $searchform,
    		    fields => $fields,
    		    );
    
    		DumpData($newagent->content());
    
    
    		} # if pattern match
    	    }# if id
    	} ## foreach
    
    }
    sub GetForm
    {
        $agent->get($site);
        $agent->form_name($searchform);
        $agent->field($searchfield, $searchterm);
        $agent->click($searchbutton);
    
        ## this runs the search first 
        DumpAgent();
    
        $viewstate = $agent->field('__VIEWSTATE');
    
    
        my $content = $agent->content();
        my $searchnextl = $searchnext;
        $searchnextl =~ s/\$/\\\$/g;
    
        while ($content =~ /${searchnextl}/) # do we have results?
        {
    ### EDIT// had to process the links of the first page!
    	# now recurse into the record
    	ProcessLinks();
    
    	#get the next one
    	$agent->click($searchbutton);
    	$agent->submit_form(
    	    form_name => "Form",
    	    fields => {
    		ScriptManager      => $formbase . '|'. $searchnext ,
    		'__EVENTTARGET'    => $searchnext,
    		$searchfield => $searchterm,
    		'__VIEWSTATE' => $viewstate,
    		'__EVENTARGUMENT'  => '',
    		"ScrollTop" => '',
    		"__dnnVariable" => '',
    		"__VIEWSTATEENCRYPTED" =>''
    	    },
    	    );
    	$viewstate = $agent->field('__VIEWSTATE');
    	DumpAgent();
    
    	$content = $agent->content();
        }
    
    }
    
    
    GetForm();
    
    
    1;
    
        I am having a problem with a new site, and need your help. The main entry point is here http://www.kqz-ks.org/SKQZ-WEB/en/shv/vbk.html The webpage does not respond at all to my post,

        Here is my current revision : BZR Repo copy

        Unfortunately I cannot get it to work, and I get no debug message back from the server.

        The server I am trying to interface is here : form you can call it like this :

        perl aspxtraktor_vote.pl --fname=John --lname=Smith--dob=12 --mob=Pri + --yob=1980
        The code dumps out the all the data need when you run it. Here is the output from tamper when I run it in firefox. tamper data Here is the log file : Log File Any tips or help would be appreciated.

        thanks,

        mike