Beefy Boxes and Bandwidth Generously Provided by pair Networks
go ahead... be a heretic
 
PerlMonks  

Re: scraping ASP page, __VIEWSTATE problem

by EvanCarroll (Chaplain)
on Sep 17, 2009 at 19:54 UTC ( #795961=note: print w/ replies, xml ) Need Help??


in reply to scraping ASP page, __VIEWSTATE problem

I'm the author of HTML::TreeBuilderX::ASP_NET. It was brought to my attention that mst was trash talking me and my module in a channel I'm banned from (because he won't let an op unban me). Logs are as follows:

06:59 <go|dfish> you're being badmouthed in #Perl
06:59 <go|dfish> lol
07:00 <go|dfish> 12:58           mst │ Phurl: yeah, do note that the author of that module is banned from, well, everywhere
07:00 <go|dfish> 12:58           mst │ Phurl: the guy who wrote ::ASP_NET
07:00 <go|dfish> 12:58         Phurl │ what module? the asp one?
07:00 <go|dfish> 12:58         Phurl │ it sux
07:00 <go|dfish> 12:58           mst │ right
07:00 <go|dfish> 12:59           mst │ he's a known troll
07:00 <go|dfish> 12:59         Phurl │ god
07:00 <go|dfish> 12:59         Phurl │ even the synopsis is bad
07:00 <go|dfish> >_<
07:04 <go|dfish> oh man, harsh.
07:04 <go|dfish> 13:00           mst │ please don't think anything produced by that fucktarded sack of shit is "normal" Moose code :) 
Anyway, this module uses HTTP::Request::Form which uses HTML::TreeBuilder which currently has a bug in it. Otherwise, this would be a /very/ simple process.
A test case in the works is at http://gist.github.com/188669, essentially HTML::TreeBuilder doesn't associate the __EVENT* with the form so I can't generate a proper request without that data.


Evan Carroll
The most respected person in the whole perl community.
www.EvanCarroll.com


Comment on Re: scraping ASP page, __VIEWSTATE problem
Re^2: scraping ASP page, __VIEWSTATE problem
by mdupont (Scribe) on Sep 19, 2009 at 18:32 UTC
    Hi all,

    I have produced a new version of a simple walker to extract the data from a aspx page.

    #!/usr/bin/perl
    
    package ASPXXTRAKTOR;
    
    #    HAPPY SOFTWARE FREEDOM DAY 2009!
    #    
    #    This program processes a ASPX database page and dumps the data.
    #    Copyright (C) 2009 James Michael Du Pont  <h4ck3rm1k3@flossk.org>
    #
    #    This program is free software: you can redistribute it and/or modify
    #    it under the terms of the GNU Affero General Public License as
    #    published by the Free Software Foundation, either version 3 of the
    #    License, or (at your option) any later version.
    #
    #    This program is distributed in the hope that it will be useful,
    #    but WITHOUT ANY WARRANTY; without even the implied warranty of
    #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    #    GNU Affero General Public License for more details.
    #
    #    You should have received a copy of the GNU Affero General Public License
    #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    
    #   see also http://www.perlmonks.org/index.pl?node_id=786884
    #
    #   parseDoPostBack derived from HTML-TreeBuilderX-ASP_NET
    #   Copyright 2008 Evan Carroll, all rights reserved.
    #   L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
    
    use strict;
    use warnings;
    
    use WWW::Mechanize;
    use LWP::UserAgent;
    
    use Getopt::Long;
    
    ## SPEC
    my $site= ""; 
    my $searchform = "";
    my $searchfield = '';
    my $searchbutton = '';
    my $searchterm = '' ; #'HOTEL'
    my $searchnext = '';
    my $formbase   = '';
    
    
    my $result = GetOptions (
        "form=s"   => \$searchform, # string
        "site=s"   => \$site, # string
        "field=s"  => \$searchfield, # string
        "term=s"   => \$searchterm, # string
        "next=s"   => \$searchnext, # string
        "base=s"   => \$formbase, # the base of the form
        "button=s" => \$searchbutton); # button
    
    
    # make a filename for the results
    our $searchtermname = $searchterm;
    $searchtermname =~ s/\s\.\-\~/_/g;
    
    our $agent = WWW::Mechanize->new( autocheck => 1 );
    our $pagenumber = 1; # how many pages of data did we get
    our $itemnumber = 1; # how many items did we see
    our $viewstate; # the encrypted state of the server, stored on the client crap
    
    sub DumpAgent
    {
        my $content = $agent->content();
        open OUT, ">", "DataExtractor_${searchtermname}_P${pagenumber}.htm" or die $!;
        print "creating new page :DataExtractor_${searchtermname}_P${pagenumber}.htm\n";
        ${pagenumber}++;
        print OUT $content;
        close OUT;
    }
    
    sub DumpData
    {
        my $content  =shift;
        open OUT, ">", "DataExtractor_${searchtermname}_DataPage${itemnumber}.htm" or die $!;
        print "creating new record DataExtractor_${searchtermname}_DataPage${itemnumber}.htm\n";
        ${itemnumber}++;
        print OUT $content;
        close OUT;
    }
    
    sub parseDoPostBack {
    #taken from 
    #Copyright 2008 Evan Carroll, all rights reserved.
    #L<http://search.cpan.org/dist/HTML-TreeBuilderX-ASP_NET>
    
        my $href= shift;
       # warn " href $href" ;
        $href   =~  /WebForm_PostBackOptions\((.*)\)/;
        $1 =~ s/\\'/'/g;
        my $args = $1;
        my ( $eventTarget, $eventArgument ) = split /\s*,\s*/, $args;
        Carp::croak 'Please submit a valid __doPostBack'
    	unless $eventTarget && $eventArgument;
        s/^'// && s/'$// for ($eventTarget, $eventArgument);
    
        #added a filter for "
        s/^\"// && s/\"$// for ($eventTarget, $eventArgument);
        return ($eventTarget, $eventArgument );
    }
    
    sub ProcessLinks
    {
        my @links = $agent->links();
        foreach my $l (@links)
        {
    	my $href  = $l->url();
    	my $id  = $l->attrs()->{id};
    	if ($id)
    	{
    #	    warn "Found $id and $href";
    	    if ($id =~ /ctl(\d+)_lnkMore/)
    	    {
    		print "Found item $1\n";
    		my $newagent = $agent->clone();
    		my ($eventTarget,$eventArgument) =parseDoPostBack($href);
    		my $oldtarget = $eventTarget;
    
    		my $fields =  {
    		    '__EVENTTARGET'    => $eventTarget,
    		    '__EVENTARGUMENT'  => $eventArgument,
    		    '__VIEWSTATE'      => $viewstate,
    
    		};
    
    		$newagent->submit_form(
    		    form_name => $searchform,
    		    fields => $fields,
    		    );
    
    		DumpData($newagent->content());
    
    
    		} # if pattern match
    	    }# if id
    	} ## foreach
    
    }
    sub GetForm
    {
        $agent->get($site);
        $agent->form_name($searchform);
        $agent->field($searchfield, $searchterm);
        $agent->click($searchbutton);
    
        ## this runs the search first 
        DumpAgent();
    
        $viewstate = $agent->field('__VIEWSTATE');
    
    
        my $content = $agent->content();
        my $searchnextl = $searchnext;
        $searchnextl =~ s/\$/\\\$/g;
    
        while ($content =~ /${searchnextl}/) # do we have results?
        {
    ### EDIT// had to process the links of the first page!
    	# now recurse into the record
    	ProcessLinks();
    
    	#get the next one
    	$agent->click($searchbutton);
    	$agent->submit_form(
    	    form_name => "Form",
    	    fields => {
    		ScriptManager      => $formbase . '|'. $searchnext ,
    		'__EVENTTARGET'    => $searchnext,
    		$searchfield => $searchterm,
    		'__VIEWSTATE' => $viewstate,
    		'__EVENTARGUMENT'  => '',
    		"ScrollTop" => '',
    		"__dnnVariable" => '',
    		"__VIEWSTATEENCRYPTED" =>''
    	    },
    	    );
    	$viewstate = $agent->field('__VIEWSTATE');
    	DumpAgent();
    
    	$content = $agent->content();
        }
    
    }
    
    
    GetForm();
    
    
    1;
    
        I am having a problem with a new site, and need your help. The main entry point is here http://www.kqz-ks.org/SKQZ-WEB/en/shv/vbk.html The webpage does not respond at all to my post,

        Here is my current revision : BZR Repo copy

        Unfortunately I cannot get it to work, and I get no debug message back from the server.

        The server I am trying to interface is here : form you can call it like this :

        perl aspxtraktor_vote.pl --fname=John --lname=Smith--dob=12 --mob=Pri + --yob=1980
        The code dumps out the all the data need when you run it. Here is the output from tamper when I run it in firefox. tamper data Here is the log file : Log File Any tips or help would be appreciated.

        thanks,

        mike

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://795961]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others studying the Monastery: (6)
As of 2014-07-12 09:37 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    When choosing user names for websites, I prefer to use:








    Results (239 votes), past polls