Re: Scraping an ASP form I don't have any control over

Sometimes i just like to take out the bigger hammer, or the chainsaw if you like

use strict; use warnings; 
  use LWP;
  use HTML::Entities            qw/decode_entities/;
  use Data::Dumper; 


  my $ua; 
  $ua = new LWP::UserAgent(keep_alive=>1
                          ,agent =>"Mozilla/5.0 (Windows NT 5.1; rv:51
+.0) Gecko/20100101 Firefox/51.0"
#                          ,ssl_opts => { verify_hostname => 0
#                                       , SSL_verify_mode => 'SSL_VERI
+FY_NONE' 
#                                       }
#                          ,timeout=>$timeout
                          );

#  use HTTP::Cookies::Mozilla;
#  $cookie_jar_obj = HTTP::Cookies::Mozilla->new(file=>$cookie_file);
#  $ua->cookie_jar( $cookie_jar_obj );

  my $response;
  my $response2;
  my $reuse1=0; 
  my $reuse2=0; 
  if ($reuse2) {$reuse1=1}; 
  unless ($reuse1) { 
      $response=getter('http://www.appaloosa.com/web/mbrverify.aspx');
+ saver('mbrverify.txt',$response); 
      } 
  else { $response=unsaver('mbrverify.txt'); } 

#print Dumper($response);
  unless ($reuse2) { 
      my $res=form_parser1($response); 
#      print Dumper($res);

      my $form=$res->[0]; 
      my $mynumber=0; # id number 
      $form->{pairs}{txtMemNumA}=$mynumber; 
#      print Dumper($res);
      $response2=backer($response,$form); saver('mbrverify2.txt',$resp
+onse2); 
      } 
  else{
      $response2=unsaver('mbrverify2.txt'); 
      } 

#print Dumper($response2);

exit; 

sub getter { 
  my $uri=shift; 
  my $args=shift; # array ref of args
  my $page = URI->new($uri);
  if ($args) { $page->query_form( @$args); } 

  my $req = new HTTP::Request (GET => $page); 
  my $response = $ua->request ($req);

  unless ($response->is_success) {
     die $response->status_line; 
     } 
  return $response;  
} # getter 

sub poster { 
  my $uri=shift; 
  my $vars=shift; # array ref of args
  my $page = URI->new($uri);
  $response = $ua->post( $page, $vars); 
  unless ($response->is_success) {
      die $response->status_line; 
      } 
  return $response;
  }  # poster 


sub form_parser1 { 
  my $response=shift; 
  my $page=decode_entities($response->decoded_content); 
  my @forms=$page=~m!(<form.+?</form.*?>)!msg; 
  my $res=[]; 
  for my $form (@forms){ 
    my ($fid)=$form=~m!(<form.+?>)!msg; 
    my $fh={}; 
    ($fh->{method})=$fid=~m!method="(.*?)"!ms; 
    ($fh->{action})=$fid=~m!action="(.*?)"!ms; 
  
    my @inputs=$form=~m!(<input.+?/>)!msg; 
    my $infields=[]; 
    my $inhash={}; 
    $fh->{pairs}=$inhash; 
    for my $input (@inputs) { 
      my ($name)=$input=~m!name="(.+?)"!i; 
      my ($value)=$input=~m!value="(.+?)"!i; 
      $inhash->{$name}=$value; 
      push @$infields,$name;
      } # input
    $fh->{infields}=$infields; 
    push @$res,$fh; 
    } # form 
  return $res; 
  } # parser 


sub backer { 
# only handles post and get 
  my $responseto=shift; 
  my $form=shift; 
#  my $base=$responseto->base;  # gives Can't locate object method "sc
+heme" via package "URI::http"
  my $base=${$responseto->{_request}{_uri}};
  my $page = URI->new_abs($form->{action},  $base);
  my $response; 

  my $vars=[]; 
  my $hash=$form->{pairs}; 
  for my $id (@{$form->{infields}}) { 
    push @$vars,$id,$hash->{$id}; 
    } 

  if (uc($form->{method}) eq 'POST') { 
      return poster($page,$vars); 
      } # post  
  else{ 
      return getter($page,$vars); 
      } # get 
  return $response; 
} # backer

sub saver { 
  my $fn=shift; 
  my $response=shift; 
  open (my $rep,'>',$fn); 
  binmode $rep, ":encoding(UTF-8)"; 
  local $Data::Dumper::Deepcopy=1;
  local $Data::Dumper::Purity=1;
  local $Data::Dumper::Sortkeys=1;
  local $Data::Dumper::Indent=1; 
  print $rep Dumper($response);
  close $rep; 
  } #saver 

sub unsaver { 
  my $fn=shift; 
  open (my $rep,'<',$fn); 
  binmode $rep, ":encoding(UTF-8)"; 
  my $stuff=do { local $/;<$rep>};  
  close $rep; 
  { 
    no strict;
    return eval ($stuff) ; 
  }
  } # unsaver
[download]

This saves the returned data to files in Data::Dumper form that you can view with an editor, and it can reuse parts so you dont have to start at the beginning each time. But i used the big hammer rather than HTML::Form too.

I left in some commented stuff i might have otherwise used as comments. If you were logged in via firefox using cookies may pass you by the verification stage. What i got back changed based on my agent. If it were https and not passing CA check my bigger hammer method is to bypass checking.

Comment on Re: Scraping an ASP form I don't have any control over Download Code

Replies are listed 'Best First'.
Re^2: Scraping an ASP form I don't have any control over by wveagle81 (Novice) on Apr 15, 2017 at 23:47 UTC
Thanks, I'll take a close look at what you've provided. My current solution works for just the one piece of data I'm pulling, but I can do so much more with the data I can retrieve.	[reply]
Re^3: Scraping an ASP form I don't have any control over by huck (Prior) on Apr 16, 2017 at 01:31 UTC
If what you have via WWW::Mechanize is good for you thats just fine tho. I just thought you might find it of use to see a more nuts and bolts method of doing it too. A closer to "core" method. One thing i do like about my method is the dumper file with all of the request, not just the content. It helps in figuring out what is going on. I dont "mech", but a way of changing `$mech->save_content("formfile.txt");` to something like `$mech->save_response("formfile.txt");`, the content is in the response. Related is the ability to rerun off the saved files, so you can fix an intermediate part locally with temporary prints without having to go the the website again.	[reply] [d/l] [select]
Re^3: Scraping an ASP form I don't have any control over by stevieb (Canon) on Apr 16, 2017 at 00:02 UTC
If it's not in a proper/standard format, and the return isn't documented, be prepared to modify your code every single time the remote site decides to change their minds...	[reply]
Re^4: Scraping an ASP form I don't have any control over by huck (Prior) on Apr 16, 2017 at 01:14 UTC
That is true even with an API tho, Im on my third rewrite because of APIs at youtube, but then i think there were at least half a dozen when i was just scrapeing the channels video indexes for my stats before there even was an api.	[reply]
Re^5: Scraping an ASP form I don't have any control over by stevieb (Canon) on Apr 16, 2017 at 01:33 UTC
Re^6: Scraping an ASP form I don't have any control over by huck (Prior) on May 31, 2017 at 07:20 UTC


Think about Loose Coupling
	PerlMonks