Beefy Boxes and Bandwidth Generously Provided by pair Networks
go ahead... be a heretic
 
PerlMonks  

Re^4: weird issue with HTML::TokeParser and Fork

by arikamir (Initiate)
on May 06, 2008 at 00:43 UTC ( [id://684794]=note: print w/replies, xml ) Need Help??


in reply to Re^3: weird issue with HTML::TokeParser and Fork
in thread [Click the star to watch this topic] weird issue with HTML::TokeParser and Fork

this is the code that forks the others
#!/usr/bin/perl use strict; use DBI; require 'scraping_amazon.pl'; require 'scraping_office_depot.pl'; require 'scraping_buy_com.pl'; require 'scraping_staples.pl'; my $pidAmazon; my $pidOfficeDepot; my $pidBuyCom; my $pidStaples; #################################################################### # Connect to database # #################################################################### my $data = DBI-> connect('DBI:mysql:database=ads;host=LocalHost','User +name','password') or die "Can't connect to database:$DBI::errstr\n"; #################################################################### # Prepare sql statement # #################################################################### my $sqlstatement = "select distinct OEM_PartNum, Description from Item +s_tbl"; my $result = $data->prepare($sqlstatement) or die "Can't prepare SQL statement: $DBI::errstr\n"; #################################################################### # Execute SQL statement # #################################################################### $result->execute() or die "Can't execute SQL statement: $DBI::errstr\n"; #################################################################### # Retrieve rows of data from ebay # #################################################################### my $ref; my $i=0; while($ref = $result->fetchrow_hashref) { # my $OEM_PartNum = $ref->{OEM_PartNum}; # my $ItemDescription = $ref->{Description}; # GetAmazon($OEM_PartNum); # GetOfficeDepot($OEM_PartNum); # GetBuy($OEM_PartNum); # GetStaples($OEM_PartNum); # $pidAmazon=fork(); # die "Cannot fork: $!" if (! defined $pidAmazon); # if (not defined $pidAmazon) { # print "esources not avilable.\n"; # } elsif ($pidAmazon == 0){ # GetAmazon($ref->{OEM_PartNum},$ref->{Descript +ion}); # exit(0); # } $pidOfficeDepot=fork(); die "Cannot fork: $!" if (! defined $pidOfficeDepo +t); if (not defined $pidOfficeDepot) { print "esources not avilable.\n"; } elsif ($pidOfficeDepot == 0){ GetOfficeDepot($ref->{OEM_PartNum},$ref->{Desc +ription}); exit(0); } # $pidBuyCom=fork(); # die "Cannot fork: $!" if (! defined $pidBuyCom); # if (not defined $pidBuyCom) { # print "esources not avilable.\n"; # } elsif ($pidBuyCom == 0){ # GetBuy($ref->{OEM_PartNum},$ref->{Description +}); # exit(0); # } $pidStaples=fork(); die "Cannot fork: $!" if (! defined $pidStaples); if (not defined $pidStaples) { print "esources not avilable.\n"; } elsif ($pidStaples == 0){ GetStaples($ref->{OEM_PartNum},$ref->{Descript +ion}); exit(0); } processcleanup(); fork } $data->disconnect; sub processcleanup { waitpid($pidAmazon,0); waitpid($pidOfficeDepot,0); waitpid($pidBuyCom,0); waitpid($pidStaples,0); }
this is the buy.com scraping code
#!/usr/bin/perl use strict; use LWP::UserAgent; use HTTP::Request; use HTTP::Headers; use XML::Simple; use DBI; use WWW::Mechanize; use HTML::TokeParser; #GetBuy("C9731A","Laser, Compatible, LaserJet 5500 Series,Cyan"); sub GetBuy { my $oem_PN = $_[0]; my $ItemDesc = $_[1]; my @ItemDesc = split(',',$ItemDesc); my $price; my $description; my $type; my $title; my $numofitems; my $descriptionCheck = 'FALSE'; my $Item; #print @ItemDesc; my $agent = WWW::Mechanize->new(); $agent->get("http://www.buy.com/retail/usersearchresults.asp? +querytype=home&qu=". $oem_PN. "&qxt=home&display=&dclksa=1"); my $stream = HTML::TokeParser->new(\$agent->{content}); for my $i (1,2){ my $tag = $stream->get_tag("a"); while (($tag->[1]{class} ne "medbluetext") && ($stream->{ +pullparser_eof} ne '1')){ $tag = $stream->get_tag("a"); } $tag = $stream->get_tag("a"); $description = $stream->get_trimmed_text("/a"); my $tag = $stream->get_tag("b"); while (($tag->[1]{class} ne "adPrice") && ($stream->{pull +parser_eof} ne '1')){ $tag = $stream->get_tag("b"); } $price = $stream->get_trimmed_text("/b"); if (($description =~ /COMPATIBLE/i) or ($description =~ / +Replacement/i)){ $type = 'Compatible'; } else { $type = 'OEM'; } foreach $Item(@ItemDesc){ if ($description =~ /$Item/i){ $descriptionCheck = 'TRUE'; } } if ($descriptionCheck) { insertrecord($oem_PN,$price,$description,$type); } } } sub insertrecord { my $oem_PN = $_[0]; my $price = $_[1]; my $description = $_[2]; my $type = $_[3]; ################################################################## +## # Connect to database + # ################################################################## +## my $buydataconnection = DBI-> connect('DBI:mysql:database=ads;host +=localhost','UserName','Passwrod') or die "Can't connect to database:$DBI::errstr\n"; ################################################################## +## # Prepare sql statement + # ################################################################## +## my $buysqlstatement = "insert into buy (OEM_PartNum,Price,Descript +ion,Type) Values ('".$oem_PN."','".$price."','".$description."','".$t +ype."')"; #print $sqlstatement."\n"; my $result = $buydataconnection->prepare($buysqlstatement) or die "Can't prepare SQL statement: $DBI::errstr\n"; ################################################################## +## # Execute SQL statement + # ################################################################## +## $result->execute() or die "Can't execute SQL statement: $DBI::errstr\n"; $buydataconnection->disconnect; $buysqlstatement = ''; my $oem_PN = ''; $result = 0; $price = 0; } return 1;
and this is the staples scraping code
#!/usr/bin/perl use strict; use LWP::UserAgent; use HTTP::Request; use HTTP::Headers; use XML::Simple; use DBI; use WWW::Mechanize; use HTML::TokeParser; #GetStaples("TN550","Laser, Compatible, HL5240, 5250, 5280DW,7,000 Pag +e Yield - Same as ADSTN580"); sub GetStaples { my $oem_PN = $_[0]; my $ItemDesc = $_[1]; my @ItemDesc = split(',',$ItemDesc); my $Item; my $price; my $description; my $type; my $title; my $numofitems; my $descriptionCheck - 'FALSE'; my $agent = WWW::Mechanize->new(autocheck => 1, cookie_jar => + undef); $agent->get("http://www.staples.com/webapp/wcs/stores/servlet +/home?&langId=-1&storeId=10001&catalogId=10051"); $agent->form_name("headerSearchForm"); $agent->field("searchkey",$oem_PN); $agent->click(); my $stream = HTML::TokeParser->new(\$agent->{content}); my $tag = $stream->get_tag("title"); $title = $stream->get_trimmed_text("/title"); if ($title !~ /that was easy/){ print "Title:".$title."--".$ItemDesc."--".$oem_PN."\n"; # open(OUTFILE, ">>output.html") or die "Can't open output. +txt: $!"; # print OUTFILE $agent->content(); # close(OUTFILE); if ($title !~ /Generic Error/){ $description = $title; $stream = HTML::TokeParser->new(\$agent->{content}); $tag = $stream->get_tag("td"); # while ((($tag->[1]{class} ne "pricenew") or ($tag->[1 +]{class} ne "pricenew specon")) && ($stream->{pullparser_eof} ne '1') +){ # $tag = $stream->get_tag("td"); # } $tag = $stream->get_tag("dd"); while (($tag->[1]{class} ne "pis") && ($stream->{pull +parser_eof} ne '1')){ $tag = $stream->get_tag("dd"); } $price = $stream->get_trimmed_text("/i"); if ($price eq ''){ $price = 'NULL'; } if ($description =~ /Compatible/){ $type = 'Compatible'; } else { $type = 'OEM'; } foreach $Item(@ItemDesc){ if ($description =~ /$Item/i){ $descriptionCheck = 'TRUE'; } } print $descriptionCheck; if ($descriptionCheck == 'TRUE') { print "insertrecord(".$oem_PN.",".$price.",".$ +description.",".$type.")"; insertrecord($oem_PN,$price,$description,$typ +e); } } } $stream = 0; $tag = 0; } sub insertrecord { my $oem_PN = $_[0]; my $price = $_[1]; my $description = $_[2]; my $type = $_[3]; ################################################################## +## # Connect to database + # ################################################################## +## my $staplesdataconnection = DBI-> connect('DBI:mysql:database=ads; +host=localhost','Username','password') or die "Can't connect to database:$DBI::errstr\n"; ################################################################## +## # Prepare sql statement + # ################################################################## +## my $sqlstatement = "insert into ads.`Staples` (OEM_PartNum,Price,D +escription,Type,Site) Values ('".$oem_PN."','".$price."','".$descript +ion."','".$type."','Staples')"; #print $sqlstatement; my $result = $staplesdataconnection->prepare($sqlstatement) or die "Can't prepare SQL statement: $DBI::errstr\n"; ################################################################## +## # Execute SQL statement + # ################################################################## +## $result->execute() or die "Can't execute SQL statement: $DBI::errstr\n"; $staplesdataconnection->disconnect; $sqlstatement = ''; my $oem_PN = ''; $result = 0; $price = 0; } return 1;

Replies are listed 'Best First'.
Re^5: weird issue with HTML::TokeParser and Fork
by ikegami (Patriarch) on May 06, 2008 at 02:27 UTC

    What's with the fork at the end of the while($ref = $result->fetchrow_hashref) loop?

    $data and $result are being destroyed in each of your children. That can have unfortunate side effects. You should be using _exit (in POSIX) instead of exit.

    General rule: require and use are for modules that have a package statement. do is for those without. This applies here.

    As for your actual question, turn on your warnings and pay attention to them. You should be getting a few "Subroutine insertrecord redefined" errors. Suggested fix:

    #!/usr/bin/perl ... use Scraping::Amazon; ... Scraping::Amazon::get($ref->{OEM_PartNum}, $ref->{Description}); ...

    scraping/amazon.pm: (File "amazon.pm" — note the extension change — in subdirectory "scraping")

    package Scraping::Amazon; ... sub get { ... } sub insertrecords { ... } 1;

    You don't have to use a subdirectory. Just remove "Scraping::" from everywhere if you put the modules in the same dir as the main script.

      the fork at the end of the loop is a typo, I took it off. If I understand your recommendation correctly you would package the sub scripts. I don't get how that will affect the destruction of $data and $results, and if $data and $result are being destructed in the child processes is a problem, does that mean that I can set the $sqlstatement from the child processes and execute it in the parent process (which would save on a connection to the DB)? Thanks

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://684794]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others meditating upon the Monastery: (4)
As of 2024-03-19 04:46 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found