Not Adding Link: http://www.vasco.com/Images/end_of_life.pdf url getting problem: http://www.vasco.com/Images/end_of_life.pdf #### #!/usr/bin/env perl use 5.010; use open qw(:locale); use strict; use utf8; use warnings qw(all); use Mojo::UserAgent; use Try::Tiny; # FIFO queue my @urls = map { Mojo::URL->new($_) } qw( http://www.f5.com/ ); my @allUrls = ( "http://www.novartis.com", "http://www.vasco.com", "http://www.ravenind.com", "http://www.nepstar.cn", "http://www.f5.com", "http://www.lorillard.com/", "http://www.lowes.com/", "http://www.leggmason.com/", ); my $totalPagesVisited = 0; my $maxPages = 100; my %uniq; my @highProbabableMatch = ( "RSS", "subscribe", "news feed", "press", "feed", "investor" ); # Limit parallel connections to 4 my $max_conn = 4; my $incorrectAttempts = 0; my $currentIP = (); # User agent following up to 5 redirects my $ua = Mojo::UserAgent ->new(max_redirects => 5) ->detect_proxy; # Keep track of active connections my $active = 0; Mojo::IOLoop->recurring( 0 => sub { for ($active + 1 .. $max_conn) { # Dequeue or halt if there are no active crawlers anymore return ($active or Mojo::IOLoop->stop or $totalPagesVisited > $maxPages) unless my $url = shift @urls; # Fetch non-blocking just by adding # a callback and marking as active ++$active; $ua->get($url => \&get_callback); } } ); sub get_callback { my (undef, $tx) = @_; # Deactivate --$active; # say "1.2 number of Links: $#urls active: $active"; # Request URL my $url = $tx->req->url; # Parse only OK HTML responses if ((! $tx->res->is_status_class(200)) or ($tx->res->headers->content_type !~ m{^text/html\b}ix)) { say "url getting problem: $url"; return; } #say $url; parse_html($url, $tx); return; } sub parse_html { my ($url, $tx) = @_; my $rssPageFound = 0; my @rssUrls=(); my $followLink = 0; my $linkAndTitle = (); try { $linkAndTitle = $tx->res->dom->at('html title')->text; } catch { say "was not able to get content from link: $url"; }; #say $tx->res->dom->at('html title')->text; # Extract and enqueue URLs for my $e ($tx->res->dom('a[href]')->each) { # Validate href attribute my $link = Mojo::URL->new($e->{href}); next if 'Mojo::URL' ne ref $link; # "normalize" link $link = $link->to_abs($tx->req->url)->fragment(undef); next unless grep { $link->protocol eq $_ } qw(http https); if ( !( testNegativeLinkMatch($link->to_string)) ) { say "Not Adding Link: " . $link->to_string; next; } # Don't go deeper than /a/b/c # Access every link only once state $uniq = {}; ++$uniq->{$url->to_string}; next if ++$uniq->{$link->to_string} > 1; ## Don't visit other hosts next if $link->host ne $url->host; if (testMatchIfRSS(\@highProbabableMatch, $e)) { $rssPageFound = 1; } if ($rssPageFound eq 1) { $totalPagesVisited++; if ( $totalPagesVisited < $maxPages) { say "adding link: " . $link->to_string; push @urls, $link; } } } return; } sub testMatchIfRSS { my ($rssUrls, $toMatchInfo) = @_; foreach my $match (@$rssUrls) { if ( ($toMatchInfo =~ />(.*?)$match(.*?){webSite}); my $origUrl = $href; @urls = (); $incorrectAttempts = 0; $totalPagesVisited = 0; push(@urls , Mojo::URL->new($origUrl)); #push (@urls, $origUrl); $active = 0; say "Processing: $href"; ### Start event loop if necessary Mojo::IOLoop->start unless Mojo::IOLoop->is_running; } } #endMain