Not Adding Link: http://www.vasco.com/Images/end_of_life.pdf url getting problem: http://www.vasco.com/Images/end_of_life.pdf ##
##

#!/usr/bin/env perl
use 5.010;
use open qw(:locale);
use strict;
use utf8;
use warnings qw(all);

use Mojo::UserAgent;
use Try::Tiny;

# FIFO queue
my @urls = map { Mojo::URL->new($_) } qw(
http://www.f5.com/
);
my @allUrls = (
"http://www.novartis.com",
"http://www.vasco.com",
"http://www.ravenind.com",
"http://www.nepstar.cn",
"http://www.f5.com",
"http://www.lorillard.com/",
"http://www.lowes.com/",
"http://www.leggmason.com/",
    );
my $totalPagesVisited = 0;
my $maxPages = 100;

my %uniq;
my @highProbabableMatch = ( "RSS", "subscribe", "news feed", "press", "feed", "investor" );

# Limit parallel connections to 4
my $max_conn = 4;
my $incorrectAttempts = 0;
my $currentIP = ();

# User agent following up to 5 redirects
my $ua = Mojo::UserAgent
->new(max_redirects => 5)
->detect_proxy;

# Keep track of active connections
my $active = 0;
Mojo::IOLoop->recurring(
0 => sub {
        for ($active + 1 .. $max_conn) {

        # Dequeue or halt if there are no active crawlers anymore
        return ($active or Mojo::IOLoop->stop or $totalPagesVisited > $maxPages)
        unless my $url = shift @urls;

        # Fetch non-blocking just by adding
        # a callback and marking as active
        ++$active;
        $ua->get($url => \&get_callback);
        }
    }
);

sub get_callback {
    my (undef, $tx) = @_;

    # Deactivate
    --$active;
#   say "1.2 number of Links: $#urls   active: $active";

    # Request URL
    my $url = $tx->req->url;

    # Parse only OK HTML responses
    if ((! $tx->res->is_status_class(200)) or ($tx->res->headers->content_type !~ m{^text/html\b}ix))
    {
        say "url getting problem: $url";
        return;
    }


    #say $url;
    parse_html($url, $tx);

    return;
}

sub parse_html {
    my ($url, $tx) = @_;
    my $rssPageFound = 0;
    my @rssUrls=();
    my $followLink = 0;
    my $linkAndTitle = ();
    try {
        $linkAndTitle = $tx->res->dom->at('html title')->text;
    }
    catch {
        say "was not able to get content from link: $url";
    };

    #say $tx->res->dom->at('html title')->text;

    # Extract and enqueue URLs
    for my $e ($tx->res->dom('a[href]')->each) {

        # Validate href attribute
        my $link = Mojo::URL->new($e->{href});
        next if 'Mojo::URL' ne ref $link;

        # "normalize" link
        $link = $link->to_abs($tx->req->url)->fragment(undef);
        next unless grep { $link->protocol eq $_ } qw(http https);

        if ( !( testNegativeLinkMatch($link->to_string)) )
        {
            say "Not Adding Link: " . $link->to_string;
            next;
        }

        # Don't go deeper than /a/b/c

        # Access every link only once
        state $uniq = {};
        ++$uniq->{$url->to_string};
        next if ++$uniq->{$link->to_string} > 1;

        ## Don't visit other hosts
        next if $link->host ne $url->host;

        if (testMatchIfRSS(\@highProbabableMatch, $e))
        { $rssPageFound = 1; }

        if ($rssPageFound eq 1)
        {
            $totalPagesVisited++;
            if ( $totalPagesVisited < $maxPages)
            {
                say "adding link: " . $link->to_string;
                push @urls, $link;
            }
        }
    }
    return;
}
sub testMatchIfRSS
{
    my ($rssUrls, $toMatchInfo) = @_;
    foreach my $match (@$rssUrls)
    {
        if ( ($toMatchInfo =~ />(.*?)$match(.*?){webSite});
            my $origUrl = $href;
            @urls = ();
            $incorrectAttempts = 0;
            $totalPagesVisited = 0;
            push(@urls , Mojo::URL->new($origUrl));
            #push (@urls, $origUrl);
            $active = 0;
            say "Processing: $href";
            ### Start event loop if necessary
            Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
        }
} #endMain