Hi i wish to scrape the content and store that in its respective names.When I prints the crawl content it doesn't print any special characters.All special characters are replaced by some junk values. for example (€)euro is printed as (-aA). I am scraping the site which is full of special characters and German language. So most of the crawled content are different from original content.Thanks in advance
use LWP::Simple;
use File::Compare;
use HTML::TreeBuilder::XPath;
use LWP::UserAgent;
open(FILE, "C:/Users/jk/Desktop/input/input.txt");
{
while(<FILE>)
{
chomp;
$url=$_;
foreach ($url)
{
($domain) = $url =~ m|www.([A-Z a-z 0-9]+.{3}).|x;
}
do 'C:/Users/jk/Desktop/perl/mainsub.pl';
&domain_check();
my $ua = LWP::UserAgent->new(agent => "Mozilla/5.0");
my $req = HTTP::Request->new(GET => "$url");
my $res = $ua->request($req);
die("error") unless $res->is_success;
my $xp = HTML::TreeBuilder::XPath->new_from_content($res->content)
+;
my @node = $xp->findnodes_as_strings("$xpath");
die("node doesn't exist") if $#node == -1;
foreach(<@node>)
{
$death=$_;
open HTML ">C:/Users/jk/Desktop/fun/perl/$site.html";
print HTML "$death\n";
}
}
}
subroutine
use LWP::Simple;
use File::Compare;
use HTML::TreeBuilder::XPath;
use LWP::UserAgent;
sub domain_check
{
sub domain_check
{
if($domain eq 'goo.eu')
{
$competitor = 'goo.eu';
$xpath ='//p/strong'
}
if ($domain eq 'mov.it')
{
$competitor = 'mov.it';
$xpath = '//div//table//td';
}
elsif ($domain eq 'lot.it')
{
$competitor = 'lot.it';
$xpath = '//div//table';
}
}