Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

unique visitors from html logfile

by Anonymous Monk
on Nov 17, 2012 at 02:09 UTC ( #1004276=perlquestion: print w/ replies, xml ) Need Help??
Anonymous Monk has asked for the wisdom of the Perl Monks concerning the following question:

I'm trying to count unique ips based on a html log file and print that data to a webpage. i have the log file parsed but i am having trouble figuring out how to only select unique ip addresses to print to the webpage, any advice?

#!/usr/bin/perl use strict; use warnings; use 5.010; use POSIX; my $yesterday = strftime("%d/%b/%Y",localtime(time()-86400)); open(LOGFILE,"<", "access.log")or die"Could not open log file."; my $yesterdayHits=0; my $IPcount=0; my $totalhits=0; my $webPage='log.html'; my $startDate; open(WEBPAGE,">",$webPage); print WEBPAGE ("<HEAD><TITLE>Access Counts</TITLE></HEAD>"); print WEBPAGE ("<BODY>"); print WEBPAGE ("<H1> Today is: ",scalar(localtime), "</H1>"); print WEBPAGE ("<h3>Yesterday was $yesterday</h3>"); print WEBPAGE ("<h3>There are $IPcount unique vistors in the log</h3>" +); print WEBPAGE ("<TABLE BORDER=1 CELLPADDING=10 width='500px'>"); print WEBPAGE ("<Tr><td>IP</td><TD>LOGFILE</TD></Tr>\n\n"); foreach my $line (<LOGFILE>) { my %ips=(); $totalhits++; my $w = "(.+?)"; $line =~ m/^$w $w $w \[$w:$w $w\] "$w $w $w" $w $w$/; my @sort=$line; my $site = $1; my $logName = $2; my $fullName = $3; my $date = $4; my $time = $5; my $gmt = $6; my $req = $7; my $file = $8; my $proto = $9; my $status = $10; my $length = $11; $ips{$site} = $line; #print %ips; #foreach my $key ( sort keys %ips ) { # print $key, " => ", $ips{$key}, "\n"; #} my ($day,$month,$year)=split"\/",$date; my %dates = ( 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', ); print WEBPAGE ("<Tr><td>$site</td><TD>$line</TD></Tr>\n\n") } close(LOGFILE); #print WEBPAGE ("<h2>Start Date is $startDate</h2>"); print WEBPAGE ("<h2>Total hits: $totalhits</h2>"); print WEBPAGE ("<h3>Hits Yesterday: $yesterdayHits</h3>"); print WEBPAGE ("</TABLE></P>"); print WEBPAGE ("</BODY></HTML>"); close(WEBPAGE);

Comment on unique visitors from html logfile
Download Code
Re: unique visitors from html logfile
by Kenosis (Priest) on Nov 17, 2012 at 02:30 UTC

    You can use a hash where the keys are the IPs and the values are the count:

    use strict; use warnings; use Regexp::Common qw/net/; my %hash; while (<DATA>) { $hash{$1}++ if /($RE{net}{IPv4})/; } print "$_ => $hash{$_}\n" for keys %hash; my $uniqueIPs = keys %hash; print "Number of unique IPs: $uniqueIPs"; __DATA__ 127.0.0.1 - - [10/Apr/2007:10:39:11 +0300] "GET / HTTP/1.1" 500 606 "- +" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/20061201 + Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:39:11 +0300] "GET /favicon.ico HTTP/1.1" + 200 766 "-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gec +ko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 139.12.0.2 - - [10/Apr/2007:10:40:54 +0300] "GET / HTTP/1.1" 500 612 " +-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/2006120 +1 Firefox/2.0.0.3 (Ubuntu-feisty)" 139.12.0.2 - - [10/Apr/2007:10:40:54 +0300] "GET /favicon.ico HTTP/1.1 +" 200 766 "-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Ge +cko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:53:10 +0300] "GET / HTTP/1.1" 500 612 "- +" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/20061201 + Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:54:08 +0300] "GET / HTTP/1.0" 200 3700 " +-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/2006120 +1 Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:54:08 +0300] "GET /style.css HTTP/1.1" 2 +00 614 "http://pti.local/" "Mozilla/5.0 (X11; U; Linux i686; en-US; r +v:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:54:08 +0300] "GET /img/pti-round.jpg HTT +P/1.1" 200 17524 "http://pti.local/" "Mozilla/5.0 (X11; U; Linux i686 +; en-US; rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:54:21 +0300] "GET /unix_sysadmin.html HT +TP/1.1" 200 3880 "http://pti.local/" "Mozilla/5.0 (X11; U; Linux i686 +; en-US; rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 217.0.22.3 - - [10/Apr/2007:10:54:51 +0300] "GET / HTTP/1.1" 200 34 "- +" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/20061201 + Firefox/2.0.0.3 (Ubuntu-feisty)" 217.0.22.3 - - [10/Apr/2007:10:54:51 +0300] "GET /favicon.ico HTTP/1.1 +" 200 11514 "-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) +Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 217.0.22.3 - - [10/Apr/2007:10:54:53 +0300] "GET /cgi/pti.pl HTTP/1.1" + 500 617 "http:/contact.local/" "Mozilla/5.0 (X11; U; Linux i686; en- +US; rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 127.0.0.1 - - [10/Apr/2007:10:54:08 +0300] "GET / HTTP/0.9" 200 3700 " +-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/2006120 +1 Firefox/2.0.0.3 (Ubuntu-feisty)" 217.0.22.3 - - [10/Apr/2007:10:58:27 +0300] "GET / HTTP/1.1" 200 3700 +"-" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/200612 +01 Firefox/2.0.0.3 (Ubuntu-feisty)" 217.0.22.3 - - [10/Apr/2007:10:58:34 +0300] "GET /unix_sysadmin.html H +TTP/1.1" 200 3880 "http://pti.local/" "Mozilla/5.0 (X11; U; Linux i68 +6; en-US; rv:1.8.1.3) Gecko/20061201 Firefox/2.0.0.3 (Ubuntu-feisty)" 217.0.22.3 - - [10/Apr/2007:10:58:45 +0300] "GET /talks/Fundamentals/r +ead-excel-file.html HTTP/1.1" 404 311 "http://pti.local/unix_sysadmin +.html" "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.3) Gecko/200 +61201 Firefox/2.0.0.3 (Ubuntu-feisty)"

    Output:

    127.0.0.1 => 8 139.12.0.2 => 2 217.0.22.3 => 6 Number of unique IPs: 3

    If you're not interested in the count for each IP, you can just use $hash{$1} = 1 instead of incrementing. Either way, the keys will contain log file's unique IPs. Regexp::Common is used in capturing the IPs.

Re: unique visitors from html logfile
by mbethke (Hermit) on Nov 17, 2012 at 07:03 UTC
    There are actually only two little things that keep your code from working:
    1. You declare (and unnecessarily clear) %ips inside the loop.
    2. You assign the line contents to $ips{$site} instead of incrementing a counter there. $ips{$site}++ is fine---if the element doesn't exist, it will read as undef which in numeric context is a zero.

    I don't know how big your logs are but another thing you could do is improve that regexp: be as specific as you can about each field. At the very least that means specifying a non-blank character where you need one/ On my machine, parsing a typical log line with your version takes about 5.2Ás; if I change $w to /(\S+?)/, it's just about 0.9Ás. Adding an /o flag to only have it interpolate and compile the regexp once brings it down to 0.35Ás. I'm not sure why the difference is so big as there's not that much backtracking╣ but anyway it helps. I didn't benchmark whether it makes a speed difference but the assignment is much shorter to write as follows:

    my ($site, $logName, $fullName, $date, $time, $gmt, $req, $file, $prot +o, $status, $length) = $line =~ /^$w .../o;

    ╣ Using $w = "(.+)" which really backtracks lot takes a whopping 185Ás per line.

Re: unique visitors from html logfile
by space_monk (Chaplain) on Nov 17, 2012 at 09:16 UTC

    It's not a good idea to use lots of print statements to be outputting your webpage. Either use the CGI or use HEREDOCS to output your page in a one or two statements. I prefer the latter, but I can understand using the former may allow your pages to automatically keep up with changing standards.

    Also try and use the following when creating html:

    • Try to consistently use lowercase elements. Uppercase is so AOL/1995 coding style.
    • Use th for table heading columns, not td
    • Use CSS for layout where possible; having a css element saying your table is 500px wide is better than having the width set with the old (deprecated?) width attribute
    • Run your page through the W3C Validator or a local validator program to see how standards compliant your page is

    Also, it is probably a good idea not to hard code the name of the logfile and the web output page into the file; these are really things that should be command line parameters so you can read any log file and output yur web page to any file name you like.

    perl your_program <access.log >log.html

    Your code was resetting the list of IPs on every line, whereas you want a count through the whole logfile

    #!/usr/bin/perl use strict; use warnings; use 5.010; use POSIX; # create this outside the loop - it doesn't change # in fact it isn't used so why is it here at all? left in just in case my %dates = ( 'Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06', 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12', ); my $yesterday = strftime("%d/%b/%Y",localtime(time()-86400)); my $yesterdayHits=0; my $IPcount=0; my $totalhits=0; my $startDate; my $tm = scalar(localtime); my %ips=(); my @rows; # read from logfile(s) supplied on command line, instead of fixed file +.... foreach my $line (<>) { $totalhits++; # (.+) is horrible as '.' includes spaces, this is better .... my $w = "(\S+?)"; $line =~ m/^$w $w $w \[$w:$w $w\] "$w $w $w" $w $w$/; # could do all these as one statement, but split for readability.. +. my ($site, $logName, $fullName) = ($1,$2, $3); my ($date, $time, $gmt) = ($4, $5, $6); my ($req, $file, $proto) = ($7, $8, $9); my ($status, $length) = ($10, $11); $ips{$site}++; my ($day,$month,$year)=split"\/",$date; my $row = <<EOF; <tr><td>$site</td><td>$line</td></tr> EOF push @rows,$row; } # Real Men use Data::Dumper :-) foreach my $key ( sort keys %ips ) { print STDERR $key, " => ", $ips{$key}, "\n"; } # write to output file specified on command line instead... print <<EOF; <head> <title>Access Counts</title></head> <body> <h1> Today is: $tm</h1> <h3>Yesterday was $yesterday</h3> <h3>There are $IPcount unique visitors in the log</h3> <table BORDER=1 CELLPADDING=10 width='500px'> <tr><th>IP</th> <th>LOGFILE</th> </tr> @rows <h2>Start Date is $startDate</h2> <h2>Total hits: $totalhits</h2> <h3>Hits Yesterday: $yesterdayHits</h3> </table></p> </body> </html> EOF
    A Monk aims to give answers to those who have none, and to learn from those who know more.

      the dates hash came about when i was trying to find the start date of the log, that idea fell apart and i thought i removed it from the code before posting

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: perlquestion [id://1004276]
Approved by Athanasius
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others about the Monastery: (11)
As of 2014-07-22 20:00 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    My favorite superfluous repetitious redundant duplicative phrase is:









    Results (126 votes), past polls