Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW

log parser fails

by kazak (Beadle)
on Aug 07, 2012 at 08:20 UTC ( #985912=perlquestion: print w/replies, xml ) Need Help??
kazak has asked for the wisdom of the Perl Monks concerning the following question:

Hi to all. I'm trying to write some log parser for my needs and idea is following:

1. Parser determines locations of all access_logs on this host.(works).

2. Prser opens them for reading one by one.(it works)

3. Parses them with regexp from last line to first one, (but only in bounds of some time range) in order to count all ip hits, referrerers, etc. (works ONLY with file handle, if I use File::ReadBackwards regexp stops to work). I just don't get it why it fails, and need some help for this to solve, thanks in advance to all community for all help.

Here a sample of strings I'm trying to parse:, - - [21/Apr/2012:04:35:01 +0200 +] "GET /seo/vbseocp.php HTTP/1.0" 404 300 "-" "Internet Explorer 6.0", - - [22/Apr/2012:04:00:43 +0200] "G +ET / HTTP/1.0" 200 10211 " +m" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1;)"

And here is my code:

#!/usr/bin/perl use strict; use warnings; use File::Find; use DateTime; use Getopt::Std; use DateTime::Format::HTTP; use Data::Dumper; use File::ReadBackwards qw(); my (@apache_confs,@log_paths,@tmp); my $start_point = "/etc/apache2/sites-enabled"; my $conf_ext = ".conf"; my ($second, $minute, $hour, $day, $month, $year, $weekday, $dayofyear +, $isDST) = localtime(time); $year += 1900; $month++; my $log_date = 'DateTime::Format::HTTP'; my %options = (); my %source_ip = (); my %referrers= (); my %urls = (); my %agents =(); getopts ('d:r:' => \%options); die "No time range duration object defined" if !$options{'d'}; die "No time range defined for back tracing." if !$options{'r'}; my $now = DateTime->new(year => $year, month => $month, day => $day, +hour => $hour, minute => $minute, second => $second); my $start_time = $now->clone->subtract( $options{'d'} => $options{'r'} + ); print "Start from: $start_time Now: $now\n"; my ($line,$source_host,$my_host,$internal_redirect,$date,$url_with_met +hod,$status,$size,$referrer,$agent,$end_time,$check_time,$vhost_name) +; finddepth(\&stat, $start_point); sub stat { my ($log_path); my $apache_configs_found = scalar(@apache_confs); open (F, "<","$File::Find::name") and push @apache_confs, "$Fi +le::Find::name" && print "$File::Find::name FOUND apache configs COU +NT: $apache_configs_found \n" if m/$conf_ext/; while (<F>) { chomp; s/(\s)//g; s/#.*//; next if /^(\s)*$/; if (/CustomLog/) { $log_path = $_; $log_path =~ s/CustomLog//; $log_path =~ s/combined//; push @log_paths, $log_path; print "Logpath $log_path\n"; } } close(F); foreach (@log_paths) { my $fh_in = File::ReadBackwards->new($_) or die("Unabl +e to open \"$_\": $!\n"); while (defined($line = $fh_in->readline())) { chomp($line); print "$line|\n"; ($source_host,$my_host,$internal_redirect,$date,$url_with_meth +od,$status,$size,$referrer,$agent) = $line =~ m@^(\S+?), (\S+) (\S+) +- - \[(\d{2})/(\w+)/(\d{4}):\s*(\d{2}):(\d{2}):\s*(\d{2}) \+(\d{4})\] + ".*?" (\d{3}) (\d+) "(.*?)" "(.*?)"@; print Data::Dumper->Dump( [ \$line, \$source_host,\$my_host,\$internal_r +edirect,\$date,\$url_with_method,\$status,\$size,\$referrer,\$agent ] +, [qw(*line *source_host *my_host *internal_redi +rect *date *url_with_method *status *size *referrer *agent)], ), qq{\n}; $date =~ s/\[//g; $date =~ s/\]//g; eval { $check_time = $log_date->parse_datetime +($date)}; my $cmp = DateTime->compare( $check_time, $sta +rt_time ); next if $cmp >= 0 ; print "CMP$cmp STIME:$check_time SH:$source_ho +st, MH:$my_host, IR:$internal_redirect, D:$date, U:$url_with_method, +S:$status, SZ:$size, R:$referrer, A:$agent\n"; $source_ip{$source_host}++; $referrers{$referrer}++; $urls{$url_with_method}++; $agents{$agent}++; } }

Replies are listed 'Best First'.
Re: log parser fails
by aitap (Deacon) on Aug 07, 2012 at 08:57 UTC
    There is a trouble in your code at this line:
    ($source_host,$my_host,$internal_redirect,$date,$url_with_method,$status,$size,$referrer,$agent) = $line =~ m@^(\S+?), (\S+) (\S+) - - \[(\d{2})/(\w+)/(\d{4}):\s*(\d{2}):(\d{2}):\s*(\d{$
    Your search pattern which began with @ is not terminated anywhere in the file. It seems that a lot of text from the end of this line was lost.
    Sorry if my advice was wrong.
      Sorry, this trouble occured during copying and that's not it. But thanks anyway. I'll fix it in order to show whole picture.
        After moving some brackets in your regular expression it looks like this:
        m@^(\S+?), (\S+) (\S+) - - (\[\d{2}/\w+/\d{4}:\s*\d{2}:\d{2}:\s*\d{2} +\+\d{4}\]) "(.*?)" (\d{3}) (\d+) "(.*?)" "(.*?)"@;
        I can run the following code:
        #!/usr/bin/perl use warnings; use strict; use File::ReadBackwards; use Data::Dumper; my $fh_in = File::ReadBackwards->new($ARGV[0]) or die("Unable to open +\"$_\": $!\n"); my ($line,$source_host,$my_host,$internal_redirect,$date,$url_with_met +hod,$status,$size,$referrer,$agent,$end_time,$check_time,$vhost_name) +; while (defined($line = $fh_in->readline())) { chomp($line); print "$line|\n"; ($source_host,$my_host,$internal_redirect,$date,$url_with_meth +od,$status,$size,$referrer,$agent) = $line =~ m@^(\S+?), (\S+) (\S+) +- - \[(\d{2}/\w+/\d{4}:\s*\d{2}:\d{2}:\s*\d{2} \+\d{4})\] "(.*?)" (\d +{3}) (\d+) "(.*?)" "(.*?)"@; print Data::Dumper->Dump( [ \$line, \$source_host,\$my_host,\$internal_r +edirect,\$date,\$url_with_method,\$status,\$size,\$referrer,\$agent ] +, [qw(*line *source_host *my_host *internal_redi +rect *date *url_with_method *status *size *referrer *agent)], ), qq{\n}; print "SH:$source_host, MH:$my_host, IR:$internal_redirect, D: +$date, U:$url_with_method, S:$status, SZ:$size, R:$referrer, A:$agent +\n"; }
        on your sample data, and it seems to detect all fields properly.

        // Удачи!
        Sorry if my advice was wrong.

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: perlquestion [id://985912]
Front-paged by Corion
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others avoiding work at the Monastery: (6)
As of 2017-03-28 10:58 GMT
Find Nodes?
    Voting Booth?
    Should Pluto Get Its Planethood Back?

    Results (329 votes). Check out past polls.