http://www.perlmonks.org?node_id=135860

   1: #!/usr/bin/perl
   2: 
   3: #############################
   4: #
   5: # Version 2.0
   6: #
   7: # A simple multi-format log parser which is intended to
   8: # to be used as a filter.  Could be faster, but it does
   9: # allow you to define a pretty output format.
  10: #
  11: # Author: Chris Jensen
  12: #
  13: # Update:
  14: #
  15: #     - If log format is unspecified, an attempt is
  16: #       made to determine the closest matching format
  17: #       by analyzing a log entry.
  18: #
  19: #     - Reduced amount of code; Sub-formats defined
  20: #       similar to log formats; Minor changes.
  21: #
  22: 
  23: use Getopt::Long;
  24: 
  25: my %optctl;
  26: GetOptions (\%optctl, "type|t=s", "pattern|p=s");
  27: 
  28: 
  29: my $log_formats = {
  30:   'common'   => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c b)] ],
  31:   'virtual'  => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+)}, [qw(v h l u t r c b)] ],
  32:   'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\"}, [qw(h l u t r c b R A)] ],
  33:   'referer'  => [ qr{(\S+) \-\> (\S+)}, [qw(R r)] ],
  34:   'agent'    => [ qr{(\S+)}, [qw(A)] ],
  35:   'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c b R A P T)] ],
  36:   'custom'   => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d+) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+)}, [qw(h l u t r c b A R T)] ],
  37: };
  38: 
  39: 
  40: my $type = $optctl{type} || 'unknown';
  41: 
  42: my $pattern = $optctl{pattern} or usage();
  43: 
  44: my ($format, $control) = @{$log_formats->{$type}};
  45: 
  46: my @pats;
  47: 
  48: map { $_ =~ /^([^a-zA-Z\_\%]*)(.)/ && push(@pats, $2) } (split(/\%/, $pattern));
  49: 
  50: my $outpat = $pattern . "\n";
  51: $outpat =~ s/(\%[^a-zA-Z\_\%]*)([a-zA-Z\_])/$1s/g;
  52: 
  53: 
  54: # Formats and sub-formats are now defined similarly
  55: 
  56: my $sub_formats = {
  57:   't' => [ qr{(\d+)\/(\w+)\/(\d+)\:(\d+)\:(\d+)\:(\d+)\s}, [qw(d m y H M S)] ],
  58:   'r' => [ qr{(\w+)\s([^\?]*)\??([^\s]*)?\s(.*)}, [qw(a f q p)] ],
  59:   'u' => [ qr{(\w*)\-(\w*)}, [qw(s i)] ],
  60:   'R' => [ qr{.*\:\/\/([^\/]+)(\/.*)}, [qw(o F)] ]
  61: };
  62: 
  63: 
  64: while(<>) {
  65: 
  66:     # Attempt to automatically determine log type/format
  67:     # Pick the matching format with the most control entities
  68: 
  69:     if ($type eq 'unknown') {
  70: 	my ($last, $t, $p);
  71: 	while (($t, $p) = each(%{$log_formats})) {
  72:             my ($f, $c) = @{$p};
  73: 	    if (/$f/ && scalar @{$c} > $last) {
  74: 		$last = scalar @{$c};
  75: 		$format = $f;
  76: 		$control = $c;
  77: 		$type = $t;
  78: 	    }
  79: 	}
  80:         die "Can't auto-determine log type\n" if ($type eq 'unknown');
  81:     }
  82: 
  83:     my @vals;
  84:     my %info;
  85:     if (/$format/) {
  86:         my $x = 0;
  87: 
  88:         foreach my $ctl (@{$control}) {
  89:             $info{$ctl} = ${++$x};
  90: 
  91:             my ($sfmt, $sctl) = @{$sub_formats->{$ctl}};
  92: 
  93:             if (defined($sfmt)) {
  94:                my $y = 0;
  95:                $info{$ctl} =~ /$sfmt/ &&
  96:                   map { $info{$_} = ${++$y} } @{$sctl};
  97:             }
  98: 	}
  99: 
 100: 	map { push(@vals, $info{$_}) } @pats;
 101: 
 102: 	printf $outpat, @vals;
 103:     }
 104: }
 105: 
 106: 
 107: sub usage {
 108: 
 109: print qq{
 110: usage: logparse [-t=<type>] -p=<pattern>
 111: 
 112: example: tail -50 access_log | logparse -t=extended -p="%H:%M  %-15o  %f"
 113: 
 114: Formatting characters:
 115: 
 116:   v   - The virtual host name/address
 117:   h   - The host IP name/address
 118:   l   - The remote logname
 119:   u   - Remote User/Session
 120:   t   - The time of the request
 121:   r   - The full request
 122:   c   - The HTTP code (302, 200, etc)
 123:   b   - Bytes
 124:   R   - Referrer string
 125:   A   - User Agent string
 126:   P   - Process ID
 127:   T   - Time taken in seconds
 128: 
 129: Request string breakdown:
 130: 
 131:   a   - Action/Method (GET, POST, etc)
 132:   f   - File path
 133:   q   - Query string
 134:   p   - HTTP protocol version
 135: 
 136: Time of request breakdown:
 137: 
 138:   d   - Day of the month
 139:   m   - Month (Apr, May, etc)
 140:   y   - Year
 141:   H   - Hour
 142:   M   - Minute
 143:   S   - Second
 144: 
 145: User Session breakdown:
 146: 
 147:   s   - Session ID
 148:   i   - User ID
 149: 
 150: Referrer string breakdown:
 151: 
 152:   o   - Host of referrer
 153:   F   - File path of referrer
 154: 
 155: };
 156: 
 157:    exit(0);
 158: 
 159: }

Replies are listed 'Best First'.
Re: Multi-Format Log Parser - Version 2.0
by grinder (Bishop) on Jan 16, 2002 at 17:07 UTC
    Neat stuff. ++ for using \"([^\"]*)\" instead of \"(.*?)\" that is all too often seen.

    Bear in mind though, that strange User-Agent strings can break your regexp. Specifically, I once encountered "Slurp 1.0" (literally, with the quotes) as a user agent in my log file.

    This was a real bugger to work around. I suppose a sufficiently well crafted regexp could extract foo from "foo" as well as bar from ""bar"". I solved the problem in a two-step process, by matching the prior fields, and then matching the latter fields, and then what was left was the user agent field. Keep in mind that ""user "foo" bar" could appear as a user agent. It gets icky.

    --
    g r i n d e r
    print@_{sort keys %_},$/if%_=split//,'= & *a?b:e\f/h^h!j+n,o@o;r$s-t%t#u';
      You're right about the improper use of quotes within a user agent string. That could cause pattern matches to fail, and those would be skipped. I'm thinking about adding an option to print log lines that don't match the currently selected format to STDERR, or a count of lines that didn't match. From using this on a fairly large web site, I know the patterns match our traffic fairly well, but it will be interesting to see how many lines don't match and why. I did a dump of counts per unique user agent string using this log parser a few days ago for our QA department and in one day's worth of logs there were 82,279 unique user agent strings. Our QA guys are after percentages of traffic per browser and platform, and I don't relish their job of parsing all the user agent strings to get that information since they don't follow any standardized format.
        I implemented a quick debug option that spits non-matches out to STDERR. In testing I found a pattern bug with byte counts of 304 log entries. Both are fixed in the following diff:
        26c26 < GetOptions (\%optctl, "type|t=s", "pattern|p=s"); --- > GetOptions (\%optctl, "type|t=s", "pattern|p=s", "debug|d=i"); 30,32c30,32 < 'common' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) (\d+)}, [qw(h l u t r c b)] ], < 'virtual' => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*) +\" (\d+) (\d+)}, [qw(v h l u t r c b)] ], < 'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) (\d+) \"([^\"]*)\" \"([^\"]*)\"}, [qw(h l u t r c b R A)] ], --- > 'common' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) ([\d\-]+)}, [qw(h l u t r c b)] ], > 'virtual' => [ qr{(\S+) (\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*) +\" (\d+) ([\d\-]+)}, [qw(v h l u t r c b)] ], > 'combined' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) ([\d\-]+) \"([^\"]*)\" \"([^\"]*)\"}, [qw(h l u t r c b R A)] ], 35,36c35,36 < 'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c b R +A P T)] ], < 'custom' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) (\d+) \"([^\"]*)\" \"([^\"]*)\" (\d+)}, [qw(h l u t r c b A R T)] +], --- > 'extended' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) ([\d\-]+) \"([^\"]*)\" \"([^\"]*)\" (\d+) (\d+)}, [qw(h l u t r c +b R A P T)] ], > 'custom' => [ qr{(\S+) (\S+) (\S+) \[([^\]]*)\] \"([^\"]*)\" (\d ++) ([\d\-]+) \"([^\"]*)\" \"([^\"]*)\" (\d+)}, [qw(h l u t r c b A R +T)] ], 102a103,104 > } elsif ($optctl{debug} == 1) { > print STDERR $_;

        With the new patterns, a quick match against 79154 lines from an access log of 'extended' format had 8 lines which didn't match. All of them were because of quotes in the request or the user agent strings.

        Here's a user agent that didn't match...
        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; Q312461; <HTML><A% +20HREF="http://www.pghconnect.com/">www.pghconnect.com</a></HTML>)"