Beefy Boxes and Bandwidth Generously Provided by pair Networks
There's more than one way to do things
 
PerlMonks  

Multiline log parsing (solution)

by Jenda (Abbot)
on Sep 26, 2002 at 21:41 UTC ( #201070=perlquestion: print w/ replies, xml ) Need Help??
Jenda has asked for the wisdom of the Perl Monks concerning the following question:

Two days ago I asked for ideas regarding parsing logs, looking for unexpected messages, incomplete multiline messages and similar stuff. sauoq and rje suggested "state machine" ... which was just what I needed.

For those interested, here is the result:

Template for machine definitions

#Empty.parser my ($error); # local variables my $error_fun = sub {output("Error: $1\n\tat line $_[0]")}; # shared e +rror function { '!INIT' => ['START', sub {$error = 0;}], # reset local variables # [ the initial state, the initialization ] START => [ [ qr'^ServiceName \d+\.\d+.\d+$' => 'START'], # [ regexp, state to go to, function to run (first paramet +er is the line number, second the line itself) ] [ qr'^---- Ticking: \d{4}/\d\d/\d\d \d\d:\d\d:\d\d - \d\d:\d\d +:\d\d$' => 'START'], [ qr'^Blah ' => 'OTHER_STATE', sub { }], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], START_EOF => sub {output("OK") unless $error}, # specifies the function to execute if EOF is encountered in S +TART state ERROR_TIME => [ [ qr'\tat ' => 'OTHER_STATE'], ], '!EOF' => sub {output("Unexpected EOF!")}, # specifies the function to execute if EOF is encountered in a + state that doesn't define its own handler '!UNEXPECTED' => ['START', sub {output("Unexpected text at line $_ +[0]"); $error++}], # [ the state to move to if no regexp of the current state mat +ched the line, function to call ] }

An actual machine (one of several)

#FileCreate.parser my ($sites, $files, $jobs, $site_start, $error); my $error_fun = sub {output("Error: $1\n\tat line $_[0]")}; { '!INIT' => ['START', sub {$sites = 0; $files = 0; $jobs = 0;$error + = 0;}], START => [ [ qr'^FileCreate \d+\.\d+.\d+$' => 'START'], [ qr'^---- Ticking: \d{4}/\d\d/\d\d \d\d:\d\d:\d\d - \d\d:\d\d +:\d\d$' => 'START'], [ qr'^Creating files for site ' => 'FILES', sub {$sites++; $si +te_start=$_[0];}], [ qr'^Going down at ' => 'START'], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], START_EOF => sub {output("OK ($sites Sites, $files Files, $jobs Jo +bs)") unless $error}, FILES => [ [ qr'^\tCreating file ' => 'FILE', sub {$files++}], [ qr'^File generation succeeded for site ' => 'START'], [ qr'^File generation FAILED for site (.*)' => 'START', sub {o +utput("Failed for $1 at line $site_start"); $error=1}], [ qr'^Jobs for site \d+ with parameter type "\w+" are to be pr +ocessed by HTTPPost or something.' => 'FILES'], [ qr'^Site \d+ has posting parameters either only for single o +r for package jobs!!!' => 'FILES'], [ qr'^Sending delete request for job' => 'FILES', sub {$jobs++ +}], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], FILE => [ [ qr'^\t\tThere are \d+ jobs.$' => 'FILE_OPT'], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], FILE_OPT => [ [ qr'^\t\tFlat File$' => 'JOBS'], [ qr'^\t\tXML File$' => 'JOBS_XML'], [ qr'^\t\t((Comma|Tab) Delimited|Delimited by .*)$' => 'JOBS_D +ELIM'], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], JOBS_DELIM => [ [ qr'^\t\tCreating 2 delimited files: ' => 'JOBS'], [ qr'^\t\t\tJob ID:' => 'JOBS', sub {$jobs++}], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], JOBS_XML => [ [ qr'^\t\t(using DTD: .*|without DTD)' => 'JOBS'], [ qr'^\t\t\tJob ID:' => 'JOBS', sub {$jobs++}], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], JOBS => [ [ qr'^\t\t\tJob ID:' => 'JOBS', sub {$jobs++}], [ qr'^\t\tZipped' => 'JOBS_END'], [ qr'^\t\tDONE' => 'FILES'], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], JOBS_END => [ [ qr'^\t\tDONE' => 'FILES'], [ qr'^ERROR: (.*)$' => ERROR_TIME, $error_fun], ], ERROR_TIME => [ [ qr'\tat ' => 'FILES'], ], '!EOF' => sub {output("Unexpected EOF!")}, '!UNEXPECTED' => ['START', sub {output("Unexpected text at line $_ +[0]"); $error++}], }

A script I use to debug them

#parse.pl use strict; BEGIN { # dirty hack. If I define the DEBUG as a constant, not as a variab +le the debug prints are optimized out during compilation if ($ARGV[0] =~ m{^[/-]d}) { shift(@ARGV); eval "sub DEBUG () {1}" } elsif ($ARGV[0] =~ m{^[/-]D}) { shift(@ARGV); eval "sub DEBUG () {2}" } else { eval "sub DEBUG () {0}" } } my $machines = readParsers('c:\dir\with\machines'); my $type = shift(@ARGV) or die "usage: parse type [files...]"; if (! @ARGV) { @ARGV = glob "logs/*.$type.log"; } foreach my $file (@ARGV) { do_file( $file, $machines->{lc $type}); } sub output {print @_,"\n"} # funcs sub readParsers { my %machines; my $dir = shift(); opendir my $DIR, $dir or die "Can't open directory $dir : $!\n"; while (defined(my $file = readdir $DIR)) { next unless $file =~ /^(.*)\.parser$/i; my $type = $1; $machines{lc $type} = do "$dir/$type.parser" or die "Failed to read the state machine $dir/$type.parser +:\n $@\n"; } return \%machines; } sub do_file { my ($file, $machine) = @_; die "Second parameter to do_file must be a state machine!\n" unless defined $machine and ref $machine eq 'HASH' and exists +$machine->{'!INIT'}; open IN, '< ' . $file or return print "Can't open $file : $!\n"; print "$file\n"; my $state_name = $machine->{'!INIT'}[0]; my $state = $machine->{$state_name}; $machine->{'!INIT'}[1]->() if (ref $machine->{'!INIT'}[1] eq 'CODE +'); my $line_no = 0; LINE: while (defined(my $line = <IN>)) { chomp($line); $line_no++; next if $line eq ''; print "\nSTATE: $state_name\nLINE: $line\n" if DEBUG; foreach my $transition (@{$state}) { print "RE: ".$transition->[0]."\n" if DEBUG > 1; next unless $line =~ $transition->[0]; $transition->[2]->($line_no, $line) if (ref $transition->[2] eq 'CODE'); $state_name = $transition->[1]; $state = $machine->{$state_name}; die qq{Unknown state "$state_name" !!!\n} unless $state; next LINE; } # no regexp matched! if (ref $machine->{'!UNEXPECTED'} and ref $machine->{'!UNEXPEC +TED'}->[1] eq 'CODE') { $machine->{'!UNEXPECTED'}->[1]->( $line_no, $line); } else { output("Unexpected input at line $line_no:\n\t$line"); } if (ref $machine->{'!UNEXPECTED'} and $machine->{'!UNEXPECTED' +}->[0]) { $state_name = $machine->{'!UNEXPECTED'}->[0]; $state = $machine->{$state_name}; } else { $state_name = $machine->{'!INIT'}[0]; $state = $machine->{$state_name}; } die qq{Unknown state "$state_name" !!!\n} unless $state; } print "STATE at EOF: $state_name\n" if DEBUG; if (exists $machine->{$state_name . "_EOF"}) { $machine->{$state_name . "_EOF"}->(); } else { if (ref $machine->{'!EOF'} and ref $machine->{'!EOF'} eq 'CODE +') { $machine->{'!EOF'}->(); } else { output("Unexpected EOF"); } } }

Please take this just as an example! Comments appreciated (though ... it does what I need so ... ;-)

Jenda

Comment on Multiline log parsing (solution)
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: perlquestion [id://201070]
Approved by valdez
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others taking refuge in the Monastery: (8)
As of 2014-10-02 17:51 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    What is your favourite meta-syntactic variable name?














    Results (68 votes), past polls