#!/usr/bin/perl
################################################
# Title: crunchrep.pl
# Author: Clovis Sangrail
# Description: The crunchrep.pl Perl script will process one or more input
# data files consisting of GT.M Journal extract log records. It produces a
# report of exeptional transactions for possible further scrutiny by audit
# or security personnel. 
#
# Revision History
# Ver           Author		Date
# ---		------		----
# 1.0		Clovis Sangrail	Apr 2012
#	Initial port/rewrite of audit.gawk.
# 1.1		Clovis Sangrail	9-May-2012
#	Rewrite regex for Global Variable analysis to handle subscripts
# like """""ABU%"""" . Then change back.
################################################

#####################
# Calling Arguments #
#####################
# See usage function.

###########
# Modules #
###########
use strict;
use Getopt::Std;			# Process switches.
use File::Basename;

#############
# Variables #
#############
$| = 1;					# Flush after every write.
my $NMF = "namefile";			# Userids to report upon. (Default.)
my $ALLF;				# Journal extract Log file.
my %rpt;				# Hash of Hash of Hash of 2x1 arrays.
my %opts;				# Hash of commandline opts => vals.
my %rname;				# Hash of uid => GECOS name.
my @fields;				# Array of Journal Log fields.
my $PWLINE;				# Capture grep output.
my ( $uid , $cnt);
my %matchpid;				# Hash of pid => uid.

#############
# Functions #
#############
sub usage				# Display program usage instructions.
{
    my $prg = basename($0);
    print <<EOF

$prg usage:

$prg -f <Glbfld#> [ -F <skipregex> -k <keepregex> <names>] <data>

The $prg program reads input files of GT.M Journal Extract Log recs
generated by the MUPIP program and Unix UIDs and produces a report of
transactions performed by the users in the namefile. The <data> file
is required, the <names> file defaults to "namefile" in the current
directory.

Switches and Parameters:
    -f = Field number of Global Variable, currently 7 or 9 so far. This
         may change with new versions of GT.M and MUPIP. Required switch.
    -F = Perl regex of Global Variable names to omit from report. Optional.
    -k = Perl regex of Global Variable names for which subscripts will be
         preserved and reported on separately. Default is to aggregate all
         subscripts into single count. Optional.
<names>  File of User ID's (one per line) on which to report. Optional,
         defaults to "namefile" in current directory.
<data>   File of 01/04/05/10 mupip journal extract log records, sorted by
         PID and rectype within PID. Required parameter.

EOF
# Sorry no indentation allowed with EOF. (No in-line comments either.)
}

########
# Main #
########

#####################################################################
# Process the switch arguments. (See 'usage{}' function above for a #
# description of the switches and other commandline parameters.)    #
#####################################################################
getopt('fFk', \%opts);			# Read switches into %opts hash.

my $gfld = $opts{"f"};			# This is global var field number.
if( (7 != $gfld) && (9 != $gfld) ) {	# Must be one of these.
    &usage();
    die "Invalid logfile global field number\n\n";
} else {
    $gfld--;				# Perl numbers from zero.
}

my $fltr = $opts{"F"};			# Get regex of globals to skip.
my $keepsub = $opts{"k"};		# and Globals to not aggregate.
					# (These can be null, I guess.)

##############################
#DBG#$fltr='^\^LOG\(?|^\^BCHLOG\(?|^\^ORSLOG\(?|^\^ZLOG\(?|^\^PROCID\(?|^\^SYSLOG\(?';
#DBG#$keepsub='^\^SCAU\(?';
#DBG#print "\n";
#DBG#print " 'f'  switch value: $opts{\"f\"} " . "\n";
#DBG#print " 'F'  switch value: $opts{\"F\"} " . "\n";
#DBG#print "fltr = $fltr \n";
#DBG#print " keepsub = $keepsub \n";
#DBG#print " 'k'  switch value: $opts{\"k\"} " . "\n";
#DBG#print "\n";
#DBG#exit(0);
##############################

###########################################################################
# Process filename arguments. Error if none. One arg is the data file of  #
# journal records, and the namefile is the default. If two args, then 1st #
# is the namefile, 2nd is the data file.                                  #
###########################################################################
$cnt = 0 + @ARGV;			# Remaining arg cnt. (getopts shifts.)

BLOCK: {
    if ( 0 == $cnt ) {			# This is an error.
        &usage();
        die "Need a journal extract log file name\n\n";
        last BLOCK;			# Not really needed.
    }
    if ( 1 == $cnt ) {
        $ALLF = $ARGV[0];		# Just log file.
        last BLOCK;
    }
    if ( 2 == $cnt ) {			# Names file and log file.
        $NMF = $ARGV[0];
        $ALLF = $ARGV[1];
        last BLOCK;
    }
}

#######################################################
# Open the $NAMES and $ALL files, err out on failure. #
#######################################################
die "Cannot open $NMF : $!" unless ( open NAMES, $NMF );
die "Cannot open $ALLF : $!" unless ( open ALL, $ALLF );

############################################################################
# Process NAMES file. This is the list of User IDs (UIDs) on which we wish #
# to report. Read each UID and seek it's /etc/passwd entry. If found set   #
# the rname{UID} hash entry to the real name field of the PW line. If not  #
# found set that hash entry to "Name_Not_Found".                           #
############################################################################
while (<NAMES>) {			# Read till EOF.

    chomp;				# Trim trailing '\n'.

    if( $PWLINE = `/usr/bin/grep $_ /etc/passwd` ) {
        @fields = split ":" , $PWLINE;	# If found load PW name into array.
        $rname{$_} = $fields[4];
    } else {
        $rname{$_} = "Name_Not_Found";	# Or if not found say so.
    }
}

######################################
#DBG##my ( $c , $mcnt , $pmil );
#DBG#my $uid;
#DBG#foreach $uid ( sort keys %rname ) {
#DBG#    print "uid: $uid , name: \"$rname{$uid}\" \n";
#DBG#}
#DBG#exit(0);
#DBG#$c=0;
#DBG#$mcnt=0;
#DBG#print "\n.";
######################################

############################################################################
# This is the main part. Read each line of the input file of MUPIP journal #
# extract log records. Input file is sorted by PID and by rectype within   #
# each PID. Skip all but '01', '04', '05', and '10' record types. Skip '01'#
# recs if UID (6th field) was not among those in the namefile, else save   #
# PID and set the matchpid entry for that PID to be the UID. For 04/05/10  #
# skip if matchpid entry for that PID not defined, or if Global Var is in  #
# the ignore list. Replace any subscripts with "(..)" unless Global is in  #
# the do-not-aggregate list. Translate timestamp and either make new array #
# entry of [ timestamp, cnt=1 ] for this rpt{uid}[pid}{startglob} hash or  #
# increment count of the existing entry.                                   #
############################################################################
while (<ALL>) {

    my ( $pid, $curpid );		# Process IDs.
    my ( $wholeglob, $saveglob, $startglob );
					# Parts of Global Var.
    my $stamp;				# Xlate of Horolog format.

    chomp;				# Remove trailing newline.

    ################################
    #DBG#if( 100 == $c++ ) {
    #DBG#    $c=0;
    #DBG#    print ".";
    #DBG#    if( 0 == ( ++$mcnt % 10 ) ) {
    #DBG#        $pmil = $mcnt / 10;
    #DBG#        print "\n $pmil K recs\n";
    #DBG#    }
    #DBG#}
    ################################

    @fields = split /\\/ , $_ , $gfld+1;
					# Split on backslashes.`

    ################################
    #DBG#print "$_ \nrectype: x$fields[0]x\n";
    #DBG#print ".";
    ################################

    ##################################################################
    # If we find an '01' record, skip if username is not among those #
    # found when the namefile was processed.                         #
    ##################################################################
    if( "01" eq $fields[0] ) {		# Record type '01'?
        next unless defined( $rname{$fields[5]} );
					# Skip if not read from namefile.
        $matchpid{ $fields[3] } = $fields[5]; 
					# Save UID matching PID.

        ####################################################
        #DBG#print "Inside 01 compare. pid = $fields[3] , matching uid = $matchpid{$fields[3]}\n";
        #DBG#if( defined( $rname{$fields[5]} ) ) {
        #DBG#  print "rectype: $fields[0] ,pid: $fields[3] ,uid: $fields[5]\n";
        #DBG#} else {
        #DBG#  print "skip: $fields[5]\n";
        #DBG#}
        #DBG#} else {
        ####################################################

    }

    ################################
    #DBG# print "$_ \nrectype: x$fields[0]x\n";
    #DBG#    print ".";
    ################################

    #####################################################################
    # Global variables are altered by record types '04, '05', and '10'. #
    #####################################################################
    if( $fields[0] =~ /04|05|10/ ) {	# Have 04/05/10 record type?
        #DBG#print "Inside 05 compare. pid = $fields[3] , matching uid = $matchpid{$fields[3]}\n";
        #DBG#print "$_ \n";
        next unless defined( $matchpid{$fields[3]} );
        $curpid = $fields[3];		# Capture PID if found in '01' rec.
					# Otherwise, skip it.
        $uid = $matchpid{$curpid};

        ##################################################################
        # Now extract the initial portion of the Global Variable being   #
        # affected by this rtecord, and also extract the whole Variable. #
        # (include any subscripts present).                              #
        ##################################################################
        #DBG#print "Global Field = $fields[$gfld] \n";
        #if($fields[$gfld] =~ /^((\^[%A-Za-z\d]+)($|=|\(([^"]+|(("+)[^"]+\6)+)+\)))/ ) {
        if($fields[$gfld] =~ /^((\^[%A-Za-z\d]+)($|=|.*?\)))/ ) {
            $wholeglob = $1;		# Nested regex memory variables.
            $startglob = $2;
        } else {
            print "Warning: Cannot parse $fields[$gfld] for pid $curpid \n";
            #DBG#print "$_ \n";
            next;			# Print error msg if regex fails.
        }
        #DBG#print "whole global = $wholeglob \n";

        ################################################################# 
        # If this Global's start matches the regex, input via switch on #
        # the commandline, of variables to skip, then skip it.          #
        ################################################################# 
        if( ($fltr ne "") &&  ($startglob =~ /$fltr/) ) {
            #DBG#print "Skipping global $wholeglob \n";
            next;
        }

        ###############################################################
        # The GTM Journal timestamp is in HoroLog format: DDDDD,SSSSS #
        # where DDDDD = days since 12/31/1840 midnight and SSSSS =    #
        # seconds since midnight. We need to convert this to Timestamp#
        # format (secs since start of 1/1/1970). (Note: 47117 is the  #
        # number of days between midnight 12/31/1840 and 1/1/1970.)   #
        ###############################################################
        if( $fields[1] =~ /^(\d+),(\d+)/ ) {
					# Extract DDDDD & SSSSS via regex.
            $stamp = ( $1 - 47117 ) * 86400 + $2 + 3600 * 6;
        } else {			# Convert, incl CST timezone offset.
            print "Cannot parse horolog field $fields[1] \n";
        }				# Err, can't find DDDDD &/or SSSSS.

        #################################################################
        # Now match the start of the Global variable against the search #
        # regex of variables that we do not aggregate. These are vars   #
        # for which we keep separate counts of each different set of    #
        # subscripts that are modified. If we find a match, then leave  #
        # the variable alone. If no match, then replace any subscripts  #
        # with the string (..) .                                        #
        #################################################################
        $saveglob = "";			# Start out assuming no aggregate.
        if( ($keepsub ne "") && ($startglob =~ /$keepsub/) ) {
            # If match, preserve Global
            #DBG#print "$wholeglob will not be aggregated  \n";
        } else {
            if( $wholeglob =~ /\(/ ) {  # Aggregate if have subscripts.
                $saveglob = $wholeglob;
                $wholeglob = $startglob . "(..)";
                #DBG#print "Aggregating $saveglob into $wholeglob \n";
            }
        }

        ##############################################################
        # Finally, if we already have an entry for this (possibly an #
        # aggregated) Global then increment it's count. If this is a #
        # new one then create the initial [ timestamp, cnt=1 ] array #
        # for the newly-created hash element to reference.           #
        ##############################################################
        if( defined( $rpt{$uid}{$curpid}{$wholeglob} ) ) {
            $rpt{$uid}{$curpid}{$wholeglob}[2]++;
            #DBG#print "inc node: uid = $uid pid = $curpid Global = $wholeglob \n";
        } else {			# Have entry, increment count.
            #DBG#print "new node: uid = $uid pid = $curpid Global = $wholeglob tstamp = $stamp \n";
            if( $saveglob eq "" ) {
                $rpt{$uid}{$curpid}{$wholeglob} = [ $stamp , $wholeglob, 1 ];
            } else {
                $rpt{$uid}{$curpid}{$wholeglob} = [ $stamp , $saveglob, 1 ];
            }

        }				# Create new node, cnt=1.
    }
    #DBG#print "Did /04/05/10 \n";
}

####################################################################
# At this point the three-dimensional %rpt hash is loaded with the #
# report data, we're ready to print the report. The %rpt hash is   #
# dimensioned as rpt{uid}{pid}{global}.                            #
####################################################################
for $uid ( sort keys %rpt ) {		# Outermost dimension is UIDs.

    my @rptblock;			# Report lines for current UID.
    my @tparts;				# Return value of localtime.
    my ( $pid, $glob );			# Hash indices.
    my ( $stamp, $cnt, $prtglob );	# Array variables.
    my ( $rptline, $baseline );		# For composing rptblock entry.

    #################################
    # print header for current UID. #
    #################################
    printf "%s: %s\n" , $uid , $rname{$uid};
    printf "YYYYMMDD hh:mm       PID     Global\n";
    printf "-------------- ------------- ------\n";
 
    @rptblock = ();			# Clear data lines array.

    ################################################
    # Within current UID 2nd index is Process IDs. #
    ################################################
    for $pid ( keys %{ $rpt{$uid} } ) {

        ###############################################################
        # Within current PID, 3rd index is each affected GT.M Global. #
        ###############################################################
        for $glob ( keys %{ $rpt{$uid}{$pid} } ) {
            $stamp = $rpt{$uid}{$pid}{$glob}[0];
					# Timestamp of 1st access.
            $cnt = $rpt{$uid}{$pid}{$glob}[2];
					# Count of accesses.
            if( 1 == $cnt ) {
                $prtglob = $rpt{$uid}{$pid}{$glob}[1];
            } else {			# Show subscripts if cnt=1.
                $prtglob = $glob;	# Show "(..)" unless keepsubs.
            }
            @tparts = localtime $stamp;	# Convert to list.
            $baseline = sprintf "%4d%02d%02d %02d:%02d%13s  %s" ,
                   1900 + $tparts[5], 1 + $tparts[4], $tparts[3],
                   $tparts[2], $tparts[1], $pid, $prtglob;
					# YYYYMMDD HH:MM  pid  Globalname
            ############################
            # One or multiple updates? #
            ############################
            if( 1 == $cnt ) {
                $rptline = sprintf "%s\n", $baseline;
            } else {			# Just add newline for single.
                $rptline = sprintf "%s\t(%d updates)\n" , $baseline, $cnt;
            }				# Multiple updates, print count.
            push( @rptblock , $rptline);# Load rptblock array.
        }
    }

    #######################################################################
    # Finally, print out the formatted data lines for this UID. Sort will #
    # put them in date/timestamp order, and PID order within that.        # 
    #######################################################################
    print sort @rptblock;
    print "\n";				# Skip line before next user.
}