Beefy Boxes and Bandwidth Generously Provided by pair Networks
good chemistry is complicated,
and a little bit messy -LW
 
PerlMonks  

Substring Finding/Counting

by Guildenstern (Deacon)
on Aug 30, 2000 at 01:03 UTC ( #30216=sourcecode: print w/ replies, xml ) Need Help??

Category: Miscellaneous
Author/Contact Info Chad Johnston cjohnston@rockstardevelopment.com Guildenstern
Description: Originally written to find common substrings in raw packet data. Can take a file of strings and will count all occurrences of substrings. Can take parameters for smallest substring and number of minimum matches to show to speed it up. I only tested it with the characters 0-9 and A-Z, so the regex may need to be protected if you use different data.
# Packet parser
# Version 1.5 - 29 Aug 2000
#
# I wrote this to inspect raw hex data from network packets
# and find how many times all possible substrings occur.
# Example 1: I was able to determine for one product that
# a particular string was present in all packets and decrypted
# to the administrator's username.
# Example 2: Another program uses tokens inserted into the packets
# to verify to teh application that the remote connections is allowed.
# I was able to determine what that token was using this program.
#
# Typical usage: pp.pl -f myinput.txt > output.fil
use strict;
use IO::File;        # we _need_ a temp file
sub usage();
sub get_cmd_opts();

usage() unless @ARGV;

my $level;           # length of substring to cut off at
my $fname;           # file (data) to be parsed
my $threshold;       # number of matches to cut off at
my $sortby;          # sort method
my @packets = ();    # input data
my $tmpfile;         # tempfile for storing intermediate results
my %all = ();        # substrings and match numbers

get_cmd_opts();

while (<>){push(@packets,$_);}
chomp(@packets);

@packets = sort { length($a) <=> length($b) } @packets;

$tmpfile = IO::File->new_tmpfile or die "Could not create tempfile: $!
+\n";
$tmpfile->autoflush(1);

do
{
 my $p = $packets[0];                  # get first packet
 foreach my $l ($level..length($p))    # substring lengths come from h
+ere
 {
  foreach my $pos (0..length($p)-$l)   # position of substring
  {
   my $str = substr($p,$pos,$l);       # create substr
    my $num = 0;                       # number matches for this subst
+r
    for (0..$#packets)                   # check against all packets
    {
     if ($l <= length($packets[$_]))   # useless to check if our subst
+r is too long
     {
      pos($packets[$_]) = 0;           # makes subsequent matches work
      while ($packets[$_] =~ /$str/g)  # find all matches
      {
       $num ++; 
      }
     }
    }
   if ($num >= $threshold)        # enough matches to keep it?
   {
    print $tmpfile "$str:$num\n"; # dump to temp
   }
  }
 }
 shift(@packets);  # get rid of the packet we just checked
} while ($#packets >= 0);


# first step is done, now read results from tempfile and
# generate output
seek($tmpfile,0,0) or die "Seek error: $!\n";

while (<$tmpfile>)
{
 chomp($_);
 my ($str,$val) = split(":",$_);
 if (!exists($all{$str})) # duplicates mess up counts
 {
  $all{$str} = $val;
 }
}

#my $value;
my @sort = ();

# hack to sort by command line param
# could be improved, i'm sure
if ($sortby eq "l")
{
 # reverse sort by length
 @sort = sort {length($b) <=> length($a) } keys %all;
}
else
{
 # reverse sort by number
 # should also sort by length since output looks a
 # tad funky
 @sort = sort { $all{$b} <=> $all{$a} } keys %all;
}

foreach (@sort)
{
 if (($all{$_} >= $threshold) && (length($_) >= $level))
 {
  print "(",length($_),") \"$_\": $all{$_} times.\n";
 }
 delete($all{$_});
}
undef @sort;

exit;

sub get_cmd_opts()
{
 use Getopt::Std;
 my %opts = ();       # command line params
 
 getopts('f:l:s:n:',\%opts);

 if (!($fname = $opts{'f'})) # file name
 {
  usage(); 
 }
 push(@ARGV,$fname); # cheap hack to avoid opening file, etc, later

 $level = exists($opts{'l'}) ? $opts{'l'} : 5;  # minimum substr lengt
+h

 if ((!exists($opts{'s'})) || ($opts{'s'} eq 'l')) # sort method
 {
  $sortby = "l";   # by length of string
 }
 elsif ($opts{'s'} eq 'f')
 {
  $sortby = "f";   # by frequency of match
 }
 else
 {
  usage();
 }

 $threshold = exists($opts{'n'}) ? $opts{'n'} : 2; # minimum match cou
+nt
}

sub usage()
{
 print "Packet Parser v1.5\n";
 print "Reads in a file of strings and shows all matches for substring
+s.\n";
 print "Note: Tested only with 0-9 and A-Z since that's all we need to
+ make hex characters.\n";
 print "Usage: pp.pl -f inputfile [-l matchlen] [-s sorttype] [-n thre
+shold]\n";
 print "-f\tinputfile: File containing data to be evaluated.\n";
 print "-l\tmatchlen: Specify minimum string length to show matches fo
+r. (Default: 5)\n";
 print "-s\tsorttype: Use l to specify sort by string length. Use f to
+ specify sort by match frequency. (Default: l)\n";
 print "-n\tthreshold: Specify minimum match number in output. (Defaul
+t: 2)\n";
 exit;
}

Comment on Substring Finding/Counting
Download Code
RE: Substring Finding/Counting
by nuance (Hermit) on Aug 30, 2000 at 20:49 UTC
    when you first mentioned this in the chatterbox you asked if anyone had any suggestions to improve it. So here goes FWIW.

    You are writing your matches to a temporary file and then reading them in again to construct a hash. That means that if you find any substrings with more than one occurrence, you will end up with entries in your data file for all of them. For instance if you find that token you mentioned and it's in every line of your - lets say - 500 record file. Then your first mention in the text file says 500 occurrences, the next says 499 and so on down to 2 occurrences. You don't get an entry that says one, but you will have checked for it.

    If instead of writting to that file you created the hash as you process the file, then right at the top you can just check if it exists. If it does then dont bother checking any further, you've already found all these matches. For the example I gave this equates to leaving out 124750 checks and that's just for one pattern.

    like this:

    do { my $p = $packets[0]; foreach my $l ($level..length($p)) { foreach my $pos (0..length($p)-$l) { my $str = substr($p,$pos,$l); next if exists $all{$str}; # if we've already found this # string somewhere else, exit # this iteration my $num = 0; for (0..$#packets) { if ($l <= length($packets[$_])) { pos($packets[$_]) = 0; while ($packets[$_] =~ /$str/g) { $num ++; } } } unless (exists $all{$str}) { $all{$str} = $num unless $num < $threshold; } } } shift(@packets); } while ($#packets >= 0);

    Nuance

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://30216]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others musing on the Monastery: (7)
As of 2014-12-29 07:27 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Is guessing a good strategy for surviving in the IT business?





    Results (185 votes), past polls