Thanks for your input. I've attached the latest and greatest which includes:
Some aspects are customized to our environment (shares, RE's, and a few tidbits), but overall I'm pleased with what I have so far and I think it's easy enough to expand. It takes under an hour to scour ~1TB of shares and comes back with ~2,500 offenders totalling 25G. I just finished updating the script so I need to review the code for bugs and tweaks, but I've included it below nonetheless.
For now I've forgone checksums for similarly named files because this should not be an issue for us. Also, the 10 character lengths were a coincidence—I'm looking for variable lengths.
Choose the shares to search
---------------------------
1) /data 69%
2) /export/home 53%
3) /tmp 0%
4) /usr/local 75%
5) /xppdata 79%
Shares: (1 - 5, separate multiple entries with spaces) 1 2 3 4 5
Shares searched:
/data
/export/home
/tmp
/usr/local
/xppdata
WordNet words loaded: 55,191
Solaris words loaded: 20,068
Words used in ngram creation: 74,874
Words in assembled dictionary: 60,968
Ngrams used:
ion: 3,387 ( 4.5%)
tio: 2,738 ( 3.7%)
ess: 2,693 ( 3.6%)
ing: 2,643 ( 3.5%)
ter: 2,531 ( 3.4%)
ati: 2,523 ( 3.4%)
nes: 2,300 ( 3.1%)
ent: 1,942 ( 2.6%)
ine: 1,649 ( 2.2%)
ate: 1,557 ( 2.1%)
ant: 1,374 ( 1.8%)
ist: 1,356 ( 1.8%)
per: 1,194 ( 1.6%)
con: 1,117 ( 1.5%)
ity: 1,105 ( 1.5%)
man: 1,059 ( 1.4%)
men: 1,059 ( 1.4%)
lin: 1,043 ( 1.4%)
tor: 1,033 ( 1.4%)
ali: 1,019 ( 1.4%)
tra: 1,018 ( 1.4%)
and: 1,001 ( 1.3%)
ida: 992 ( 1.3%)
eri: 991 ( 1.3%)
ene: 977 ( 1.3%)
ste: 972 ( 1.3%)
ble: 939 ( 1.3%)
her: 930 ( 1.2%)
ari: 927 ( 1.2%)
ran: 926 ( 1.2%)
ism: 911 ( 1.2%)
lit: 909 ( 1.2%)
ous: 906 ( 1.2%)
tic: 904 ( 1.2%)
the: 884 ( 1.2%)
pro: 880 ( 1.2%)
str: 876 ( 1.2%)
nce: 875 ( 1.2%)
rat: 868 ( 1.2%)
tin: 865 ( 1.2%)
ast: 842 ( 1.1%)
era: 835 ( 1.1%)
ver: 830 ( 1.1%)
tri: 824 ( 1.1%)
sta: 820 ( 1.1%)
der: 817 ( 1.1%)
ill: 807 ( 1.1%)
rin: 807 ( 1.1%)
gra: 801 ( 1.1%)
ace: 800 ( 1.1%)
est: 797 ( 1.1%)
ria: 790 ( 1.1%)
res: 782 ( 1.0%)
dae: 779 ( 1.0%)
nti: 771 ( 1.0%)
sti: 763 ( 1.0%)
ian: 761 ( 1.0%)
lat: 732 ( 1.0%)
all: 728 ( 1.0%)
ell: 726 ( 1.0%)
ect: 721 ( 1.0%)
ina: 719 ( 1.0%)
Ngram RE:
(?-xism:(?:i(?:n[aeg]|[ao]n|s[mt]|da|ll|ty)|a(?:l[il]|n[dt]|t[ei]|ce|r
+i|st)|e(?:n[et]|r[ai]|s[st]|ct|ll)|t(?:i[cno]|[eo]r|r[ai]|he)|r(?:a[n
+t]|i[an]|es)|(?:m[ae]|co)n|l(?:i[nt]|at)|n(?:ce|es|ti)|d(?:ae|er)|p(?
+:er|ro)|st[aeir]|[hv]er|ble|gra|ous))
Search began on Friday, January 14, 2011 @ 03:49:05 PM
...
Search ended on Friday, January 14, 2011 @ 04:40:08 PM and took 0 hour
+s, 51 minutes, and 3 seconds.
Complete.
Log written to /tmp/suspect_tmp_files_log_01-14-2011_15.48.txt.
Report written to /tmp/suspect_tmp_files_rpt_01-14-2011_15.48.txt
use warnings;
use strict;
use File::Find::Rule;
use File::stat;
use Number::Bytes::Human qw(format_bytes);
use Date::Format;
use DateTime;
use Filesys::DfPortable;
use Filesys::DiskUsage qw(du);
use List::Compare;
use List::MoreUtils qw(all);
use Log::Dispatch;
use Term::Prompt;
use SelectSaver;
use Path::Class;
use Number::Format qw(format_number);
use Regexp::Assemble;
use Sort::Key::Natural qw(natsort_inplace);
use Sort::Naturally;
use WordNet::QueryData;
my $files = 0;
my $total_size = 0;
my $time_zone = DateTime::TimeZone->new(name => 'local');
my $load_stamp = DateTime->now(time_zone => $time_zone);
#---------#
# Logging #
#---------#
my $log_file = file(
'/tmp',
sprintf 'suspect_tmp_files_log_%s.txt', $load_stamp->strftime('%m-
+%d-%Y_%H.%M')
)->stringify;
my $rpt_file = file(
'/tmp',
sprintf 'suspect_tmp_files_rpt_%s.txt', $load_stamp->strftime('%m-
+%d-%Y_%H.%M')
)->stringify;
my $log = Log::Dispatch->new(
outputs =>
[
[
'File',
min_level => 'debug',
filename => $log_file,
],
[ 'Screen', min_level => 'info' ],
],
);
$SIG{__DIE__} = sub {
### Ignore evals.
return if $^S;
$log->error(shift);
exit 1;
};
$SIG{__WARN__} = sub { $log->warning(shift); };
#-------------------#
# Prompt for shares #
#-------------------#
my @shares = qw(/data /xppdata /tmp /usr/local /export/home);
natsort_inplace @shares;
my @use_shares = prompt(
'm',
{
title => 'Choose the shares to search',
prompt => 'Shares:',
items => [ map {
my $df_ref = dfportable($_) || die "Unable to determine di
+sk usage for $_!\n";
sprintf "%-19s %2d%%", $_, $df_ref->{per};
} @shares ],
accept_multiple_selections => 1,
accept_empty_selection => 0,
cols => 1,
},
'',
''
);
print "\n";
$log->info(
"Shares searched:\n\t" .
(join "\n\t" => @shares[@use_shares]) .
"\n\n"
);
#-----------------#
# Hash user names #
#-----------------#
my %uid;
while (my ($name, undef, $uid) = getpwent()) {
$uid{$uid} = $name;
}
#-----------#
# Set times #
#-----------#
my $now = time;
my $a_month_ago = $now - (86_400 * 30);
my $years_ago = $now - (86_400 * 365 * 2);
#---------------------------------------#
# Prep for finding temporary-like names #
#---------------------------------------#
my %ngram;
my %dict;
my $ngram_total;
my $temp_re = qr/
\Acore\z |
copy |
dupe |
t(?:e?mp|rash) |
ba(?:k|ck_?up) |
\b(?:old|test) |
prev(?!iew) |
([a-z_])\1\1 |
\bfoo\b |
\.log |
[~\$]
/ix;
### Prep our dictionaries: Solaris and WordNet.
my @dict;
open my $DICT, '<', '/usr/share/lib/dict/words' or die $!;
my $wn = WordNet::QueryData->new(
dir => '/usr/local/wordnet/dict/',
noload => 1,
);
### Gather words from WordNet.
my @wn = grep { /^[a-z]+$/ } $wn->listAllWords;
$log->info('WordNet words loaded:', format_number(scalar @wn), "\n");
### Gather words from Solaris' dictionary.
push @dict, $_ for grep { /^[a-z]+$/ } <$DICT>;
$log->info('Solaris words loaded:', format_number(scalar @dict), "\n\n
+");
### Compare for union.
my $lc = List::Compare->new(\@wn, \@dict);
### Gather ngrams.
### Only allow words that are lowercase and have 3+ letters.
for (grep { /^[a-z]{3,}$/ } $lc->get_union) {
s/\s+\z//;
### Gather letter trios (ngrams, or, more specifically, trigrams).
my $word = $_;
my @ngrams = map { substr($word, $_, 3) } 0 .. (length $_) - 3;
### Tally.
++$ngram{$_} for @ngrams;
++$ngram_total;
### Only add 4+ lengths to the dictionary as many temps were match
+ing lengths of 3.
++$dict{$_} if length >= 4;
}
$log->info('Words used in ngram creation:', format_number($ngram_total
+), "\n");
### Done with these.
undef $lc;
undef @wn;
undef @dict;
### Remove suspect temporary words and months from our dictionary.
for my $word (
qw(archive backup copy core defunct dupe log old previous sample t
+emporary test trash hold),
qw(january february march april may june july august september oct
+ober november december)
) {
delete $dict{$word} if exists $dict{$word};
}
$log->info('Words in assembled dictionary:', format_number(scalar keys
+ %dict), "\n\n");
$log->info("Ngrams used:\n\n");
### Remove ngrams with less than 1% occurrence.
for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) {
my $percentage = format_number(($ngram{$ngram} / $ngram_total) * 1
+00, 1, 1);
delete $ngram{$ngram} and next if $percentage < 1;
$log->info(sprintf "\t%3s: %5s (%4s%%)\n", $ngram, format_number($
+ngram{$ngram}), $percentage);
}
$log->info("\n");
### Build an RE based on the ngrams.
my $ra = Regexp::Assemble->new;
$ra->add($_) for keys %ngram;
$log->info("Ngram RE:\n\n" . $ra->re, "\n\n");
### Files must match these to be considered temporary.
my @REs = (
### Lower/upper case letters not in the extension.
qr/\A[^.]+[a-z]/,
qr/\A[^.]+[A-Z]/,
### Digit.
qr/\d/,
### Name only contains upper/lower case letters or digits; may end
+ in a tilde, dollars, or ext.
qr/\A[a-zA-Z\d]+(?:[~\$]|\..{1,4})*\z/,
);
#------------#
# Find Files #
#------------#
my @files;
my $start_stamp = DateTime->now(time_zone => $time_zone);
$log->info('Search began on', $start_stamp->strftime('%A, %B %d, %Y @
+%r'), "\n");
### Start with anything a month old, perform an initial check, then lo
+ok for guaranteeds or possibilities.
File::Find::Rule
->atime("<$a_month_ago")
->exec(\&init_check)
->or(
### Check for those that match the temp RE and prune.
File::Find::Rule
->name($temp_re)
->prune,
### Check for possible temporaries and don't prune (because th
+ey're possible, not guaranteed)
File::Find::Rule
->exec(\&soft_check)
)
->exec(\&report)
->in(@shares[@use_shares]);
my $end_stamp = DateTime->now(time_zone => $time_zone);
my $diff_stamp = $start_stamp->subtract_datetime($end_stamp);
$diff_stamp->in_units(qw(hours minutes seconds));
$log->info(
sprintf "\nSearch ended on %s and took %d hours, %d minutes, and %
+d seconds.\n\n",
$end_stamp->strftime('%A, %B %d, %Y @ %r'),
map { $diff_stamp->$_} qw(hours minutes seconds)
);
#--------------#
# Write Report #
#--------------#
{
open my $RPT, '>', $rpt_file or die $!;
my $saver = SelectSaver->new($RPT);
print 'Suspected Temporary Files: ', format_number($files), "\n";
print 'Total Size: ', format_bytes($total_size), "\n";
my $info_format = "%-5s %-14s %s\n";
my $last_user = '';
for my $file (
sort {
ncmp($a->{user}, $b->{user}) ||
ncmp($b->{size}, $a->{size}) ||
ncmp($b->{how_old}, $a->{how_old})
} @files
) {
my $user = $file->{user};
if ($user ne $last_user) {
print "\n$user\n";
print '=' x 8, "\n";
printf $info_format, 'Size', 'Last Accessed', 'How Old?';
print '-' x 40, "\n";
}
$file->{size} = format_bytes($file->{size});
printf $info_format, @{$file}{qw(size atime how_old)};
### Print the path on its own line (so it can easily be copied
+/pasted).
print "\n ";
print $file->{path};
print "\n\n";
$last_user = $user;
}
}
print "Complete.\n\nLog written to $log_file.\nReport written to $rpt_
+file\n\n";
#-----------#
# Functions #
#-----------#
sub init_check {
my ($name, undef, $path) = @_;
### Ignore links, sockets, and empties.
return 0 if -l $path || -z _ || -S _;
my $stat = stat($path) or die $!;
### Only bother with files 5MB+.
return 0 if -f $path && $stat->size < 5_242_880;
### Ignore directories matching "prev", "previous", etc. per our s
+et up.
return 0 if -d $path && $name =~ /prev(?!iew)/;
return 1;
}
sub soft_check {
my ($name, undef, $path) = @_;
### Test for REs, words, then ngrams.
return 0 unless all { $name =~ $_ } @REs;
if (length $name >= 4) {
for ($name =~ /([A-Za-z][a-z]{3,}|[A-Z]{4,})/g) {
if (exists $dict{lc $_}) {
$log->info("\tSkipping '$name' due to presence of '$_'
+\n");
$log->info("\t\t$path\n");
return 0;
}
}
}
return 0 if lc $name =~ $ra->re;
return 1;
}
sub report {
my ($name, undef, $path) = @_;
my $stat = stat($path) or die $!;
### Proceed with reporting.
my $user = $uid{$stat->uid} || '?';
my $file_stamp = DateTime->from_epoch(epoch => $stat->atime, time_
+zone => $time_zone);
my $age_stamp = $load_stamp->subtract_datetime($file_stamp);
$age_stamp->in_units(qw(years months weeks days));
my $how_old = sprintf '%dyr, %dmo, %dw, %dd', map { $age_stamp->$_
+} qw(years months weeks days);
my $size;
push @files, {
user => $user,
size => $size = -d $path ? du($path) : $stat->size,
atime => time2str('%D %H:%M', $stat->atime),
how_old => $how_old,
path => -d $path ? "$path/" : $path,
};
$total_size += $size;
++$files;
}