Choose the shares to search
---------------------------
1) /data 69%
2) /export/home 53%
3) /tmp 0%
4) /usr/local 75%
5) /xppdata 79%
Shares: (1 - 5, separate multiple entries with spaces) 1 2 3 4 5
Shares searched:
/data
/export/home
/tmp
/usr/local
/xppdata
WordNet words loaded: 55,191
Solaris words loaded: 20,068
Words used in ngram creation: 74,874
Words in assembled dictionary: 60,968
Ngrams used:
ion: 3,387 ( 4.5%)
tio: 2,738 ( 3.7%)
ess: 2,693 ( 3.6%)
ing: 2,643 ( 3.5%)
ter: 2,531 ( 3.4%)
ati: 2,523 ( 3.4%)
nes: 2,300 ( 3.1%)
ent: 1,942 ( 2.6%)
ine: 1,649 ( 2.2%)
ate: 1,557 ( 2.1%)
ant: 1,374 ( 1.8%)
ist: 1,356 ( 1.8%)
per: 1,194 ( 1.6%)
con: 1,117 ( 1.5%)
ity: 1,105 ( 1.5%)
man: 1,059 ( 1.4%)
men: 1,059 ( 1.4%)
lin: 1,043 ( 1.4%)
tor: 1,033 ( 1.4%)
ali: 1,019 ( 1.4%)
tra: 1,018 ( 1.4%)
and: 1,001 ( 1.3%)
ida: 992 ( 1.3%)
eri: 991 ( 1.3%)
ene: 977 ( 1.3%)
ste: 972 ( 1.3%)
ble: 939 ( 1.3%)
her: 930 ( 1.2%)
ari: 927 ( 1.2%)
ran: 926 ( 1.2%)
ism: 911 ( 1.2%)
lit: 909 ( 1.2%)
ous: 906 ( 1.2%)
tic: 904 ( 1.2%)
the: 884 ( 1.2%)
pro: 880 ( 1.2%)
str: 876 ( 1.2%)
nce: 875 ( 1.2%)
rat: 868 ( 1.2%)
tin: 865 ( 1.2%)
ast: 842 ( 1.1%)
era: 835 ( 1.1%)
ver: 830 ( 1.1%)
tri: 824 ( 1.1%)
sta: 820 ( 1.1%)
der: 817 ( 1.1%)
ill: 807 ( 1.1%)
rin: 807 ( 1.1%)
gra: 801 ( 1.1%)
ace: 800 ( 1.1%)
est: 797 ( 1.1%)
ria: 790 ( 1.1%)
res: 782 ( 1.0%)
dae: 779 ( 1.0%)
nti: 771 ( 1.0%)
sti: 763 ( 1.0%)
ian: 761 ( 1.0%)
lat: 732 ( 1.0%)
all: 728 ( 1.0%)
ell: 726 ( 1.0%)
ect: 721 ( 1.0%)
ina: 719 ( 1.0%)
Ngram RE:
(?-xism:(?:i(?:n[aeg]|[ao]n|s[mt]|da|ll|ty)|a(?:l[il]|n[dt]|t[ei]|ce|ri|st)|e(?:n[et]|r[ai]|s[st]|ct|ll)|t(?:i[cno]|[eo]r|r[ai]|he)|r(?:a[nt]|i[an]|es)|(?:m[ae]|co)n|l(?:i[nt]|at)|n(?:ce|es|ti)|d(?:ae|er)|p(?:er|ro)|st[aeir]|[hv]er|ble|gra|ous))
Search began on Friday, January 14, 2011 @ 03:49:05 PM
...
Search ended on Friday, January 14, 2011 @ 04:40:08 PM and took 0 hours, 51 minutes, and 3 seconds.
Complete.
Log written to /tmp/suspect_tmp_files_log_01-14-2011_15.48.txt.
Report written to /tmp/suspect_tmp_files_rpt_01-14-2011_15.48.txt
####
Suspected Temporary Files: 2,453
Total Size: 25G
user_name
=========
Size Last Accessed How Old?
----------------------------------------
3.3G 07/10/09 05:59 1yr, 6mo, 0w, 4d
/path/with/a/directory/named/backups/
11M 11/17/04 16:36 6yr, 1mo, 3w, 5d
/path/to/a/database/named/test.something
##
##
use warnings;
use strict;
use File::Find::Rule;
use File::stat;
use Number::Bytes::Human qw(format_bytes);
use Date::Format;
use DateTime;
use Filesys::DfPortable;
use Filesys::DiskUsage qw(du);
use List::Compare;
use List::MoreUtils qw(all);
use Log::Dispatch;
use Term::Prompt;
use SelectSaver;
use Path::Class;
use Number::Format qw(format_number);
use Regexp::Assemble;
use Sort::Key::Natural qw(natsort_inplace);
use Sort::Naturally;
use WordNet::QueryData;
my $files = 0;
my $total_size = 0;
my $time_zone = DateTime::TimeZone->new(name => 'local');
my $load_stamp = DateTime->now(time_zone => $time_zone);
#---------#
# Logging #
#---------#
my $log_file = file(
'/tmp',
sprintf 'suspect_tmp_files_log_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M')
)->stringify;
my $rpt_file = file(
'/tmp',
sprintf 'suspect_tmp_files_rpt_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M')
)->stringify;
my $log = Log::Dispatch->new(
outputs =>
[
[
'File',
min_level => 'debug',
filename => $log_file,
],
[ 'Screen', min_level => 'info' ],
],
);
$SIG{__DIE__} = sub {
### Ignore evals.
return if $^S;
$log->error(shift);
exit 1;
};
$SIG{__WARN__} = sub { $log->warning(shift); };
#-------------------#
# Prompt for shares #
#-------------------#
my @shares = qw(/data /xppdata /tmp /usr/local /export/home);
natsort_inplace @shares;
my @use_shares = prompt(
'm',
{
title => 'Choose the shares to search',
prompt => 'Shares:',
items => [ map {
my $df_ref = dfportable($_) || die "Unable to determine disk usage for $_!\n";
sprintf "%-19s %2d%%", $_, $df_ref->{per};
} @shares ],
accept_multiple_selections => 1,
accept_empty_selection => 0,
cols => 1,
},
'',
''
);
print "\n";
$log->info(
"Shares searched:\n\t" .
(join "\n\t" => @shares[@use_shares]) .
"\n\n"
);
#-----------------#
# Hash user names #
#-----------------#
my %uid;
while (my ($name, undef, $uid) = getpwent()) {
$uid{$uid} = $name;
}
#-----------#
# Set times #
#-----------#
my $now = time;
my $a_month_ago = $now - (86_400 * 30);
my $years_ago = $now - (86_400 * 365 * 2);
#---------------------------------------#
# Prep for finding temporary-like names #
#---------------------------------------#
my %ngram;
my %dict;
my $ngram_total;
my $temp_re = qr/
\Acore\z |
copy |
dupe |
t(?:e?mp|rash) |
ba(?:k|ck_?up) |
\b(?:old|test) |
prev(?!iew) |
([a-z_])\1\1 |
\bfoo\b |
\.log |
[~\$]
/ix;
### Prep our dictionaries: Solaris and WordNet.
my @dict;
open my $DICT, '<', '/usr/share/lib/dict/words' or die $!;
my $wn = WordNet::QueryData->new(
dir => '/usr/local/wordnet/dict/',
noload => 1,
);
### Gather words from WordNet.
my @wn = grep { /^[a-z]+$/ } $wn->listAllWords;
$log->info('WordNet words loaded:', format_number(scalar @wn), "\n");
### Gather words from Solaris' dictionary.
push @dict, $_ for grep { /^[a-z]+$/ } <$DICT>;
$log->info('Solaris words loaded:', format_number(scalar @dict), "\n\n");
### Compare for union.
my $lc = List::Compare->new(\@wn, \@dict);
### Gather ngrams.
### Only allow words that are lowercase and have 3+ letters.
for (grep { /^[a-z]{3,}$/ } $lc->get_union) {
s/\s+\z//;
### Gather letter trios (ngrams, or, more specifically, trigrams).
my $word = $_;
my @ngrams = map { substr($word, $_, 3) } 0 .. (length $_) - 3;
### Tally.
++$ngram{$_} for @ngrams;
++$ngram_total;
### Only add 4+ lengths to the dictionary as many temps were matching lengths of 3.
++$dict{$_} if length >= 4;
}
$log->info('Words used in ngram creation:', format_number($ngram_total), "\n");
### Done with these.
undef $lc;
undef @wn;
undef @dict;
### Remove suspect temporary words and months from our dictionary.
for my $word (
qw(archive backup copy core defunct dupe log old previous sample temporary test trash hold),
qw(january february march april may june july august september october november december)
) {
delete $dict{$word} if exists $dict{$word};
}
$log->info('Words in assembled dictionary:', format_number(scalar keys %dict), "\n\n");
$log->info("Ngrams used:\n\n");
### Remove ngrams with less than 1% occurrence.
for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) {
my $percentage = format_number(($ngram{$ngram} / $ngram_total) * 100, 1, 1);
delete $ngram{$ngram} and next if $percentage < 1;
$log->info(sprintf "\t%3s: %5s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage);
}
$log->info("\n");
### Build an RE based on the ngrams.
my $ra = Regexp::Assemble->new;
$ra->add($_) for keys %ngram;
$log->info("Ngram RE:\n\n" . $ra->re, "\n\n");
### Files must match these to be considered temporary.
my @REs = (
### Lower/upper case letters not in the extension.
qr/\A[^.]+[a-z]/,
qr/\A[^.]+[A-Z]/,
### Digit.
qr/\d/,
### Name only contains upper/lower case letters or digits; may end in a tilde, dollars, or ext.
qr/\A[a-zA-Z\d]+(?:[~\$]|\..{1,4})*\z/,
);
#------------#
# Find Files #
#------------#
my @files;
my $start_stamp = DateTime->now(time_zone => $time_zone);
$log->info('Search began on', $start_stamp->strftime('%A, %B %d, %Y @ %r'), "\n");
### Start with anything a month old, perform an initial check, then look for guaranteeds or possibilities.
File::Find::Rule
->atime("<$a_month_ago")
->exec(\&init_check)
->or(
### Check for those that match the temp RE and prune.
File::Find::Rule
->name($temp_re)
->prune,
### Check for possible temporaries and don't prune (because they're possible, not guaranteed)
File::Find::Rule
->exec(\&soft_check)
)
->exec(\&report)
->in(@shares[@use_shares]);
my $end_stamp = DateTime->now(time_zone => $time_zone);
my $diff_stamp = $start_stamp->subtract_datetime($end_stamp);
$diff_stamp->in_units(qw(hours minutes seconds));
$log->info(
sprintf "\nSearch ended on %s and took %d hours, %d minutes, and %d seconds.\n\n",
$end_stamp->strftime('%A, %B %d, %Y @ %r'),
map { $diff_stamp->$_} qw(hours minutes seconds)
);
#--------------#
# Write Report #
#--------------#
{
open my $RPT, '>', $rpt_file or die $!;
my $saver = SelectSaver->new($RPT);
print 'Suspected Temporary Files: ', format_number($files), "\n";
print 'Total Size: ', format_bytes($total_size), "\n";
my $info_format = "%-5s %-14s %s\n";
my $last_user = '';
for my $file (
sort {
ncmp($a->{user}, $b->{user}) ||
ncmp($b->{size}, $a->{size}) ||
ncmp($b->{how_old}, $a->{how_old})
} @files
) {
my $user = $file->{user};
if ($user ne $last_user) {
print "\n$user\n";
print '=' x 8, "\n";
printf $info_format, 'Size', 'Last Accessed', 'How Old?';
print '-' x 40, "\n";
}
$file->{size} = format_bytes($file->{size});
printf $info_format, @{$file}{qw(size atime how_old)};
### Print the path on its own line (so it can easily be copied/pasted).
print "\n ";
print $file->{path};
print "\n\n";
$last_user = $user;
}
}
print "Complete.\n\nLog written to $log_file.\nReport written to $rpt_file\n\n";
#-----------#
# Functions #
#-----------#
sub init_check {
my ($name, undef, $path) = @_;
### Ignore links, sockets, and empties.
return 0 if -l $path || -z _ || -S _;
my $stat = stat($path) or die $!;
### Only bother with files 5MB+.
return 0 if -f $path && $stat->size < 5_242_880;
### Ignore directories matching "prev", "previous", etc. per our set up.
return 0 if -d $path && $name =~ /prev(?!iew)/;
return 1;
}
sub soft_check {
my ($name, undef, $path) = @_;
### Test for REs, words, then ngrams.
return 0 unless all { $name =~ $_ } @REs;
if (length $name >= 4) {
for ($name =~ /([A-Za-z][a-z]{3,}|[A-Z]{4,})/g) {
if (exists $dict{lc $_}) {
$log->info("\tSkipping '$name' due to presence of '$_'\n");
$log->info("\t\t$path\n");
return 0;
}
}
}
return 0 if lc $name =~ $ra->re;
return 1;
}
sub report {
my ($name, undef, $path) = @_;
my $stat = stat($path) or die $!;
### Proceed with reporting.
my $user = $uid{$stat->uid} || '?';
my $file_stamp = DateTime->from_epoch(epoch => $stat->atime, time_zone => $time_zone);
my $age_stamp = $load_stamp->subtract_datetime($file_stamp);
$age_stamp->in_units(qw(years months weeks days));
my $how_old = sprintf '%dyr, %dmo, %dw, %dd', map { $age_stamp->$_} qw(years months weeks days);
my $size;
push @files, {
user => $user,
size => $size = -d $path ? du($path) : $stat->size,
atime => time2str('%D %H:%M', $stat->atime),
how_old => $how_old,
path => -d $path ? "$path/" : $path,
};
$total_size += $size;
++$files;
}