use warnings; use strict; use File::Find::Rule; use File::stat; use Number::Bytes::Human qw(format_bytes); use Date::Format; use DateTime; use Filesys::DfPortable; use Filesys::DiskUsage qw(du); use List::Compare; use List::MoreUtils qw(all); use Log::Dispatch; use Term::Prompt; use SelectSaver; use Path::Class; use Number::Format qw(format_number); use Regexp::Assemble; use Sort::Key::Natural qw(natsort_inplace); use Sort::Naturally; use WordNet::QueryData; my $files = 0; my $total_size = 0; my $time_zone = DateTime::TimeZone->new(name => 'local'); my $load_stamp = DateTime->now(time_zone => $time_zone); #---------# # Logging # #---------# my $log_file = file( '/tmp', sprintf 'suspect_tmp_files_log_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M') )->stringify; my $rpt_file = file( '/tmp', sprintf 'suspect_tmp_files_rpt_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M') )->stringify; my $log = Log::Dispatch->new( outputs => [ [ 'File', min_level => 'debug', filename => $log_file, ], [ 'Screen', min_level => 'info' ], ], ); $SIG{__DIE__} = sub { ### Ignore evals. return if $^S; $log->error(shift); exit 1; }; $SIG{__WARN__} = sub { $log->warning(shift); }; #-------------------# # Prompt for shares # #-------------------# my @shares = qw(/data /xppdata /tmp /usr/local /export/home); natsort_inplace @shares; my @use_shares = prompt( 'm', { title => 'Choose the shares to search', prompt => 'Shares:', items => [ map { my $df_ref = dfportable($_) || die "Unable to determine disk usage for $_!\n"; sprintf "%-19s %2d%%", $_, $df_ref->{per}; } @shares ], accept_multiple_selections => 1, accept_empty_selection => 0, cols => 1, }, '', '' ); print "\n"; $log->info( "Shares searched:\n\t" . (join "\n\t" => @shares[@use_shares]) . "\n\n" ); #-----------------# # Hash user names # #-----------------# my %uid; while (my ($name, undef, $uid) = getpwent()) { $uid{$uid} = $name; } #-----------# # Set times # #-----------# my $now = time; my $a_month_ago = $now - (86_400 * 30); my $years_ago = $now - (86_400 * 365 * 2); #---------------------------------------# # Prep for finding temporary-like names # #---------------------------------------# my %ngram; my %dict; my $ngram_total; my $temp_re = qr/ \Acore\z | copy | dupe | t(?:e?mp|rash) | ba(?:k|ck_?up) | \b(?:old|test) | prev(?!iew) | ([a-z_])\1\1 | \bfoo\b | \.log | [~\$] /ix; ### Prep our dictionaries: Solaris and WordNet. my @dict; open my $DICT, '<', '/usr/share/lib/dict/words' or die $!; my $wn = WordNet::QueryData->new( dir => '/usr/local/wordnet/dict/', noload => 1, ); ### Gather words from WordNet. my @wn = grep { /^[a-z]+$/ } $wn->listAllWords; $log->info('WordNet words loaded:', format_number(scalar @wn), "\n"); ### Gather words from Solaris' dictionary. push @dict, $_ for grep { /^[a-z]+$/ } <$DICT>; $log->info('Solaris words loaded:', format_number(scalar @dict), "\n\n"); ### Compare for union. my $lc = List::Compare->new(\@wn, \@dict); ### Gather ngrams. ### Only allow words that are lowercase and have 3+ letters. for (grep { /^[a-z]{3,}$/ } $lc->get_union) { s/\s+\z//; ### Gather letter trios (ngrams, or, more specifically, trigrams). my $word = $_; my @ngrams = map { substr($word, $_, 3) } 0 .. (length $_) - 3; ### Tally. ++$ngram{$_} for @ngrams; ++$ngram_total; ### Only add 4+ lengths to the dictionary as many temps were matching lengths of 3. ++$dict{$_} if length >= 4; } $log->info('Words used in ngram creation:', format_number($ngram_total), "\n"); ### Done with these. undef $lc; undef @wn; undef @dict; ### Remove suspect temporary words and months from our dictionary. for my $word ( qw(archive backup copy core defunct dupe log old previous sample temporary test trash hold), qw(january february march april may june july august september october november december) ) { delete $dict{$word} if exists $dict{$word}; } $log->info('Words in assembled dictionary:', format_number(scalar keys %dict), "\n\n"); $log->info("Ngrams used:\n\n"); ### Remove ngrams with less than 1% occurrence. for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) { my $percentage = format_number(($ngram{$ngram} / $ngram_total) * 100, 1, 1); delete $ngram{$ngram} and next if $percentage < 1; $log->info(sprintf "\t%3s: %5s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage); } $log->info("\n"); ### Build an RE based on the ngrams. my $ra = Regexp::Assemble->new; $ra->add($_) for keys %ngram; $log->info("Ngram RE:\n\n" . $ra->re, "\n\n"); ### Files must match these to be considered temporary. my @REs = ( ### Lower/upper case letters not in the extension. qr/\A[^.]+[a-z]/, qr/\A[^.]+[A-Z]/, ### Digit. qr/\d/, ### Name only contains upper/lower case letters or digits; may end in a tilde, dollars, or ext. qr/\A[a-zA-Z\d]+(?:[~\$]|\..{1,4})*\z/, ); #------------# # Find Files # #------------# my @files; my $start_stamp = DateTime->now(time_zone => $time_zone); $log->info('Search began on', $start_stamp->strftime('%A, %B %d, %Y @ %r'), "\n"); ### Start with anything a month old, perform an initial check, then look for guaranteeds or possibilities. File::Find::Rule ->atime("<$a_month_ago") ->exec(\&init_check) ->or( ### Check for those that match the temp RE and prune. File::Find::Rule ->name($temp_re) ->prune, ### Check for possible temporaries and don't prune (because they're possible, not guaranteed) File::Find::Rule ->exec(\&soft_check) ) ->exec(\&report) ->in(@shares[@use_shares]); my $end_stamp = DateTime->now(time_zone => $time_zone); my $diff_stamp = $start_stamp->subtract_datetime($end_stamp); $diff_stamp->in_units(qw(hours minutes seconds)); $log->info( sprintf "\nSearch ended on %s and took %d hours, %d minutes, and %d seconds.\n\n", $end_stamp->strftime('%A, %B %d, %Y @ %r'), map { $diff_stamp->$_} qw(hours minutes seconds) ); #--------------# # Write Report # #--------------# { open my $RPT, '>', $rpt_file or die $!; my $saver = SelectSaver->new($RPT); print 'Suspected Temporary Files: ', format_number($files), "\n"; print 'Total Size: ', format_bytes($total_size), "\n"; my $info_format = "%-5s %-14s %s\n"; my $last_user = ''; for my $file ( sort { ncmp($a->{user}, $b->{user}) || ncmp($b->{size}, $a->{size}) || ncmp($b->{how_old}, $a->{how_old}) } @files ) { my $user = $file->{user}; if ($user ne $last_user) { print "\n$user\n"; print '=' x 8, "\n"; printf $info_format, 'Size', 'Last Accessed', 'How Old?'; print '-' x 40, "\n"; } $file->{size} = format_bytes($file->{size}); printf $info_format, @{$file}{qw(size atime how_old)}; ### Print the path on its own line (so it can easily be copied/pasted). print "\n "; print $file->{path}; print "\n\n"; $last_user = $user; } } print "Complete.\n\nLog written to $log_file.\nReport written to $rpt_file\n\n"; #-----------# # Functions # #-----------# sub init_check { my ($name, undef, $path) = @_; ### Ignore links, sockets, and empties. return 0 if -l $path || -z _ || -S _; my $stat = stat($path) or die $!; ### Only bother with files 5MB+. return 0 if -f $path && $stat->size < 5_242_880; ### Ignore directories matching "prev", "previous", etc. per our set up. return 0 if -d $path && $name =~ /prev(?!iew)/; return 1; } sub soft_check { my ($name, undef, $path) = @_; ### Test for REs, words, then ngrams. return 0 unless all { $name =~ $_ } @REs; if (length $name >= 4) { for ($name =~ /([A-Za-z][a-z]{3,}|[A-Z]{4,})/g) { if (exists $dict{lc $_}) { $log->info("\tSkipping '$name' due to presence of '$_'\n"); $log->info("\t\t$path\n"); return 0; } } } return 0 if lc $name =~ $ra->re; return 1; } sub report { my ($name, undef, $path) = @_; my $stat = stat($path) or die $!; ### Proceed with reporting. my $user = $uid{$stat->uid} || '?'; my $file_stamp = DateTime->from_epoch(epoch => $stat->atime, time_zone => $time_zone); my $age_stamp = $load_stamp->subtract_datetime($file_stamp); $age_stamp->in_units(qw(years months weeks days)); my $how_old = sprintf '%dyr, %dmo, %dw, %dd', map { $age_stamp->$_} qw(years months weeks days); my $size; push @files, { user => $user, size => $size = -d $path ? du($path) : $stat->size, atime => time2str('%D %H:%M', $stat->atime), how_old => $how_old, path => -d $path ? "$path/" : $path, }; $total_size += $size; ++$files; }