Choose the shares to search --------------------------- 1) /data 69% 2) /export/home 53% 3) /tmp 0% 4) /usr/local 75% 5) /xppdata 79% Shares: (1 - 5, separate multiple entries with spaces) 1 2 3 4 5 Shares searched: /data /export/home /tmp /usr/local /xppdata WordNet words loaded: 55,191 Solaris words loaded: 20,068 Words used in ngram creation: 74,874 Words in assembled dictionary: 60,968 Ngrams used: ion: 3,387 ( 4.5%) tio: 2,738 ( 3.7%) ess: 2,693 ( 3.6%) ing: 2,643 ( 3.5%) ter: 2,531 ( 3.4%) ati: 2,523 ( 3.4%) nes: 2,300 ( 3.1%) ent: 1,942 ( 2.6%) ine: 1,649 ( 2.2%) ate: 1,557 ( 2.1%) ant: 1,374 ( 1.8%) ist: 1,356 ( 1.8%) per: 1,194 ( 1.6%) con: 1,117 ( 1.5%) ity: 1,105 ( 1.5%) man: 1,059 ( 1.4%) men: 1,059 ( 1.4%) lin: 1,043 ( 1.4%) tor: 1,033 ( 1.4%) ali: 1,019 ( 1.4%) tra: 1,018 ( 1.4%) and: 1,001 ( 1.3%) ida: 992 ( 1.3%) eri: 991 ( 1.3%) ene: 977 ( 1.3%) ste: 972 ( 1.3%) ble: 939 ( 1.3%) her: 930 ( 1.2%) ari: 927 ( 1.2%) ran: 926 ( 1.2%) ism: 911 ( 1.2%) lit: 909 ( 1.2%) ous: 906 ( 1.2%) tic: 904 ( 1.2%) the: 884 ( 1.2%) pro: 880 ( 1.2%) str: 876 ( 1.2%) nce: 875 ( 1.2%) rat: 868 ( 1.2%) tin: 865 ( 1.2%) ast: 842 ( 1.1%) era: 835 ( 1.1%) ver: 830 ( 1.1%) tri: 824 ( 1.1%) sta: 820 ( 1.1%) der: 817 ( 1.1%) ill: 807 ( 1.1%) rin: 807 ( 1.1%) gra: 801 ( 1.1%) ace: 800 ( 1.1%) est: 797 ( 1.1%) ria: 790 ( 1.1%) res: 782 ( 1.0%) dae: 779 ( 1.0%) nti: 771 ( 1.0%) sti: 763 ( 1.0%) ian: 761 ( 1.0%) lat: 732 ( 1.0%) all: 728 ( 1.0%) ell: 726 ( 1.0%) ect: 721 ( 1.0%) ina: 719 ( 1.0%) Ngram RE: (?-xism:(?:i(?:n[aeg]|[ao]n|s[mt]|da|ll|ty)|a(?:l[il]|n[dt]|t[ei]|ce|ri|st)|e(?:n[et]|r[ai]|s[st]|ct|ll)|t(?:i[cno]|[eo]r|r[ai]|he)|r(?:a[nt]|i[an]|es)|(?:m[ae]|co)n|l(?:i[nt]|at)|n(?:ce|es|ti)|d(?:ae|er)|p(?:er|ro)|st[aeir]|[hv]er|ble|gra|ous)) Search began on Friday, January 14, 2011 @ 03:49:05 PM ... Search ended on Friday, January 14, 2011 @ 04:40:08 PM and took 0 hours, 51 minutes, and 3 seconds. Complete. Log written to /tmp/suspect_tmp_files_log_01-14-2011_15.48.txt. Report written to /tmp/suspect_tmp_files_rpt_01-14-2011_15.48.txt #### Suspected Temporary Files: 2,453 Total Size: 25G user_name ========= Size Last Accessed How Old? ---------------------------------------- 3.3G 07/10/09 05:59 1yr, 6mo, 0w, 4d /path/with/a/directory/named/backups/ 11M 11/17/04 16:36 6yr, 1mo, 3w, 5d /path/to/a/database/named/test.something #### use warnings; use strict; use File::Find::Rule; use File::stat; use Number::Bytes::Human qw(format_bytes); use Date::Format; use DateTime; use Filesys::DfPortable; use Filesys::DiskUsage qw(du); use List::Compare; use List::MoreUtils qw(all); use Log::Dispatch; use Term::Prompt; use SelectSaver; use Path::Class; use Number::Format qw(format_number); use Regexp::Assemble; use Sort::Key::Natural qw(natsort_inplace); use Sort::Naturally; use WordNet::QueryData; my $files = 0; my $total_size = 0; my $time_zone = DateTime::TimeZone->new(name => 'local'); my $load_stamp = DateTime->now(time_zone => $time_zone); #---------# # Logging # #---------# my $log_file = file( '/tmp', sprintf 'suspect_tmp_files_log_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M') )->stringify; my $rpt_file = file( '/tmp', sprintf 'suspect_tmp_files_rpt_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M') )->stringify; my $log = Log::Dispatch->new( outputs => [ [ 'File', min_level => 'debug', filename => $log_file, ], [ 'Screen', min_level => 'info' ], ], ); $SIG{__DIE__} = sub { ### Ignore evals. return if $^S; $log->error(shift); exit 1; }; $SIG{__WARN__} = sub { $log->warning(shift); }; #-------------------# # Prompt for shares # #-------------------# my @shares = qw(/data /xppdata /tmp /usr/local /export/home); natsort_inplace @shares; my @use_shares = prompt( 'm', { title => 'Choose the shares to search', prompt => 'Shares:', items => [ map { my $df_ref = dfportable($_) || die "Unable to determine disk usage for $_!\n"; sprintf "%-19s %2d%%", $_, $df_ref->{per}; } @shares ], accept_multiple_selections => 1, accept_empty_selection => 0, cols => 1, }, '', '' ); print "\n"; $log->info( "Shares searched:\n\t" . (join "\n\t" => @shares[@use_shares]) . "\n\n" ); #-----------------# # Hash user names # #-----------------# my %uid; while (my ($name, undef, $uid) = getpwent()) { $uid{$uid} = $name; } #-----------# # Set times # #-----------# my $now = time; my $a_month_ago = $now - (86_400 * 30); my $years_ago = $now - (86_400 * 365 * 2); #---------------------------------------# # Prep for finding temporary-like names # #---------------------------------------# my %ngram; my %dict; my $ngram_total; my $temp_re = qr/ \Acore\z | copy | dupe | t(?:e?mp|rash) | ba(?:k|ck_?up) | \b(?:old|test) | prev(?!iew) | ([a-z_])\1\1 | \bfoo\b | \.log | [~\$] /ix; ### Prep our dictionaries: Solaris and WordNet. my @dict; open my $DICT, '<', '/usr/share/lib/dict/words' or die $!; my $wn = WordNet::QueryData->new( dir => '/usr/local/wordnet/dict/', noload => 1, ); ### Gather words from WordNet. my @wn = grep { /^[a-z]+$/ } $wn->listAllWords; $log->info('WordNet words loaded:', format_number(scalar @wn), "\n"); ### Gather words from Solaris' dictionary. push @dict, $_ for grep { /^[a-z]+$/ } <$DICT>; $log->info('Solaris words loaded:', format_number(scalar @dict), "\n\n"); ### Compare for union. my $lc = List::Compare->new(\@wn, \@dict); ### Gather ngrams. ### Only allow words that are lowercase and have 3+ letters. for (grep { /^[a-z]{3,}$/ } $lc->get_union) { s/\s+\z//; ### Gather letter trios (ngrams, or, more specifically, trigrams). my $word = $_; my @ngrams = map { substr($word, $_, 3) } 0 .. (length $_) - 3; ### Tally. ++$ngram{$_} for @ngrams; ++$ngram_total; ### Only add 4+ lengths to the dictionary as many temps were matching lengths of 3. ++$dict{$_} if length >= 4; } $log->info('Words used in ngram creation:', format_number($ngram_total), "\n"); ### Done with these. undef $lc; undef @wn; undef @dict; ### Remove suspect temporary words and months from our dictionary. for my $word ( qw(archive backup copy core defunct dupe log old previous sample temporary test trash hold), qw(january february march april may june july august september october november december) ) { delete $dict{$word} if exists $dict{$word}; } $log->info('Words in assembled dictionary:', format_number(scalar keys %dict), "\n\n"); $log->info("Ngrams used:\n\n"); ### Remove ngrams with less than 1% occurrence. for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) { my $percentage = format_number(($ngram{$ngram} / $ngram_total) * 100, 1, 1); delete $ngram{$ngram} and next if $percentage < 1; $log->info(sprintf "\t%3s: %5s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage); } $log->info("\n"); ### Build an RE based on the ngrams. my $ra = Regexp::Assemble->new; $ra->add($_) for keys %ngram; $log->info("Ngram RE:\n\n" . $ra->re, "\n\n"); ### Files must match these to be considered temporary. my @REs = ( ### Lower/upper case letters not in the extension. qr/\A[^.]+[a-z]/, qr/\A[^.]+[A-Z]/, ### Digit. qr/\d/, ### Name only contains upper/lower case letters or digits; may end in a tilde, dollars, or ext. qr/\A[a-zA-Z\d]+(?:[~\$]|\..{1,4})*\z/, ); #------------# # Find Files # #------------# my @files; my $start_stamp = DateTime->now(time_zone => $time_zone); $log->info('Search began on', $start_stamp->strftime('%A, %B %d, %Y @ %r'), "\n"); ### Start with anything a month old, perform an initial check, then look for guaranteeds or possibilities. File::Find::Rule ->atime("<$a_month_ago") ->exec(\&init_check) ->or( ### Check for those that match the temp RE and prune. File::Find::Rule ->name($temp_re) ->prune, ### Check for possible temporaries and don't prune (because they're possible, not guaranteed) File::Find::Rule ->exec(\&soft_check) ) ->exec(\&report) ->in(@shares[@use_shares]); my $end_stamp = DateTime->now(time_zone => $time_zone); my $diff_stamp = $start_stamp->subtract_datetime($end_stamp); $diff_stamp->in_units(qw(hours minutes seconds)); $log->info( sprintf "\nSearch ended on %s and took %d hours, %d minutes, and %d seconds.\n\n", $end_stamp->strftime('%A, %B %d, %Y @ %r'), map { $diff_stamp->$_} qw(hours minutes seconds) ); #--------------# # Write Report # #--------------# { open my $RPT, '>', $rpt_file or die $!; my $saver = SelectSaver->new($RPT); print 'Suspected Temporary Files: ', format_number($files), "\n"; print 'Total Size: ', format_bytes($total_size), "\n"; my $info_format = "%-5s %-14s %s\n"; my $last_user = ''; for my $file ( sort { ncmp($a->{user}, $b->{user}) || ncmp($b->{size}, $a->{size}) || ncmp($b->{how_old}, $a->{how_old}) } @files ) { my $user = $file->{user}; if ($user ne $last_user) { print "\n$user\n"; print '=' x 8, "\n"; printf $info_format, 'Size', 'Last Accessed', 'How Old?'; print '-' x 40, "\n"; } $file->{size} = format_bytes($file->{size}); printf $info_format, @{$file}{qw(size atime how_old)}; ### Print the path on its own line (so it can easily be copied/pasted). print "\n "; print $file->{path}; print "\n\n"; $last_user = $user; } } print "Complete.\n\nLog written to $log_file.\nReport written to $rpt_file\n\n"; #-----------# # Functions # #-----------# sub init_check { my ($name, undef, $path) = @_; ### Ignore links, sockets, and empties. return 0 if -l $path || -z _ || -S _; my $stat = stat($path) or die $!; ### Only bother with files 5MB+. return 0 if -f $path && $stat->size < 5_242_880; ### Ignore directories matching "prev", "previous", etc. per our set up. return 0 if -d $path && $name =~ /prev(?!iew)/; return 1; } sub soft_check { my ($name, undef, $path) = @_; ### Test for REs, words, then ngrams. return 0 unless all { $name =~ $_ } @REs; if (length $name >= 4) { for ($name =~ /([A-Za-z][a-z]{3,}|[A-Z]{4,})/g) { if (exists $dict{lc $_}) { $log->info("\tSkipping '$name' due to presence of '$_'\n"); $log->info("\t\t$path\n"); return 0; } } } return 0 if lc $name =~ $ra->re; return 1; } sub report { my ($name, undef, $path) = @_; my $stat = stat($path) or die $!; ### Proceed with reporting. my $user = $uid{$stat->uid} || '?'; my $file_stamp = DateTime->from_epoch(epoch => $stat->atime, time_zone => $time_zone); my $age_stamp = $load_stamp->subtract_datetime($file_stamp); $age_stamp->in_units(qw(years months weeks days)); my $how_old = sprintf '%dyr, %dmo, %dw, %dd', map { $age_stamp->$_} qw(years months weeks days); my $size; push @files, { user => $user, size => $size = -d $path ? du($path) : $stat->size, atime => time2str('%D %H:%M', $stat->atime), how_old => $how_old, path => -d $path ? "$path/" : $path, }; $total_size += $size; ++$files; }