use warnings;
use strict;
use File::Find::Rule;
use File::stat;
use Number::Bytes::Human qw(format_bytes);
use Date::Format;
use DateTime;
use Filesys::DfPortable;
use Filesys::DiskUsage qw(du); 
use List::Compare;
use List::MoreUtils qw(all);
use Log::Dispatch;
use Term::Prompt;
use SelectSaver;
use Path::Class;
use Number::Format qw(format_number);
use Regexp::Assemble;
use Sort::Key::Natural qw(natsort_inplace);
use Sort::Naturally;
use WordNet::QueryData;

my $files = 0;
my $total_size = 0;
my $time_zone = DateTime::TimeZone->new(name => 'local');
my $load_stamp = DateTime->now(time_zone => $time_zone);

#---------#
# Logging #
#---------#
my $log_file = file(
	'/tmp',
	sprintf 'suspect_tmp_files_log_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M')
)->stringify;
my $rpt_file = file(
	'/tmp',
	sprintf 'suspect_tmp_files_rpt_%s.txt', $load_stamp->strftime('%m-%d-%Y_%H.%M')
)->stringify;
my $log = Log::Dispatch->new(
	outputs =>
		[
			[
				'File',
				min_level => 'debug',
				filename => $log_file,
			],
			[ 'Screen', min_level => 'info' ],
		],
);
$SIG{__DIE__} = sub {
	### Ignore evals.
	return if $^S;
	$log->error(shift);
	exit 1;
};
$SIG{__WARN__} = sub { $log->warning(shift); };

#-------------------#
# Prompt for shares #
#-------------------#
my @shares = qw(/data /xppdata /tmp /usr/local /export/home);
natsort_inplace @shares;
my @use_shares = prompt(
	'm',
	{
		title => 'Choose the shares to search',
		prompt => 'Shares:',
		items => [ map {
			my $df_ref = dfportable($_) || die "Unable to determine disk usage for $_!\n";
			sprintf "%-19s  %2d%%", $_, $df_ref->{per};
		} @shares ],
		accept_multiple_selections => 1,
		accept_empty_selection => 0,
		cols => 1,
	},
	'',
	''
);
print "\n";
$log->info(
	"Shares searched:\n\t" .
	(join "\n\t" => @shares[@use_shares]) .
	"\n\n"
);

#-----------------#
# Hash user names #
#-----------------#
my %uid;
while (my ($name, undef, $uid) = getpwent()) {
	$uid{$uid} = $name;
}

#-----------#
# Set times #
#-----------#
my $now = time;
my $a_month_ago = $now - (86_400 * 30);
my $years_ago = $now - (86_400 * 365 * 2);

#---------------------------------------#
# Prep for finding temporary-like names #
#---------------------------------------#
my %ngram;
my %dict;
my $ngram_total;
my $temp_re = qr/
	\Acore\z |
	copy |
	dupe |
	t(?:e?mp|rash) |
	ba(?:k|ck_?up) |
	\b(?:old|test) |
	prev(?!iew) |
	([a-z_])\1\1 |
	\bfoo\b |
	\.log |
	[~\$]
/ix;

### Prep our dictionaries: Solaris and WordNet.
my @dict;
open my $DICT, '<', '/usr/share/lib/dict/words' or die $!;
my $wn = WordNet::QueryData->new(
	dir => '/usr/local/wordnet/dict/',
	noload => 1,
);

### Gather words from WordNet.
my @wn = grep { /^[a-z]+$/ } $wn->listAllWords;
$log->info('WordNet words loaded:', format_number(scalar @wn), "\n");

### Gather words from Solaris' dictionary.
push @dict, $_ for grep { /^[a-z]+$/ } <$DICT>;
$log->info('Solaris words loaded:', format_number(scalar @dict), "\n\n");

### Compare for union.
my $lc = List::Compare->new(\@wn, \@dict);

### Gather ngrams.
### Only allow words that are lowercase and have 3+ letters.
for (grep { /^[a-z]{3,}$/ } $lc->get_union) {
	s/\s+\z//;
	### Gather letter trios (ngrams, or, more specifically, trigrams).
	my $word = $_;
	my @ngrams = map { substr($word, $_, 3) } 0 .. (length $_) - 3;
	### Tally.
	++$ngram{$_} for @ngrams;
	++$ngram_total;
	### Only add 4+ lengths to the dictionary as many temps were matching lengths of 3.
	++$dict{$_} if length >= 4;
}
$log->info('Words used in ngram creation:', format_number($ngram_total), "\n");

### Done with these.
undef $lc;
undef @wn;
undef @dict;

### Remove suspect temporary words and months from our dictionary.
for my $word (
	qw(archive backup copy core defunct dupe log old previous sample temporary test trash hold),
	qw(january february march april may june july august september october november december)
) {
	delete $dict{$word} if exists $dict{$word};
}
$log->info('Words in assembled dictionary:', format_number(scalar keys %dict), "\n\n");

$log->info("Ngrams used:\n\n");
### Remove ngrams with less than 1% occurrence.
for my $ngram (sort {$ngram{$b} <=> $ngram{$a}} keys %ngram) {
	my $percentage = format_number(($ngram{$ngram} / $ngram_total) * 100, 1, 1);
	delete $ngram{$ngram} and next if $percentage < 1;
	$log->info(sprintf "\t%3s: %5s (%4s%%)\n", $ngram, format_number($ngram{$ngram}), $percentage);
}
$log->info("\n");

### Build an RE based on the ngrams.
my $ra = Regexp::Assemble->new;
$ra->add($_) for keys %ngram;
$log->info("Ngram RE:\n\n" . $ra->re, "\n\n");

### Files must match these to be considered temporary.
my @REs = (
	### Lower/upper case letters not in the extension.
	qr/\A[^.]+[a-z]/,
	qr/\A[^.]+[A-Z]/,
	### Digit.
	qr/\d/,
	### Name only contains upper/lower case letters or digits; may end in a tilde, dollars, or ext.
	qr/\A[a-zA-Z\d]+(?:[~\$]|\..{1,4})*\z/,
);

#------------#
# Find Files #
#------------#
my @files;
my $start_stamp = DateTime->now(time_zone => $time_zone);
$log->info('Search began on', $start_stamp->strftime('%A, %B %d, %Y @ %r'), "\n");
### Start with anything a month old, perform an initial check, then look for guaranteeds or possibilities.
File::Find::Rule
	->atime("<$a_month_ago")
	->exec(\&init_check)
	->or(
		### Check for those that match the temp RE and prune.
		File::Find::Rule
			->name($temp_re)
			->prune,
		### Check for possible temporaries and don't prune (because they're possible, not guaranteed)
		File::Find::Rule
			->exec(\&soft_check)
	)
	->exec(\&report)
	->in(@shares[@use_shares]);

my $end_stamp = DateTime->now(time_zone => $time_zone);
my $diff_stamp = $start_stamp->subtract_datetime($end_stamp);
$diff_stamp->in_units(qw(hours minutes seconds));
$log->info(
	sprintf "\nSearch ended on %s and took %d hours, %d minutes, and %d seconds.\n\n",
		$end_stamp->strftime('%A, %B %d, %Y @ %r'),
		map { $diff_stamp->$_} qw(hours minutes seconds)
);

#--------------#
# Write Report #
#--------------#
{
	open my $RPT, '>', $rpt_file or die $!;
	my $saver = SelectSaver->new($RPT);
	
	print 'Suspected Temporary Files: ', format_number($files), "\n";
	print 'Total Size: ', format_bytes($total_size), "\n";
	
	my $info_format = "%-5s   %-14s   %s\n";

	my $last_user = '';
	for my $file (
		sort {
				ncmp($a->{user}, $b->{user}) ||
				ncmp($b->{size}, $a->{size}) ||
				ncmp($b->{how_old}, $a->{how_old})
		} @files
	) {

		my $user = $file->{user};
		if ($user ne $last_user) {
			print "\n$user\n";
			print '=' x 8, "\n";
			printf $info_format, 'Size', 'Last Accessed', 'How Old?';
			print '-' x 40, "\n";
		}
		
		$file->{size} = format_bytes($file->{size});
		
		printf $info_format, @{$file}{qw(size atime how_old)};
		### Print the path on its own line (so it can easily be copied/pasted).
		print "\n  ";
		print $file->{path};
		print "\n\n";

		$last_user = $user;
	}
}

print "Complete.\n\nLog written to $log_file.\nReport written to $rpt_file\n\n";

#-----------#
# Functions #
#-----------#
sub init_check {
	my ($name, undef, $path) = @_;
	### Ignore links, sockets, and empties.
	return 0 if -l $path || -z _ || -S _;
	
	my $stat = stat($path) or die $!;
	### Only bother with files 5MB+.
	return 0 if -f $path && $stat->size < 5_242_880;
	### Ignore directories matching "prev", "previous", etc. per our set up.
	return 0 if -d $path && $name =~ /prev(?!iew)/;
	return 1;
}

sub soft_check {
	my ($name, undef, $path) = @_;
	### Test for REs, words, then ngrams.
	return 0 unless all { $name =~ $_ } @REs;
	if (length $name >= 4) {
		for ($name =~ /([A-Za-z][a-z]{3,}|[A-Z]{4,})/g) {
			if (exists $dict{lc $_}) {
				$log->info("\tSkipping '$name' due to presence of '$_'\n");
				$log->info("\t\t$path\n");
				return 0;
			}
		}
	}
	return 0 if lc $name =~ $ra->re;
	return 1;
}

sub report {
	my ($name, undef, $path) = @_;
	my $stat = stat($path) or die $!;
	### Proceed with reporting.
	my $user = $uid{$stat->uid} || '?';
	my $file_stamp = DateTime->from_epoch(epoch => $stat->atime, time_zone => $time_zone);
	my $age_stamp = $load_stamp->subtract_datetime($file_stamp);
	$age_stamp->in_units(qw(years months weeks days));
	my $how_old = sprintf '%dyr, %dmo, %dw, %dd', map { $age_stamp->$_} qw(years months weeks days);
	my $size;
	push @files, {
		user => $user,
		size => $size = -d $path ? du($path) : $stat->size,
		atime => time2str('%D %H:%M', $stat->atime),
		how_old => $how_old,
		path => -d $path ? "$path/" : $path,
	};
	$total_size += $size;
	++$files;
}