#!/usr/bin/perl use warnings; use strict; use File::Copy; my $gui; my $mw; sub doit; sub remdupes; sub remuntr; print "\nSwitch on the GUI?\ny/n "; chomp ($gui = ); $gui or $gui = "y"; if ($gui eq "n") {$gui = ""}; if ($gui) {require Tk;import Tk;$mw = MainWindow->new; # $mw->title("Filterdupes"); $mw->geometry('300x100'); } my ($sourcefile_full, $folder, $sourcefile, $sourcefile_noext, $dupes, $untranslated, $unfiltered, $filtered); # GUI if ($gui) { my $intro = $mw -> Label(-text=>"This app strips duplicate entries from tab delimited txt files.\nPress \"Browse\" to pick your input file.") -> pack(); my $butt_browse; $butt_browse = $mw -> Button(-text => "Browse", -command => sub {$butt_browse->destroy;$intro->destroy;doit})-> pack(); MainLoop(); } else { doit; } sub doit { if ($gui) { $sourcefile_full = $mw->getOpenFile( -title => "Please choose the input file" # -multiple => 1 ); } else { print "\nDrag and drop the tab delimited source file here and press enter.\nThe file must be in UTF-8 encoding.\n"; chomp ($sourcefile_full = ); } do { # windows doesn't add quotes if there is no space in the path, linux adds single quotes # strip any leading and trailing spaces and quotes; $1=everything up to last / or \, $2= everything from there up to the end except spaces and "'. $sourcefile_full =~ /^ *[\"\']?(.*)[\/\\]([^\"\']*)[\"\']? *$/; $folder = $1; $sourcefile = $2; $sourcefile =~ /(.*)\.(.*)/; $sourcefile_noext = $1; # strip quotes $sourcefile_full =~ s/^ *[\"\']?([^\"\']*)[\"\']? *$/$1/; print "\nFile not found (maybe its path or its filename contains accented letters). Try again!\n\n" unless (-e "$sourcefile_full"); }until (-e "$sourcefile_full"); copy ("$sourcefile_full", "$folder/${sourcefile_noext}_filtered.txt") or die "cannot create backup: $!"; # sleep 5; # DUPES Q if ($gui) { # my $frm_remdupes = $mw -> Frame(-borderwidth => 4, -relief => 'groove') -> pack(-anchor=>'w'); my $frm_remdupes = $mw -> Frame() -> pack(); my $butt_removedupes = $frm_remdupes -> Button(-text => "Remove dupes", -command => sub {$dupes = "y";$frm_remdupes -> destroy; remdupes})-> pack(); my $butt_dontremovedupes = $frm_remdupes -> Button(-text => "Do not remove dupes", -command => sub {$dupes = "n";$frm_remdupes -> destroy; remdupes})-> pack(); } else { do { print "\nRemove duplicates? (case sensitive)\n[y/n] "; chomp ($dupes = lc()); $dupes or $dupes = "y"; print "\nAnswer with y or n.\n\n" unless $dupes eq "y" or $dupes eq "n"; }until ($dupes eq "y" or $dupes eq "n"); remdupes; } } # REMOVE DUPES sub remdupes { if ($dupes eq "y") { open (IN, "<:encoding(UTF-8)", "$folder/${sourcefile_noext}_filtered.txt") or die "Can't open file: $!"; open (OUT, ">>:encoding(UTF-8)", "$folder/${sourcefile_noext}_filtered_mod.txt") or die "Can't open file: $!"; my %seen; # hash that contains uique records (hash lookups are faster than array lookups) my $key; # key to be put in hash while () { /^([^\t]*\t[^\t]*)/; # only watch first two fields chomp ($key = $1); # only watch first two fields #$key = $_; # watch the full line print OUT $_ if (! $seen{ $key }++); # add to hash, and if new, print to file } $unfiltered = $.; $filtered = keys %seen; if ($gui) { my $lab_dupes = $mw -> Label(-text=>"Filtered out dupes: $unfiltered -> $filtered") -> pack(-anchor=> 'w'); } else { print "\nFiltered out dupes: $unfiltered -> $filtered\n"; } undef %seen; # free up memory # reporting close IN; close OUT; rename ("$folder/${sourcefile_noext}_filtered_mod.txt", "$folder/${sourcefile_noext}_filtered.txt") or die "Can't rename file: $!"; } # UNTRANSLATED Q #c if ($gui) { # my $frm_remuntr = $mw -> Frame(-borderwidth => 4, -relief => 'groove') -> pack(-anchor=>'w'); my $frm_remuntr = $mw -> Frame() -> pack(); my $butt_removeuntr = $frm_remuntr -> Button(-text => "Remove untranslated entries", -command => sub {$untranslated = "y";$frm_remuntr -> destroy; remuntr})-> pack(); my $butt_dontremoveuntr = $frm_remuntr -> Button(-text => "Do not remove untranslated", -command => sub {$untranslated = "n";$frm_remuntr -> destroy; remuntr})-> pack(); } else { do { print "\nRemove untranslated entries (where the first two fields are identical)?\n[y/n] "; chomp ($untranslated = lc()); $untranslated or $untranslated = "y"; print "\nAnswer with y or n.\n\n" unless $untranslated eq "y" or $untranslated eq "n"; }until ($untranslated eq "y" or $untranslated eq "n"); } } # REMOVE UNTRANSLATED sub remuntr { if ($untranslated eq "y") { open (IN, "<:encoding(UTF-8)", "$folder/${sourcefile_noext}_filtered.txt") or die "Can't open file: $!"; open (OUT, ">>:encoding(UTF-8)", "$folder/${sourcefile_noext}_filtered_mod.txt") or die "Can't open file: $!"; my $filtered = "0"; while () { /^([^\t]*)\t([^\t]*)/; next if $1 eq $2; print OUT $_; $filtered++; } my $unfiltered = $.; if ($gui) { my $lab_untr = $mw -> Label(-text=>"Filtered out untranslated: $unfiltered -> $filtered") -> pack(-anchor=> 'w'); } else { print "\nFiltered out untranslated: $unfiltered -> $filtered\n"; } # reporting close IN; close OUT; rename ("$folder/${sourcefile_noext}_filtered_mod.txt", "$folder/${sourcefile_noext}_filtered.txt") or die "Can't rename file: $!"; } # END if ($gui) { my $butt_exit = $mw -> Button(-text => "Exit", -command => sub {$mw->destroy})-> pack(); } else { print "\nPress enter to quit"; ; } }