Beefy Boxes and Bandwidth Generously Provided by pair Networks
"be consistent"
 
PerlMonks  

Word Count Script+

by xgunnerx (Initiate)
on May 22, 2002 at 15:07 UTC ( #168456=sourcecode: print w/ replies, xml ) Need Help??

Category: Miscellaneous
Author/Contact Info xgunnerx
Description: A word count script that "attempts" to be more accurate. It will take into account works at the end of a line that end with a hyphen, words that contain hyphens, etc. It will also give a character frequency on every character. A word frequency on every word. Avg length of words. Avg amount of words per sentence. Results go in results.txt
#!/usr/bin/perl -w

use strict;
print "Enter a filename to analyze: ";
my $file_read = <STDIN>;
chomp $file_read;
my %w_counter = ();
my %c_counter = ();
my $totalcount = 0;
my $charcount = 0;
my $p_val = 0;
my $var;
my $avglen;

open(FILE, "$file_read") or die "Could not open file: $!\n";
my @array = <FILE>;
close FILE;

foreach (@array) {
    if (/\b\w+\-\w+\-$/) {
      s/\n/ /sg;
      $var .= $_;
    }
    elsif (/\b-$/) {
      s/\b\-\n+/ /sg;
      $var .= $_;
    }
    else {
      s/\n/ /sg;
      $var .= $_;
    }
}
$var =~ s/-{2}/ /g;
$var =~ tr/[A-Z]/[a-z]/;
my $expr = q/([\w]+[-]?[']?(?:\w*)?[-]?(?:\w*)?)/;
my $subexpr = qr/$expr/;
while ($var =~ /$subexpr/g) {
    $w_counter{$1}++;
}

sub sort_byval_w {
    $w_counter{$b} <=> $w_counter{$a};
}
open(RESULTS, ">results.txt");
foreach my $key (sort sort_byval_w(keys %w_counter)) {
    print RESULTS "The word $key was seen $w_counter{$key} times\n";
    $totalcount += $w_counter{$key};
}

open(FILE, "$file_read") or die "Could not open file: $!\n";

while (<FILE>) {
    while(/(.)/sg) {
            $c_counter{$1}++;
    }
}
close FILE;

sub sort_byval_c {
    $c_counter{$b} <=> $c_counter{$a};
}

foreach my $key (sort sort_byval_c(keys %c_counter)) {
my $space = " ";
        if ($key =~ /\t/) {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<TAB>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key eq "$space") {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<SPACE>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key =~ /\n/) {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<NEWLINE>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key =~ /\r/) {
            $key = "<RETURN>";
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $c_counter{$key} = "$p_val";
        }
        print RESULTS "The char $key was seen $c_counter{$key} times\n
+";
        $charcount += $c_counter{$key};
}

## Get avg numb words per sentance

my $sentences = $c_counter{"."} + $c_counter{"?"} + $c_counter{"!"};
my $avgwords_sent = $totalcount / $sentences;
my $avgwords_tot = $charcount / $totalcount;
print RESULTS "\n";
print RESULTS "-----------------------------\n";
print RESULTS "Total characters are: $charcount\n";
print RESULTS "Total words: $totalcount\n";
print RESULTS "Average words per sentence is: $avgwords_sent\n";
print RESULTS "Average length of words is: $avgwords_tot\n";
close RESULTS;
print "See results.txt for your results\n";

Comment on Word Count Script+
Download Code
Re: Word Count Script+
by d4vis (Chaplain) on May 22, 2002 at 20:44 UTC
    Nifty...
    though if it doesn't find a "." "?" or "!" in the input file, you'll get a warning:
    Use of uninitialized value in addition (+) at wc.pl line 95.

    ~monk d4vis
    #!/usr/bin/fnord

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://168456]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others examining the Monastery: (9)
As of 2014-07-25 04:29 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    My favorite superfluous repetitious redundant duplicative phrase is:









    Results (167 votes), past polls