Beefy Boxes and Bandwidth Generously Provided by pair Networks
P is for Practical
 
PerlMonks  

Word Count Script+

by xgunnerx (Initiate)
on May 22, 2002 at 15:07 UTC ( #168456=sourcecode: print w/ replies, xml ) Need Help??

Category: Miscellaneous
Author/Contact Info xgunnerx
Description: A word count script that "attempts" to be more accurate. It will take into account works at the end of a line that end with a hyphen, words that contain hyphens, etc. It will also give a character frequency on every character. A word frequency on every word. Avg length of words. Avg amount of words per sentence. Results go in results.txt
#!/usr/bin/perl -w

use strict;
print "Enter a filename to analyze: ";
my $file_read = <STDIN>;
chomp $file_read;
my %w_counter = ();
my %c_counter = ();
my $totalcount = 0;
my $charcount = 0;
my $p_val = 0;
my $var;
my $avglen;

open(FILE, "$file_read") or die "Could not open file: $!\n";
my @array = <FILE>;
close FILE;

foreach (@array) {
    if (/\b\w+\-\w+\-$/) {
      s/\n/ /sg;
      $var .= $_;
    }
    elsif (/\b-$/) {
      s/\b\-\n+/ /sg;
      $var .= $_;
    }
    else {
      s/\n/ /sg;
      $var .= $_;
    }
}
$var =~ s/-{2}/ /g;
$var =~ tr/[A-Z]/[a-z]/;
my $expr = q/([\w]+[-]?[']?(?:\w*)?[-]?(?:\w*)?)/;
my $subexpr = qr/$expr/;
while ($var =~ /$subexpr/g) {
    $w_counter{$1}++;
}

sub sort_byval_w {
    $w_counter{$b} <=> $w_counter{$a};
}
open(RESULTS, ">results.txt");
foreach my $key (sort sort_byval_w(keys %w_counter)) {
    print RESULTS "The word $key was seen $w_counter{$key} times\n";
    $totalcount += $w_counter{$key};
}

open(FILE, "$file_read") or die "Could not open file: $!\n";

while (<FILE>) {
    while(/(.)/sg) {
            $c_counter{$1}++;
    }
}
close FILE;

sub sort_byval_c {
    $c_counter{$b} <=> $c_counter{$a};
}

foreach my $key (sort sort_byval_c(keys %c_counter)) {
my $space = " ";
        if ($key =~ /\t/) {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<TAB>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key eq "$space") {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<SPACE>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key =~ /\n/) {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<NEWLINE>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key =~ /\r/) {
            $key = "<RETURN>";
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $c_counter{$key} = "$p_val";
        }
        print RESULTS "The char $key was seen $c_counter{$key} times\n
+";
        $charcount += $c_counter{$key};
}

## Get avg numb words per sentance

my $sentences = $c_counter{"."} + $c_counter{"?"} + $c_counter{"!"};
my $avgwords_sent = $totalcount / $sentences;
my $avgwords_tot = $charcount / $totalcount;
print RESULTS "\n";
print RESULTS "-----------------------------\n";
print RESULTS "Total characters are: $charcount\n";
print RESULTS "Total words: $totalcount\n";
print RESULTS "Average words per sentence is: $avgwords_sent\n";
print RESULTS "Average length of words is: $avgwords_tot\n";
close RESULTS;
print "See results.txt for your results\n";

Comment on Word Count Script+
Download Code
Re: Word Count Script+
by d4vis (Chaplain) on May 22, 2002 at 20:44 UTC
    Nifty...
    though if it doesn't find a "." "?" or "!" in the input file, you'll get a warning:
    Use of uninitialized value in addition (+) at wc.pl line 95.

    ~monk d4vis
    #!/usr/bin/fnord

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://168456]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others lurking in the Monastery: (6)
As of 2015-07-02 23:10 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (47 votes), past polls