Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl Monk, Perl Meditation
 
PerlMonks  

Word Count Script+

by xgunnerx (Initiate)
on May 22, 2002 at 15:07 UTC ( [id://168456]=sourcecode: print w/replies, xml ) Need Help??
Category: Miscellaneous
Author/Contact Info xgunnerx
Description: A word count script that "attempts" to be more accurate. It will take into account works at the end of a line that end with a hyphen, words that contain hyphens, etc. It will also give a character frequency on every character. A word frequency on every word. Avg length of words. Avg amount of words per sentence. Results go in results.txt
#!/usr/bin/perl -w

use strict;
print "Enter a filename to analyze: ";
my $file_read = <STDIN>;
chomp $file_read;
my %w_counter = ();
my %c_counter = ();
my $totalcount = 0;
my $charcount = 0;
my $p_val = 0;
my $var;
my $avglen;

open(FILE, "$file_read") or die "Could not open file: $!\n";
my @array = <FILE>;
close FILE;

foreach (@array) {
    if (/\b\w+\-\w+\-$/) {
      s/\n/ /sg;
      $var .= $_;
    }
    elsif (/\b-$/) {
      s/\b\-\n+/ /sg;
      $var .= $_;
    }
    else {
      s/\n/ /sg;
      $var .= $_;
    }
}
$var =~ s/-{2}/ /g;
$var =~ tr/[A-Z]/[a-z]/;
my $expr = q/([\w]+[-]?[']?(?:\w*)?[-]?(?:\w*)?)/;
my $subexpr = qr/$expr/;
while ($var =~ /$subexpr/g) {
    $w_counter{$1}++;
}

sub sort_byval_w {
    $w_counter{$b} <=> $w_counter{$a};
}
open(RESULTS, ">results.txt");
foreach my $key (sort sort_byval_w(keys %w_counter)) {
    print RESULTS "The word $key was seen $w_counter{$key} times\n";
    $totalcount += $w_counter{$key};
}

open(FILE, "$file_read") or die "Could not open file: $!\n";

while (<FILE>) {
    while(/(.)/sg) {
            $c_counter{$1}++;
    }
}
close FILE;

sub sort_byval_c {
    $c_counter{$b} <=> $c_counter{$a};
}

foreach my $key (sort sort_byval_c(keys %c_counter)) {
my $space = " ";
        if ($key =~ /\t/) {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<TAB>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key eq "$space") {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<SPACE>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key =~ /\n/) {
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $key = "<NEWLINE>";
            $c_counter{$key} = "$p_val";
        }
        elsif ($key =~ /\r/) {
            $key = "<RETURN>";
            $p_val = $c_counter{$key};
            delete $c_counter{$key};
            $c_counter{$key} = "$p_val";
        }
        print RESULTS "The char $key was seen $c_counter{$key} times\n
+";
        $charcount += $c_counter{$key};
}

## Get avg numb words per sentance

my $sentences = $c_counter{"."} + $c_counter{"?"} + $c_counter{"!"};
my $avgwords_sent = $totalcount / $sentences;
my $avgwords_tot = $charcount / $totalcount;
print RESULTS "\n";
print RESULTS "-----------------------------\n";
print RESULTS "Total characters are: $charcount\n";
print RESULTS "Total words: $totalcount\n";
print RESULTS "Average words per sentence is: $avgwords_sent\n";
print RESULTS "Average length of words is: $avgwords_tot\n";
close RESULTS;
print "See results.txt for your results\n";
Replies are listed 'Best First'.
Re: Word Count Script+
by d4vis (Chaplain) on May 22, 2002 at 20:44 UTC
    Nifty...
    though if it doesn't find a "." "?" or "!" in the input file, you'll get a warning:
    Use of uninitialized value in addition (+) at wc.pl line 95.

    ~monk d4vis
    #!/usr/bin/fnord

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: sourcecode [id://168456]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others chanting in the Monastery: (3)
As of 2025-07-20 06:56 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found

    Notices?
    erzuuliAnonymous Monks are no longer allowed to use Super Search, due to an excessive use of this resource by robots.