Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl: the Markov chain saw
 
PerlMonks  

A little cleanup ...

by pablopelos (Initiate)
on Nov 04, 2011 at 06:09 UTC ( #935837=note: print w/ replies, xml ) Need Help??


in reply to Naive Bayes Classifier Using Laplacian Smoothing

A little code cleanup goes a long way. When your were adding to your dictionary it wasn't adding the total count of each word it was just over-riding with the new count. Put in the quicker way to calculate the dictionary size. Also the word count on a bag was not returning a zero if the word did not occur in that bag leading to uninitialized errors.

#!/usr/local/bin/perl -w # Using Info from AI class implement a classifier that learns using Na +ive Bayes with Laplace # Smoothing to guess whether a title is a book song or movie. # Store wrong issues as training data in books.txt, movies.txt, and so +ngs.txt use strict; use Data::Dumper; import bag; import Dictionary; # training Data my $bookbag = new bag(); $bookbag->addItem("Small Business Kit For Fools"); $bookbag->addItem("Small Business Kit For Dummies"); $bookbag->addItem("S-Corporation: Small Business Start-Up Kit"); $bookbag->addItem("Home Business Tax Deductions"); $bookbag->addItem("Essential Technical Analysis"); $bookbag->addItem("Technical Analysis of the Markets"); $bookbag->addItem("Introduction to probability and statistics"); $bookbag->addItem("Differential Equations"); $bookbag->addItem("Principles of Physics"); $bookbag->addItem("Perl Testing, A developer's notebook"); $bookbag->addItem("BreakAWAY Careers"); $bookbag->addItem("A Game of Thrones"); $bookbag->addItem("The Cleric Quintet"); $bookbag->addItem("Exile's Song"); $bookbag->addItem("Honor's War"); my $moviebag = new bag(); $moviebag->addItem("Star Wars Episode IV: A New Hope"); $moviebag->addItem("Shrek"); $moviebag->addItem("Rocky"); $moviebag->addItem("Shrek II"); $moviebag->addItem("Rocky II"); $moviebag->addItem("Star Trek"); $moviebag->addItem("Star Trek II: The Wrath of Khan"); $moviebag->addItem("Independence Day"); $moviebag->addItem("I know what you did last summer"); $moviebag->addItem("A Nightmare on Elm Street"); $moviebag->addItem("Transformers"); $moviebag->addItem("Pirates of The Carribean II: The Black Pearl"); $moviebag->addItem("A Cinderella Story"); $moviebag->addItem("The Never Ending Story"); my $songbag = new bag(); $songbag->addItem("Short Skirt /Long Jacket"); $songbag->addItem("My name is Jonas"); $songbag->addItem("Wild Wild Life"); $songbag->addItem("You can call Me Al"); $songbag->addItem("If you only Knew"); $songbag->addItem("Sweet Sacrifice"); $songbag->addItem("Tear Away"); $songbag->addItem("Taking Care of Business Business"); my $dictionary = new Dictionary( $bookbag, $moviebag, $songbag ); #printf " Size of dictionary is :%d \n", $dictionary->size(); #print Dumper $dictionary; #die; #### Laplace Smoothing Co-efficient. our $k = 7; our $input =''; my $bookprob = base_probability( $k, $bookbag, $moviebag, $songb +ag); my $movieprob = base_probability( $k, $moviebag, $songbag, $bookb +ag); my $songprob = base_probability( $k, $songbag, $moviebag, $bookb +ag); prompt(); $input = <STDIN>; chomp ($input); while ( $input ne 'q') { ####DO my %choice = (); # calc Probability my $m = titleprob($input, $moviebag, $movieprob, $k, $dictionary) +; my $bk = titleprob($input, $bookbag, $movieprob, $k, $dictionary); my $s = titleprob($input, $songbag, $movieprob, $k, $dictionary); $choice{MOVIE} = $m / ( $m + $bk + $s ); $choice{BOOK} = $bk / ( $m + $bk + $s ); $choice{SONG} = $s / ( $m + $bk + $s ); #select the most likely my @sortedchoice = sort {$choice{$b} <=> $choice{$a} }( keys %cho +ice ); printf "\n\nThat title is probaby a %s with a %d %% confidence \n +", $sortedchoice[0], $choice{$sortedchoice[0]} *100; CHECK: print "Correct? (y/yes) \nif not enter b if it should be a book,\ +n\t m if it should be a movie and\n\t s if it should be a son +g\n"; my $check = <STDIN>; chomp($check); if ($check =~ m/[bms]/i){ print " Noted correction\n"; # add title to appropriate bag switch: { $check eq 'b' && do { $bookbag->addItem($input); last swi +tch;}; $check eq 'm' && do { $moviebag->addItem($input);last swi +tch;}; $check eq 's' && do { $songbag->addItem($input);last swit +ch;}; } # re-create dictionary #print Dumper $bookbag; $dictionary = new Dictionary( $moviebag, $songbag, $bookbag ); #print Dumper $dictionary; #recalculate base probs $bookprob = base_probability( $k, $bookbag, $moviebag, $ +songbag); $movieprob = base_probability( $k, $moviebag, $songbag, $ +bookbag); $songprob = base_probability( $k, $songbag, $moviebag, $ +bookbag); } elsif ($check =~ m/yes|y/i){ print "Cool!\n" } else{ goto CHECK; } ####Stuff prompt(); $input = <STDIN>; chomp ($input); last if ( $input eq 'q' ); } ###################################################################### sub final_probability { # ( $title, moviebag, bookbag,songbag, k, dict +ionary } sub base_probability { # return base probability for class represented as the first bag # this is a number between 0 and 1 my ($k, @listofbags) = @_; my $total = 0; foreach my $bag (@listofbags) { $total += $bag->itemCount(); } my $probability = ( $listofbags[0]->itemCount() + $k)/ ( $total + +$k * @listofbags ); return $probability; } sub wordprob { # Give probability of a particular word in a bag my ($word, $bag, $dictionary, $k) = @_; my $prob = ($bag->instanceCount($word) + $k ) / ($bag->bagSize() + + $k*$dictionary->size() ); return $prob; } sub titleprob { # Given a base probability and a bag calculate the particular prob + of the title. my ($title, $bag, $bprob, $k, $dictionary) = @_; #split up title into words my @words = split(/\s/,$title); my $prob = $bprob; foreach my $word (@words) { $prob *= wordprob($word, $bag, $dictionary, $k); } return $prob; } sub prompt { printf("movie %.4f book %.4f song %.4f \n", $movieprob, $bookpro +b, $songprob); printf("There are %d words in the dictionary\n", $dictionary->size +()); print "Enter a title. (Enter 'q' to quit) :"; } ###################################################################### +######## #### Data structures package bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } # Bag sub new { my $class = shift; my $title = shift; my $self = {}; bless $self, $class; if ( defined $title ) { addItem ($title);} return $self; } # Bag Ops sub addItem { # addItem ('Title') # Accept a title, and put words into bag my $self = shift; my $item = shift; $self->{TITLECOUNT} +=1; #chomp it chomp($item); #clear punctuation $item =~ s/[-:?.]|\// /g; my @wordcollection = split /\s+/,$item; foreach my $word (@wordcollection) { $self->{BAGSIZE} +=1; $self->{BAG}{uc($word)} +=1; } } sub instanceCount { # instanceCount('Word') # returns the number of times a word occurs in a bag. my $self = shift; my $word = shift; return $self->{BAG}{uc($word)} || 0 ; } sub setSize { # setSize() # Returns the number of unique elements in bag. my $self = shift; return scalar keys (%{$self->{BAG}}); } sub bagSize { # bagSize() # Returns the number of words in bag counting all duplicates. my $self = shift; return $self->{BAGSIZE}; } sub itemCount { #return how many titles were added to the bag. my $self = shift; return $self->{TITLECOUNT}; } 1; ###################################################################### +######## package Dictionary; use Data::Dumper; import bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } # Dictionary sub new { my $class = shift; my $self = {}; bless $self, $class; # bags are @_ if (@_) { $self->addBags(@_); } bless $self,$class; } # Dictionary Ops sub addBags { my $self = shift; my @bags = @_; my %dictionary = (); #sort by set size @bags = sort { $b->setSize()<=> $a->setSize()} @bags; foreach my $bag (@bags) { foreach (keys %{$bag->{BAG}}) { $dictionary{$_} += $bag->{BAG}->{$_}; } #print Dumper(\%dictionary); } $self->{DICTIONARY} = \%dictionary; } sub size { #my $self = shift; return scalar keys %{shift()->{DICTIONARY}}; } 1;


Comment on A little cleanup ...
Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://935837]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others cooling their heels in the Monastery: (19)
As of 2015-07-28 20:47 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (258 votes), past polls