Beefy Boxes and Bandwidth Generously Provided by pair Networks
Do you know where your variables are?
 
PerlMonks  

Naive Bayes Classifier Using Laplacian Smoothing

by talwyn (Monk)
on Nov 02, 2011 at 19:34 UTC ( #935480=CUFP: print w/ replies, xml ) Need Help??

So I'm following the AI class from Stanford and we just finished a grueling section on probability and how it is used in the real world and how AI can use it to classify things.

Below is a kinda rough implementation extending the in-class example to three classes of input and using un-even training set with the K coefficient set to 7 to smooth it out.

It works for all training data and does appear to learn correctly from new examples given by the user. Let me know what you guys think.

# Using Info from AI class implement a classifier that learns using Na +ive Bayes with Laplace # Smoothing to guess whether a title is a book song or movie. # Store wrong issues as training data in books.txt, movies.txt, and so +ngs.txt use strict; use Data::Dumper; import bag; import Dictionary; # training Data my $bookbag = new bag(); $bookbag->addItem("Small Business Kit For Dummies"); $bookbag->addItem("S-Corporation: Small Business Start-Up Kit"); $bookbag->addItem("Home Business Tax Deductions"); $bookbag->addItem("Essential Technical Analysis"); $bookbag->addItem("Technical Analysis of the Markets"); $bookbag->addItem("Introduction to probability and statistics"); $bookbag->addItem("Differential Equations"); $bookbag->addItem("Principles of Physics"); $bookbag->addItem("Perl Testing, A developer's notebook"); $bookbag->addItem("BreakAWAY Careers"); $bookbag->addItem("A Game of Thrones"); $bookbag->addItem("The Cleric Quintet"); $bookbag->addItem("Exile's Song"); $bookbag->addItem("Honor's War"); my $moviebag = new bag(); $moviebag->addItem("Star Wars Episode IV: A New Hope"); $moviebag->addItem("Shrek"); $moviebag->addItem("Rocky"); $moviebag->addItem("SHrek II"); $moviebag->addItem("Rocky II"); $moviebag->addItem("Star Trek"); $moviebag->addItem("Star Trek II: The Wrath of Khan"); $moviebag->addItem("Independence Day"); $moviebag->addItem("I know what you did last summer"); $moviebag->addItem("A Nightmare on Elm Street"); $moviebag->addItem("Transformers"); $moviebag->addItem("Pirates of The Carribean II: The Black Pearl"); $moviebag->addItem("A Cinderella Story"); $moviebag->addItem("The Never Ending Story"); my $songbag = new bag(); $songbag->addItem("Short Skirt /Long Jacket"); $songbag->addItem("My name is Jonas"); $songbag->addItem("Wild Wild Life"); $songbag->addItem("You can call Me Al"); $songbag->addItem("If you only Knew"); $songbag->addItem("Sweet Sacrifice"); $songbag->addItem("Tear Away"); my $dictionary = new Dictionary( $bookbag, $moviebag, $songbag ); #printf " Size of dictionary is :%d \n", $dictionary->size(); #print Dumper $dictionary; #die; #### Laplace Smoothing Co-efficient. our $k = 7; our $input =''; my $bookprob = base_probability( $k, $bookbag, $moviebag, $songb +ag); my $movieprob = base_probability( $k, $moviebag, $songbag, $bookb +ag); my $songprob = base_probability( $k, $songbag, $moviebag, $bookb +ag); prompt(); $input = <STDIN>; chomp ($input); while ( $input ne 'q') { ####DO my %choice = (); # calc Probability $choice{MOVIE}= my $m = titleprob($input, $moviebag, $movieprob, +$k, $dictionary); $choice{BOOK} = my $bk = titleprob($input, $bookbag, $movieprob, $ +k, $dictionary); $choice{SONG} = my $s = titleprob($input, $songbag, $movieprob, $k +, $dictionary); $choice{MOVIE} =$m / ( $m + $bk + $s ); $choice{BOOK} = $bk / ( $m + $bk + $s ); $choice{SONG} = $s / ( $m + $bk + $s ); #select the most likely my @sortedchoice = sort {$choice{$b} <=>$choice{$a} }( keys %choi +ce ); printf "\n\nThat title is probaby a %s with a %d %% confidence \n +", $sortedchoice[0], $choice{$sortedchoice[0]} *100; CHECK: print "Correct? enter b if it should be a book, m if it should be +a movie and s if it should be a song\n"; my $check = <STDIN>; chomp($check); if ($check =~ m/[bms]/i){ print " Noted correction\n"; # add title to appropriate bag switch: { $check eq 'b' && do { $bookbag->addItem($input); last swi +tch;}; $check eq 'm' && do { $moviebag->addItem($input);last swi +tch;}; $check eq 's' && do { $songbag->addItem($input);last swit +ch;}; } # re-create dictionary #print Dumper $bookbag; $dictionary = new Dictionary( $moviebag, $songbag, $bookbag ); #print Dumper $dictionary; #recalculate base probs $bookprob = base_probability( $k, $bookbag, $moviebag, $ +songbag); $movieprob = base_probability( $k, $moviebag, $songbag, $ +bookbag); $songprob = base_probability( $k, $songbag, $moviebag, $ +bookbag); } elsif ($check=~m/yes|y/i){ print "Cool!\n" } else{ goto CHECK; } ####Stuff prompt(); $input = <STDIN>; chomp ($input); last if ( $input eq 'q' ); } ###################################################################### sub final_probability { # ( $title, moviebag, bookbag,songbag, k, dict +ionary } sub base_probability() {# ( $k,@listofbags ) { # return base probability for class represented as the first bag # this is a number between 0 and 1 my $k = shift; my @listofbags = @_; my $total = 0; foreach my $bag (@listofbags) { $total += $bag->itemCount(); } my $probability = ( $listofbags[0]->itemCount() + $k)/ ( $total + +$k * @listofbags ); return $probability; } sub wordprob {#( $word, $bag, $dictionary, $k) { #Give probability of a particular word in a bag my $word = shift; my $bag = shift; my $prob = shift; my $k = shift; my $prob = ($bag->instanceCount($word) + $k )/ ($bag->bagSize() + +$k*$dictionary->size() ); return $prob; } sub titleprob{ #( $title, $bag , $baseprobability, $k, $dictionary){ # Given a base probability and a bag calculate the particular prob + of the title. # my $title = shift; my $bag = shift; my $bprob = shift; my $k = shift; my $dictionary = shift; #split up title into words my @words = split / /, $title; my $prob = $bprob; foreach my $word (@words) { $prob *= wordprob($word, $bag, $dictionary, $k); } return $prob; } sub prompt { printf"movie %.4f book %.4f song %.4f \n", $movieprob, $bookprob +, $songprob; printf "There are %d words in the dictionary\n", $dictionary->size +(); print "Enter a title. (Enter 'q' to quit) :"; } ###################################################################### +######## #### Data structures package bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } ## Bags sub new { my $class = shift; my $title = shift; my $self = {}; bless $self, $class; if ( defined $title ) { addItem ($title);} return $self; } ###Ops sub addItem(){ # addItem ('Title') # Accept a title, and put words into bag my $self = shift; my $item = shift; $self->{TITLECOUNT} +=1; #chomp it chomp($item); #clear punctuation $item =~ s/[-:?.]|\// /g; my @wordcollection = split /\s+/,$item; foreach my $word (@wordcollection) { $self->{BAGSIZE} +=1; $self->{BAG}{uc ($word)} +=1; } } sub instanceCount(){ # instanceCount('Word') # returns the number of times a word occurs in a bag. my $self = shift; my $word = shift; return $self->{BAG}{uc($word)}; } sub setSize(){ # setSize() # Returns the number of unique elements in bag. my $self = shift; return scalar keys (%{$self->{BAG}}); } sub bagSize(){ # bagSize() # Returns the number of words in bag counting all duplicates. my $self = shift; return $self->{BAGSIZE}; } sub itemCount() { #return how many titles were added to the bag. my $self = shift; return $self->{TITLECOUNT}; } 1; ###################################################################### +######## package Dictionary; use Data::Dumper; import bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } sub new () { my $class = shift; my $self = {}; bless $self, $class; my @bags = @_; if (defined @bags) { $self->addBags(@bags); } bless $self,$class; } ## Dictionary sub addBags( @bagName ){ my $self = shift; my @bags = @_; my %dictionary = (); #sort by set size @bags = sort { $b->setSize()<=> $a->setSize()} @bags; foreach my $bag (@bags) { %dictionary = ( %{$bag->{BAG}}, %dictionary); #print Dumper %dictionary; } $self->{DICTIONARY} = \%dictionary; } sub size(){ # setSize() # Returns the number of unique elements in bag. my $self = shift; my %dictionary = %{$self->{DICTIONARY}}; # remove empty string key created by concat operation #delete $self->{DICTIONARY}{''}; return scalar keys (%dictionary); } 1;

Comment on Naive Bayes Classifier Using Laplacian Smoothing
Download Code
Re: Naive Bayes Classifier Using Laplacian Smoothing
by BrowserUk (Pope) on Nov 02, 2011 at 19:49 UTC
    Let me know what you guys think.

    The first thing that leapt of the page for me was:

    sub size(){ # setSize() # Returns the number of unique elements in bag. my $self = shift; my %dictionary = %{$self->{DICTIONARY}}; # remove empty string key created by concat operation #delete $self->{DICTIONARY}{''}; return scalar keys (%dictionary); }

    Copying an entire hash from a reference into a local hash just to return how many keys it has is insanely wasteful. Especially as you are calling this over and over again in a sort comparator.

    Coded as:

    sub size { # setSize() # Returns the number of unique elements in bag. return scalar keys %{ shift() }; }

    Will speed up your sort by orders of magnitude.

    Also, you should not be using -- and would be getting a messages telling you so if you were using strict & warnings -- an empty prototype sub size() { on a subroutine that takes parameters.


    With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
    Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
    "Science is about questioning the status quo. Questioning authority".
    In the absence of evidence, opinion is indistinguishable from prejudice.

      As I said this is a rough see-how /if it works implementation. That said. I do appreciate the input I was having lots of trouble with references. Making a 'real' hash was only thing that was working for me.

      I'll see if I can get your revision to work for me. I did not notice any hit at N < = 200 words in the dictionary so its probably something I'll run into when I implement persistence and actually save the learned data to become the next round of training data.

A little cleanup ...
by pablopelos (Initiate) on Nov 04, 2011 at 06:09 UTC

    A little code cleanup goes a long way. When your were adding to your dictionary it wasn't adding the total count of each word it was just over-riding with the new count. Put in the quicker way to calculate the dictionary size. Also the word count on a bag was not returning a zero if the word did not occur in that bag leading to uninitialized errors.

    #!/usr/local/bin/perl -w # Using Info from AI class implement a classifier that learns using Na +ive Bayes with Laplace # Smoothing to guess whether a title is a book song or movie. # Store wrong issues as training data in books.txt, movies.txt, and so +ngs.txt use strict; use Data::Dumper; import bag; import Dictionary; # training Data my $bookbag = new bag(); $bookbag->addItem("Small Business Kit For Fools"); $bookbag->addItem("Small Business Kit For Dummies"); $bookbag->addItem("S-Corporation: Small Business Start-Up Kit"); $bookbag->addItem("Home Business Tax Deductions"); $bookbag->addItem("Essential Technical Analysis"); $bookbag->addItem("Technical Analysis of the Markets"); $bookbag->addItem("Introduction to probability and statistics"); $bookbag->addItem("Differential Equations"); $bookbag->addItem("Principles of Physics"); $bookbag->addItem("Perl Testing, A developer's notebook"); $bookbag->addItem("BreakAWAY Careers"); $bookbag->addItem("A Game of Thrones"); $bookbag->addItem("The Cleric Quintet"); $bookbag->addItem("Exile's Song"); $bookbag->addItem("Honor's War"); my $moviebag = new bag(); $moviebag->addItem("Star Wars Episode IV: A New Hope"); $moviebag->addItem("Shrek"); $moviebag->addItem("Rocky"); $moviebag->addItem("Shrek II"); $moviebag->addItem("Rocky II"); $moviebag->addItem("Star Trek"); $moviebag->addItem("Star Trek II: The Wrath of Khan"); $moviebag->addItem("Independence Day"); $moviebag->addItem("I know what you did last summer"); $moviebag->addItem("A Nightmare on Elm Street"); $moviebag->addItem("Transformers"); $moviebag->addItem("Pirates of The Carribean II: The Black Pearl"); $moviebag->addItem("A Cinderella Story"); $moviebag->addItem("The Never Ending Story"); my $songbag = new bag(); $songbag->addItem("Short Skirt /Long Jacket"); $songbag->addItem("My name is Jonas"); $songbag->addItem("Wild Wild Life"); $songbag->addItem("You can call Me Al"); $songbag->addItem("If you only Knew"); $songbag->addItem("Sweet Sacrifice"); $songbag->addItem("Tear Away"); $songbag->addItem("Taking Care of Business Business"); my $dictionary = new Dictionary( $bookbag, $moviebag, $songbag ); #printf " Size of dictionary is :%d \n", $dictionary->size(); #print Dumper $dictionary; #die; #### Laplace Smoothing Co-efficient. our $k = 7; our $input =''; my $bookprob = base_probability( $k, $bookbag, $moviebag, $songb +ag); my $movieprob = base_probability( $k, $moviebag, $songbag, $bookb +ag); my $songprob = base_probability( $k, $songbag, $moviebag, $bookb +ag); prompt(); $input = <STDIN>; chomp ($input); while ( $input ne 'q') { ####DO my %choice = (); # calc Probability my $m = titleprob($input, $moviebag, $movieprob, $k, $dictionary) +; my $bk = titleprob($input, $bookbag, $movieprob, $k, $dictionary); my $s = titleprob($input, $songbag, $movieprob, $k, $dictionary); $choice{MOVIE} = $m / ( $m + $bk + $s ); $choice{BOOK} = $bk / ( $m + $bk + $s ); $choice{SONG} = $s / ( $m + $bk + $s ); #select the most likely my @sortedchoice = sort {$choice{$b} <=> $choice{$a} }( keys %cho +ice ); printf "\n\nThat title is probaby a %s with a %d %% confidence \n +", $sortedchoice[0], $choice{$sortedchoice[0]} *100; CHECK: print "Correct? (y/yes) \nif not enter b if it should be a book,\ +n\t m if it should be a movie and\n\t s if it should be a son +g\n"; my $check = <STDIN>; chomp($check); if ($check =~ m/[bms]/i){ print " Noted correction\n"; # add title to appropriate bag switch: { $check eq 'b' && do { $bookbag->addItem($input); last swi +tch;}; $check eq 'm' && do { $moviebag->addItem($input);last swi +tch;}; $check eq 's' && do { $songbag->addItem($input);last swit +ch;}; } # re-create dictionary #print Dumper $bookbag; $dictionary = new Dictionary( $moviebag, $songbag, $bookbag ); #print Dumper $dictionary; #recalculate base probs $bookprob = base_probability( $k, $bookbag, $moviebag, $ +songbag); $movieprob = base_probability( $k, $moviebag, $songbag, $ +bookbag); $songprob = base_probability( $k, $songbag, $moviebag, $ +bookbag); } elsif ($check =~ m/yes|y/i){ print "Cool!\n" } else{ goto CHECK; } ####Stuff prompt(); $input = <STDIN>; chomp ($input); last if ( $input eq 'q' ); } ###################################################################### sub final_probability { # ( $title, moviebag, bookbag,songbag, k, dict +ionary } sub base_probability { # return base probability for class represented as the first bag # this is a number between 0 and 1 my ($k, @listofbags) = @_; my $total = 0; foreach my $bag (@listofbags) { $total += $bag->itemCount(); } my $probability = ( $listofbags[0]->itemCount() + $k)/ ( $total + +$k * @listofbags ); return $probability; } sub wordprob { # Give probability of a particular word in a bag my ($word, $bag, $dictionary, $k) = @_; my $prob = ($bag->instanceCount($word) + $k ) / ($bag->bagSize() + + $k*$dictionary->size() ); return $prob; } sub titleprob { # Given a base probability and a bag calculate the particular prob + of the title. my ($title, $bag, $bprob, $k, $dictionary) = @_; #split up title into words my @words = split(/\s/,$title); my $prob = $bprob; foreach my $word (@words) { $prob *= wordprob($word, $bag, $dictionary, $k); } return $prob; } sub prompt { printf("movie %.4f book %.4f song %.4f \n", $movieprob, $bookpro +b, $songprob); printf("There are %d words in the dictionary\n", $dictionary->size +()); print "Enter a title. (Enter 'q' to quit) :"; } ###################################################################### +######## #### Data structures package bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } # Bag sub new { my $class = shift; my $title = shift; my $self = {}; bless $self, $class; if ( defined $title ) { addItem ($title);} return $self; } # Bag Ops sub addItem { # addItem ('Title') # Accept a title, and put words into bag my $self = shift; my $item = shift; $self->{TITLECOUNT} +=1; #chomp it chomp($item); #clear punctuation $item =~ s/[-:?.]|\// /g; my @wordcollection = split /\s+/,$item; foreach my $word (@wordcollection) { $self->{BAGSIZE} +=1; $self->{BAG}{uc($word)} +=1; } } sub instanceCount { # instanceCount('Word') # returns the number of times a word occurs in a bag. my $self = shift; my $word = shift; return $self->{BAG}{uc($word)} || 0 ; } sub setSize { # setSize() # Returns the number of unique elements in bag. my $self = shift; return scalar keys (%{$self->{BAG}}); } sub bagSize { # bagSize() # Returns the number of words in bag counting all duplicates. my $self = shift; return $self->{BAGSIZE}; } sub itemCount { #return how many titles were added to the bag. my $self = shift; return $self->{TITLECOUNT}; } 1; ###################################################################### +######## package Dictionary; use Data::Dumper; import bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } # Dictionary sub new { my $class = shift; my $self = {}; bless $self, $class; # bags are @_ if (@_) { $self->addBags(@_); } bless $self,$class; } # Dictionary Ops sub addBags { my $self = shift; my @bags = @_; my %dictionary = (); #sort by set size @bags = sort { $b->setSize()<=> $a->setSize()} @bags; foreach my $bag (@bags) { foreach (keys %{$bag->{BAG}}) { $dictionary{$_} += $bag->{BAG}->{$_}; } #print Dumper(\%dictionary); } $self->{DICTIONARY} = \%dictionary; } sub size { #my $self = shift; return scalar keys %{shift()->{DICTIONARY}}; } 1;
Re: Naive Bayes Classifier Using Laplacian Smoothing
by blakew (Monk) on Nov 17, 2011 at 19:33 UTC
    Very nice. I refreshed my understanding of the model to check your code - the only nit I found is that it seems that "Laplace smoothing" refers specifically to the case where $k=1.

    An interesting extension might be to try out Text::DoubleMetaphone

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: CUFP [id://935480]
Approved by Corion
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others contemplating the Monastery: (14)
As of 2014-11-25 20:51 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    My preferred Perl binaries come from:














    Results (158 votes), past polls