Beefy Boxes and Bandwidth Generously Provided by pair Networks
Keep It Simple, Stupid
 
PerlMonks  

Naive Bayes Classifier Using Laplacian Smoothing

by talwyn (Monk)
on Nov 02, 2011 at 19:34 UTC ( #935480=CUFP: print w/ replies, xml ) Need Help??

So I'm following the AI class from Stanford and we just finished a grueling section on probability and how it is used in the real world and how AI can use it to classify things.

Below is a kinda rough implementation extending the in-class example to three classes of input and using un-even training set with the K coefficient set to 7 to smooth it out.

It works for all training data and does appear to learn correctly from new examples given by the user. Let me know what you guys think.

# Using Info from AI class implement a classifier that learns using Na +ive Bayes with Laplace # Smoothing to guess whether a title is a book song or movie. # Store wrong issues as training data in books.txt, movies.txt, and so +ngs.txt use strict; use Data::Dumper; import bag; import Dictionary; # training Data my $bookbag = new bag(); $bookbag->addItem("Small Business Kit For Dummies"); $bookbag->addItem("S-Corporation: Small Business Start-Up Kit"); $bookbag->addItem("Home Business Tax Deductions"); $bookbag->addItem("Essential Technical Analysis"); $bookbag->addItem("Technical Analysis of the Markets"); $bookbag->addItem("Introduction to probability and statistics"); $bookbag->addItem("Differential Equations"); $bookbag->addItem("Principles of Physics"); $bookbag->addItem("Perl Testing, A developer's notebook"); $bookbag->addItem("BreakAWAY Careers"); $bookbag->addItem("A Game of Thrones"); $bookbag->addItem("The Cleric Quintet"); $bookbag->addItem("Exile's Song"); $bookbag->addItem("Honor's War"); my $moviebag = new bag(); $moviebag->addItem("Star Wars Episode IV: A New Hope"); $moviebag->addItem("Shrek"); $moviebag->addItem("Rocky"); $moviebag->addItem("SHrek II"); $moviebag->addItem("Rocky II"); $moviebag->addItem("Star Trek"); $moviebag->addItem("Star Trek II: The Wrath of Khan"); $moviebag->addItem("Independence Day"); $moviebag->addItem("I know what you did last summer"); $moviebag->addItem("A Nightmare on Elm Street"); $moviebag->addItem("Transformers"); $moviebag->addItem("Pirates of The Carribean II: The Black Pearl"); $moviebag->addItem("A Cinderella Story"); $moviebag->addItem("The Never Ending Story"); my $songbag = new bag(); $songbag->addItem("Short Skirt /Long Jacket"); $songbag->addItem("My name is Jonas"); $songbag->addItem("Wild Wild Life"); $songbag->addItem("You can call Me Al"); $songbag->addItem("If you only Knew"); $songbag->addItem("Sweet Sacrifice"); $songbag->addItem("Tear Away"); my $dictionary = new Dictionary( $bookbag, $moviebag, $songbag ); #printf " Size of dictionary is :%d \n", $dictionary->size(); #print Dumper $dictionary; #die; #### Laplace Smoothing Co-efficient. our $k = 7; our $input =''; my $bookprob = base_probability( $k, $bookbag, $moviebag, $songb +ag); my $movieprob = base_probability( $k, $moviebag, $songbag, $bookb +ag); my $songprob = base_probability( $k, $songbag, $moviebag, $bookb +ag); prompt(); $input = <STDIN>; chomp ($input); while ( $input ne 'q') { ####DO my %choice = (); # calc Probability $choice{MOVIE}= my $m = titleprob($input, $moviebag, $movieprob, +$k, $dictionary); $choice{BOOK} = my $bk = titleprob($input, $bookbag, $movieprob, $ +k, $dictionary); $choice{SONG} = my $s = titleprob($input, $songbag, $movieprob, $k +, $dictionary); $choice{MOVIE} =$m / ( $m + $bk + $s ); $choice{BOOK} = $bk / ( $m + $bk + $s ); $choice{SONG} = $s / ( $m + $bk + $s ); #select the most likely my @sortedchoice = sort {$choice{$b} <=>$choice{$a} }( keys %choi +ce ); printf "\n\nThat title is probaby a %s with a %d %% confidence \n +", $sortedchoice[0], $choice{$sortedchoice[0]} *100; CHECK: print "Correct? enter b if it should be a book, m if it should be +a movie and s if it should be a song\n"; my $check = <STDIN>; chomp($check); if ($check =~ m/[bms]/i){ print " Noted correction\n"; # add title to appropriate bag switch: { $check eq 'b' && do { $bookbag->addItem($input); last swi +tch;}; $check eq 'm' && do { $moviebag->addItem($input);last swi +tch;}; $check eq 's' && do { $songbag->addItem($input);last swit +ch;}; } # re-create dictionary #print Dumper $bookbag; $dictionary = new Dictionary( $moviebag, $songbag, $bookbag ); #print Dumper $dictionary; #recalculate base probs $bookprob = base_probability( $k, $bookbag, $moviebag, $ +songbag); $movieprob = base_probability( $k, $moviebag, $songbag, $ +bookbag); $songprob = base_probability( $k, $songbag, $moviebag, $ +bookbag); } elsif ($check=~m/yes|y/i){ print "Cool!\n" } else{ goto CHECK; } ####Stuff prompt(); $input = <STDIN>; chomp ($input); last if ( $input eq 'q' ); } ###################################################################### sub final_probability { # ( $title, moviebag, bookbag,songbag, k, dict +ionary } sub base_probability() {# ( $k,@listofbags ) { # return base probability for class represented as the first bag # this is a number between 0 and 1 my $k = shift; my @listofbags = @_; my $total = 0; foreach my $bag (@listofbags) { $total += $bag->itemCount(); } my $probability = ( $listofbags[0]->itemCount() + $k)/ ( $total + +$k * @listofbags ); return $probability; } sub wordprob {#( $word, $bag, $dictionary, $k) { #Give probability of a particular word in a bag my $word = shift; my $bag = shift; my $prob = shift; my $k = shift; my $prob = ($bag->instanceCount($word) + $k )/ ($bag->bagSize() + +$k*$dictionary->size() ); return $prob; } sub titleprob{ #( $title, $bag , $baseprobability, $k, $dictionary){ # Given a base probability and a bag calculate the particular prob + of the title. # my $title = shift; my $bag = shift; my $bprob = shift; my $k = shift; my $dictionary = shift; #split up title into words my @words = split / /, $title; my $prob = $bprob; foreach my $word (@words) { $prob *= wordprob($word, $bag, $dictionary, $k); } return $prob; } sub prompt { printf"movie %.4f book %.4f song %.4f \n", $movieprob, $bookprob +, $songprob; printf "There are %d words in the dictionary\n", $dictionary->size +(); print "Enter a title. (Enter 'q' to quit) :"; } ###################################################################### +######## #### Data structures package bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } ## Bags sub new { my $class = shift; my $title = shift; my $self = {}; bless $self, $class; if ( defined $title ) { addItem ($title);} return $self; } ###Ops sub addItem(){ # addItem ('Title') # Accept a title, and put words into bag my $self = shift; my $item = shift; $self->{TITLECOUNT} +=1; #chomp it chomp($item); #clear punctuation $item =~ s/[-:?.]|\// /g; my @wordcollection = split /\s+/,$item; foreach my $word (@wordcollection) { $self->{BAGSIZE} +=1; $self->{BAG}{uc ($word)} +=1; } } sub instanceCount(){ # instanceCount('Word') # returns the number of times a word occurs in a bag. my $self = shift; my $word = shift; return $self->{BAG}{uc($word)}; } sub setSize(){ # setSize() # Returns the number of unique elements in bag. my $self = shift; return scalar keys (%{$self->{BAG}}); } sub bagSize(){ # bagSize() # Returns the number of words in bag counting all duplicates. my $self = shift; return $self->{BAGSIZE}; } sub itemCount() { #return how many titles were added to the bag. my $self = shift; return $self->{TITLECOUNT}; } 1; ###################################################################### +######## package Dictionary; use Data::Dumper; import bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } sub new () { my $class = shift; my $self = {}; bless $self, $class; my @bags = @_; if (defined @bags) { $self->addBags(@bags); } bless $self,$class; } ## Dictionary sub addBags( @bagName ){ my $self = shift; my @bags = @_; my %dictionary = (); #sort by set size @bags = sort { $b->setSize()<=> $a->setSize()} @bags; foreach my $bag (@bags) { %dictionary = ( %{$bag->{BAG}}, %dictionary); #print Dumper %dictionary; } $self->{DICTIONARY} = \%dictionary; } sub size(){ # setSize() # Returns the number of unique elements in bag. my $self = shift; my %dictionary = %{$self->{DICTIONARY}}; # remove empty string key created by concat operation #delete $self->{DICTIONARY}{''}; return scalar keys (%dictionary); } 1;

Comment on Naive Bayes Classifier Using Laplacian Smoothing
Download Code
Re: Naive Bayes Classifier Using Laplacian Smoothing
by BrowserUk (Pope) on Nov 02, 2011 at 19:49 UTC
    Let me know what you guys think.

    The first thing that leapt of the page for me was:

    sub size(){ # setSize() # Returns the number of unique elements in bag. my $self = shift; my %dictionary = %{$self->{DICTIONARY}}; # remove empty string key created by concat operation #delete $self->{DICTIONARY}{''}; return scalar keys (%dictionary); }

    Copying an entire hash from a reference into a local hash just to return how many keys it has is insanely wasteful. Especially as you are calling this over and over again in a sort comparator.

    Coded as:

    sub size { # setSize() # Returns the number of unique elements in bag. return scalar keys %{ shift() }; }

    Will speed up your sort by orders of magnitude.

    Also, you should not be using -- and would be getting a messages telling you so if you were using strict & warnings -- an empty prototype sub size() { on a subroutine that takes parameters.


    With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
    Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
    "Science is about questioning the status quo. Questioning authority".
    In the absence of evidence, opinion is indistinguishable from prejudice.

      As I said this is a rough see-how /if it works implementation. That said. I do appreciate the input I was having lots of trouble with references. Making a 'real' hash was only thing that was working for me.

      I'll see if I can get your revision to work for me. I did not notice any hit at N < = 200 words in the dictionary so its probably something I'll run into when I implement persistence and actually save the learned data to become the next round of training data.

A little cleanup ...
by pablopelos (Initiate) on Nov 04, 2011 at 06:09 UTC

    A little code cleanup goes a long way. When your were adding to your dictionary it wasn't adding the total count of each word it was just over-riding with the new count. Put in the quicker way to calculate the dictionary size. Also the word count on a bag was not returning a zero if the word did not occur in that bag leading to uninitialized errors.

    #!/usr/local/bin/perl -w # Using Info from AI class implement a classifier that learns using Na +ive Bayes with Laplace # Smoothing to guess whether a title is a book song or movie. # Store wrong issues as training data in books.txt, movies.txt, and so +ngs.txt use strict; use Data::Dumper; import bag; import Dictionary; # training Data my $bookbag = new bag(); $bookbag->addItem("Small Business Kit For Fools"); $bookbag->addItem("Small Business Kit For Dummies"); $bookbag->addItem("S-Corporation: Small Business Start-Up Kit"); $bookbag->addItem("Home Business Tax Deductions"); $bookbag->addItem("Essential Technical Analysis"); $bookbag->addItem("Technical Analysis of the Markets"); $bookbag->addItem("Introduction to probability and statistics"); $bookbag->addItem("Differential Equations"); $bookbag->addItem("Principles of Physics"); $bookbag->addItem("Perl Testing, A developer's notebook"); $bookbag->addItem("BreakAWAY Careers"); $bookbag->addItem("A Game of Thrones"); $bookbag->addItem("The Cleric Quintet"); $bookbag->addItem("Exile's Song"); $bookbag->addItem("Honor's War"); my $moviebag = new bag(); $moviebag->addItem("Star Wars Episode IV: A New Hope"); $moviebag->addItem("Shrek"); $moviebag->addItem("Rocky"); $moviebag->addItem("Shrek II"); $moviebag->addItem("Rocky II"); $moviebag->addItem("Star Trek"); $moviebag->addItem("Star Trek II: The Wrath of Khan"); $moviebag->addItem("Independence Day"); $moviebag->addItem("I know what you did last summer"); $moviebag->addItem("A Nightmare on Elm Street"); $moviebag->addItem("Transformers"); $moviebag->addItem("Pirates of The Carribean II: The Black Pearl"); $moviebag->addItem("A Cinderella Story"); $moviebag->addItem("The Never Ending Story"); my $songbag = new bag(); $songbag->addItem("Short Skirt /Long Jacket"); $songbag->addItem("My name is Jonas"); $songbag->addItem("Wild Wild Life"); $songbag->addItem("You can call Me Al"); $songbag->addItem("If you only Knew"); $songbag->addItem("Sweet Sacrifice"); $songbag->addItem("Tear Away"); $songbag->addItem("Taking Care of Business Business"); my $dictionary = new Dictionary( $bookbag, $moviebag, $songbag ); #printf " Size of dictionary is :%d \n", $dictionary->size(); #print Dumper $dictionary; #die; #### Laplace Smoothing Co-efficient. our $k = 7; our $input =''; my $bookprob = base_probability( $k, $bookbag, $moviebag, $songb +ag); my $movieprob = base_probability( $k, $moviebag, $songbag, $bookb +ag); my $songprob = base_probability( $k, $songbag, $moviebag, $bookb +ag); prompt(); $input = <STDIN>; chomp ($input); while ( $input ne 'q') { ####DO my %choice = (); # calc Probability my $m = titleprob($input, $moviebag, $movieprob, $k, $dictionary) +; my $bk = titleprob($input, $bookbag, $movieprob, $k, $dictionary); my $s = titleprob($input, $songbag, $movieprob, $k, $dictionary); $choice{MOVIE} = $m / ( $m + $bk + $s ); $choice{BOOK} = $bk / ( $m + $bk + $s ); $choice{SONG} = $s / ( $m + $bk + $s ); #select the most likely my @sortedchoice = sort {$choice{$b} <=> $choice{$a} }( keys %cho +ice ); printf "\n\nThat title is probaby a %s with a %d %% confidence \n +", $sortedchoice[0], $choice{$sortedchoice[0]} *100; CHECK: print "Correct? (y/yes) \nif not enter b if it should be a book,\ +n\t m if it should be a movie and\n\t s if it should be a son +g\n"; my $check = <STDIN>; chomp($check); if ($check =~ m/[bms]/i){ print " Noted correction\n"; # add title to appropriate bag switch: { $check eq 'b' && do { $bookbag->addItem($input); last swi +tch;}; $check eq 'm' && do { $moviebag->addItem($input);last swi +tch;}; $check eq 's' && do { $songbag->addItem($input);last swit +ch;}; } # re-create dictionary #print Dumper $bookbag; $dictionary = new Dictionary( $moviebag, $songbag, $bookbag ); #print Dumper $dictionary; #recalculate base probs $bookprob = base_probability( $k, $bookbag, $moviebag, $ +songbag); $movieprob = base_probability( $k, $moviebag, $songbag, $ +bookbag); $songprob = base_probability( $k, $songbag, $moviebag, $ +bookbag); } elsif ($check =~ m/yes|y/i){ print "Cool!\n" } else{ goto CHECK; } ####Stuff prompt(); $input = <STDIN>; chomp ($input); last if ( $input eq 'q' ); } ###################################################################### sub final_probability { # ( $title, moviebag, bookbag,songbag, k, dict +ionary } sub base_probability { # return base probability for class represented as the first bag # this is a number between 0 and 1 my ($k, @listofbags) = @_; my $total = 0; foreach my $bag (@listofbags) { $total += $bag->itemCount(); } my $probability = ( $listofbags[0]->itemCount() + $k)/ ( $total + +$k * @listofbags ); return $probability; } sub wordprob { # Give probability of a particular word in a bag my ($word, $bag, $dictionary, $k) = @_; my $prob = ($bag->instanceCount($word) + $k ) / ($bag->bagSize() + + $k*$dictionary->size() ); return $prob; } sub titleprob { # Given a base probability and a bag calculate the particular prob + of the title. my ($title, $bag, $bprob, $k, $dictionary) = @_; #split up title into words my @words = split(/\s/,$title); my $prob = $bprob; foreach my $word (@words) { $prob *= wordprob($word, $bag, $dictionary, $k); } return $prob; } sub prompt { printf("movie %.4f book %.4f song %.4f \n", $movieprob, $bookpro +b, $songprob); printf("There are %d words in the dictionary\n", $dictionary->size +()); print "Enter a title. (Enter 'q' to quit) :"; } ###################################################################### +######## #### Data structures package bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } # Bag sub new { my $class = shift; my $title = shift; my $self = {}; bless $self, $class; if ( defined $title ) { addItem ($title);} return $self; } # Bag Ops sub addItem { # addItem ('Title') # Accept a title, and put words into bag my $self = shift; my $item = shift; $self->{TITLECOUNT} +=1; #chomp it chomp($item); #clear punctuation $item =~ s/[-:?.]|\// /g; my @wordcollection = split /\s+/,$item; foreach my $word (@wordcollection) { $self->{BAGSIZE} +=1; $self->{BAG}{uc($word)} +=1; } } sub instanceCount { # instanceCount('Word') # returns the number of times a word occurs in a bag. my $self = shift; my $word = shift; return $self->{BAG}{uc($word)} || 0 ; } sub setSize { # setSize() # Returns the number of unique elements in bag. my $self = shift; return scalar keys (%{$self->{BAG}}); } sub bagSize { # bagSize() # Returns the number of words in bag counting all duplicates. my $self = shift; return $self->{BAGSIZE}; } sub itemCount { #return how many titles were added to the bag. my $self = shift; return $self->{TITLECOUNT}; } 1; ###################################################################### +######## package Dictionary; use Data::Dumper; import bag; BEGIN{ use strict; $| = 1; #unbuffered screen output. } # Dictionary sub new { my $class = shift; my $self = {}; bless $self, $class; # bags are @_ if (@_) { $self->addBags(@_); } bless $self,$class; } # Dictionary Ops sub addBags { my $self = shift; my @bags = @_; my %dictionary = (); #sort by set size @bags = sort { $b->setSize()<=> $a->setSize()} @bags; foreach my $bag (@bags) { foreach (keys %{$bag->{BAG}}) { $dictionary{$_} += $bag->{BAG}->{$_}; } #print Dumper(\%dictionary); } $self->{DICTIONARY} = \%dictionary; } sub size { #my $self = shift; return scalar keys %{shift()->{DICTIONARY}}; } 1;
Re: Naive Bayes Classifier Using Laplacian Smoothing
by blakew (Monk) on Nov 17, 2011 at 19:33 UTC
    Very nice. I refreshed my understanding of the model to check your code - the only nit I found is that it seems that "Laplace smoothing" refers specifically to the case where $k=1.

    An interesting extension might be to try out Text::DoubleMetaphone

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: CUFP [id://935480]
Approved by Corion
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others chanting in the Monastery: (6)
As of 2014-08-23 21:27 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The best computer themed movie is:











    Results (178 votes), past polls