Your skill will accomplishwhat the force of many cannot PerlMonks

### Naive Bayes Classifier Using Laplacian Smoothing

by talwyn (Monk)
 on Nov 02, 2011 at 19:34 UTC ( #935480=CUFP: print w/replies, xml ) Need Help??

So I'm following the AI class from Stanford and we just finished a grueling section on probability and how it is used in the real world and how AI can use it to classify things.

Below is a kinda rough implementation extending the in-class example to three classes of input and using un-even training set with the K coefficient set to 7 to smooth it out.

It works for all training data and does appear to learn correctly from new examples given by the user. Let me know what you guys think.

```# Using Info from AI class implement a classifier that learns using Na
+ive Bayes with Laplace
# Smoothing to guess whether a title is a book song or movie.
# Store wrong issues as training data in books.txt, movies.txt, and so
+ngs.txt
use strict;
use Data::Dumper;
import bag;
import Dictionary;

# training Data

my \$bookbag = new bag();

my \$moviebag = new bag();

\$moviebag->addItem("Star Wars Episode IV: A New Hope");
\$moviebag->addItem("Star Trek II: The Wrath of Khan");
\$moviebag->addItem("I know what you did last summer");
\$moviebag->addItem("Pirates of The Carribean II: The Black Pearl");

my \$songbag = new bag();

my \$dictionary = new Dictionary( \$bookbag, \$moviebag, \$songbag );
#printf " Size of dictionary is :%d \n",  \$dictionary->size();
#print Dumper \$dictionary;

#die;

#### Laplace Smoothing Co-efficient.
our \$k = 7;

our \$input ='';

my \$bookprob    = base_probability( \$k, \$bookbag,   \$moviebag,  \$songb
+ag);
my \$movieprob   = base_probability( \$k, \$moviebag,  \$songbag,   \$bookb
+ag);
my \$songprob    = base_probability( \$k, \$songbag,   \$moviebag,  \$bookb
+ag);

prompt();
\$input = <STDIN>;
chomp (\$input);
while (  \$input ne 'q') {

####DO

my %choice = ();

# calc Probability
\$choice{MOVIE}= my \$m  = titleprob(\$input, \$moviebag, \$movieprob,
+\$k, \$dictionary);
\$choice{BOOK} = my \$bk = titleprob(\$input, \$bookbag, \$movieprob, \$
+k, \$dictionary);
\$choice{SONG} = my \$s = titleprob(\$input, \$songbag, \$movieprob, \$k
+, \$dictionary);

\$choice{MOVIE} =\$m  / ( \$m + \$bk + \$s );
\$choice{BOOK} = \$bk  / ( \$m + \$bk + \$s );
\$choice{SONG} = \$s  / ( \$m + \$bk + \$s );

#select the most likely

my @sortedchoice  = sort {\$choice{\$b} <=>\$choice{\$a} }( keys %choi
+ce );

printf "\n\nThat title is probaby a %s  with a %d %% confidence \n
+", \$sortedchoice[0], \$choice{\$sortedchoice[0]} *100;

CHECK:

print "Correct? enter b if it should be a book, m if it should be
+a movie and s if it should be a song\n";
my \$check = <STDIN>;
chomp(\$check);
if (\$check =~ m/[bms]/i){
print " Noted correction\n";

# add title to appropriate bag
switch: {
\$check eq 'b' && do {  \$bookbag->addItem(\$input); last swi
+tch;};
\$check eq 'm' && do {  \$moviebag->addItem(\$input);last swi
+tch;};
\$check eq 's' && do {  \$songbag->addItem(\$input);last swit
+ch;};
}

# re-create dictionary
#print Dumper \$bookbag;

\$dictionary = new Dictionary( \$moviebag, \$songbag, \$bookbag );
#print Dumper \$dictionary;

#recalculate base probs

\$bookprob    = base_probability( \$k, \$bookbag,   \$moviebag,  \$
+songbag);
\$movieprob   = base_probability( \$k, \$moviebag,  \$songbag,   \$
+bookbag);
\$songprob    = base_probability( \$k, \$songbag,   \$moviebag,  \$
+bookbag);

}
elsif (\$check=~m/yes|y/i){
print "Cool!\n"
}

else{
goto CHECK;
}

####Stuff

prompt();
\$input = <STDIN>;
chomp (\$input);
last if ( \$input eq 'q' );

}

######################################################################

sub final_probability { # ( \$title, moviebag, bookbag,songbag, k, dict
+ionary

}

sub base_probability() {# ( \$k,@listofbags ) {
# return base probability for class represented as the first bag
# this is a number between 0 and 1

my \$k           = shift;
my @listofbags  = @_;
my \$total       = 0;

foreach my \$bag (@listofbags) {
\$total += \$bag->itemCount();
}

my \$probability = ( \$listofbags[0]->itemCount() + \$k)/ ( \$total +
+\$k * @listofbags );

return \$probability;

}

sub wordprob  {#( \$word, \$bag, \$dictionary, \$k) {
#Give probability of a particular word in a bag
my \$word    = shift;
my \$bag     = shift;
my \$prob    = shift;
my \$k       = shift;

my \$prob = (\$bag->instanceCount(\$word) + \$k )/ (\$bag->bagSize() +
+\$k*\$dictionary->size() );

return \$prob;
}

sub titleprob{ #( \$title, \$bag , \$baseprobability, \$k, \$dictionary){
# Given a base probability and a bag calculate the particular prob
+ of the title.
#
my \$title       = shift;
my \$bag         = shift;
my \$bprob        = shift;
my \$k           = shift;
my \$dictionary  = shift;

#split up title into words
my @words = split / /, \$title;

my \$prob = \$bprob;

foreach my \$word (@words) {
\$prob *= wordprob(\$word, \$bag, \$dictionary, \$k);
}

return \$prob;

}

sub prompt {

printf"movie %.4f  book %.4f  song %.4f \n", \$movieprob, \$bookprob
+, \$songprob;
printf "There are %d words in the dictionary\n", \$dictionary->size
+();
print "Enter a title. (Enter 'q' to quit) :";

}

######################################################################
+########

#### Data structures

package bag;

BEGIN{
use strict;

\$| = 1; #unbuffered screen output.
}
## Bags

sub new {
my \$class = shift;
my \$title = shift;

my \$self = {};

bless \$self, \$class;
if ( defined \$title ) { addItem (\$title);}

return \$self;
}
###Ops

# Accept a title, and put words into bag
my \$self = shift;
my \$item = shift;

\$self->{TITLECOUNT} +=1;
#chomp it
chomp(\$item);
#clear punctuation
\$item =~ s/[-:?.]|\// /g;

my @wordcollection = split /\s+/,\$item;

foreach my \$word (@wordcollection) {
\$self->{BAGSIZE} +=1;
\$self->{BAG}{uc (\$word)} +=1;
}

}

sub instanceCount(){
# instanceCount('Word')
# returns the number of times a word occurs in a bag.
my \$self = shift;
my \$word = shift;

return \$self->{BAG}{uc(\$word)};
}

sub setSize(){
# setSize()
# Returns the number of unique elements in bag.
my \$self = shift;

return scalar keys (%{\$self->{BAG}});

}

sub bagSize(){
# bagSize()
# Returns the number of words in bag counting all duplicates.
my \$self = shift;

return \$self->{BAGSIZE};
}

sub itemCount() {
#return how many titles were added to the bag.
my \$self = shift;

return \$self->{TITLECOUNT};
}

1;
######################################################################
+########
package Dictionary;
use Data::Dumper;
import bag;

BEGIN{
use strict;

\$| = 1; #unbuffered screen output.
}

sub new () {
my \$class = shift;

my \$self = {};

bless \$self, \$class;

my @bags = @_;

if (defined @bags) {

}

bless  \$self,\$class;

}

## Dictionary
my \$self = shift;
my @bags = @_;
my %dictionary = ();
#sort by set size
@bags = sort { \$b->setSize()<=> \$a->setSize()} @bags;

foreach my \$bag (@bags) {
%dictionary = ( %{\$bag->{BAG}}, %dictionary);
#print Dumper %dictionary;
}

\$self->{DICTIONARY} = \%dictionary;

}

sub size(){
# setSize()
# Returns the number of unique elements in bag.
my \$self = shift;
my %dictionary = %{\$self->{DICTIONARY}};
# remove empty string key created by concat operation
#delete  \$self->{DICTIONARY}{''};

return scalar keys (%dictionary);

}

1;

Replies are listed 'Best First'.
Re: Naive Bayes Classifier Using Laplacian Smoothing
by BrowserUk (Pope) on Nov 02, 2011 at 19:49 UTC
Let me know what you guys think.

The first thing that leapt of the page for me was:

```sub size(){
# setSize()
# Returns the number of unique elements in bag.
my \$self = shift;
my %dictionary = %{\$self->{DICTIONARY}};
# remove empty string key created by concat operation
#delete  \$self->{DICTIONARY}{''};

return scalar keys (%dictionary);

}

Copying an entire hash from a reference into a local hash just to return how many keys it has is insanely wasteful. Especially as you are calling this over and over again in a sort comparator.

Coded as:

```sub size {
# setSize()
# Returns the number of unique elements in bag.
return scalar keys %{ shift() };
}

Will speed up your sort by orders of magnitude.

Also, you should not be using -- and would be getting a messages telling you so if you were using strict & warnings -- an empty prototype sub size() { on a subroutine that takes parameters.

With the rise and rise of 'Social' network sites: 'Computers are making people easier to use everyday'
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.

As I said this is a rough see-how /if it works implementation. That said. I do appreciate the input I was having lots of trouble with references. Making a 'real' hash was only thing that was working for me.

I'll see if I can get your revision to work for me. I did not notice any hit at N < = 200 words in the dictionary so its probably something I'll run into when I implement persistence and actually save the learned data to become the next round of training data.

A little cleanup ...
by pablopelos (Initiate) on Nov 04, 2011 at 06:09 UTC

A little code cleanup goes a long way. When your were adding to your dictionary it wasn't adding the total count of each word it was just over-riding with the new count. Put in the quicker way to calculate the dictionary size. Also the word count on a bag was not returning a zero if the word did not occur in that bag leading to uninitialized errors.

```#!/usr/local/bin/perl -w

# Using Info from AI class implement a classifier that learns using Na
+ive Bayes with Laplace
# Smoothing to guess whether a title is a book song or movie.
# Store wrong issues as training data in books.txt, movies.txt, and so
+ngs.txt
use strict;
use Data::Dumper;
import bag;
import Dictionary;

# training Data

my \$bookbag = new bag();

my \$moviebag = new bag();
\$moviebag->addItem("Star Wars Episode IV: A New Hope");
\$moviebag->addItem("Star Trek II: The Wrath of Khan");
\$moviebag->addItem("I know what you did last summer");
\$moviebag->addItem("Pirates of The Carribean II: The Black Pearl");

my \$songbag = new bag();

my \$dictionary = new Dictionary( \$bookbag, \$moviebag, \$songbag );
#printf " Size of dictionary is :%d \n",  \$dictionary->size();
#print Dumper \$dictionary;

#die;

#### Laplace Smoothing Co-efficient.
our \$k = 7;

our \$input ='';

my \$bookprob    = base_probability( \$k, \$bookbag,   \$moviebag,  \$songb
+ag);
my \$movieprob   = base_probability( \$k, \$moviebag,  \$songbag,   \$bookb
+ag);
my \$songprob    = base_probability( \$k, \$songbag,   \$moviebag,  \$bookb
+ag);

prompt();
\$input = <STDIN>;
chomp (\$input);
while (  \$input ne 'q') {

####DO

my %choice = ();

# calc Probability
my \$m  = titleprob(\$input, \$moviebag, \$movieprob, \$k, \$dictionary)
+;
my \$bk = titleprob(\$input, \$bookbag, \$movieprob, \$k, \$dictionary);
my \$s  = titleprob(\$input, \$songbag, \$movieprob, \$k, \$dictionary);

\$choice{MOVIE} = \$m  / ( \$m + \$bk + \$s );
\$choice{BOOK}  = \$bk / ( \$m + \$bk + \$s );
\$choice{SONG}  = \$s  / ( \$m + \$bk + \$s );

#select the most likely

my @sortedchoice  = sort {\$choice{\$b} <=> \$choice{\$a} }( keys %cho
+ice );

printf "\n\nThat title is probaby a %s  with a %d %% confidence \n
+", \$sortedchoice[0], \$choice{\$sortedchoice[0]} *100;

CHECK:

print "Correct? (y/yes)  \nif not enter b if it should be a book,\
+n\t     m if it should be a movie and\n\t     s if it should be a son
+g\n";
my \$check = <STDIN>;
chomp(\$check);
if (\$check =~ m/[bms]/i){
print " Noted correction\n";

# add title to appropriate bag
switch: {
\$check eq 'b' && do {  \$bookbag->addItem(\$input); last swi
+tch;};
\$check eq 'm' && do {  \$moviebag->addItem(\$input);last swi
+tch;};
\$check eq 's' && do {  \$songbag->addItem(\$input);last swit
+ch;};
}

# re-create dictionary
#print Dumper \$bookbag;

\$dictionary = new Dictionary( \$moviebag, \$songbag, \$bookbag );
#print Dumper \$dictionary;

#recalculate base probs

\$bookprob    = base_probability( \$k, \$bookbag,   \$moviebag,  \$
+songbag);
\$movieprob   = base_probability( \$k, \$moviebag,  \$songbag,   \$
+bookbag);
\$songprob    = base_probability( \$k, \$songbag,   \$moviebag,  \$
+bookbag);

}
elsif (\$check =~ m/yes|y/i){
print "Cool!\n"
}

else{
goto CHECK;
}

####Stuff

prompt();
\$input = <STDIN>;
chomp (\$input);
last if ( \$input eq 'q' );

}

######################################################################

sub final_probability { # ( \$title, moviebag, bookbag,songbag, k, dict
+ionary

}

sub base_probability {
# return base probability for class represented as the first bag
# this is a number between 0 and 1

my (\$k, @listofbags) = @_;
my \$total       = 0;

foreach my \$bag (@listofbags) {
\$total += \$bag->itemCount();
}

my \$probability = ( \$listofbags[0]->itemCount() + \$k)/ ( \$total +
+\$k * @listofbags );

return \$probability;

}

sub wordprob {
# Give probability of a particular word in a bag
my (\$word, \$bag, \$dictionary, \$k) = @_;

my \$prob = (\$bag->instanceCount(\$word) + \$k ) / (\$bag->bagSize() +
+ \$k*\$dictionary->size() );

return \$prob;
}

sub titleprob {
# Given a base probability and a bag calculate the particular prob
+ of the title.
my (\$title, \$bag, \$bprob, \$k, \$dictionary) = @_;

#split up title into words
my @words = split(/\s/,\$title);

my \$prob = \$bprob;

foreach my \$word (@words) {
\$prob *= wordprob(\$word, \$bag, \$dictionary, \$k);
}

return \$prob;
}

sub prompt {

printf("movie %.4f  book %.4f  song %.4f \n", \$movieprob, \$bookpro
+b, \$songprob);
printf("There are %d words in the dictionary\n", \$dictionary->size
+());
print "Enter a title. (Enter 'q' to quit) :";

}

######################################################################
+########

#### Data structures

package bag;

BEGIN{
use strict;

\$| = 1; #unbuffered screen output.
}

# Bag
sub new {
my \$class = shift;
my \$title = shift;

my \$self = {};

bless \$self, \$class;
if ( defined \$title ) { addItem (\$title);}

return \$self;
}

# Bag Ops
# Accept a title, and put words into bag
my \$self = shift;
my \$item = shift;

\$self->{TITLECOUNT} +=1;
#chomp it
chomp(\$item);
#clear punctuation
\$item =~ s/[-:?.]|\// /g;

my @wordcollection = split /\s+/,\$item;

foreach my \$word (@wordcollection) {
\$self->{BAGSIZE} +=1;
\$self->{BAG}{uc(\$word)} +=1;
}
}

sub instanceCount {
# instanceCount('Word')
# returns the number of times a word occurs in a bag.
my \$self = shift;
my \$word = shift;

return \$self->{BAG}{uc(\$word)} || 0 ;
}

sub setSize {
# setSize()
# Returns the number of unique elements in bag.
my \$self = shift;

return scalar keys (%{\$self->{BAG}});
}

sub bagSize {
# bagSize()
# Returns the number of words in bag counting all duplicates.
my \$self = shift;

return \$self->{BAGSIZE};
}

sub itemCount {
#return how many titles were added to the bag.
my \$self = shift;

return \$self->{TITLECOUNT};
}

1;
######################################################################
+########
package Dictionary;
use Data::Dumper;
import bag;

BEGIN{
use strict;

\$| = 1; #unbuffered screen output.
}

# Dictionary
sub new {
my \$class = shift;

my \$self = {};

bless \$self, \$class;

# bags are @_
if (@_) {
}

bless  \$self,\$class;
}

# Dictionary Ops
my \$self = shift;
my @bags = @_;
my %dictionary = ();
#sort by set size
@bags = sort { \$b->setSize()<=> \$a->setSize()} @bags;

foreach my \$bag (@bags) {
foreach (keys %{\$bag->{BAG}}) {
\$dictionary{\$_} += \$bag->{BAG}->{\$_};
}
#print Dumper(\%dictionary);
}

\$self->{DICTIONARY} = \%dictionary;
}

sub size {
#my \$self = shift;
return scalar keys %{shift()->{DICTIONARY}};
}

1;
Re: Naive Bayes Classifier Using Laplacian Smoothing
by blakew (Monk) on Nov 17, 2011 at 19:33 UTC
Very nice. I refreshed my understanding of the model to check your code - the only nit I found is that it seems that "Laplace smoothing" refers specifically to the case where \$k=1.

An interesting extension might be to try out Text::DoubleMetaphone

Create A New User
Node Status?
node history
Node Type: CUFP [id://935480]
Approved by Corion
help
Chatterbox?
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others exploiting the Monastery: (8)
As of 2017-05-26 07:52 GMT
Sections?
Information?
Find Nodes?
Leftovers?
Voting Booth?
My favorite model of computation is ...

Results (189 votes). Check out past polls.