#!/usr/bin/perl
######################################################################
+##########################
# QUINLANS ALGORITHM FOR DECISION TREE
# Author : Sandhya Manickavasagam email- smanicak@syr.edu
######################################################################
+##########################
use DBI;
use Tree::DAG_Node;
use AI::DecisionTree;
use Graph::Easy;
$table ="income";
my $file_to_append = "C:\\Documents and Settings\\smanicka\\Desktop\\q
+uinlans\\output_quinlans.txt";
open (MYFILE,">$file_to_append")or warn "Can't open file to append";
print MYFILE "\t\t\tCSE 787 - ANALYTICAL DATA MINING - PROJECT 2\n====
+=====================================================================
+=======\n\n";
print MYFILE "\n\n\t\t\tQUINLAN'S DECISION TREE ALGORITHM\n-----------
+---------------------------------------------------------------------
+-\n";
print MYFILE "\n\t\t\tName :\t Sandhya Manickavasagam\n\t\t\tSU-ID:660
+185882\n=============================================================
+===================\n\n";
my $dtree = new AI::DecisionTree(noise_mode=>'pick_best');
$dbh = DBI->connect('dbi:ODBC:driver=microsoft access driver (*.mdb);d
+bq=C:\Documents and Settings\smanicka\Desktop\quinlans\sample.mdb') o
+r warn("Sorry,Cant connect to the table\n$DBI::errstr \n") or warn "
+Cannot connect to database";
my %data;
$query = qq(select count(*) from $table);
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
$count_row = $sth->fetchrow_array();
my $query = "select * from $table";
print MYFILE "\nExecuting $query \n";
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
while (@rows=$sth->fetchrow_array()) {
$count_col =0;
foreach $row (@rows){
# print "$row,";
$count_col ++;
}
# print "\n";
}
print MYFILE "There are $count_row rows of data in this table\n";
print MYFILE "\nThere are $sth->{NUM_OF_FIELDS} columns\n\nAssuming th
+at the first column is the transaction number and hence not a part of
+ the data used for mining : \n\nThe columns under consideration are :
+\n\n\t$sth->{NAME}->[1]\n\t$sth->{NAME}->[2]\n\t$sth->{NAME}->[3]\n\t
+$sth->{NAME}->[4]\n\t$sth->{NAME}->[5]\n";
for (my $i=1;$i<$count_col;$i++){
push (@columns_list,"$sth->{NAME}->[$i]");
}
print MYFILE "\nThe Distinct entries are : \n ";
foreach $c (@columns_list){
print MYFILE "\n$c\n================\n";
$query = "select distinct $c from $table";
#print "$query\n";
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
while (@rows = $sth->fetchrow_array()) {
foreach $row (@rows){
print MYFILE "$row\n";
push((@{"$c"}),$row);
}
}
}
print MYFILE "\n";
foreach $c (@columns_list){
foreach $k ( @{"$c"}) {
#print "$k \n";
$occ = 0;
$query = qq(select $c from $table where $c = '$k');
$stmnt = qq(select $c from $table where $c = $k);
$sth = $dbh->prepare($query) or print "cannot prepare\n";
$sth ->execute() or (($sth = $dbh->prepare($stmnt) or print "cannot pr
+epare") and $sth ->execute());
while (@rows = $sth->fetchrow_array()) {
$occ++;
}
print MYFILE "$k occurs $occ times\n";
$ratio = $occ/$count_row;
#${$c}{"$k"} = $ratio ;
$data{"$k"} = $ratio;
}
print MYFILE "\n";
}
# print "\n";
#foreach $c (@columns_list){
#print "\n$c\n";
#foreach $r (@{"$c"}){
# print "$r \t";
#}
#}
print MYFILE "\nStoring the entries into a hash table\n---------------
+---------------------------------------\n";
while (($key, $value) = each(%data)){
print MYFILE $key.", ".$value."\n";
}
# Calculate ENTROPY
sub log2 {
my $n = $_;
return log($n)/log(2);
}
#gain of each column should be calculated and column with max gain sho
+uld become root
#my $root = Tree::DAG_Node->new();
#$root->name("Outlook");
#$new_daughter = Tree::DAG_Node->new();
# $new_daughter->name("");
# $root->add_daughter($new_daughter);
# A set of training data for deciding whether to play
$query = qq(select * from $table);
print MYFILE "\nExecuting $query \n";
$sth = $dbh->prepare($query) or print "cannot prepare";
$sth ->execute();
while(@rows = $sth->fetchrow_array()){
#if($rows[4] eq 1){
#$rows[4] = "yes";
#}
#else{
#$rows[4] = "no";
#}
foreach $r (@rows){
print "$r \t\t";
}
print "\n";
#}
######################################################################
+##########
# For the Most part, the entire program will work for any given data.I
+f a new #
# table needs to be used, please change the mappings below to reflect
+the new #
# table values.
+ #
######################################################################
+##########
$dtree->add_instance
(attributes=> {Income=> qq($rows[1]),
Student=> qq ($rows[2]),
Credit_Rating=> qq($rows[3])
},
result => qq($rows[4]));
}
$dtree ->train();
@rules = $dtree->rule_statements();
print MYFILE "\n Mining the Data \n===================================
+=======\n";
$i =0;
print MYFILE "\n\nDecision Rules \n===================================
+=============\n";
foreach $rule (@rules){
print MYFILE "$rule \n";
$i = $i + 1;
@{"arr".$i} = split (/\s+/,$rule);
foreach $a (@{"arr".$i}){
#print "$a \t";
if ($a eq "if") {
next;
}
if ($a eq "and"){
last;
}
else{
if ($a =~ m/=/){
@ar = split (/=/,$a);
$root_of_tree = "<root>$ar[0]</root> \n";
}
}
}
}
print MYFILE "\n\n DECISION TREE |N===================================
+====\n\n$root_of_tree\n";
print "\n$root_of_tree\n";
@rules = $dtree->rule_statements();
$k = 0;
foreach $rule (@rules){
$k ++;
}
foreach $rules (@rules){
@rule_arr = split (/\s+/,$rules);
for($t =0;$t< $#rule_arr;$t++){
my @node;
if ($rule_arr[$t] =~m/=/){
@leaf = split (/\=/,$rule_arr[$t]);
push(@node," <branch node = \"$leaf[0]\" attribute =\"$leaf[1]\" \/>")
+;
}
if ($rule_arr[$t] =~ m/\-\>/){
print "\n <Class value>\"$rule_arr[$t+1]\"</Class value>\n\n";
print MYFILE "\n <Class value>\"$rule_arr[$t+1]\"</Class value>\n\n";
}
foreach $n (@node){
print "$n \n";
print MYFILE "$n \n";
}
}
}
#if ($rule_arr[$t] =~ m/\=/){
# @leaf = split (/\=/,$rule_arr);
# print "Leaf node : $leaf[0]\n";
# }
#}
#}
1 high no fair No
2 high no excellent No
3 high no fair Yes
4 medium no fair Yes
5 low yes fair Yes
6 low yes excellent No
7 low yes excellent Yes
8 medium no fair No
9 low yes fair Yes
10 medium yes fair Yes
11 medium yes excellent Yes
12 medium no excellent Yes
13 high yes fair Yes
14 medium no excellent No
|