package Text::Plagiarized; $REVISION = '$Id: Plagiarized.pm,v 1.0 2003/07/13 19:15:57 ovid Exp $'; $VERSION = '0.01'; use 5.006; use strict; use warnings; use String::Approx qw/amatch/; use Text::Sentence qw/split_sentences/; sub new { my $class = shift; my $self = bless { original => {}, comparison => {}, matches => [], threshold => 80, } => $class; } sub original { my ($self, $text) = @_; local $_ = $text; $self->{original} = { text => $text, sentences => [split_sentences($text)], }; return $self; } sub comparison { my ($self, $text) = @_; local $_ = $text; $self->{comparison} = { text => $text, sentences => [split_sentences($text)], }; return $self; } my %percentage = map { $_ => 1 } 0 .. 100; # wow. This is a cheap hack sub threshold { my $self = shift; if (@_) { my $num = shift; unless (exists $percentage{$num}) { require Carp; Carp::croak("threshold must be an integer between 0 and 100, inclusive"); } $self->{threshold} = 100 - $num; } $self->{threshold}; } sub analyze { my $self = shift; my @sentences; my $threshold = $self->threshold; foreach my $sentence1 (@{$self->{original}{sentences}}) { foreach my $sentence2 (@{$self->{comparison}{sentences}}) { my ($hash1, $hash2) = _hash($sentence1, $sentence2); if ($hash1 eq $hash2 || amatch($hash1, ["$threshold%"], $hash2)) { push @sentences => [$sentence1 => $sentence2]; last; } } } $self->{matches} = \@sentences; } sub matches { shift->{matches} } sub percent { my $self = shift; my $precision = shift || 0; my $matches = @{$self->matches}; my $sentences = @{$self->{original}{sentences}}; sprintf "%.${precision}f" => ($matches/$sentences) * 100; } # starts to break down if we have more than 26 different words # use Unicode characters? # stop words? # memoize this sub _hash { my @string = map lc $_ => @_; s/[^[:alnum:][:space:]]//g foreach @string; s/[[:space:]]+/ /g foreach @string; my %words; my $letter = 'a'; s/(\S+)/ unless (exists $words{$1}) { $words{$1} = $letter; $letter++; } $words{$1} /eg foreach @string; s/ //g foreach @string; return @string; } 1;