use 5.026; use strict; use warnings; use Benchmark qw/cmpthese/; use Devel::Size qw/size total_size/; use Set::Similarity::Jaccard; my @all = qw< alpha bravo charlie delta echo foxtrot >; my %mask; $mask{ $all[$_] } = 2 << $_ for 0..$#all; my %first; $first{$_} = substr $_, 0, 1 for @all; # For set_to_string my @set1 = qw< alpha charlie delta >; my @set2 = qw< alpha delta echo foxtrot >; # Convert a list to bits. e.g. qw : 0b101011 sub set_to_bits { my $bits = 0; $bits |= $mask{$_} for @_; $bits; } # Convert a list to a string representation. Note we can't just work with the # full keywords here, as Set::Similarity::Jaccard would look at the keywords # character-wise, which would give an incorrect result. # e.g.: qw : 'ace' sub set_to_string { join '', map { $first{$_} } @_; } # Compute Jaccard similarity of two sets, using hashes only sub jaccard_hash { my ($set1, $set2) = @_; my %set1 = map { $_ => 1 } @$set1; my %set2 = map { $_ => 1 } @$set2; my $int_count = 0; $int_count += 1 for grep { $set1{$_} && $set2{$_} } keys %set1; my $uni_count = 0; $uni_count += 1 for grep { $set1{$_} || $set2{$_} } @all; $uni_count ? $int_count/$uni_count : 1; } # Compute Jaccard similarity of two sets by converting to bits first (inlined) sub jaccard_bits { my ($set1, $set2) = @_; my ($bits1, $bits2) = (0,0); # Inline set_to_bits for speed $bits1 |= $mask{$_} for @$set1; $bits2 |= $mask{$_} for @$set2; # Calculate intersection and union my $int = $bits1 & $bits2; my $uni = $bits1 | $bits2; # Compute Hamming weights of $int and $uni to get # of bits set (inlined) my $ic = $int; $ic -= (($ic >> 1) & 0x55555555); $ic = ($ic & 0x33333333) + (($ic >> 2) & 0x33333333); $ic = ((($ic + ($ic >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24; my $uc = $uni; $uc -= (($uc >> 1) & 0x55555555); $uc = ($uc & 0x33333333) + (($uc >> 2) & 0x33333333); $uc = ((($uc + ($uc >> 4)) & 0x0f0f0f0f) * 0x01010101) >> 24; $uc ? $ic/$uc : 1; } # Verification my $jac = Set::Similarity::Jaccard->new; my $sim = $jac->similarity(set_to_string(@set1), set_to_string(@set2)); say "CPAN Jaccard: $sim"; say "Bits Jaccard: " . jaccard_bits(\@set1, \@set2); say "Hash Jaccard: " . jaccard_hash(\@set1, \@set2); printf "\nSizes:\n"; printf " %20s: %4d bytes, %4d bytes total\n", $_, eval "size($_)", eval "total_size($_)" for qw< \@set1 set_to_bits(@set1) set_to_string(@set1) >; printf "\nPerformance:\n"; my $str1 = set_to_string(@set1); my $str2 = set_to_string(@set2); my $r; cmpthese(-5, { 'similarity only' => sub { $r = Set::Similarity::Jaccard->new->similarity($str1, $str2) }, '$jac->similarity' => sub { $r = Set::Similarity::Jaccard->new->similarity(set_to_string(@set1), set_to_string(@set2)) }, 'jaccard_bits' => sub { $r = jaccard_bits(\@set1, \@set2) }, 'jaccard_hash' => sub { $r = jaccard_hash(\@set1, \@set2) }, });