Re: Comparison by position and value

For some reason I'm really enjoying this problem.

I've benchmarked all of the promising-looking solutions here. If you don't see yours and you think it's a contender, let me know, and ideally post your benchmark code and results as a followup to this (or stick it in a scratchpad and /msg me and I'll add it).

Some of mine use Inline::C; if you don't have it and don't want it, just comment out the multi-line use statement and the Init statement, and remove the benchmarks for sgifford_csimple3 and sgifford_cclever3.

Results (slightly doctored^* for better display):

           ambrus:  3s ( 3.19 usr + 0.01 sys = 3.20 CPU) @ 7812.50/s 
        aristotle:  7s ( 5.71 usr + 0.02 sys = 5.73 CPU) @ 4363.00/s 
       aristotle2:  5s ( 4.56 usr + 0.00 sys = 4.56 CPU) @ 5482.46/s 
              ccn:  2s ( 1.74 usr + 0.00 sys = 1.74 CPU) @ 14367.82/s 
sgifford_cclever3:  1s ( 0.96 usr + 0.00 sys = 0.96 CPU) @ 26041.67/s 
 sgifford_clever2:  9s ( 8.56 usr + 0.05 sys = 8.61 CPU) @ 2903.60/s 
 sgifford_clever3:  2s ( 1.19 usr + 0.01 sys = 1.20 CPU) @ 20833.33/s 
sgifford_csimple3:  1s ( 1.04 usr + 0.00 sys = 1.04 CPU) @ 24038.46/s 
          simple3:  5s ( 4.23 usr + 0.01 sys = 4.24 CPU) @ 5896.23/s
[download]

Code follows.

#!/usr/bin/perl

use Benchmark;
use Inline C => 'DATA',
  VERSION => 0.0,
  NAME => 'SimpleTest',
  OPTIMIZE => '-O3';
Inline->init;

sub simple3
{
  my($a,$b)=@_;
  my(@seen);

  return undef if (length($a) != length($b));
  foreach my $i (0..length($a))
  {
    my($ac,$bc)=(substr($a,$i,1),substr($b,$i,1));
    if ($ac eq $bc)
    {
      ; # Do nothing
    }
    elsif ($ac eq '_')
    {
      return undef if (++$seen[$bc] > 1);
    }
    elsif ($bc eq '_')
    {
      return undef if (++$seen[$ac] > 1);
    }
    else { return undef }
  }
  1;
}

# Represent each string as two strings and two masks.
sub sgifford_clever2
{
  (my $a = $_[0]) =~ tr/_/\0/;
  (my $b = $_[1]) =~ tr/_/\0/;
  # Data transformations; could be done beforehand in linear time.
  my($a3,$b3)=("\0"x10,"\0"x10);
  foreach my $i (0..(length($a)-1))
  {
    my $c = substr($a,$i,1);
    next if $c eq "\0";
    substr($a3,$c,1)=$i;
  }
  foreach my $i (0..(length($b)-1))
  {
    my $c = substr($b,$i,1);
    next if $c eq "\0";
    substr($b3,$c,1)=$i;
  }

  my $a_new = $a . $a3;
  my $b_new = $b . $b3;
  (my $a_mask = $a_new)  =~ tr/\0/\xff/c;
  (my $b_mask = $b_new)  =~ tr/\0/\xff/c;

#  (my $print = "\t$a_new\n\t$b_mask\nvs.\t$b_new\n\t$a_mask\n\n") =~ 
+tr/\0\xff/_!/;
#  print $print;

  # Comparisons; must be done for each comparison.
  return (($a_new & $b_mask) eq ($b_new & $a_mask));
}

sub sgifford_clever3
{
  #    a   and mask[b]  eq      b   and mask[a] 
  ($_[0][1] & $_[1][2]) eq ($_[1][1] & $_[0][2]);
}

sub sgifford_clever3_xform
{
  (my $a = $_[0]) =~ tr/_/\0/;

  # Data transformations; could be done beforehand in linear time.
  my($a3)="\0"x10;
  foreach my $i (0..(length($a)-1))
  {
    my $c = substr($a,$i,1);
    next if $c eq "\0";
    substr($a3,$c,1)=$i;
  }

  my $a_new = $a . $a3;
  (my $a_mask = $a_new)  =~ tr/\0/\xff/c;
  return [$_[0],$a_new,$a_mask];
}

# From [ccn]
sub ccn
{
  local $_ = $_[0] ^ $_[1];
  return  not (/[\001-\017]/ or /([\020-\031]).*?\1/s);
}

# From [ambrus]
sub ambrus
{
  my ($s1, $s2) = @_;
  my $m = length($s1) - 1;
  my($n, $p) = ($m - 1, $m + 1);
  ($s1 . $s2) !~ /^.{0,$m}?([^_]).{$m}(?!\1)[^_]/ and
    ($s1 . $s2) !~ /^.{0,$m}?([^_])(?:.{0,$n}|.{$p,})\1/;
}

# From [aristotle]
sub aristotle
{
  my( $l, $r ) = @_;
  
# underscores are insignificant
  tr/_/\0/ for $l, $r;
  
# cancel out identical values
  my $xor = $l ^ $r;
  
# convert to bitmasks
  tr/\0/\377/c for $l, $r;
  
  my $mask = $l & $r;
  
# masked chars must be identical
  return !1 if ( $xor & $mask ) =~ tr/\0//c;
                                  
# and there may not be dupes of non-identical characters
  return 0 == grep {
                      my $char = substr( $xor, $_, 1 );
                      $char ne "\0" and index( $xor, $char, $_ + 1 ) !
+= -1
                   } 0 .. length( $xor ) - 1;
}

sub aristotle2_xform
{
  my($a)=@_;

# underscores are insignificant
  $a =~ tr/_/\0/;
  (my $mask = $a) =~ tr/\0/\377/c;
  
  return [$_[0],$a,$mask];
}

sub aristotle2
{
  my( $l, $r ) = @_;
  
# cancel out identical values
  my $xor = $l->[1] ^ $r->[1];
  my $mask = $l->[2] & $r->[1];
  
# masked chars must be identical
  return !1 if ( $xor & $mask ) =~ tr/\0//c;
                                  
# and there may not be dupes of non-identical characters
  return 0 == grep {
                      my $char = substr( $xor, $_, 1 );
                      $char ne "\0" and index( $xor, $char, $_ + 1 ) !
+= -1
                   } 0 .. length( $xor ) - 1;
}

my @tests =
(qw/
     _8__3__19
     48____7__

     _8__3__19
     4_2___7__

     _8__3__19
     4_8___7__

     __8_3__19
     48____7__

     __8_3__19
     84____7__

     _8__3__19
     48_____7_
   /
);

sub run_tests
{
  my($func,$verbose,@tests)=@_;
  my ($s1, $s2);
  while (defined($s1 = shift(@tests)))
  {
      $s2 = shift(@tests);
      my $result = $func->($s1, $s2);
      if ($verbose)
      {
        if (ref($s1) && ref($s2))
        {
          $s1 = $s1->[0];
          $s2 = $s2->[0];
        }
        print "$s1\n$s2: ",$result?"compatible":"not compatible","\n";
      }
  }
}

my @tests_clever3 = map { sgifford_clever3_xform($_) } @tests;
my @tests_ccn = map { my $tmp = $_; $tmp =~ tr/_/ /; $tmp } @tests;
my @tests_aristotle2 = map { aristotle2_xform($_) } @tests;

#run_tests(\&simple3, 1, @tests);
#run_tests(\&ambrus, 1, @tests);
#run_tests(\&aristotle,1,@tests);
#run_tests(\&aristotle2,1,@tests_aristotle2);
#run_tests(\&clever3, 1, @tests_clever3);
#run_tests(\&cclever3, 1, @tests_clever3);

timethese(25_000, {
                 simple3 => sub { run_tests(\&simple3, 0, @tests) },
                 sgifford_csimple3 => sub { run_tests(\&sgifford_csimp
+le3, 0, @tests) },
                 sgifford_clever2 => sub { run_tests(\&sgifford_clever
+2, 0, @tests) },
                 sgifford_clever3 => sub { run_tests(\&sgifford_clever
+3, 0, @tests_clever3) },
                 sgifford_cclever3 => sub { run_tests(\&sgifford_cclev
+er3, 0, @tests_clever3) },
                 ccn => sub { run_tests(\&ccn, 0, @tests_ccn) },
                 ambrus => sub { run_tests(\&ambrus, 0, @tests) },
                 aristotle => sub { run_tests(\&aristotle, 0, @tests) 
+},
                 aristotle2 => sub { run_tests(\&aristotle2, 0, @tests
+_aristotle2) },
                });


__DATA__
__C__
int sgifford_csimple3(const char *a, const char *b)
{
  int i;
  int l;
  unsigned char seen[256];

  memset(seen,0,256);

  if ((l=strlen(a)) != strlen(b))
    return 0;
  for(i=0;i<l;i++)
  {
    if (a[i] == b[i])
    {
      ; /* Do nothing */
    }
    else if (a[i] == '_')
    {
      if (++seen[b[i]] > 1)
        return 0;
    }
    else if (b[i] == '_')
    {
      if (++seen[a[i]] > 1)
        return 0;
    }
    else
      return 0;
  }
  return 1;
}

int sgifford_cclever3(SV *a, SV *b)
{
  AV *a_arr, *b_arr;
  SV **tmp;
  char *a_val, *a_mask, *b_val, *b_mask;
  int i;
  
  /* First  get the arrays from the references */
  if (!SvROK(a) || !SvROK(b))
    croak("a or b not arrayrefs!");
  a_arr = (AV*)SvRV(a);
  b_arr = (AV*)SvRV(b);

  /* Now pull out the data */

  if ( (tmp = av_fetch(a_arr, 1, 0)) == NULL)
    croak("a[1] is undef");
  if ((a_val = SvPV(*tmp, PL_na)) == NULL)
    croak("a[1] contains NULL pointer?");

  if ( (tmp = av_fetch(a_arr, 2, 0)) == NULL)
    croak("a[2] is undef");
  if ((a_mask = SvPV(*tmp, PL_na)) == NULL)
    croak("a[2] contains NULL pointer?");


  if ( (tmp = av_fetch(b_arr, 1, 0)) == NULL)
    croak("b[1] is undef");
  if ((b_val = SvPV(*tmp, PL_na)) == NULL)
    croak("b[1] contains NULL pointer?");


  if ( (tmp = av_fetch(b_arr, 2, 0)) == NULL)
    croak("b[2] is undef");
  if ((b_mask = SvPV(*tmp, PL_na)) == NULL)
    croak("b[2] contains NULL pointer?");

  /* OK, finally we have all of the data! */
  for(i=0;i<20;i++)
  {
    if ((a_val[i] & b_mask[i]) != (b_val[i] & a_mask[i]))
      return 0;
  }
  return 1;
}
[download]

^*Benchmark Doctoring Code:

perl -F: -ane'$F[1] =~ s/ wallclock secs/s/; $F[1] =~ s/\(n=.*$//; $F[1] =~ s/ */ /g; printf "%17s: %s", @F;'

Comment on Re: Comparison by position and value Select or Download Code


Problems? Is your data what you think it is?
	PerlMonks