Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

Re: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer)

by quidity (Pilgrim)
on Sep 12, 2007 at 15:10 UTC ( #638592=note: print w/ replies, xml ) Need Help??


in reply to Challenge: CPU-optimized byte-wise or-equals (for a meter of beer)

Here is my dirty approach:

use Inline C => qq{ SV* swap(SV* one, SV* two) { char *buf1, *buf2, *buf3; STRLEN idx, len; SV* ret; if (!SvPOK(one) || !SvPOK(two) || sv_len(one) > sv_len(two)) return newSVsv(&PL_sv_undef); len = sv_len(one); buf1 = SvPVX(one); buf2 = SvPVX(two); buf3 = malloc(len); for (idx=0; idx < len; idx++) { buf3[idx] = buf1[idx] ? buf1[idx] : buf2[idx]; } ret = newSVpv(buf3, len); free(buf3); return ret; } }; # [ your code, but with ] cmpthese( -5, { 'split1' => sub { my $s3 = split1( $s1, $s2 ) }, 'substr1' => sub { my $s3 = substr1( $s1, $s2 ) }, 'inline' => sub { my $s3 = swap($s1, $s2) }, });

Which gives, er, some improvement...

          Rate  split1 substr1  inline
split1  4.45/s      --    -88%   -100%
substr1 37.6/s    746%      --    -99%
inline  6955/s 156241%  18378%      --

Although you have to make assumptions about your data, like it not being encoded in utf8. You can test for all this as part of the C, if you need to.

If you need to be this specific about direct byte faffing, always think of Inline::C. Then wrap the dirtyness in a module.


Comment on Re: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer)
Download Code
Re^2: Challenge: CPU-optimized byte-wise or-equals (for a meter of beer)
by avar (Beadle) on Sep 12, 2007 at 17:22 UTC
    I'm a really sore^Hthirsty loser. So I wrote an XS function that did the replacement in-place by operating directly on the SV'S char vector. It's faster than your swap function but still slower than my s///-pos stuff, I don't really know why.
    use Inline C => q[ void avar_cee(SV *sv1, SV *sv2) { char *sv1p, *sv2p; STRLEN sv1len, sv2len; SV *sv1_sv; STRLEN i; if (!SvROK(sv1) || !SvPOK((SV*)SvRV(sv1)) || !SvPOK(sv2)) { croak("Usage: avar_cee(\$s1, $s2)"); } sv1_sv = (SV*)SvRV(sv1); sv1p = SvPV(sv1_sv, sv1len); sv2p = SvPV(sv2, sv2len); if (sv1len != sv2len) { croak("The given strings must be of the same length"); } for (i = 0; i < sv2len; i++) { if (sv1p[i] == '\0') { sv1p[i] = sv2p[i]; } } }
    It could be sped up with something like OpenMP by putting #pragma omp paralell for before the for loop but I didn't have gcc 4.2 to test it. I changed the test script so that it wouldn't be unfair to the in-place functions. Previously I was copying the data anyway so that testing would work which defeated the purpose of doing it in-place. Results:
    split1 5.07/s -- -82% -87% -99% -99% -99% + -99% -100% -100% -100% -100% -100% substr1 27.8/s 449% -- -31% -96% -96% -97% + -97% -98% -98% -99% -99% -99% ikegami_s 40.3/s 695% 45% -- -94% -95% -95% + -96% -97% -98% -98% -99% -99% avar 700/s 13700% 2414% 1636% -- -10% -18% + -30% -42% -62% -70% -80% -86% avar2 778/s 15243% 2695% 1830% 11% -- -8% + -22% -35% -58% -67% -78% -85% corion 848/s 16635% 2948% 2005% 21% 9% -- + -15% -29% -54% -63% -76% -83% ikegami_tr 999/s 19609% 3490% 2379% 43% 28% 18% + -- -17% -45% -57% -72% -80% avar2_pos 1201/s 23591% 4215% 2880% 72% 54% 42% + 20% -- -34% -48% -66% -76% moritz 1832/s 36032% 6481% 4445% 162% 135% 116% + 83% 53% -- -21% -49% -64% swap 2324/s 45742% 8250% 5666% 232% 199% 174% + 133% 93% 27% -- -35% -54% avar_c_inplace 3567/s 70267% 12717% 8751% 410% 359% 320% + 257% 197% 95% 53% -- -30% avar2_pos_inplace 5063/s 99781% 18093% 12464% 624% 551% 497% + 407% 322% 176% 118% 42% --
    benchmark.pl:
    #!/usr/bin/perl use 5.6.0; use strict; use warnings FATAL => 'all'; use Benchmark qw( cmpthese ); use Inline C => q[ void avar_c_inplace(SV *sv1, SV *sv2) { char *sv1p, *sv2p; STRLEN sv1len, sv2len; SV *sv1_sv; STRLEN i; if (!SvROK(sv1) || !SvPOK((SV*)SvRV(sv1)) || !SvPOK(sv2)) { croak("Usage: avar_cee(\$s1, $s2)"); } sv1_sv = (SV*)SvRV(sv1); sv1p = SvPV(sv1_sv, sv1len); sv2p = SvPV(sv2, sv2len); if (sv1len != sv2len) { croak("The given strings must be of the same length"); } for (i = 0; i < sv2len; i++) { if (sv1p[i] == '\0') { sv1p[i] = sv2p[i]; } } } SV* swap(SV* one, SV* two) { char *buf1, *buf2, *buf3; STRLEN idx, len; SV* ret; if (!SvPOK(one) || !SvPOK(two) || sv_len(one) > sv_len(two)) return newSVsv(&PL_sv_undef); len = sv_len(one); buf1 = SvPVX(one); buf2 = SvPVX(two); buf3 = malloc(len); for (idx=0; idx < len; idx++) { buf3[idx] = buf1[idx] ? buf1[idx] : buf2[idx]; } ret = newSVpv(buf3, len); free(buf3); return ret; } ]; my $s1 = do_rand(0, 100_000); my $s2 = do_rand(1, 100_000); #my $s1 = "a\0b\0c\0d\0"; #my $s2 = "aXbYcZdA"; my @z = $s1 =~ m/(\0)/g; my $num = @z; warn "\$s1 has $num zeroes"; my $cp0 = $s1; my $cp1 = $s1; my %subs = ( 'split1' => sub { my $s3 = split1( $s1, $s2 ) }, 'substr1' => sub { my $s3 = substr1( $s1, $s2 ) }, # 'kyle' => sub { my $s3 = kyle( $s1, $s2 ) }, 'moritz' => sub { my $s3 = moritz( $s1, $s2 ) }, 'corion' => sub { my $s3 = corion( $s1, $s2 ) }, 'ikegami_s' => sub { my $s3 = ikegami_s( $s1, $s2 ) }, 'ikegami_tr' => sub { my $s3 = ikegami_tr( $s1, $s2 ) }, 'avar' => sub { my $s3 = avar( $s1, $s2 ) }, 'avar2' => sub { my $s3 = avar2( $s1, $s2 ) }, 'avar2_pos' => sub { my $s3 = avar2_pos( $s1, $s2 ) }, 'avar2_pos_inplace' => sub { avar2_pos_inplace( \$cp0, $s2 ); $cp +0 }, 'avar_c_inplace' => sub { avar_c_inplace( \$cp1, $s2 ); $cp1 }, 'swap' => sub { swap( $s1, $s2 ) }, # 'bogus' => sub { "oh noes" }, ); cmpthese( -2, \%subs ); use Test::More; plan 'tests' => scalar keys %subs; my $correct; { my $tmp = $s1; avar2_pos_inplace(\$tmp, $s2); $correct = $tmp; } foreach my $subname ( keys %subs ) { my $sub = $subs{$subname}; ok($sub->() eq $correct, "$subname returned the correct value"); } sub split1 { my ($s1, $s2) = @_; my @s1 = split //, $s1; my @s2 = split //, $s2; foreach my $idx ( 0 .. $#s1 ) { if ( $s1[$idx] eq chr(0) ) { $s1[$idx] = $s2[$idx]; } } return join '', @s1; } sub substr1 { my ($s1, $s2) = @_; for my $idx ( 0 .. length($s1) ) { if ( substr($s1,$idx,1) eq chr(0) ) { substr($s1, $idx, 1) = substr($s2, $idx, 1); } } return $s1; } sub kyle { my ($s1, $s2) = @_; my $out = $s1; while ( $s1 =~ m/\000/g ) { my $pos = pos; substr( $out, $pos, 1 ) = substr( $s2, $pos, 1 ); } return $out; } sub moritz { my ($s1, $s2) = @_; my $pos = 0; while ( 0 < ( $pos = index $s1, "\000", $pos ) ) { substr( $s1, $pos, 1 ) = substr( $s2, $pos, 1 ); } return $s1; } sub ikegami_s { my ($s1, $s2) = @_; (my $mask = $s1) =~ s/[^\x00]/\xFF/g; return ($s1 & $mask) | ($s2 & ~$mask); } sub ikegami_tr { my ($s1, $s2) = @_; (my $mask = $s1) =~ tr/\x00/\xFF/c; return ($s1 & $mask) | ($s2 & ~$mask); } sub corion { my ($s1, $s2) = @_; my $ofs = 0; return join "", map { $ofs += length; $_ => substr $s2, $ofs++, 1 } + split /\0/, $s1, -1; } sub avar { my ($s1, $s2) = @_; my $s3 = $s1; { use bytes; $s3 =~ s/(\0)/substr $s2, $+[0]-1, 1/eg; } $s3; } sub avar2 { my ($s1, $s2) = @_; use bytes; $s1 =~ s/(\0)/substr $s2, $+[0]-1, 1/eg; return $s1; } sub avar2_pos { my ($s1, $s2) = @_; use bytes; $s1 =~ s/\0/substr $s2, pos($s1), 1/eg; return $s1; } sub avar2_pos_inplace { my ($s1, $s2) = @_; use bytes; $$s1 =~ s/\0/substr $s2, pos($$s1), 1/eg; } sub avar2_pos_inplace2 { my ($s1, $s2) = @_; use bytes; $$s1 =~ s/\0/substr $$s2, pos($$s1), 1/eg; } # This makes sure that $s1 has chr(0)'s in it and $s2 does not. sub do_rand { my $min = shift; my $len = shift; my $n = ""; for (1 .. $len) { $n .= chr( rand(255-$min)+$min ); } return $n; } #sub do_rand { # my $n = (shift) ? int(rand(255)) : int(rand(254)) + 1; # return chr( $n ); #} __END__

      You are calculating array offsets over and over. The fast way to do this in C is more like:

      int len; char* pBeg= SvPV(...,len); char* pSrc= pBeg + len; char* pDst= SvPV(....) + len; while( pBeg <= pSrc ) { if( ! *pDst ) { *pDst= *pSrc; } --pSrc; --pDst; }

      - tye        

        I'd already coded up an XS version and used memchr for my search instead of your explicit loop. I don't know why but I found that using memchr got a function that was twice as fast. I included the searching part of the code below.

        Rate avar avar2 ikegami_tr avar2_pos corion morit +z avar2_pos_inplace dio_c2 dio_c avar 109/s -- -6% -18% -40% -51% -70 +% -84% -91% -95% avar2 116/s 6% -- -13% -36% -48% -68 +% -83% -90% -95% ikegami_tr 133/s 22% 15% -- -27% -41% -64 +% -81% -89% -94% avar2_pos 182/s 66% 56% 36% -- -19% -50 +% -74% -84% -91% corion 224/s 105% 93% 68% 23% -- -39 +% -67% -81% -89% moritz 366/s 235% 215% 175% 101% 63% - +- -47% -69% -83% avar2_pos_inplace 686/s 527% 490% 415% 278% 206% 87 +% -- -41% -68% dio_c2 1172/s 971% 908% 779% 545% 422% 220 +% 71% -- -45% dio_c 2118/s 1836% 1722% 1488% 1066% 844% 479 +% 209% 81% -- dio_c2 // Do it while ( dpv < dpv_end ) { if ( ! *dpv ) *dpv = *spv; ++spv; ++dpv; } dio_c // Do it while ( 1 ) { ptr = (char*)memchr( ptr, '\0', ptr_end - ptr ); if ( ! ( ptr && ptr < ptr_end ) ) break; *ptr = *(ptr - dpv + spv); ++ ptr; }

        ⠤⠤ ⠙⠊⠕⠞⠁⠇⠑⠧⠊

      I just tested this snippet on gcc 4.2.1 and it had the same speed on a two-processor, multi-core machine as it did without the OpenMP annotations as well as when it was compiled by gcc 3.whatever.

      #pragma omp parallel for for( ; dsvc < dsvc_end; ++dsvc, ++ssvc ) { if ( ! *dsvc ) *dsvc = *ssvc; }

      Just to see if things changed if I simplified the for(;;) loop, this also had no effect. Or perhaps the openmp version got slower. It's been about a 1000 calls slower the last few times I tried it. I'm sure that's just because the loop has to do the pointer math all the time instead of just incrementing like the previous version. OpenMP just isn't getting enabled, I think.

      /* Do it. */ #pragma omp parallel for for( o = 0; o < len; ++o ) { if ( ! *( dsvc + o ) ) *(dsvc+o) = *(ssvc+o); }

      In the below chart I deleted a pile of uninteresting programs and kept only the ones I was testing and things that performed at least as well as my worst candidate.

      Rate moritz_inplace dio_openmp swap dio_noopenmp + avar_tye_c_inplace avar_c_inplace dio_c moritz_inplace 1948/s -- -64% -66% -68% + -69% -73% -83% dio_openmp 5344/s 174% -- -7% -13% + -15% -26% -54% swap 5745/s 195% 8% -- -6% + -9% -20% -50% dio_noopenmp 6124/s 214% 15% 7% -- + -3% -15% -47% avar_tye_c_inplace 6286/s 223% 18% 9% 3% + -- -12% -45% avar_c_inplace 7178/s 269% 34% 25% 17% + 14% -- -38% dio_c 11495/s 490% 115% 100% 88% + 83% 60% --

      ⠤⠤ ⠙⠊⠕⠞⠁⠇⠑⠧⠊

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://638592]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others imbibing at the Monastery: (12)
As of 2014-08-01 14:33 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    Who would be the most fun to work for?















    Results (25 votes), past polls