Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine
 
PerlMonks  

Re: The sum of absolute differences in the counts of chars in two strings.

by Perlbotics (Canon)
on Nov 19, 2011 at 16:47 UTC ( #938983=note: print w/ replies, xml ) Need Help??


in reply to The sum of absolute differences in the counts of chars in two strings.

Here's my brute-force try. Don't know if the call-overhead is bigger than the gain? I assumed a one-byte-is-one-character relationship (no wchar's yet). HTH

use Inline C; use strict; use warnings; use Time::HiRes qw(time); my $str1 = "abcdefabc"; my $str2 = "xyzabcabc@@@"; my $now = time; my $res; $res = deltasum( $str1, $str2 ) for (1..1_000_000); print "res=$res time=", ( time() - $now ), "us\n"; #res=9 time=0.819153070449829us __END__ __C__ #define DEBUG(x) int counter_tab[256]; int deltasum(char* a, char* b) { int i; int sum = 0; DEBUG( printf("IN: (%d:%s) (%d:%s)\n", strlen(a), a, strlen(b), b); +) bzero( counter_tab, sizeof( counter_tab ) ); for ( ; *a ; ++a ) ++counter_tab[ *a ]; for ( ; *b ; ++b ) --counter_tab[ *b ]; for ( i = 0; i < 256; ++i ) { /* maybe reduced to significant range? + */ sum += abs( counter_tab[i] ); DEBUG( if ( counter_tab[i] ) printf( "'%c' (%3d) x %5d\n", i, i, c +ounter_tab[i]); ) } DEBUG( printf("OUT: %d\n", sum); ) return sum; }
Update: Moved i and sum to head of function to cope with ancient compilers (thanks to davido). Simplified sizeof() expression.

Debug sample:

IN: (9:abcdefabc) (12:xyzabcabc@@@) '@' ( 64) x -3 'd' (100) x 1 'e' (101) x 1 'f' (102) x 1 'x' (120) x -1 'y' (121) x -1 'z' (122) x -1 OUT: 9

Update: Approx. 60% speed-up when limiting to [ACGTacgt], still room for improvement...

... __END__ __C__ #define DEBUG(x) #define GATTACA(x) (x)=='A' || (x)=='C' || (x)=='G' || (x)=='T' || \ (x)=='a' || (x)=='c' || (x)=='g' || (x)=='t' int deltasum_acgt(char* a, char* b) { int i; int sum = 0; int tab[8] = {0}; int ignored = 0; DEBUG( printf("IN (%d:%s) (%d:%s)!\n", strlen(a), a, strlen(b), b); +) for ( ; *a ; ++a ) { if ( GATTACA( *a ) ) ++tab[ *a & 0x07 ]; else ++ignored; } for ( ; *b ; ++b ) { if ( GATTACA( *b ) ) --tab[ *b & 0x07 ]; else ++ignored; } sum += abs( tab[1] ); /* A */ sum += abs( tab[3] ); /* C */ sum += abs( tab[7] ); /* G */ sum += abs( tab[4] ); /* T */ sum += ignored << 20; /* signal ill chars */ DEBUG( for (i=0; i<7; ++i) printf("%d -> %d\n", i, tab[i]); ) DEBUG( printf("OUT: %d ign=%d delta=%d\n", sum, ignored, sum & 0xff +fff); ) return sum; }
Script shall mask ignored characters from returned value by using $delta = $res & 0xfffff;. Number of chars ignored is $ign = $res >> 20; (think system / $?).
Assumption: Maximum number of differences is not much more than one million.


Comment on Re: The sum of absolute differences in the counts of chars in two strings.
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://938983]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others chilling in the Monastery: (15)
As of 2015-07-06 16:17 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (77 votes), past polls