Beefy Boxes and Bandwidth Generously Provided by pair Networks
XP is just a number
 
PerlMonks  

Re: The sum of absolute differences in the counts of chars in two strings.

by Perlbotics (Abbot)
on Nov 19, 2011 at 16:47 UTC ( #938983=note: print w/ replies, xml ) Need Help??


in reply to The sum of absolute differences in the counts of chars in two strings.

Here's my brute-force try. Don't know if the call-overhead is bigger than the gain? I assumed a one-byte-is-one-character relationship (no wchar's yet). HTH

use Inline C; use strict; use warnings; use Time::HiRes qw(time); my $str1 = "abcdefabc"; my $str2 = "xyzabcabc@@@"; my $now = time; my $res; $res = deltasum( $str1, $str2 ) for (1..1_000_000); print "res=$res time=", ( time() - $now ), "us\n"; #res=9 time=0.819153070449829us __END__ __C__ #define DEBUG(x) int counter_tab[256]; int deltasum(char* a, char* b) { int i; int sum = 0; DEBUG( printf("IN: (%d:%s) (%d:%s)\n", strlen(a), a, strlen(b), b); +) bzero( counter_tab, sizeof( counter_tab ) ); for ( ; *a ; ++a ) ++counter_tab[ *a ]; for ( ; *b ; ++b ) --counter_tab[ *b ]; for ( i = 0; i < 256; ++i ) { /* maybe reduced to significant range? + */ sum += abs( counter_tab[i] ); DEBUG( if ( counter_tab[i] ) printf( "'%c' (%3d) x %5d\n", i, i, c +ounter_tab[i]); ) } DEBUG( printf("OUT: %d\n", sum); ) return sum; }
Update: Moved i and sum to head of function to cope with ancient compilers (thanks to davido). Simplified sizeof() expression.

Debug sample:

IN: (9:abcdefabc) (12:xyzabcabc@@@) '@' ( 64) x -3 'd' (100) x 1 'e' (101) x 1 'f' (102) x 1 'x' (120) x -1 'y' (121) x -1 'z' (122) x -1 OUT: 9

Update: Approx. 60% speed-up when limiting to [ACGTacgt], still room for improvement...

... __END__ __C__ #define DEBUG(x) #define GATTACA(x) (x)=='A' || (x)=='C' || (x)=='G' || (x)=='T' || \ (x)=='a' || (x)=='c' || (x)=='g' || (x)=='t' int deltasum_acgt(char* a, char* b) { int i; int sum = 0; int tab[8] = {0}; int ignored = 0; DEBUG( printf("IN (%d:%s) (%d:%s)!\n", strlen(a), a, strlen(b), b); +) for ( ; *a ; ++a ) { if ( GATTACA( *a ) ) ++tab[ *a & 0x07 ]; else ++ignored; } for ( ; *b ; ++b ) { if ( GATTACA( *b ) ) --tab[ *b & 0x07 ]; else ++ignored; } sum += abs( tab[1] ); /* A */ sum += abs( tab[3] ); /* C */ sum += abs( tab[7] ); /* G */ sum += abs( tab[4] ); /* T */ sum += ignored << 20; /* signal ill chars */ DEBUG( for (i=0; i<7; ++i) printf("%d -> %d\n", i, tab[i]); ) DEBUG( printf("OUT: %d ign=%d delta=%d\n", sum, ignored, sum & 0xff +fff); ) return sum; }
Script shall mask ignored characters from returned value by using $delta = $res & 0xfffff;. Number of chars ignored is $ign = $res >> 20; (think system / $?).
Assumption: Maximum number of differences is not much more than one million.


Comment on Re: The sum of absolute differences in the counts of chars in two strings.
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://938983]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others avoiding work at the Monastery: (10)
As of 2014-07-28 22:06 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    My favorite superfluous repetitious redundant duplicative phrase is:









    Results (210 votes), past polls