rtillian has asked for the wisdom of the Perl Monks concerning the following question:
Hi Mahatmas,
the target is to get a string into an array.
split does it perfectly but is slow.
unpack is extremly fast but produces 97, 98, 99 instead of a,b,c
so you have to chr() it to get 'a' out of 97 - again slow
What would you suggest?
A sample code:
use strict;
use warnings;
# Fill a buffer with bytes
my $begin = time();
my $buffer = '';
for( 0 .. 10_000 ) { $buffer .= "abcdef\x00ghik"; }
#
# Split is slow
#
print "split 20 times\n";
for( 0 .. 20 ) {
print "loop $_\n";
my @a = split( //, $buffer );
}
printf( "split consumed %d second(s)\n", time() - $begin );
$begin = time();
#
# pack is extremly fast
#
print "pack alone\n";
for( 0 .. 20 ) {
print "loop $_\n";
my @a = unpack( "C*", $buffer );
}
printf( "pack consumed %d seconds\n", time() - $begin );
$begin = time();
#
# pack is extremly fast but needs chr() to get characters - slow!
#
print "pack and chr in map\n";
for( 0 .. 20 ) {
print "loop $_\n";
my @a = unpack( "C*", $buffer );
map{ $_ = chr( $_ ) } @a;
}
printf( "pack and chr in map consumed %d seconds\n", time() - $begin );
exit;
Re: Convert string to array - performance challenge
by BrowserUk (Patriarch) on Apr 07, 2010 at 17:04 UTC
|
my @a = unpack '(a1)*', $buffer;
Update: But your benchmark is broken. You've included the timing of building the buffer in with the time for split.
Corrected with a couple of alternatives added, substr proves to be fastest.
use strict;
use warnings;
use Time::HiRes qw[ time ];
my $buffer = "abcdef\x00ghik" x 10_000;
my $begin = time();
for( 0 .. 20 ) {
my @a = split( //, $buffer );
}
printf( "split consumed %.3f second(s)\n", time() - $begin );
$begin = time();
for( 0 .. 20 ) {
my @a = unpack( "C*", $buffer );
}
printf( "pack consumed %.3f seconds\n", time() - $begin );
$begin = time();
for( 0 .. 20 ) {
my @a = unpack( "C*", $buffer );
map{ $_ = chr( $_ ) } @a;
}
printf( "pack and chr in map consumed %.3f seconds\n", time() - $begin
+ );
$begin = time();
for( 0 .. 20 ) {
my @a = unpack( "(a1)*", $buffer );
}
printf( "unpack '{a1)* consumed %.3f seconds\n", time() - $begin );
$begin = time();
for( 0 .. 20 ) {
## Out-by-1 corrected courtesy [AnomalousMonk].
my @a; $#a = length( $buffer )-1;
$a[ $_ ] = substr $buffer, $_, 1 for 0 .. $#a;
}
printf( "substr for consumed %.3f seconds\n", time() - $begin );
exit;
__END__
C:\test>junk47
split consumed 2.427 second(s)
pack consumed 0.347 seconds ## incomplete
pack and chr in map consumed 4.132 seconds
unpack '{a1)* consumed 2.527 seconds
substr for consumed 1.654 seconds
Examine what is said, not who speaks -- Silence betokens consent -- Love the truth but pardon error.
"Science is about questioning the status quo. Questioning authority".
In the absence of evidence, opinion is indistinguishable from prejudice.
| [reply] [d/l] [select] |
|
| [reply] |
|
| [reply] |
Re: Convert string to array - performance challenge
by JavaFan (Canon) on Apr 07, 2010 at 17:16 UTC
|
I cannot beat the split. Here's some code inside code tags, and some output. Please do provide the output of your program if you want to make a point - relying on others to run the program reduces the chances people will actually spend time answering your questing. It's your loss!
use Benchmark 'cmpthese';
our $buffer = "abcdef\x00ghik" x 10_000;
our %hash = map {(ord($_) => $_)} "a" .. "z";
our %array;
foreach my $ch ("a" .. "z") {
$::array[ord $ch] = $ch;
}
cmpthese -1, {
split => 'my @a = split //, $::buffer',
pack => 'my @a = unpack "C*", $::buffer',
pack_chr => 'my @a = map chr, unpack "C*", $::buffer',
regexp => 'my @a = $::buffer =~ /./g',
hash => 'my @a = map $::hash{$_}, unpack "C*", $::buffer',
array => 'my @a = map $::array[$_], unpack "C*", $::buffer',
};
__END__
Rate hash array pack_chr regexp split pack
hash 3.81/s -- -35% -39% -39% -56% -87%
array 5.83/s 53% -- -6% -6% -32% -80%
pack_chr 6.19/s 63% 6% -- -0% -28% -79%
regexp 6.19/s 63% 6% 0% -- -28% -79%
split 8.57/s 125% 47% 38% 38% -- -71%
pack 29.6/s 678% 409% 378% 378% 246% --
| [reply] [d/l] [select] |
Re: Convert string to array - performance challenge
by ikegami (Patriarch) on Apr 07, 2010 at 17:14 UTC
|
For that input, split is the fastest for me.
Rate regex unpack_C unpack_a split
regex 9.95/s -- -2% -46% -50%
unpack_C 10.1/s 2% -- -45% -49%
unpack_a 18.3/s 84% 80% -- -9%
split 20.0/s 101% 97% 9% --
use strict;
use warnings;
use Benchmark qw( cmpthese );
my %tests = (
split => q{ my @a = split //, $buf; },
regex => q{ my @a = $buf =~ /./sg; },
unpack_C => q{ my @a = map chr, unpack 'C*', $buf; },
unpack_a => q{ my @a = unpack '(a)*', $buf; },
);
$_ = "use strict; use warnings; our \$buf; $_ 1"
for values(%tests);
local our $buf = "abcdef\x00ghik" x 10_000;
cmpthese(-2, \%tests);
Update: Changed (A)* to (a)* in response to BrowserUk's reply.
| [reply] [d/l] [select] |
|
[0] Perl> $buf = "abcdef\x00ghik";;
[0] Perl> @a = unpack '(A)*', $buf;;
[0] Perl> print length for @a;;
1
1
1
1
1
1
0
1
1
1
1
| [reply] [d/l] [select] |
|
Very interesting! With the EXACT same code as Ikegami, (with the (a)* mod), I get:
Rate regex split unpack_C unpack_a
regex 3.00/s -- -3% -9% -19%
split 3.09/s 3% -- -6% -17%
unpack_C 3.29/s 10% 7% -- -11%
unpack_a 3.71/s 24% 20% 13% --
I have seen before that the Ikegami machine is faster than mine on certain benchmarks. But, I am unable to understand how split() can be the fastest. I am running a Prescott processor, an old 3 Ghz hyper-threaded critter. With Active State Perl 5.10.1.
| [reply] [d/l] |
|
I think it depends a lot on the system, c-runtime
ActivePerl v5.8.9 built for MSWin32-x86-multi-thread, Binary build 825
+ [288577]
Rate unpack_C unpack_a split regex
unpack_C 3.48/s -- -5% -7% -9%
unpack_a 3.66/s 5% -- -3% -4%
split 3.76/s 8% 3% -- -1%
regex 3.82/s 10% 4% 1% --
Rate unpack_C regex unpack_a split
unpack_C 3.56/s -- -5% -5% -6%
regex 3.74/s 5% -- -0% -1%
unpack_a 3.74/s 5% 0% -- -1%
split 3.79/s 7% 2% 1% --
Rate split unpack_C regex unpack_a
split 3.45/s -- -4% -9% -25%
unpack_C 3.58/s 4% -- -6% -22%
regex 3.79/s 10% 6% -- -18%
unpack_a 4.60/s 34% 29% 21% --
The newer versions (built using MinGW) are more consistent between runs
C:\perl\5.10.1\bin\MSWin32-x86-multi-thread\perl.exe
Rate unpack_C regex split unpack_a
unpack_C 3.66/s -- -2% -15% -22%
regex 3.74/s 2% -- -14% -21%
split 4.33/s 18% 16% -- -8%
unpack_a 4.70/s 28% 26% 9% --
C:\perl\5.11.1\bin\MSWin32-x86-multi-thread\perl.exe
Rate regex unpack_C split unpack_a
regex 2.74/s -- -28% -38% -39%
unpack_C 3.82/s 39% -- -13% -14%
split 4.40/s 60% 15% -- -2%
unpack_a 4.47/s 63% 17% 2% --
| [reply] [d/l] [select] |
|
|
Re: Convert string to array - performance challenge
by BrowserUk (Patriarch) on Apr 07, 2010 at 17:40 UTC
|
Update: Don't consider or use this. It is broken per the replies below!
If you really feel the need for speed (at the expense of a little clarity), add this to your becnhmark:
$begin = time();
for( 0 .. 20 ) {
my @a = reverse map chop( $buffer), 0 .. length( $buffer ) -1;
}
printf( "reverse map chop consumed %.3f seconds\n", time() - $begin );
__END__
C:\test>junk47
split consumed 2.368 second(s)
pack consumed 0.347 seconds
pack and chr in map consumed 4.041 seconds
unpack '{a1)* consumed 2.507 seconds
substr for consumed 1.652 seconds
reverse map chop consumed 0.477 seconds
| [reply] [d/l] |
|
Your benchmark is broken, because after the first iteration of for( 0 .. 20 ), $buffer will have been reduced to length zero (because of the chops).
With that corrected, "reverse map chop" isn't any faster than the other solutions.
| [reply] [d/l] [select] |
|
| [reply] |
|
You didn't take into account chop()'s destructive behaviour. For 20 of the 21 passes, you're splitting a zero-length string.
Rate rev_chop regex unpack_C unpack_a split
rev_chop 10.0/s -- -1% -3% -44% -50%
regex 10.1/s 1% -- -2% -44% -50%
unpack_C 10.3/s 3% 2% -- -43% -49%
unpack_a 18.0/s 80% 77% 74% -- -11%
split 20.2/s 102% 99% 95% 12% --
use strict;
use warnings;
use Benchmark qw( cmpthese );
my %tests = (
split => q{ my @a = split //, $buf; },
regex => q{ my @a = $buf =~ /./sg; },
unpack_C => q{ my @a = map chr, unpack 'C*', $buf; },
unpack_a => q{ my @a = unpack '(a)*', $buf; },
rev_chop => q{ my @a = reverse map chop($buf), 1..length($buf); },
);
$_ = "use strict; use warnings; my \$buf = our \$buffer; $_ 1"
for values(%tests);
local our $buffer = "abcdef\x00ghik" x 10_000;
cmpthese(-2, \%tests);
| [reply] [d/l] [select] |
|
my %tests = (
split => q{ my @a = split //, $buf; },
regex => q{ my @a = $buf =~ /./sg; },
unpack_C => q{ my @a = map chr, unpack 'C*', $buf; },
unpack_a => q{ my @a = unpack '(a)*', $buf; },
rev_chop => q{ my @a = reverse map chop($buf), 1..length($buf); },
chop => q{ my @a; $a[ $_ ] = chop $buffer for length($buffer) .. 0;
+ },
);
And result was
Rate rev_chop unpack_C split regex unpack_a chop
rev_chop 17.6/s -- -6% -13% -20% -30% -100%
unpack_C 18.7/s 6% -- -7% -15% -25% -100%
split 20.2/s 15% 8% -- -8% -20% -100%
regex 22.0/s 25% 17% 9% -- -13% -100%
unpack_a 25.1/s 43% 34% 24% 14% -- -100%
chop 68478/s 388556% 365714% 339175% 311691% 272542% --
However I am yet to read "Benchmark" so someone please explain this output. :)
UPDATE: Sorry! That was a silly mistake.
Thanks almut.
| [reply] [d/l] [select] |
|
|
|
| [reply] |
Re: Convert string to array - performance challenge (our @a = split)
by ikegami (Patriarch) on Apr 08, 2010 at 17:33 UTC
|
split has an optimisation that occurs under very specific circumstances.
$ perl -MO=Concise,-exec -e'my @a = split //, $buf;'
1 <0> enter
2 <;> nextstate(main 1 -e:1) v:{
3 <0> pushmark s
4 </> pushre(/""/) s/64
5 <#> gvsv[*buf] s
6 <$> const[IV 0] s
7 <@> split[t3] lK
8 <0> pushmark s
9 <0> padav[@a:1,2] lRM*/LVINTRO
a <2> aassign[t4] vKS
b <@> leave[1 ref] vKP/REFC
-e syntax OK
$ perl -MO=Concise,-exec -e'our @a = split //, $buf;'
1 <0> enter
2 <;> nextstate(main 1 -e:1) v:{
3 </> pushre(/""/ => @a) s/64
4 <#> gvsv[*buf] s
5 <$> const[IV 0] s
6 <@> split[t5] vK
7 <@> leave[1 ref] vKP/REFC
-e syntax OK
In the second program, the assignment is removed and split stores the result into the array itself. It's as if
our @a = split(//, $buf);
became
split_into(@a, //, $buf);
Unfortunately, it doesn't currently work for lexicals.
$ perl -MO=Concise,-exec -e'my @a; @a = split //, $buf;'
1 <0> enter
2 <;> nextstate(main 1 -e:1) v:{
3 <0> padav[@a:1,2] vM/LVINTRO
4 <;> nextstate(main 2 -e:1) v:{
5 <0> pushmark s
6 </> pushre(/""/) s/64
7 <#> gvsv[*buf] s
8 <$> const[IV 0] s
9 <@> split[t3] lK
a <0> pushmark s
b <0> padav[@a:1,2] lRM*
c <2> aassign[t4] vKS
d <@> leave[1 ref] vKP/REFC
-e syntax OK
When taking advantage of that optimisation, one gets a solution that's almost twice as fast as the previous solution:
Rate regex unpack_C unpack_a split split_pa
regex 10.4/s -- -2% -44% -50% -73%
unpack_C 10.6/s 2% -- -43% -49% -73%
unpack_a 18.8/s 79% 76% -- -10% -52%
split 20.9/s 100% 97% 12% -- -46%
split_pa 38.8/s 271% 264% 107% 85% --
use strict;
use warnings;
use Benchmark qw( cmpthese );
my %tests = (
split => q{ my @a = split //, $buf; },
split_pa => q{ local our @a; @a = split //, $buf; },
regex => q{ my @a = $buf =~ /./sg; },
unpack_C => q{ my @a = map chr, unpack 'C*', $buf; },
unpack_a => q{ my @a = unpack '(a)*', $buf; },
);
$_ = "use strict; use warnings; our \$buf; $_ 1"
for values(%tests);
local our $buf = "abcdef\x00ghik" x 10_000;
cmpthese(-2, \%tests);
| [reply] [d/l] [select] |
|
|