Dearest Monks,
My mind has been amused with Collatz conjecture. See 1nickt's post about obtaining the top 20 sequences. Below is code for obtaining the longest progression. Here I try a GCC compiler intrinsic to further increase performance. That went well and so updated my prior post adding collatz_count_c2 there.
use strict;
use warnings;
use feature 'say';
use MCE::Flow;
use Inline C => Config =>
CCFLAGSEX => '-O2 -fomit-frame-pointer', clean_after_build => 0;
use Inline C => <<'END_OF_C_CODE';
#include <stdlib.h>
#include <stdint.h>
#if defined(_WIN32)
#define strtoull _strtoui64
#endif
void collatz_longest_c1( SV* _beg_n, SV* _end_n )
{
uint64_t beg_n, end_n, i, n, steps;
uint64_t number = 0, highest = 0;
Inline_Stack_Vars;
#ifdef __LP64__
beg_n = SvUV( _beg_n );
end_n = SvUV( _end_n );
#else
beg_n = strtoull( SvPV_nolen( _beg_n ), NULL, 10 );
end_n = strtoull( SvPV_nolen( _end_n ), NULL, 10 );
#endif
for ( i = end_n; i >= beg_n; i-- ) {
n = i, steps = 0;
/* count using the T(x) notation */
do {
n % 2 ? ( steps += 2, n = (3 * n + 1) >> 1 )
: ( steps += 1, n = n >> 1 );
} while ( n != 1 );
if ( steps >= highest ) {
number = i, highest = steps;
}
}
Inline_Stack_Reset;
Inline_Stack_Push( sv_2mortal( newSVuv(number ) ) );
Inline_Stack_Push( sv_2mortal( newSVuv(highest) ) );
Inline_Stack_Done;
}
void collatz_longest_c2( SV* _beg_n, SV* _end_n )
{
uint64_t beg_n, end_n, i, n, steps;
uint64_t number = 0, highest = 0;
Inline_Stack_Vars;
#ifdef __LP64__
beg_n = SvUV( _beg_n );
end_n = SvUV( _end_n );
#else
beg_n = strtoull( SvPV_nolen( _beg_n ), NULL, 10 );
end_n = strtoull( SvPV_nolen( _end_n ), NULL, 10 );
#endif
/* based on GCC compiler intrinsics demonstration by Alex Shirley
+*/
/* https://stackoverflow.com/questions/38114205/c-collatz-conjectu
+re-optimization */
/* https://www.go4expert.com/articles/builtin-gcc-functions-builti
+nclz-t29238 */
for ( i = beg_n; i <= end_n; i++ ) {
n = i, steps = 0;
if ( n % 2 == 0 ) {
steps += __builtin_ctz(n); /* account for all evens */
n >>= __builtin_ctz(n); /* always returns an odd */
}
/* when we enter we're always working on an odd number */
do {
n = 3 * n + 1;
steps += __builtin_ctz(n) + 1; /* account for odd and even
+ */
n >>= __builtin_ctz(n); /* always returns an odd */
} while ( n != 1 );
if ( steps > highest ) {
number = i, highest = steps;
}
}
Inline_Stack_Reset;
Inline_Stack_Push( sv_2mortal( newSVuv(number ) ) );
Inline_Stack_Push( sv_2mortal( newSVuv(highest) ) );
Inline_Stack_Done;
}
END_OF_C_CODE
sub collatz_longest {
my ( $beg_n, $end_n ) = @_;
my ( $number, $highest ) = ( 0, 0 );
my ( $i, $n, $steps );
for ( $i = $end_n; $i >= $beg_n; $i-- ) {
$n = $i, $steps = 0;
# count using the T(x) notation
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1;
$number = $i, $highest = $steps
if ( $steps >= $highest );
}
return ( $number, $highest );
}
no warnings 'once';
#*collatz = \&collatz_longest; # choose collatz here
#*collatz = \&collatz_longest_c1; # using T(x) notation
*collatz = \&collatz_longest_c2; # using compiler intrinsics
my $m = shift || '1e7';
my ( @sizes, $chunk_size );
$chunk_size = int( $m / MCE::Util::get_ncpu() / 80 + 1 );
$chunk_size += 1 if $chunk_size % 2;
mce_flow_s {
max_workers => MCE::Util::get_ncpu(),
chunk_size => $chunk_size,
gather => \@sizes,
bounds_only => 1,
}, sub {
MCE->gather([ collatz( @{ $_[1] } ) ]);
}, 1, $m;
MCE::Flow->finish;
# Output the longest progression for the initial starting number.
# https://en.wikipedia.org/wiki/Collatz_conjecture
my $highest = ( sort { $b->[1] <=> $a->[1] } @sizes )[ 0 ]->[ 1 ];
say "Longest Collatz (index and value)";
say "@$_"
for ( sort { $a->[0] <=> $b->[0] }
grep { $_->[1] == $highest } @sizes )[ 0..0 ];
Output
The times include launching Perl, loading modules, spawning workers, reaping workers, and output (~ 0.100 seconds).
This outputs the longest progression for the number of steps to reach 1.
These numbers are the lowest ones with the indicated step count.
1e7 : 8400511 685
1 core
collatz_longest 1m16.034s
collatz_longest_c1 0m01.868s
collatz_longest_c2 0m00.778s
2 cores
collatz_longest 0m37.912s
collatz_longest_c1 0m00.965s
collatz_longest_c2 0m00.422s
4 cores
collatz_longest 0m19.799s
collatz_longest_c1 0m00.516s
collatz_longest_c2 0m00.239s
8 cores
collatz_longest 0m10.042s
collatz_longest_c1 0m00.285s
collatz_longest_c2 0m00.147s
16 cores
collatz_longest 0m05.196s
collatz_longest_c1 0m00.178s
collatz_longest_c2 0m00.109s
32 cores
collatz_longest 0m02.717s
collatz_longest_c1 0m00.137s
collatz_longest_c2 0m00.105s
collatz_longest_c1 (Inline C), collatz_longest (Perl)
32 cores
1e8 : 63728127 949 Inline C 0m00.738s Perl 0m30.554s
1e9 : 670617279 986 Inline C 0m07.198s Perl 5m51.938s
1e10 : 9780657630 1132 Inline C 1m17.059s
1e11 : 75128138247 1228 Inline C 13m51.122s
collatz_longest_c2 (Inline C)
32 cores
1e8 : 63728127 949 Inline C 0m00.340s
1e9 : 670617279 986 Inline C 0m03.023s
1e10 : 9780657630 1132 Inline C 0m33.152s
1e11 : 75128138247 1228 Inline C 6m10.355s
I can now sleep knowing that MCE can handle this. Just be sure to use sequence generation in MCE (i.e. mce_flow_s) with the bounds_only option.
Regards, Mario
Re: Longest Collatz using MCE::Flow, Inline::C, and GCC compiler intrinsics (Caching)
by marioroy (Prior) on Apr 12, 2020 at 05:47 UTC
|
Hi again,
I tried caching using an array and File::Map. All run using one core and are reasonably fast. The main goal here are attempts made at reducing memory consumption.
Update 1: Added PDL example.
Update 2: Improved performance.
Update 3: Added scalar example.
use strict;
use warnings;
use feature 'say';
use File::Map qw/map_anonymous unmap/;
use PDL;
# based on the caching demonstration by iM71
# https://stackoverflow.com/a/55361008
# https://stackoverflow.com/questions/38114205/c-collatz-conjecture-op
+timization
sub collatz_longest_pdl {
my ( $size ) = @_;
my ( $length, $number ) = ( 0, 0 );
my ( $n, $steps, $tmp, $n2 );
my $cache = zeros( short(), $size + 1 );
for my $current ( 2 .. $size ) {
$n = $current, $steps = 0;
# count using the T(x) notation
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1 && $n >= $current;
$tmp = $steps + at( $cache, $n );
set( $cache, $current, $tmp );
$length = $tmp, $number = $current
if $tmp > $length;
}
return ( $number, $length );
}
sub collatz_longest_array {
my ( $size ) = @_;
my ( $length, $number ) = ( 0, 0 );
my ( $n, $steps, $tmp );
my @cache;
for my $current ( 2 .. $size ) {
$n = $current, $steps = 0;
# count using the T(x) notation
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1 && $n >= $current;
$tmp = $steps + ( $cache[ $n ] // 0 );
$cache[ $current ] = $tmp;
$length = $tmp, $number = $current
if $tmp > $length;
}
return ( $number, $length );
}
sub collatz_longest_filemap {
my ( $size ) = @_;
my ( $length, $number ) = ( 0, 0 );
my ( $n, $steps, $tmp );
map_anonymous my $cache, $size * 2 + 2, 'shared';
# fill cache with zero's
substr($cache, 0, $size * 2 + 2, pack('s', 0) x ( $size + 1 ));
for my $current ( 2 .. $size ) {
$n = $current, $steps = 0;
# count using the T(x) notation
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1 && $n >= $current;
$tmp = $steps + unpack('s', substr($cache, $n * 2, 2));
substr($cache, $current * 2, 2, pack('s', $tmp));
$length = $tmp, $number = $current
if $tmp > $length;
}
unmap $cache;
return ( $number, $length );
}
sub collatz_longest_scalar {
my ( $size ) = @_;
my ( $length, $number ) = ( 0, 0 );
my ( $n, $steps, $tmp );
# fill cache with zero's
my $cache = pack('s', 0) x ( $size + 1 );
for my $current ( 2 .. $size ) {
$n = $current, $steps = 0;
# count using the T(x) notation
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1 && $n >= $current;
$tmp = $steps + unpack('s', substr($cache, $n * 2, 2));
substr($cache, $current * 2, 2, pack('s', $tmp));
$length = $tmp, $number = $current
if $tmp > $length;
}
return ( $number, $length );
}
#*collatz = \&collatz_longest_pdl; # choose collatz here
#*collatz = \&collatz_longest_array;
#*collatz = \&collatz_longest_filemap;
*collatz = \&collatz_longest_scalar;
my ( $number, $length ) = collatz( shift || '1e7' );
say "Longest Collatz (index and value)";
say "$number $length";
Output
1e7 : 8400511 685
1 core
collatz_longest 1m16.034s MCE Perl code
collatz_longest_pdl 0m13.691s Consumes 39 MiB
collatz_longest_filemap 0m06.615s Consumes 59 MiB
collatz_longest_scalar 0m06.148s Consumes 39 MiB
collatz_longest_array 0m04.986s Consumes 330 MiB
collatz_longest_c1 0m01.868s Inline C code
collatz_longest_c2 0m00.778s Inline C code
I've been wanting to try File::Map and pleasantly surprised.
Regards, Mario
| [reply] [d/l] [select] |
|
Greetings,
I am pleased to provide the final work. Computing collatz_longest for 1e9 requires ~ 3.8 GiB of available memory. Try 5e8 for lesser memory consumption.
This is a parallel demonstration using MCE::Flow and File::Map for caching. The 2nd example uses Inline::C for counting the number of steps. Unlike the other examples, workers working on chunks 2 and higher require results from prior chunks. This is accounted for. Obviously the worker processing chunk 1 doesn't need prior results. Once the workers being processing (i.e. after the mapped cache creation), it doesn't take long before full CPU utilization kicks in.
Q. Why does this work, especially when subsequent chunks need results from prior chunks?
A. The magic lies with using a smaller chunk size set to 2500. The initial ramp up is a one time occurrence. One cool thing about MCE is that input IO is sequential. This applies to number sequences as well. A worker obtaining chunk 1 begins processing immediately (there's no wasting time). The worker obtaining the next chunk begins processing but may need to pause a little here and there. Eventually, the chunks as far as timing goes (starting) are spread out where workers need to pause left often. This can be seen by looking at the CPU utilization. The Power of Randomness kicks in at some point with CPU utilization near 100% until completion.
Credit to PerlMonks choroba, Laurent_R, 1nickt, rjt, and vr. See this thread. Worthy mention goes to Leon Timmermans, author of File::Map. Wow!
Credit for the caching technique used here is based on the caching demonstration by iM71, a response in that thread. My first attempt at parallelization failed. I tried again by maximizing on MCE's strengths described above. It's surreal :)
Credit for reducing the number of loop iterations was from watching Notation and compressed dynamics, one minute into the video (i.e. the T(x) notation).
Below, the minimum and maximum argument (size) is 1e6 and 1e9 respectively. The two scripts will set to limit quietly if exceeded.
Cache miss update:
Cache miss is less than 1%. Therefore, it is faster to compute for $n than waiting for the result.
Final update:
#!/usr/bin/env perl
use strict;
use warnings;
use feature qw/say/;
use File::Map qw/map_anonymous unmap/;
use Time::HiRes qw/usleep/;
use MCE::Flow;
my $size = shift || 1e6;
$size = 1e6 if $size < 1e6; # minimum
$size = 1e9 if $size > 1e9; # maximum
map_anonymous my $cache, $size * 2 + 2, 'shared';
# fill cache with zeroes
substr($cache, 0, $size * 2 + 2, pack('s', 0) x ( $size + 1 ));
# local to workers and the manager process
my ( $length, $number ) = ( 0, 0 );
sub collatz_longest {
my ( $chunk_id, $seq_beg, $seq_end ) = @_;
my ( $n, $steps, $tmp );
for my $input ( $seq_beg .. $seq_end ) {
$n = $input, $steps = 0;
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1 && $n >= $input;
$tmp = unpack('s', substr($cache, $n * 2, 2));
# another worker with a lesser chunk_id is not yet
# completed processing $n, so pause a little
# if ( $tmp == 0 && $chunk_id > 1 ) {
# do {
# usleep 100;
# $tmp = unpack('s', substr($cache, $n * 2, 2));
# } while ( $tmp == 0 );
# }
# do this instead (faster): compute $n if cache miss
$tmp = _collatz($n) if $tmp == 0 && $chunk_id > 1;
substr($cache, $input * 2, 2, pack('s', $steps += $tmp));
$length = $steps, $number = $input if $steps > $length;
}
}
sub _collatz {
my ( $input ) = @_;
my ( $n, $steps ) = ( $input, 0 );
$n % 2 ? ( $steps += 2, $n = (3 * $n + 1) >> 1 )
: ( $steps += 1, $n = $n >> 1 )
while $n != 1 && $n >= $input;
my $tmp = unpack('s', substr($cache, $n * 2, 2));
$tmp = _collatz($n) if $tmp == 0;
substr($cache, $input * 2, 2, pack('s', $steps += $tmp));
return $steps
}
my $chunk_size;
$chunk_size = int( $size / MCE::Util::get_ncpu() / 80 + 1 );
$chunk_size += 1 if $chunk_size % 2;
MCE::Flow->init(
max_workers => MCE::Util::get_ncpu(),
chunk_size => $chunk_size, # specify 2500 if pausing above
bounds_only => 1,
gather => sub {
$length = $_[0], $number = $_[1] if $_[0] > $length;
},
user_end => sub {
MCE->gather($length, $number);
},
);
mce_flow_s sub {
my ( $mce, $seq_ref, $chunk_id ) = @_;
collatz_longest($chunk_id, @{ $seq_ref });
}, 2, $size;
MCE::Flow->finish; unmap $cache;
say "Longest Collatz (index and value)";
say "$number $length";
Count steps via Inline C:
#!/usr/bin/env perl
use strict;
use warnings;
use feature qw/say/;
use File::Map qw/map_anonymous unmap/;
use Time::HiRes qw/usleep/;
use MCE::Flow;
use Inline C => Config => CCFLAGSEX => '-O2 -fomit-frame-pointer';
use Inline C => <<'END_OF_C_CODE';
#include <stdint.h>
void num_steps_c( SV* _n, SV* _s )
{
uint64_t n, input;
int steps = 0;
n = input = SvUV(_n);
while ( n != 1 && n >= input ) {
n % 2 ? ( steps += 2, n = (3 * n + 1) >> 1 )
: ( steps += 1, n = n >> 1 );
}
sv_setuv(_n, n);
sv_setiv(_s, steps);
return;
}
END_OF_C_CODE
my $size = shift || 1e6;
$size = 1e6 if $size < 1e6; # minimum
$size = 1e9 if $size > 1e9; # maximum
map_anonymous my $cache, $size * 2 + 2, 'shared';
# fill cache with zeroes
substr($cache, 0, $size * 2 + 2, pack('s', 0) x ( $size + 1 ));
# local to workers and the manager process
my ( $length, $number ) = ( 0, 0 );
sub collatz_longest {
my ( $chunk_id, $seq_beg, $seq_end ) = @_;
my ( $n, $steps, $tmp );
for my $input ( $seq_beg .. $seq_end ) {
num_steps_c($n = $input, $steps);
$tmp = unpack('s', substr($cache, $n * 2, 2));
# another worker with a lesser chunk_id is not yet
# completed processing $n, so pause a little
# if ( $tmp == 0 && $chunk_id > 1 ) {
# do {
# usleep 100;
# $tmp = unpack('s', substr($cache, $n * 2, 2));
# } while ( $tmp == 0 );
# }
# do this instead (faster): compute $n if cache miss
$tmp = _collatz($n) if $tmp == 0 && $chunk_id > 1;
substr($cache, $input * 2, 2, pack('s', $steps += $tmp));
$length = $steps, $number = $input if $steps > $length;
}
}
sub _collatz {
my ( $input ) = @_;
num_steps_c( my $n = $input, my $steps );
my $tmp = unpack('s', substr($cache, $n * 2, 2));
$tmp = _collatz($n) if $tmp == 0;
substr($cache, $input * 2, 2, pack('s', $steps += $tmp));
return $steps
}
my $chunk_size;
$chunk_size = int( $size / MCE::Util::get_ncpu() / 80 + 1 );
$chunk_size += 1 if $chunk_size % 2;
MCE::Flow->init(
max_workers => MCE::Util::get_ncpu(),
chunk_size => $chunk_size, # specify 2500 if pausing above
bounds_only => 1,
gather => sub {
$length = $_[0], $number = $_[1] if $_[0] > $length;
},
user_end => sub {
MCE->gather($length, $number);
},
);
mce_flow_s sub {
my ( $mce, $seq_ref, $chunk_id ) = @_;
collatz_longest($chunk_id, @{ $seq_ref });
}, 2, $size;
MCE::Flow->finish; unmap $cache;
say "Longest Collatz (index and value)";
say "$number $length";
Results: Unix time (i.e. time perl script.pl 1e9).
1e9, 32 cores
collatz_longest_final 0m26.371s
collatz_longest_inline_c 0m15.634s
Longest Collatz (index and value)
670617279 986
Regards, Mario
| [reply] [d/l] [select] |
Re: Longest Collatz using MCE::Flow, Inline::C, and GCC compiler intrinsics
by etj (Priest) on Jul 28, 2024 at 02:32 UTC
|
Using Inline::Pdlpp, it performs comparably with _longest_c1, use this diff to the above:
--- collatz.orig 2024-07-27 23:53:49.665967800 +0100
+++ collatz 2024-07-28 03:21:55.085180253 +0100
@@ -119,11 +119,19 @@
return ( $number, $highest );
}
+use Inline Pdlpp => 'DATA', clean_after_build => 0;
+sub PDL::collatz_pdl {
+ PDL::_collatz_pdl_int(@_, my $n = PDL->null, my $h = PDL->null);
+ # Perl scalars because is across forked processes in MCE
+ map $_->sclr, $n, $h;
+}
+
no warnings 'once';
#*collatz = \&collatz_longest; # choose collatz here
#*collatz = \&collatz_longest_c1; # using T(x) notation
- *collatz = \&collatz_longest_c2; # using compiler intrinsics
+#*collatz = \&collatz_longest_c2; # using compiler intrinsics
+ *collatz = \&PDL::collatz_pdl; # using PDL
my $m = shift || '1e7';
my ( @sizes, $chunk_size );
@@ -151,3 +159,27 @@
say "@$_"
for ( sort { $a->[0] <=> $b->[0] }
grep { $_->[1] == $highest } @sizes )[ 0..0 ];
+
+__DATA__
+__Pdlpp__
+pp_def('collatz_pdl',
+ Pars => 'beg(); end(); [o]number(); [o]highest()',
+ GenericTypes => ['Q'],
+ PMCode => '', # trigger the _int but keep the Perl code in main bod
+y
+ Code => '
+ PDL_Indx i, number = 0, highest = 0;
+ for ( i = $end(); i >= $beg(); i-- ) {
+ PDL_Indx n = i, steps = 0;
+ /* count using the T(x) notation */
+ do {
+ n % 2 ? ( steps += 2, n = (3 * n + 1) >> 1 )
+ : ( steps += 1, n = n >> 1 );
+ } while ( n != 1 );
+ if ( steps >= highest ) {
+ number = i, highest = steps;
+ }
+ }
+ $number() = number;
+ $highest() = highest;
+ ',
+);
It revealed a quirk to Inline::Pdlpp, which makes/loads the XS code, but drops the Perl code, so PMCode gets ignored. I've added that to the doc which will be in next release.
The most idiomatic way to get PDL to do this would be to broadcast the above operation over a higher dim that has as many things in its as cores you have, and let the PDL pthreads do their thing instead of MCE. That would have the challenge that the Collatz stuff is quite non-deterministic in processing need, which reminds me that I've meant for a while to break PDL pthreading into smaller chunks and distribute across a thread pool which would mitigate this. PRs to beat me to it very welcome!
A PDL solution doesn't work great in MCE because the outputs needed to be converted to Perl scalars to be successfully passed across forked processes. | [reply] [d/l] [select] |
|
|