I have a large set of points.
Each point has an x and y coordinate.
The range of values for x and y are -2,147,483,648 to 2,147,483,647 (i.e. 32-bit signed int).
I am running a 64-bit version of Perl; this code is not required to run in a 32-bit environment.
I need to insert the set of points into a Perl data structure ...
then perform many lookups to tell if a given point is in the set or not.
I want the lookup to be as fast as possible.
The most obvious way I could think of to do this is in Perl is with a hash.
My first attempt is shown below.
For the unique hash key, I tried three different ways to do it:
- String: $x . ':' . $y
- pack/unpack: pack "ii", $x, $y
- 64-bit int: ($y << 32) | ($x & 0xFFFFFFFF)
The string was the fastest in my simple benchmark shown below.
Benchmark: timing 200000 iterations of Big, Pak, Str...
Big: 6 wallclock secs ( 5.64 usr + 0.00 sys = 5.64 CPU) @ 35454.71/s
+(n=200000)
Pak: 4 wallclock secs ( 4.47 usr + 0.00 sys = 4.47 CPU) @ 44752.74/s
+(n=200000)
Str: 4 wallclock secs ( 3.95 usr + 0.00 sys = 3.95 CPU) @ 50594.49/s
+(n=200000)
Suggestions for a faster lookup are welcome.
use strict;
# use warnings;
use Benchmark qw(timethese);
my @points = (
[ 0, 0 ],
[ -1, -2 ],
[ 1, 2 ],
[ -1, 2 ],
[ 1, -2 ],
[ 0, 1 ],
[ 1, 0 ],
[ -1, 0 ],
[ 0, -1 ],
[ 2147483647, 2147483647 ],
[ 2147483647, -2147483647 ],
[ -2147483647, 2147483647 ],
[ -2147483647, -2147483647 ],
[ -1, 2147483647 ],
[ 2147483647, 1 ],
[ -2, 2147483646 ],
[ 1, -2147483647 ],
[ 1234561, 1234562 ],
[ 1234563, -1234564 ],
[ -1234565, 1234566 ],
[ -1234567, -1234568 ],
[ 10, 11 ],
[ 11, 12 ],
[ 12, 13 ],
[ 13, 14 ],
[ 14, 15 ],
[ 15, 16 ],
[ 16, 17 ],
[ 17, 18 ],
[ 18, 19 ],
[ 19, 20 ],
[ 1001, 1002 ],
[ 1003, 1004 ],
[ 1005, 1006 ],
[ 1007, 1008 ],
[ 1009, 1010 ],
[ 1011, 1012 ],
[ 1013, 1014 ],
[ 1015, 1016 ],
[ 1017, 1018 ],
[ 1019, 1020 ],
[ -1001, -1002 ],
[ -1003, -1004 ],
[ -1005, -1006 ],
[ -1007, -1008 ],
[ -1009, -1010 ],
[ -1011, -1012 ],
[ -1013, -1014 ],
[ -1015, -1016 ],
[ -1017, -1018 ],
[ -1019, -1020 ],
[ 99910, 99911 ],
[ 99911, 99912 ],
[ 99912, 99913 ],
[ 99913, 99914 ],
[ 99914, 99915 ],
[ 99915, 99916 ],
[ 99916, 99917 ],
[ 99917, 99918 ],
[ 99918, 99919 ],
[ 99919, 99920 ],
[ -99910, -99911 ],
[ -99911, -99912 ],
[ -99912, -99913 ],
[ -99913, -99914 ],
[ -99914, -99915 ],
[ -99915, -99916 ],
[ -99916, -99917 ],
[ -99917, -99918 ],
[ -99918, -99919 ],
[ -99919, -99920 ],
[ 1099910, 1099911 ],
[ 1099911, 1099912 ],
[ 1099912, 1099913 ],
[ 1099913, 1099914 ],
[ 1099914, 1099915 ],
[ 1099915, 1099916 ],
[ 1099916, 1099917 ],
[ 1099917, 1099918 ],
[ 1099918, 1099919 ],
[ 1099919, 1099920 ],
[ -1099910, -1099911 ],
[ -1099911, -1099912 ],
[ -1099912, -1099913 ],
[ -1099913, -1099914 ],
[ -1099914, -1099915 ],
[ -1099915, -1099916 ],
[ -1099916, -1099917 ],
[ -1099917, -1099918 ],
[ -1099918, -1099919 ],
[ -1099919, -1099920 ],
[ 91099910, 91099911 ],
[ 91099911, 91099912 ],
[ 91099912, 91099913 ],
[ 91099913, 91099914 ],
[ 91099914, 91099915 ],
[ 91099915, 91099916 ],
[ 91099916, 91099917 ],
[ 91099917, 91099918 ],
[ 91099918, 91099919 ],
[ 91099919, 91099920 ],
[ -91099910, -91099911 ],
[ -91099911, -91099912 ],
[ -91099912, -91099913 ],
[ -91099913, -91099914 ],
[ -91099914, -91099915 ],
[ -91099915, -91099916 ],
[ -91099916, -91099917 ],
[ -91099917, -91099918 ],
[ -91099918, -91099919 ],
[ -91099919, -91099920 ],
[ 491099910, 491099911 ],
[ 491099911, 491099912 ],
[ 491099912, 491099913 ],
[ 491099913, 491099914 ],
[ 491099914, 491099915 ],
[ 491099915, 491099916 ],
[ 491099916, 491099917 ],
[ 491099917, 491099918 ],
[ 491099918, 491099919 ],
[ 491099919, 491099920 ],
[ -491099910, -491099911 ],
[ -491099911, -491099912 ],
[ -491099912, -491099913 ],
[ -491099913, -491099914 ],
[ -491099914, -491099915 ],
[ -491099915, -491099916 ],
[ -491099916, -491099917 ],
[ -491099917, -491099918 ],
[ -491099918, -491099919 ],
[ -491099919, -491099920 ],
);
my $npoints = @points;
sub str_hash {
# print "string_hash---------------\n";
my %cells;
# insert the points into the hash
for my $p (@points) {
my $h = $p->[0] . ':' . $p->[1];
my ( $x, $y ) = split ':', $h;
# print "x='$p->[0]' y='$p->[1]' h='$h' (x='$x' y='$y')\n";
if ($x != $p->[0]) { die; }
if ($y != $p->[1]) { die; }
$cells{$h} = undef;
# ++$cells{$h};
}
scalar(keys %cells) == $npoints or die;
# lookup each points in the hash
for my $p (@points) {
my $h = $p->[0] . ':' . $p->[1];
exists $cells{$h} or die;
}
exists $cells{'notfound'} and die;
exists $cells{'notfound2'} and die;
exists $cells{'notfound3'} and die;
return \%cells;
}
sub big_hash {
# print "bigint_hash---------------\n";
my %cells;
# insert the points into the hash
for my $p (@points) {
my $h = ($p->[1] << 32) | ($p->[0] & 0xFFFFFFFF);
my $x = $h & 0x00000000FFFFFFFF;
my $y = ($h & 0xFFFFFFFF00000000) >> 32;
if ($x >> 31) { $x -= 0xFFFFFFFF + 1 }
if ($y >> 31) { $y -= 0xFFFFFFFF + 1 }
# print "x='$p->[0]' y='$p->[1]' h='$h' (x='$x' y='$y')\n";
if ($x != $p->[0]) { die; }
if ($y != $p->[1]) { die; }
$cells{$h} = undef;
# ++$cells{$h};
}
scalar(keys %cells) == $npoints or die;
# lookup each point in the hash
for my $p (@points) {
my $h = ($p->[1] << 32) | ($p->[0] & 0xFFFFFFFF);
exists $cells{$h} or die;
}
exists $cells{'notfound'} and die;
exists $cells{'notfound2'} and die;
exists $cells{'notfound3'} and die;
return \%cells;
}
sub pak_hash {
# print "unpack_hash---------------\n";
my %cells;
# insert the points into the hash
for my $p (@points) {
my $h = pack "ii", $p->[0], $p->[1];
my ( $x, $y ) = unpack "ii", $h;
# print "x='$p->[0]' y='$p->[1]' h='$h' (x='$x' y='$y')\n";
if ($x != $p->[0]) { die; }
if ($y != $p->[1]) { die; }
$cells{$h} = undef;
# ++$cells{$h};
}
scalar(keys %cells) == $npoints or die;
# lookup each point in the hash
for my $p (@points) {
my $h = pack "ii", $p->[0], $p->[1];
exists $cells{$h} or die;
}
exists $cells{'notfound'} and die;
exists $cells{'notfound2'} and die;
exists $cells{'notfound3'} and die;
return \%cells;
}
sub str_look {
my $cells = shift;
for my $p (@points) {
# my $h = $p->[0] . ':' . $p->[1];
exists $cells->{$p->[0] . ':' . $p->[1]} or die;
}
exists $cells->{'notfound'} and die;
exists $cells->{'notfound2'} and die;
exists $cells->{'notfound3'} and die;
}
sub big_look {
my $cells = shift;
for my $p (@points) {
# my $h = ($p->[1] << 32) | ($p->[0] & 0xFFFFFFFF);
exists $cells->{($p->[1] << 32) | ($p->[0] & 0xFFFFFFFF)} or die
+;
}
exists $cells->{'notfound'} and die;
exists $cells->{'notfound2'} and die;
exists $cells->{'notfound3'} and die;
}
sub pak_look {
my $cells = shift;
for my $p (@points) {
# my $h = pack "ii", $p->[0], $p->[1];
exists $cells->{pack "ii", $p->[0], $p->[1]} or die;
}
exists $cells->{'notfound'} and die;
exists $cells->{'notfound2'} and die;
exists $cells->{'notfound3'} and die;
}
my $str_ref = str_hash();
my $big_ref = big_hash();
my $pak_ref = pak_hash();
timethese 200000, {
Str => sub { str_look($str_ref) },
Big => sub { big_look($big_ref) },
Pak => sub { pak_look($pak_ref) },
};
Update: Much later it was discovered that pack 'i2' was the fastest in this simple GoL algorithm.
References
References Added Later
Update: Updated References long after original node was written.
-
Are you posting in the right place? Check out Where do I post X? to know for sure.
-
Posts may use any of the Perl Monks Approved HTML tags. Currently these include the following:
<code> <a> <b> <big>
<blockquote> <br /> <dd>
<dl> <dt> <em> <font>
<h1> <h2> <h3> <h4>
<h5> <h6> <hr /> <i>
<li> <nbsp> <ol> <p>
<small> <strike> <strong>
<sub> <sup> <table>
<td> <th> <tr> <tt>
<u> <ul>
-
Snippets of code should be wrapped in
<code> tags not
<pre> tags. In fact, <pre>
tags should generally be avoided. If they must
be used, extreme care should be
taken to ensure that their contents do not
have long lines (<70 chars), in order to prevent
horizontal scrolling (and possible janitor
intervention).
-
Want more info? How to link
or How to display code and escape characters
are good places to start.