After reading A short meditation about hash search performance because of Hash Search is VERY slow I wanted to investigate the performance of inserting into a hash and array as the number of existing elements increases. The methodology is to Benchmark::timeit() a number of insertions into an empty hash or an empty array. For array, insertion means "push at the end". My aim was to see if the time to insert is constant, linearly increasing with the number of items inserted. I believe this is confirmed. Inserts of N elements into a fresh hash are timed and recorded for different N. I did not just time only 1 insert because there's the suspicion that existing size may affect each new insert. I.e. it's faster to insert into an empty container. So I inserted N items and timed that in total.
Linear regression is used to build the relationship T = aX+b where T is the total time to insert X items into a fresh container. The error of the regression will be indicative of how well the hypothesis of constant insert time holds.
I interpret the results as follows: inserting into a hash or array takes constant time and is independent of size: e.g. if insertion of 1000 items into a fresh hash takes 1 unit of time then 2000 items take 2 units of time. Same with array. Caveat: hash keys provided by random_key() were extremely random. The confidence of this conclusion is stronger for arrays than for hash (the error of the linear regression). Inserting into an array is faster than inserting into a hash. Twice as fast in my computer/OS. Perl version used 5.32.1, Linux Fedora 5.13.19.
Caveat #2 The methodology is simplistic so please add on if you have better inspiration.
5' Edit: The ascii plot looks volatile but in reality the numbers show a nice linear relationship, which is confirmed by the low error of the least squares. Also "linear regression" and "least squares" mean the same in this post.
There have been some more edits within 20 minutes of posting. E.g. to clarify that array insert means push.
10000 array insertions took 1.00004911422729 seconds.
11000 array insertions took 1.09900379180908 seconds.
12000 array insertions took 1.20203304290771 seconds.
13000 array insertions took 1.32625269889832 seconds.
14000 array insertions took 1.42015385627747 seconds.
15000 array insertions took 1.50754404067993 seconds.
16000 array insertions took 1.60211801528931 seconds.
17000 array insertions took 1.70919871330261 seconds.
18000 array insertions took 1.80735301971436 seconds.
19000 array insertions took 1.90634799003601 seconds.
20000 array insertions took 1.99797105789185 seconds.
21000 array insertions took 2.09790301322937 seconds.
22000 array insertions took 2.20532703399658 seconds.
23000 array insertions took 2.30040597915649 seconds.
24000 array insertions took 2.40066504478455 seconds.
25000 array insertions took 2.50436305999756 seconds.
26000 array insertions took 2.59865713119507 seconds.
27000 array insertions took 2.70806193351746 seconds.
28000 array insertions took 2.79667997360229 seconds.
29000 array insertions took 2.89690399169922 seconds.
30000 array insertions took 2.99919009208679 seconds.
Array inserts:
Y = 9.94971160764818e-05 * X + 0.0141616115322365 with error 0.0014576
+9586141692
Y : total time to insert X elements.
10000 hash insertions took 2.02198815345764 seconds.
11000 hash insertions took 2.21846413612366 seconds.
12000 hash insertions took 2.43116092681885 seconds.
13000 hash insertions took 2.62970495223999 seconds.
14000 hash insertions took 2.83560991287231 seconds.
15000 hash insertions took 3.04865097999573 seconds.
16000 hash insertions took 3.23556900024414 seconds.
17000 hash insertions took 3.44259905815125 seconds.
18000 hash insertions took 3.65537810325623 seconds.
19000 hash insertions took 3.85966873168945 seconds.
20000 hash insertions took 4.09639000892639 seconds.
21000 hash insertions took 4.35650420188904 seconds.
22000 hash insertions took 4.53428721427917 seconds.
23000 hash insertions took 4.74114084243774 seconds.
24000 hash insertions took 5.05467200279236 seconds.
25000 hash insertions took 5.18268871307373 seconds.
26000 hash insertions took 5.41770219802856 seconds.
27000 hash insertions took 5.68814396858215 seconds.
28000 hash insertions took 5.90737128257751 seconds.
29000 hash insertions took 6.0479040145874 seconds.
30000 hash insertions took 6.26542806625366 seconds.
Hash inserts:
Y = 0.000215204607047044 * X + -0.176900404356259 with error 0.0075377
+646200501
Y : total time to insert X elements.
7 +-----------------------------------------------------------------
+-----+
| + + +
+ |
| 'arr2'
+A |
6 |-+ 'hash2'
+BB +-|
| B B
+ |
| B
+ |
| B B
+ |
5 |-+
+ +-|
S | B B
+ |
E | B
+ |
C4 |-+ B B
+ +-|
O | B
+ |
N | B
+ |
D3 |-+ B B
+ +-|
S | B A A
+ A |
| B B A A A
+ |
| B A A A
+ |
2 |-+ A A A
+ +-|
| A A A
+ |
| A A A + + +
+ |
1 +-----------------------------------------------------------------
+-----+
10000 15000 20000 25000
+ 30000
number of items
use strict;
use warnings;
use Benchmark qw/timeit :hireswallclock/;
my $num_repeats = 50;
my @bobjs;
my %ahash;
my @anarray;
my @results;
my $linreg;
# map { $_ * 1000 } 10..30 simulates a loop with step of 1000
# Do array inserts
@results = ();
@bobjs = ();
@anarray = ();
for my $num_inserts ( map { $_ * 1000 } 10..30 ){
# warmup
do_array_inserts($num_inserts, \@anarray); @anarray = ();
# time it
my $abobj = timeit($num_repeats,
sub { do_array_inserts($num_inserts, \@anarray); @anarray = ()
+; }
);
push @bobjs, [$num_inserts, $abobj];
push @results, [$num_inserts, $abobj->real];
print $num_inserts." array insertions took ". $abobj->real . " sec
+onds.\n"
}
$linreg = ordinary_least_squares(@results);
print "Array inserts:\n";
print "Y = ".$linreg->{A}." * X + ".$linreg->{B}." with error ".$linre
+g->{E}."\n";
print "Y : total time to insert X elements.\n";
# Do hash inserts
@results = ();
%ahash = ();
@bobjs = ();
for my $num_inserts ( map { $_ * 1000 } 10..30 ){
# warmup
do_hash_inserts($num_inserts, \%ahash); %ahash = ();
# time it
my $abobj = timeit($num_repeats,
sub { do_hash_inserts($num_inserts, \%ahash); %ahash = (); }
);
push @bobjs, [$num_inserts, $abobj];
push @results, [$num_inserts, $abobj->real];
print $num_inserts." hash insertions took ". $abobj->real . " seco
+nds.\n"
}
$linreg = ordinary_least_squares(@results);
print "Hash inserts:\n";
print "Y = ".$linreg->{A}." * X + ".$linreg->{B}." with error ".$linre
+g->{E}."\n";
print "Y : total time to insert X elements.\n";
#### end
sub do_hash_inserts {
my ($repeats, $container) = @_;
for (1..$repeats){
$container->{random_key()} = random_value();
}
}
sub do_array_inserts {
my ($repeats, $container) = @_;
for (1..$repeats){
push @$container, random_value();
}
}
sub random_key {
return join '', map { rand } 1..3
}
sub random_value {
return join '', map { rand } 1..3
}
sub ordinary_least_squares {
# each element of input array is [num_insertions, time_seconds]
my @observations = @_;
# coefficients to find assuming Y = A*X + B
my ($A, $B);
my $diff;
my $num_observations = @observations;
my $cov = 0.0;
my $var = 0.0;
my $meanx = 0.0;
my $meany = 0.0;
for my $anobs (@observations){
my ($x, $y) = @$anobs;
$meanx += $x;
$meany += $y;
}
$meanx /= $num_observations;
$meany /= $num_observations;
for my $anobs (@observations){
my ($x, $y) = @$anobs;
my $diffx = ($x - $meanx);
$cov += $diffx * ($y - $meany);
$var += $diffx * $diffx;
}
$A = $cov / $var;
$B = $meany - $A * $meanx;
my $error = 0.0;
for my $anobs (@observations){
my ($x, $y) = @$anobs;
my $ypred = $A * $x + $B;
$error += ($y - $ypred)**2
}
$error = sqrt($error) / $num_observations;
return { A => $A, B => $B, E => $error }
}
bw, bliako