# # a test program to test the various line count methods under Perl. it turns # out that they each may have their own specific issues. # use strict; my $test_file="large_test_file.txt"; # change name to test size/length differences # # progress related code # sub quick_touch($) { open TCH,">>$_[0]"; # will create if file does not exist but not destroy existing close TCH; } $|=1; # so we can provide screen feedback my $last_progress_file; my $progress_interval=3; my $next_progress_time=0; # time greater than this requires feedback - setting to 0 will feedback immediately sub progress_message($$) { return if time()<$next_progress_time; $next_progress_time=time()+$progress_interval; print "$_[0] $_[1]\r"; # # to touch a tag file instead of using screen feedback # #unlink $last_progress_file if defined $last_progress_file && $last_progress_file ne ""; #$last_progress_file="$_[0].$_[1]"; #quick_touch $tag_file; } # # create the test file # sub create_test_file { return if -e $test_file; # we do not want to create if it already exists.... open TOUT,">$test_file"; # # write a file of a specified number of lines. want to have enough to # defeat the cache and/or take long enough to minimize external random # effects. # # 10 million lines is just over 1GB with this for output: # # "qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnm1234567890\n" # #for (my $line=0;$line<200_000_000;$line++) for (my $line=0;$line<50_000_000;$line++) { print TOUT "qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnm1234567890\n"; progress_message "create_test_file",$line; # show progress when writing since it may be taking a while } close TOUT; } ####sub create_test_file_short_line ####{ #### return if -e $test_file; # we do not want to create if it already exists.... #### open TOUT,">$test_file"; #### # #### # write a file of a specified number of lines. 100 million lines is about 600MB. #### # shorter lines may impact various things. it may be better to write many short #### # lines in a single print to save time when writing the file. #### # it can be very slow doing one line at a time so it may make sense to write several #### # qwert\n strings in a single line (say 10) to speed up the write process. #### # #### for (my $line=0;$line<1_000_000_000;$line++) #### { #### print TOUT "qwert\n"; #### progress_message "create_test_file",$line; #### } #### close TOUT; ####} sub test_while_dot { open TIN,"<$test_file"; while () { ####progress_message "test_while_dot",$.; } close TIN; return $.; } sub test_while_variable { my $linecount=0; open TIN,"<$test_file"; while () { $linecount++; ####progress_message "test_while_variable",$linecount; } close TIN; return $linecount; } sub test_block_read($) { my $block_size=$_[0]; open TIN,"<$test_file"; binmode TIN; my ($data, $n); my $newlinecount=0; while ((read TIN, $data, $_[0]) != 0) { $newlinecount+=($data =~ tr/\012//); ####progress_message "test_block_read",$newlinecount; } close(TIN); return $newlinecount; # return the line count } # # calling these routines requires loading entire file into memory which crashes # Linux 32 bit on large files. problem occurs with both for and foreach. # ###sub test_foreach_dot ###{ ### open TIN,"<$test_file"; ### for () ### { ### ####progress_message "test_foreach_dot $.",$.; ### } ### close TIN; ### return $.; ###} ###sub test_foreach_variable ###{ ### my $linecount=0; ### open TIN,"<$test_file"; ### for () ### { ### $linecount++; ### ####progress_message "test_foreach_variable $.",$linecount; ### } ### close TIN; ### return $linecount; ###} # # do the test # my $start_time; my $delta_time; $start_time=time(); create_test_file; # create the test file if not already present $delta_time=time()-$start_time; print "\ncreate_test_file: $delta_time\n"; $start_time=time(); test_while_dot; $delta_time=time()-$start_time; print "\ntest_while_dot: $delta_time\n"; $start_time=time(); test_while_variable; $delta_time=time()-$start_time; print "\ntest_while_variable: $delta_time\n"; foreach my $block_size (4096,256*1024,1*1024*1024,4*1024*1024,16*1024*1024) { $start_time=time(); test_block_read $block_size; $delta_time=time()-$start_time; my $bsizek=$block_size/1024; print "\ntest_block_read ${bsizek}K: $delta_time\n"; }