Re: Count number of lines in a text file

if you are counting lines for some kind of a "production" process you probably want to avoid most of the "tricks" mentioned here since they are likely to be harder to maintain. and any variation of for or foreach processing needs to be avoided because you may run out of memory on a large file. I also tend to avoid $. because I have run into cases where the value is not correct when I am using multiple files (I maintain my own count).

as noted in a previous comment there is a problem with sysread resulting in an incorrect count (and there is now a fix posted in a separate comment).

the best generic method is probably a simple while loop case but if you are processing large files and need to provide some kind of progress indication you may find that a while sysread or read variation is a better choice. this program can be used to test several variations on a file in a single run (edit as needed for file size and/or progress).

#
# a test program to test the various line count methods under Perl.  i
+t turns
# out that they each may have their own specific issues.
#

use strict;

my $test_file="large_test_file.txt";       # change name to test size/
+length differences

#
# progress related code
#
sub quick_touch($)
{
    open TCH,">>$_[0]";     # will create if file does not exist but n
+ot destroy existing
    close TCH;
}

$|=1;           # so we can provide screen feedback
my $last_progress_file;
my $progress_interval=3;
my $next_progress_time=0;     # time greater than this requires feedba
+ck - setting to 0 will feedback immediately
sub progress_message($$)
{
    return if time()<$next_progress_time;
    $next_progress_time=time()+$progress_interval;
    print "$_[0] $_[1]\r";
    #
    # to touch a tag file instead of using screen feedback
    #
    #unlink $last_progress_file if defined $last_progress_file && $las
+t_progress_file ne "";
    #$last_progress_file="$_[0].$_[1]";
    #quick_touch $tag_file;
}

#
# create the test file
#
sub create_test_file
{
    return if -e $test_file;               # we do not want to create 
+if it already exists....
    open TOUT,">$test_file";
    #
    # write a file of a specified number of lines.  want to have enoug
+h to
    # defeat the cache and/or take long enough to minimize external ra
+ndom
    # effects.
    #
    # 10 million lines is just over 1GB with this for output:
    #
    #     "qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcv
+bnm1234567890qwertyuiopasdfghjklzxcvbnm1234567890\n"
    #
    #for (my $line=0;$line<200_000_000;$line++)
    for (my $line=0;$line<50_000_000;$line++)
    {
        print TOUT "qwertyuiopasdfghjklzxcvbnm1234567890qwertyuiopasdf
+ghjklzxcvbnm1234567890qwertyuiopasdfghjklzxcvbnm1234567890\n";
        progress_message "create_test_file",$line;        # show progr
+ess when writing since it may be taking a while
    }
    close TOUT;
}
####sub create_test_file_short_line
####{
####    return if -e $test_file;               # we do not want to cre
+ate if it already exists....
####    open TOUT,">$test_file";
####    #
####    # write a file of a specified number of lines. 100 million lin
+es is about 600MB.
####    # shorter lines may impact various things.  it may be better t
+o write many short
####    # lines in a single print to save time when writing the file.
####    # it can be very slow doing one line at a time so it may make 
+sense to write several
####    # qwert\n strings in a single line (say 10) to speed up the wr
+ite process.
####    #
####    for (my $line=0;$line<1_000_000_000;$line++)
####    {
####        print TOUT "qwert\n";
####        progress_message "create_test_file",$line;
####    }
####    close TOUT;
####}

sub test_while_dot
{
    open TIN,"<$test_file";
    while (<TIN>)
    {
        ####progress_message "test_while_dot",$.;
    }
    close TIN;
    return $.;
}
sub test_while_variable
{
    my $linecount=0;
    open TIN,"<$test_file";
    while (<TIN>)
    {
        $linecount++;
        ####progress_message "test_while_variable",$linecount;
    }
    close TIN;
    return $linecount;
}
sub test_block_read($)
{
    my $block_size=$_[0];
    open TIN,"<$test_file";
    binmode TIN;
    my ($data, $n);
    my $newlinecount=0;
    while ((read TIN, $data, $_[0]) != 0)
    {
        $newlinecount+=($data =~ tr/\012//);
        ####progress_message "test_block_read",$newlinecount;
    }
    close(TIN);
    return $newlinecount;       # return the line count
}
#
# calling these routines requires loading entire file into memory whic
+h crashes
# Linux 32 bit on large files.  problem occurs with both for and forea
+ch.
#
###sub test_foreach_dot
###{
###    open TIN,"<$test_file";
###    for (<TIN>)
###    {
###        ####progress_message "test_foreach_dot $.",$.;
###    }
###    close TIN;
###    return $.;
###}
###sub test_foreach_variable
###{
###    my $linecount=0;
###    open TIN,"<$test_file";
###    for (<TIN>)
###    {
###        $linecount++;
###        ####progress_message "test_foreach_variable $.",$linecount;
###    }
###    close TIN;
###    return $linecount;
###}

#
# do the test
#
my $start_time;
my $delta_time;

$start_time=time();
create_test_file;         # create the test file if not already presen
+t
$delta_time=time()-$start_time;
print "\ncreate_test_file: $delta_time\n";

$start_time=time();
test_while_dot;
$delta_time=time()-$start_time;
print "\ntest_while_dot: $delta_time\n";

$start_time=time();
test_while_variable;
$delta_time=time()-$start_time;
print "\ntest_while_variable: $delta_time\n";

foreach my $block_size (4096,256*1024,1*1024*1024,4*1024*1024,16*1024*
+1024)
{
    $start_time=time();
    test_block_read $block_size;
    $delta_time=time()-$start_time;
    my $bsizek=$block_size/1024;
    print "\ntest_block_read ${bsizek}K: $delta_time\n";
}
[download]

Comment on Re: Count number of lines in a text file Download Code


Perl Monk, Perl Meditation
	PerlMonks