Re^2: Digest::MD5 addfile() w/ tied filehandle

Here is a MWE:

#!/usr/bin/perl

use strict;
use warnings;

use B2B::BGZF::Reader;
use Digest::MD5;
use Digest::Perl::MD5;

my ($fn_compressed, $fn_uncompressed) = @ARGV;

# open filehandle to compressed data
my $fh_c = B2B::BGZF::Reader->new_filehandle( $fn_compressed )
    or die "failed to open compressed file";

# open filehandle to uncompressed data
open my $fh_u, '<:raw', $fn_uncompressed
    or die "failed to open uncompressed file";

# compare pure-Perl vs XS behavior
for my $class (qw/ Digest::Perl::MD5 Digest::MD5 /) {
    seek $fh_c, 0, 0;
    seek $fh_u, 0, 0;
    my $md5_c = $class->new()->addfile($fh_c)->hexdigest;
    my $md5_u = $class->new()->addfile($fh_u)->hexdigest;
    print "Compressed v Uncompressed ($class):\n";
    print   "\t$md5_c\n";
    print   "\t$md5_u\n";
}
[download]

Below is a pared-down version of the package to use with the above. For the test files, I will include following that a Base64-encoded BGZF file (is there a better way to attach binary files?). Once decoded it can be used as the compressed file. Since BGZF is a backward-compatible variant of gzip, you can use gunzip to get the original uncompressed data to use for the example. Hopefully this provides a complete working example.

package B2B::BGZF::Reader;

use v5.10.1;
use strict;
use warnings;

use Carp;
use Compress::Zlib;
use IO::Uncompress::RawInflate qw/rawinflate $RawInflateError/;

use constant BGZF_MAGIC => pack "H*", '1f8b0804';
use constant HEAD_BYTES => 12; # not including extra fields
use constant FOOT_BYTES => 8;

## no critic
# allow for filehandle tie'ing
sub TIEHANDLE { B2B::BGZF::Reader::new(@_) }
sub READ      { B2B::BGZF::Reader::_read(@_) }
sub READLINE  { B2B::BGZF::Reader::getline(@_) }
sub SEEK      { B2B::BGZF::Reader::_seek(@_) }
sub CLOSE     { close $_[0]->{fh} }
sub TELL      { return $_[0]->{u_offset} }
sub EOF       { return $_[0]->{buffer_len} == -1 }

# accessors
sub usize { return $_[0]->{u_file_size} };

## use critic

sub new_filehandle {

    #-----------------------------------------------------------------
+--------
    # ARG 0 : BGZF input filename
    #-----------------------------------------------------------------
+--------
    # RET 0 : Filehandle GLOB (internally an IO::File object tied to
    #         B2B::BGZF::Reader)
    #-----------------------------------------------------------------
+--------

    my ($class, $fn_in) = @_;
    croak "input filename required" if (! defined $fn_in);
    open my $fh, '<', undef;
    tie *$fh, $class, $fn_in or croak "failed to tie filehandle";

    return $fh;

}

sub new {

    #-----------------------------------------------------------------
+--------
    # ARG 0 : BGZF input filename
    #-----------------------------------------------------------------
+--------
    # RET 0 : B2B::BGZF::Reader object
    #-----------------------------------------------------------------
+--------
    
    my ($class, $fn_in) = @_;
    my $self = bless {}, $class;

    # initialize
    $self->{fn_in} = $fn_in
        or croak "Input name required";

    # open compressed file in binary mode
    open my $fh, '<:raw', $fn_in or croak "Failed to open input file";
+ ## no critic
    $self->{fh} = $fh;

    # these member variables allow for data extraction
    $self->{buffer}        = ''; # contents of currently uncompressed 
+block
    $self->{buffer_len}    = 0;  # save time on frequent length() call
+s
    $self->{block_offset}  = 0;  # offset of current block in compress
+ed data
    $self->{buffer_offset} = 0;  # offset of current pos in uncompress
+ed block
    $self->{block_size}    = 0;  # compressed size of current block
    $self->{file_size}     = -s $fn_in; # size of compressed file

    # these variables are tracked to allow for full seek implementatio
+n
    $self->{u_offset}      = 0;  # calculated current uncompressed off
+set 
    $self->{u_file_size}   = 0;  # size of uncompressed file (filled d
+uring
                                 #  indexing

    $self->_generate_index(); # on-the-fly

    # load initial block
    $self->_load_block();

    return $self;

}

sub _load_block {
    
    #-----------------------------------------------------------------
+--------
    # ARG 0 : offset of block start in compressed file
    #-----------------------------------------------------------------
+--------
    # no returns
    #-----------------------------------------------------------------
+--------

    my ($self, $block_offset) = @_;

    # loading a block should always reset buffer offset
    $self->{buffer_offset} = 0;

    # avoid reload of current block
    return if (defined $block_offset
        && $block_offset == $self->{block_offset});


    # if no offset given (e.g. sequential reads), move to next block
    if (! defined $block_offset) {
        $block_offset = $self->{block_offset} + $self->{block_size};
    }

    $self->{block_offset} = $block_offset;

    # deal with EOF
    croak "Read past file end (perhaps corrupted/truncated input?)"
        if ($self->{block_offset} > $self->{file_size});
    if ($self->{block_offset} == $self->{file_size}) {
        $self->{buffer} = '';
        $self->{buffer_len} = -1;
        return;
    }

    # never assume we're already there
    sysseek $self->{fh}, $self->{block_offset}, 0;

    # parse block according to GZIP spec, including content inflation
    my ($block_size, $uncompressed_size, $content)
        = $self->_unpack_block(1);

    $self->{block_size} = $block_size;
    $self->{buffer_len} = $uncompressed_size;
    $self->{buffer}     = $content;

    return;

}

sub _unpack_block {

    #-----------------------------------------------------------------
+--------
    # ARG 0 : bool indicating whether to inflate (and return) actual p
+ayload
    #-----------------------------------------------------------------
+--------
    # RET 0 : compressed block size
    # RET 1 : uncompressed content size
    # RET 2 : content (if ARG 0)
    #-----------------------------------------------------------------
+--------

    my ($self, $do_unpack) = @_;

    my @return_values;


    my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv',
        _safe_sysread($self->{fh}, HEAD_BYTES);
    my $t = sysseek $self->{fh}, 0, 1;
    croak "invalid header at $t (corrupt file or not BGZF?)"
        if ($magic ne BGZF_MAGIC);

    # extract stated block size according to BGZF spec
    my $block_size;
    my $l = 0;
    while ($l < $len_extra) {
        my ($field_id, $field_len) = unpack 'A2v',
            _safe_sysread($self->{fh}, 4);
        if ($field_id eq 'BC') {
            croak "invalid BC length" if ($field_len != 2);
            croak "multiple BC fields" if (defined $block_size);
            $block_size = unpack 'v',
                _safe_sysread($self->{fh} => $field_len);
            $block_size += 1; # convert to 1-based
        }
        $l += 4 + $field_len;
    }
    croak "invalid extra field length" if ($l != $len_extra);
    croak "failed to read block size" if (! defined $block_size);

    push @return_values, $block_size;
    my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_ext
+ra;
    my $content;
    if ($do_unpack) {
        
        # decode actual content
        my $payload = _safe_sysread($self->{fh}, $payload_len);
        rawinflate( \$payload => \$content )
            or croak "Error inflating: $RawInflateError\n";
        my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4);
        croak "content CRC32 mismatch" if ( $crc_given != crc32($conte
+nt) );

    }
    else { sysseek $self->{fh}, $payload_len + 4, 1; }

    # unpack and possibly check uncompressed payload size
    my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4);
    croak "content length mismatch"
        if ( defined $content && $size_given != length($content) );
    push @return_values, $size_given;
    push @return_values, $content if (defined $content);

    return @return_values;

}

sub _read {

    #-----------------------------------------------------------------
+--------
    # ARG 0 : buffer to write to
    # ARG 1 : bytes to attempt to read
    # ARG 3 : (optional) offset in buffer to start write (default: 0)
    #-----------------------------------------------------------------
+--------
    # RET 0 : bytes read (0 at EOF, undef on error)
    #-----------------------------------------------------------------
+--------

    # we try to emulate the built-in 'read' call, so we will
    # overwrite $_[1] and return the number of bytes read. To facilita
+te this,
    # make $buf a reference to the buffer passed
    my $self = shift;
    my $buf = \shift; # must be a reference !!
    my $bytes = shift;
    my $offset = shift;

    # handle cases when $offset is passed in
    my $prefix = '';
    if (defined $offset && $offset != 0) {
        $prefix = substr $$buf, 0, $offset;
        $prefix .= "\0" x ( $offset - length($$buf) )
            if ( $offset > length($$buf) );
    }

    $$buf = ''; # reset (only AFTER grabbing any prefix above)

    ITER:
    while (length($$buf) < $bytes) {

        my $l = length($$buf);
        my $remaining = $bytes - $l;

        # if read is within current block
        if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}
+) {
            $$buf .= substr $self->{buffer}, $self->{buffer_offset}, $
+remaining;
            $self->{buffer_offset} += $remaining;
            $self->_load_block()
                if ($self->{buffer_offset} == $self->{buffer_len});
        }
        else {
            last ITER if ($self->{buffer_len} < 0); #EOF
            $$buf .= substr $self->{buffer}, $self->{buffer_offset};
            $self->_load_block();
        }

    }

    $$buf = $prefix . $$buf;
    my $l = length($$buf);
    $self->{u_offset} += $l;

    return $l;

}


sub getline {

    #-----------------------------------------------------------------
+--------
    # No arguments
    #-----------------------------------------------------------------
+--------
    # RET 0 : string read (undef at EOF)
    #-----------------------------------------------------------------
+--------

    my ($self) = @_;

    my $data;

    while (1) {

        # return immediately if EOF
        return $data if ($self->{buffer_len} < 0);

        # search current block for record separator
        # start searching from the current buffer offset
        pos($self->{buffer}) = $self->{buffer_offset};

        if ($self->{buffer} =~ m|$/|g) {
            my $pos = pos $self->{buffer};
            $data .= substr $self->{buffer}, $self->{buffer_offset},
                $pos - $self->{buffer_offset};

            $self->{buffer_offset} = $pos;

            # if we're at the end of the block, load next
            $self->_load_block if ($pos == $self->{buffer_len});

            $self->{u_offset} += length($data);
            return $data;

        }

        # otherwise, add rest of block to data and load next block
        $data .= substr $self->{buffer}, $self->{buffer_offset};
        $self->_load_block;

    }

    return;

}

sub _generate_index {

    #-----------------------------------------------------------------
+--------
    # No arguments
    #-----------------------------------------------------------------
+--------
    # No returns
    #-----------------------------------------------------------------
+--------

    my ($self) = @_;

    my $uncmp_offset     = 0;
    my $cmp_offset       = 0;
    my $i                = 0;
    $self->{u_file_size} = 0;
    $self->{idx}         = [];
    $self->{ridx}        = {};

    sysseek $self->{fh}, 0, 0;

    while ($cmp_offset < $self->{file_size}) {

        push @{$self->{idx}}, [$cmp_offset, $uncmp_offset];
        $self->{ridx}->{$cmp_offset} = $uncmp_offset;

        my ($block_size, $uncompressed_size) = $self->_unpack_block(0)
+;

        $cmp_offset += $block_size;
        $uncmp_offset += $uncompressed_size;
        $self->{u_file_size} += $uncompressed_size;

    }

    sysseek $self->{fh}, $self->{block_offset}, 0;

    return;

}


sub _seek {

    #-----------------------------------------------------------------
+--------
    # ARG 0 : byte offset to which to seek
    # ARG 1 : relativity of offset (0: file start, 1: current, 2: file
+ end)
    #-----------------------------------------------------------------
+--------
    # no returns
    #-----------------------------------------------------------------
+--------

    my ($self, $pos, $whence) = @_;

    $pos += $self->{u_offset} if ($whence == 1);
    $pos  = $self->{u_file_size} + $pos if ($whence == 2);

    return if ($pos < 0 || $pos >= $self->{u_file_size});

    # Do seeded search for nearest block start >= $pos
    # (although we don't know the size of each block, we can determine
+ the
    # mean length and usually choose a starting value close to the act
+ual -
    # benchmarks much faster than binary search)
    # TODO: benchmark whether breaking this out and Memoizing speeds t
+hings up

    my $s = scalar( @{$self->{idx}} );
    my $idx = int($pos/($self->{u_file_size}) * $s);
    while (1) {
        if ($pos < $self->{idx}->[$idx]->[1]) {
            --$idx;
            next;
        }
        if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) {
            ++$idx;
            next;
        }
        last;
    }

    my $block_o   = $self->{idx}->[$idx]->[0];
    my $block_o_u = $self->{idx}->[$idx]->[1];
    my $buff_o    = $pos - $block_o_u;

    $self->_load_block( $block_o );
    $self->{buffer_offset} = $buff_o;
    $self->{u_offset} = $block_o_u + $buff_o;

    return 1;

}

sub _safe_sysread {

    # sysread wrapper that checks return count and returns read
    # (internally we should never read off end of file - doing so indi
+cates
    # either a software bug or a corrupt input file so we croak)

    #-----------------------------------------------------------------
+--------
    # ARG 0 : bytes to read
    #-----------------------------------------------------------------
+--------
    # RET 0 : string read
    #-----------------------------------------------------------------
+--------

    my ($fh, $len) = @_;
    my $buf = '';
    my $r = sysread $fh, $buf, $len;
    croak "returned unexpected byte count" if ($r != $len);

    return $buf;

}

1;
[download]

And the Base64-encoded BGZF:

H4sIBAAAAAAA/wYAQkMCAKsBXZJLbtxADET3PgVPoDsYQXb5ADGcPafFGRPo3zRJI8cP2S
+1P5CwE
SS2qWPXIX9RtZ6w7Euwtt8EXE3hv2bqiUgEaow3AlEywKluBq90QOg3pnBiVZYOvCldMnF
+mg8htn
oMpl6VG5G23wxeIOhavLC+1AfxJ1tcFHh1PLnjER6vb09HPnBpUksVWFajkj3A2942/KSx
+9Q3EtF
lwISL2p1g+94qwyDpumI5j/RQIW0bPTmPQ9/Am+cgE20bfBsCixKU0robuwuXuIOcS37qd
+X4tFo+
YtSQxuJ+xAp51wnKJR/v7vNKwx8dU0GevacHK+cUgx4Hq+biMDwDZb7Fzxu8OocCXM/QtK
+mfhcnJ
fpAPZ7VKbXjeGNMP1xdAi/rHvILXN7xMF+9OlW0cqxA17vdIPc9nF890WofirCO280LqtK
+gBuolj
tE7wVaHj4CPefxsX02kOkkZiz8Ktur1kueMn0jsnxY/RtlLd+z/4Z7QHrrWIwlU38D3STy
+sboGSO
aylhwI5kL7FoTsi3LhblOI9y3P4CBU6J0CwDAAAfiwgEAAAAAAD/BgBCQwIAGwADAAAAAA
+AAAAAA
[download]

Comment on Re^2: Digest::MD5 addfile() w/ tied filehandle Select or Download Code


good chemistry is complicated, and a little bit messy -LW
	PerlMonks