#!/usr/bin/perl
use strict;
use warnings;
use B2B::BGZF::Reader;
use Digest::MD5;
use Digest::Perl::MD5;
my ($fn_compressed, $fn_uncompressed) = @ARGV;
# open filehandle to compressed data
my $fh_c = B2B::BGZF::Reader->new_filehandle( $fn_compressed )
or die "failed to open compressed file";
# open filehandle to uncompressed data
open my $fh_u, '<:raw', $fn_uncompressed
or die "failed to open uncompressed file";
# compare pure-Perl vs XS behavior
for my $class (qw/ Digest::Perl::MD5 Digest::MD5 /) {
seek $fh_c, 0, 0;
seek $fh_u, 0, 0;
my $md5_c = $class->new()->addfile($fh_c)->hexdigest;
my $md5_u = $class->new()->addfile($fh_u)->hexdigest;
print "Compressed v Uncompressed ($class):\n";
print "\t$md5_c\n";
print "\t$md5_u\n";
}
Below is a pared-down version of the package to use with the above. For the test files, I will include following that a Base64-encoded BGZF file (is there a better way to attach binary files?). Once decoded it can be used as the compressed file. Since BGZF is a backward-compatible variant of gzip, you can use gunzip to get the original uncompressed data to use for the example. Hopefully this provides a complete working example.
package B2B::BGZF::Reader;
use v5.10.1;
use strict;
use warnings;
use Carp;
use Compress::Zlib;
use IO::Uncompress::RawInflate qw/rawinflate $RawInflateError/;
use constant BGZF_MAGIC => pack "H*", '1f8b0804';
use constant HEAD_BYTES => 12; # not including extra fields
use constant FOOT_BYTES => 8;
## no critic
# allow for filehandle tie'ing
sub TIEHANDLE { B2B::BGZF::Reader::new(@_) }
sub READ { B2B::BGZF::Reader::_read(@_) }
sub READLINE { B2B::BGZF::Reader::getline(@_) }
sub SEEK { B2B::BGZF::Reader::_seek(@_) }
sub CLOSE { close $_[0]->{fh} }
sub TELL { return $_[0]->{u_offset} }
sub EOF { return $_[0]->{buffer_len} == -1 }
# accessors
sub usize { return $_[0]->{u_file_size} };
## use critic
sub new_filehandle {
#-----------------------------------------------------------------
+--------
# ARG 0 : BGZF input filename
#-----------------------------------------------------------------
+--------
# RET 0 : Filehandle GLOB (internally an IO::File object tied to
# B2B::BGZF::Reader)
#-----------------------------------------------------------------
+--------
my ($class, $fn_in) = @_;
croak "input filename required" if (! defined $fn_in);
open my $fh, '<', undef;
tie *$fh, $class, $fn_in or croak "failed to tie filehandle";
return $fh;
}
sub new {
#-----------------------------------------------------------------
+--------
# ARG 0 : BGZF input filename
#-----------------------------------------------------------------
+--------
# RET 0 : B2B::BGZF::Reader object
#-----------------------------------------------------------------
+--------
my ($class, $fn_in) = @_;
my $self = bless {}, $class;
# initialize
$self->{fn_in} = $fn_in
or croak "Input name required";
# open compressed file in binary mode
open my $fh, '<:raw', $fn_in or croak "Failed to open input file";
+ ## no critic
$self->{fh} = $fh;
# these member variables allow for data extraction
$self->{buffer} = ''; # contents of currently uncompressed
+block
$self->{buffer_len} = 0; # save time on frequent length() call
+s
$self->{block_offset} = 0; # offset of current block in compress
+ed data
$self->{buffer_offset} = 0; # offset of current pos in uncompress
+ed block
$self->{block_size} = 0; # compressed size of current block
$self->{file_size} = -s $fn_in; # size of compressed file
# these variables are tracked to allow for full seek implementatio
+n
$self->{u_offset} = 0; # calculated current uncompressed off
+set
$self->{u_file_size} = 0; # size of uncompressed file (filled d
+uring
# indexing
$self->_generate_index(); # on-the-fly
# load initial block
$self->_load_block();
return $self;
}
sub _load_block {
#-----------------------------------------------------------------
+--------
# ARG 0 : offset of block start in compressed file
#-----------------------------------------------------------------
+--------
# no returns
#-----------------------------------------------------------------
+--------
my ($self, $block_offset) = @_;
# loading a block should always reset buffer offset
$self->{buffer_offset} = 0;
# avoid reload of current block
return if (defined $block_offset
&& $block_offset == $self->{block_offset});
# if no offset given (e.g. sequential reads), move to next block
if (! defined $block_offset) {
$block_offset = $self->{block_offset} + $self->{block_size};
}
$self->{block_offset} = $block_offset;
# deal with EOF
croak "Read past file end (perhaps corrupted/truncated input?)"
if ($self->{block_offset} > $self->{file_size});
if ($self->{block_offset} == $self->{file_size}) {
$self->{buffer} = '';
$self->{buffer_len} = -1;
return;
}
# never assume we're already there
sysseek $self->{fh}, $self->{block_offset}, 0;
# parse block according to GZIP spec, including content inflation
my ($block_size, $uncompressed_size, $content)
= $self->_unpack_block(1);
$self->{block_size} = $block_size;
$self->{buffer_len} = $uncompressed_size;
$self->{buffer} = $content;
return;
}
sub _unpack_block {
#-----------------------------------------------------------------
+--------
# ARG 0 : bool indicating whether to inflate (and return) actual p
+ayload
#-----------------------------------------------------------------
+--------
# RET 0 : compressed block size
# RET 1 : uncompressed content size
# RET 2 : content (if ARG 0)
#-----------------------------------------------------------------
+--------
my ($self, $do_unpack) = @_;
my @return_values;
my ($magic, $mod, $flags, $os, $len_extra) = unpack 'A4A4CCv',
_safe_sysread($self->{fh}, HEAD_BYTES);
my $t = sysseek $self->{fh}, 0, 1;
croak "invalid header at $t (corrupt file or not BGZF?)"
if ($magic ne BGZF_MAGIC);
# extract stated block size according to BGZF spec
my $block_size;
my $l = 0;
while ($l < $len_extra) {
my ($field_id, $field_len) = unpack 'A2v',
_safe_sysread($self->{fh}, 4);
if ($field_id eq 'BC') {
croak "invalid BC length" if ($field_len != 2);
croak "multiple BC fields" if (defined $block_size);
$block_size = unpack 'v',
_safe_sysread($self->{fh} => $field_len);
$block_size += 1; # convert to 1-based
}
$l += 4 + $field_len;
}
croak "invalid extra field length" if ($l != $len_extra);
croak "failed to read block size" if (! defined $block_size);
push @return_values, $block_size;
my $payload_len = $block_size - HEAD_BYTES - FOOT_BYTES - $len_ext
+ra;
my $content;
if ($do_unpack) {
# decode actual content
my $payload = _safe_sysread($self->{fh}, $payload_len);
rawinflate( \$payload => \$content )
or croak "Error inflating: $RawInflateError\n";
my $crc_given = unpack 'V', _safe_sysread($self->{fh} => 4);
croak "content CRC32 mismatch" if ( $crc_given != crc32($conte
+nt) );
}
else { sysseek $self->{fh}, $payload_len + 4, 1; }
# unpack and possibly check uncompressed payload size
my $size_given = unpack 'V', _safe_sysread($self->{fh} => 4);
croak "content length mismatch"
if ( defined $content && $size_given != length($content) );
push @return_values, $size_given;
push @return_values, $content if (defined $content);
return @return_values;
}
sub _read {
#-----------------------------------------------------------------
+--------
# ARG 0 : buffer to write to
# ARG 1 : bytes to attempt to read
# ARG 3 : (optional) offset in buffer to start write (default: 0)
#-----------------------------------------------------------------
+--------
# RET 0 : bytes read (0 at EOF, undef on error)
#-----------------------------------------------------------------
+--------
# we try to emulate the built-in 'read' call, so we will
# overwrite $_[1] and return the number of bytes read. To facilita
+te this,
# make $buf a reference to the buffer passed
my $self = shift;
my $buf = \shift; # must be a reference !!
my $bytes = shift;
my $offset = shift;
# handle cases when $offset is passed in
my $prefix = '';
if (defined $offset && $offset != 0) {
$prefix = substr $$buf, 0, $offset;
$prefix .= "\0" x ( $offset - length($$buf) )
if ( $offset > length($$buf) );
}
$$buf = ''; # reset (only AFTER grabbing any prefix above)
ITER:
while (length($$buf) < $bytes) {
my $l = length($$buf);
my $remaining = $bytes - $l;
# if read is within current block
if ($self->{buffer_offset} + $remaining <= $self->{buffer_len}
+) {
$$buf .= substr $self->{buffer}, $self->{buffer_offset}, $
+remaining;
$self->{buffer_offset} += $remaining;
$self->_load_block()
if ($self->{buffer_offset} == $self->{buffer_len});
}
else {
last ITER if ($self->{buffer_len} < 0); #EOF
$$buf .= substr $self->{buffer}, $self->{buffer_offset};
$self->_load_block();
}
}
$$buf = $prefix . $$buf;
my $l = length($$buf);
$self->{u_offset} += $l;
return $l;
}
sub getline {
#-----------------------------------------------------------------
+--------
# No arguments
#-----------------------------------------------------------------
+--------
# RET 0 : string read (undef at EOF)
#-----------------------------------------------------------------
+--------
my ($self) = @_;
my $data;
while (1) {
# return immediately if EOF
return $data if ($self->{buffer_len} < 0);
# search current block for record separator
# start searching from the current buffer offset
pos($self->{buffer}) = $self->{buffer_offset};
if ($self->{buffer} =~ m|$/|g) {
my $pos = pos $self->{buffer};
$data .= substr $self->{buffer}, $self->{buffer_offset},
$pos - $self->{buffer_offset};
$self->{buffer_offset} = $pos;
# if we're at the end of the block, load next
$self->_load_block if ($pos == $self->{buffer_len});
$self->{u_offset} += length($data);
return $data;
}
# otherwise, add rest of block to data and load next block
$data .= substr $self->{buffer}, $self->{buffer_offset};
$self->_load_block;
}
return;
}
sub _generate_index {
#-----------------------------------------------------------------
+--------
# No arguments
#-----------------------------------------------------------------
+--------
# No returns
#-----------------------------------------------------------------
+--------
my ($self) = @_;
my $uncmp_offset = 0;
my $cmp_offset = 0;
my $i = 0;
$self->{u_file_size} = 0;
$self->{idx} = [];
$self->{ridx} = {};
sysseek $self->{fh}, 0, 0;
while ($cmp_offset < $self->{file_size}) {
push @{$self->{idx}}, [$cmp_offset, $uncmp_offset];
$self->{ridx}->{$cmp_offset} = $uncmp_offset;
my ($block_size, $uncompressed_size) = $self->_unpack_block(0)
+;
$cmp_offset += $block_size;
$uncmp_offset += $uncompressed_size;
$self->{u_file_size} += $uncompressed_size;
}
sysseek $self->{fh}, $self->{block_offset}, 0;
return;
}
sub _seek {
#-----------------------------------------------------------------
+--------
# ARG 0 : byte offset to which to seek
# ARG 1 : relativity of offset (0: file start, 1: current, 2: file
+ end)
#-----------------------------------------------------------------
+--------
# no returns
#-----------------------------------------------------------------
+--------
my ($self, $pos, $whence) = @_;
$pos += $self->{u_offset} if ($whence == 1);
$pos = $self->{u_file_size} + $pos if ($whence == 2);
return if ($pos < 0 || $pos >= $self->{u_file_size});
# Do seeded search for nearest block start >= $pos
# (although we don't know the size of each block, we can determine
+ the
# mean length and usually choose a starting value close to the act
+ual -
# benchmarks much faster than binary search)
# TODO: benchmark whether breaking this out and Memoizing speeds t
+hings up
my $s = scalar( @{$self->{idx}} );
my $idx = int($pos/($self->{u_file_size}) * $s);
while (1) {
if ($pos < $self->{idx}->[$idx]->[1]) {
--$idx;
next;
}
if ($idx+1 < $s && $pos >= $self->{idx}->[$idx+1]->[1]) {
++$idx;
next;
}
last;
}
my $block_o = $self->{idx}->[$idx]->[0];
my $block_o_u = $self->{idx}->[$idx]->[1];
my $buff_o = $pos - $block_o_u;
$self->_load_block( $block_o );
$self->{buffer_offset} = $buff_o;
$self->{u_offset} = $block_o_u + $buff_o;
return 1;
}
sub _safe_sysread {
# sysread wrapper that checks return count and returns read
# (internally we should never read off end of file - doing so indi
+cates
# either a software bug or a corrupt input file so we croak)
#-----------------------------------------------------------------
+--------
# ARG 0 : bytes to read
#-----------------------------------------------------------------
+--------
# RET 0 : string read
#-----------------------------------------------------------------
+--------
my ($fh, $len) = @_;
my $buf = '';
my $r = sysread $fh, $buf, $len;
croak "returned unexpected byte count" if ($r != $len);
return $buf;
}
1;
And the Base64-encoded BGZF:
H4sIBAAAAAAA/wYAQkMCAKsBXZJLbtxADET3PgVPoDsYQXb5ADGcPafFGRPo3zRJI8cP2S
+1P5CwE
SS2qWPXIX9RtZ6w7Euwtt8EXE3hv2bqiUgEaow3AlEywKluBq90QOg3pnBiVZYOvCldMnF
+mg8htn
oMpl6VG5G23wxeIOhavLC+1AfxJ1tcFHh1PLnjER6vb09HPnBpUksVWFajkj3A2942/KSx
+9Q3EtF
lwISL2p1g+94qwyDpumI5j/RQIW0bPTmPQ9/Am+cgE20bfBsCixKU0robuwuXuIOcS37qd
+X4tFo+
YtSQxuJ+xAp51wnKJR/v7vNKwx8dU0GevacHK+cUgx4Hq+biMDwDZb7Fzxu8OocCXM/QtK
+mfhcnJ
fpAPZ7VKbXjeGNMP1xdAi/rHvILXN7xMF+9OlW0cqxA17vdIPc9nF890WofirCO280LqtK
+gBuolj
tE7wVaHj4CPefxsX02kOkkZiz8Ktur1kueMn0jsnxY/RtlLd+z/4Z7QHrrWIwlU38D3STy
+sboGSO
aylhwI5kL7FoTsi3LhblOI9y3P4CBU6J0CwDAAAfiwgEAAAAAAD/BgBCQwIAGwADAAAAAA
+AAAAAA