Beefy Boxes and Bandwidth Generously Provided by pair Networks
Problems? Is your data what you think it is?

LZW Demystified (the code)

by demerphq (Chancellor)
on Jun 29, 2003 at 20:44 UTC ( #270020=note: print w/replies, xml ) Need Help??

in reply to LZW Demystified

And heres the code. It may be a bit scruffy right now. Theres lots of $DEBUG stuff going on that I normally would remove, except that its no fun to play with if you cant see what happening under the hood! $DEBUG=1 shows lots of diagnostics. $DEBUG>1 shows even more. If run directly as a script it will compress and expand itself under $DEBUG circumstances. Otherwise it should be usable now. I will add POD later, but for now I'll leave it be.

Note that once the output code bug is fixed changing the magic number output should make it compress compatible. Currently it is not however.

package LZW; use strict; use warnings; use Data::Dumper; use constant MAGIC_1=>0x2f; # magic number 1 (Set to 0x1f when comp +ress compatible) use constant MAGIC_2=>0x9d; # compress magic number 2 use constant MAGIC=>pack 'C*',MAGIC_1,MAGIC_2; use constant BIT_MASK=>0x1f; # bit_mask for third byte to get # the max bits in the file. use constant BLOCK_MODE=>0x80; # bit_mask for whether we are # compress 2.0 compat (we arent) use constant WIPE=>256; # code to cause a string table dump use constant INIT_BITS=>9; # number of bits for the first emitted +code use constant MAX_BITS=>12; # maximum bits to represent. this means # we can have 2**MAX_BITS codes and cor +responding # hash table entries. use constant CODE_EOF=>2**MAX_BITS+1; # we use this to signal EOF. $|++; our $DEBUG=0; # used for debugging sub qquote { my $val=shift; return 'undef' unless defined $val; return qq("") unless length $val; my @parts=$val=~m/\G(.{1,80})/sg; #warn scalar @parts,"\n"; return (@parts>1?"\n":"").join(".\n", map { Data::Dumper::qquote($_) } @parts); } BEGIN { $Data::Dumper::Useqq=1; my @chars=map { chr ($_) } 0..255; sub _init { my $self=shift; print STDERR "_init()\n" if $DEBUG; my %default=( # index is the codes id, value is the string codes => [ @chars,'' ], # lookup from string to id strs => { map { $_ => ord($_) } @chars[0..255] }, # number of bits to output per code code_bits => INIT_BITS, # (2**code_bits) When the number of codes equals # this we add a bit to the code length threshold => (2**INIT_BITS), ); $self->{$_}=$default{$_} foreach keys %default; return $self; } sub init { my $self=shift; %$self=( # (2**n-1) Size of buffer holding chars representing bits. buf_max => 1023, # user supplied attributes go above here %$self, # per object initializers go below here # string representing the bitstream of the compressed file bit_buf => "", in => undef, # in filehandle out => undef, # out filehandle rbits =>0, # bits read wbits =>0, # bits written ); $self->_init(); } } sub new { my $class=shift; my $self=bless {@_},$class; return $self->init() } sub _open_files { my ($self,$infile,$outfile)=@_; if (-e $outfile) { if ($self->{overwrite}) { warn "Overwriting '$outfile'\n"; } else { Carp::croak("Won't overwrite '$outfile' unless". " overwrite option set\n"); } } open my $in,"<",$infile or die "Reading '$infile' : $!"; open my $out,">",$outfile or die "Writing '$outfile' : $!"; binmode $in; binmode $out; return ($in,$out); } sub expand_file { my ($self,$infile,$outfile)=@_; $infile or die "Must have a file to expand!"; unless ($outfile) { $outfile=$infile; $outfile=~s/\.plzw\z// or $outfile.=".out"; } warn "Expanding '$infile' to '$outfile'\n"; $self->expand_fh($self->_open_files($infile,$outfile)); return $outfile; } sub compress_file { my ($self,$infile,$outfile)=@_; $outfile||= "$infile.plzw"; warn "Compressing '$infile' to '$outfile'\n"; $self->compress_fh($self->_open_files($infile,$outfile)); return $outfile; } sub compress_fh { my ($self,$in,$out)=@_; $self->init() if $self->{in} or $self->{out}; @{$self}{qw(in out)}=($in,$out); my $string; my $buf=""; my $codes=$self->{codes}; read($in,$string,1) or die "Empty file or error! $!"; # compress signature, mode and maxbit and first char. print $out MAGIC,chr(BLOCK_MODE|MAX_BITS),$string; print STDERR "O > char : ".qquote($string)."\n" if $DEBUG; $self->{rbits}+=32; my $notfirst=0; while (<$in>) { $buf.=$_; print STDERR "READ:".qquote($buf)."\n" if $DEBUG; while ($buf) { my $char=substr($buf,0,1,''); my $append=$string.$char; $self->{rbits}+=8; if (exists($self->{strs}{$append})) { $string=$append; } else { if ($notfirst) { $self->output_code($self->{strs}{$string}) } else { $notfirst=1; } # add the code if (@$codes<CODE_EOF) { push @$codes,$append; $self->{strs}{$append}=$#$codes; printf STDERR "C>> %6d : %s\n",$#$codes,qquote($ap +pend) if $DEBUG; } elsif ($self->{wbits}/$self->{rbits}<.7) { $self->output_code(WIPE); $self->_init(); #$char=""; } $string=$char } } } $self->output_code($self->{strs}{$string} ) if length $string; $self->output_code( CODE_EOF ); warn "Read %d bits, wrote %d bits producing %%%5.2f compression\n" +, $self->{rbits},$self->{wbits},$self->{wbits}/$self->{rb +its}; return 1 } # output_code $fh $code # Takes a filehandle and a numeric code and outputs it with the # correct number of code_bits for the number of codes so far involved. # thus we use 9 bit codes until we hit 512 then we go to 10 bit codes # etc. # We cache the codes emitted until we have a multiple of 8 (as we outp +ut # bytes/chars and not code_bits), and at the same time we do a bit of +caching # so we call print and pack less often. sub output_code { my ($self,$code)=@_; Carp::confess 'undef!' unless defined $code; my $out=$self->{out}; # the condition here is against <= $#.. different from input_code while ($self->{threshold} <= $#{$self->{codes}} and $self->{code_bits}<MAX_BITS) { $self->{threshold}<<=1; $self->{code_bits}++; } unless ($code==CODE_EOF) { my $bits=sprintf "%0*b",$self->{code_bits},$code; $self->{bit_buf}.=$bits; $self->{wbits}+=$self->{code_bits}; printf STDERR "O > %-6d : %12s : %s \t| ", $code,$bits,qquote($self->{codes}[$code]) if $DEBUG; } my $len=length($self->{bit_buf}); if (($len>$self->{buf_max}) or (CODE_EOF == $code)) { my $chunk=substr($self->{bit_buf},0, ($len>$self->{buf_max} ? int($len / 8) * 8 : $len) +,"" ); my $pack=pack "B*",$chunk; printf STDERR "\nOC> %2d:%2d:%4d:%s\n %s\n", length($pack),$self->{code_bits},length($chunk), qquote($chunk),qquote($pack),"\n" if $DEBUG; print $out $pack; #$self->{bit_buf}=""; } } sub expand_fh { my $self=shift; my ($in,$out)=@_; $self->init() if $self->{in} or $self->{out}; $self->{in}=$in; $self->{out}=$out; my $head; my $bytes=read($in,$head,3); unless ($bytes == 3 and substr($head,0,2) eq MAGIC) { Carp::confess("Not a Perl LZW file"); } if (ord(substr($head,3,1)) & BIT_MASK > MAX_BITS) { Carp::confess("Can't decompress this file, it used too large " +. " a bitsize for me to handle.\n"); } my ($char,$old_code); read($in,$char,1) or die "Bad file!\n"; $old_code=ord($char); my $codes=$self->{codes}; print STDERR "First char: $char\n" if $DEBUG; print $out $char; $char=""; #reset char so the first symbol entered isnt the first c +har doubled. #old_code holds its ord value anyway. while ((my $new_code=$self->input_code()) != CODE_EOF) { if ($new_code==WIPE) { $self->_init(); $char=""; $old_code=WIPE; next; } print STDERR "Read: $new_code " .qquote($codes->[$new_code])." +\n" if $DEBUG; my $str; if ( defined $codes->[$new_code]) { $str=$codes->[$new_code]; print STDERR "Found $new_code: ",qquote($str),"\n" if $DEBUG>1; } else { # this handles the KwKwK case $str=$codes->[$old_code].$char; print STDERR "Initializing current string for $new_code (" +, qquote($str).") from ".qquote($codes->[$ol +d_code]), " and ",qquote($char)."\n" if $DEBUG; } $self->output_decoded($str); $char=substr($str,0,1); if (@$codes<CODE_EOF) { push @$codes,$codes->[$old_code].$char; printf STDERR "E<< Created: %6d : %s\n", $#$codes,qquote($codes->[$old_code].$ch +ar) if $DEBUG; } $old_code=$new_code; } } # input_code $in # Reads a code from the input filehandle. # This is a bit tricky. We are extracting variable numbers of code_bit +s from the stream each time. # As with the output we maintain a cache, but this time for the purpos +e of allowing us to easily # slice off the correct set of code_bits from the front of the stream. sub input_code { my $self=shift; my $in=$self->{in}; # make sure we are reading in the correct number of bits # the condition here is against <= @.. different from output_code while ($self->{threshold} <= @{$self->{codes}} and $self->{code_bits}<MAX_BITS) { $self->{threshold}<<=1; $self->{code_bits}++; } # do we need to fill the buffer up with more bits? if (length($self->{bit_buf})<$self->{code_bits}) { if (not(eof $in)) { local $/=\do{my $size=int ($self->{buf_max} / 8)}; $self->{bit_buf}.=unpack "B*",scalar <$in>; print STDERR "RC< ",qquote($self->{bit_buf})."\n" if $DEBUG; } else { if ($DEBUG && $self->{bit_buf}=~/1/) { # this actually shouldnt be an issue but we do it anyw +ay Carp::carp(length($self->{bit_buf})."<".$self->{code_b +its}. ": $self->{bit_buf} : Corrupt file?\n".Dumper($self)); } return CODE_EOF; } } # read the bits from the buffer my $code_bits=substr($self->{bit_buf},0,$self->{code_bits},""); # now turn it into a number my $code=unpack('n',pack "B*",(0 x (16-length $code_bits)).$code_b +its); print STDERR " < $code $self->{code_bits} '$code_bits'\n" if $DEBU +G; return $code; } sub output_decoded { my ($self,$str)=@_; my $fh=$self->{out}; print $fh $str; } unless (caller) { local $DEBUG=1; my $obj=__PACKAGE__->new(overwrite=>1); my $compressed_as=$obj->compress_file($0); my $expanded_as=$obj->expand_file($compressed_as,$compressed_as.". +out"); } 1; __END__ # From # decompress Read OLD_CODE output OLD_CODE CHARACTER = OLD_CODE WHILE there are still input characters DO Read NEW_CODE IF NEW_CODE is not in the translation table THEN STRING = get translation of OLD_CODE STRING = STRING+CHARACTER ELSE STRING = get translation of NEW_CODE END of IF output STRING CHARACTER = first character in STRING add OLD_CODE + CHARACTER to the translation table OLD_CODE = NEW_CODE END of WHILE # compress STRING = get input character WHILE there are still input characters DO CHARACTER = get input character IF STRING+CHARACTER is in the string table then STRING = STRING+character ELSE output the code for STRING add STRING+CHARACTER to the string table STRING = CHARACTER END of IF END of WHILE output the code for STRING

The part after the __END__ marker is the LZW algorithm in pseudo code from the Dr. Dobbs Journal article mentioned above.


<Elian> And I do take a kind of perverse pleasure in having an OO assembly language...

Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://270020]
[james28909]: id just rename the subrouties perl what kind of data they are getting. something easy to remember too. like if get_data were returning movies titles, rename get_data to movies or movie_titles or get_movie_titles.
[james28909]: s/perl/per/
[james28909]: but i dont really know what your end goal is. if it works dont fix it lol
[Lady_Aleena]: james28909, I use get_data in a subroutine called movies.

How do I use this? | Other CB clients
Other Users?
Others wandering the Monastery: (4)
As of 2017-05-24 04:30 GMT
Find Nodes?
    Voting Booth?