<?xml version="1.0" encoding="windows-1252"?>
<node id="270020" title="LZW Demystified (the code)" created="2003-06-29 16:44:59" updated="2005-08-14 09:24:11">
<type id="11">
note</type>
<author id="108447">
demerphq</author>
<data>
<field name="doctext">
&lt;!--
&lt;p&gt;&lt;em&gt;&lt;/em&gt;&lt;/p&gt;
&lt;code&gt;&lt;/code&gt;
&lt;i&gt;&lt;/i&gt;
&lt;b&gt;&lt;/b&gt;
&amp;#91; &amp;#93; 
--&gt;
&lt;p&gt;
And heres the code. It may be a bit scruffy right now. Theres lots of $DEBUG stuff going on that I normally would remove, except that its no fun to play with if you cant see what happening under the hood! $DEBUG=1 shows lots of diagnostics. $DEBUG&gt;1 shows even more. If run directly as a script it will compress and expand itself under $DEBUG circumstances. Otherwise it should be usable now. I will add POD later, but for now I'll leave it be.
&lt;/p&gt;
&lt;p&gt;
Note that once the output code bug is fixed changing the magic number output should make it &lt;Code&gt;compress&lt;/code&gt; compatible. Currently it is not however.
&lt;readmore&gt;
&lt;code&gt;
package LZW;
use strict;
use warnings;
use Data::Dumper;
use constant MAGIC_1=&gt;0x2f;    # magic number 1 (Set to 0x1f when compress compatible)
use constant MAGIC_2=&gt;0x9d;    # compress magic number 2
use constant MAGIC=&gt;pack 'C*',MAGIC_1,MAGIC_2;

use constant BIT_MASK=&gt;0x1f;   # bit_mask for third byte to get
                               # the max bits in the file.
use constant BLOCK_MODE=&gt;0x80; # bit_mask for whether we are
                               # compress 2.0 compat (we arent)
use constant WIPE=&gt;256;        # code to cause a string table dump
use constant INIT_BITS=&gt;9;     # number of bits for the first emitted code
use constant MAX_BITS=&gt;12;     # maximum bits to represent. this means
                               # we can have 2**MAX_BITS codes and corresponding
                               # hash table entries.
use constant CODE_EOF=&gt;2**MAX_BITS+1; # we use this to signal EOF.

$|++;

our $DEBUG=0;

# used for debugging
sub qquote {
    my $val=shift;
    return 'undef' unless defined $val;
    return qq("") unless length $val;
    my @parts=$val=~m/\G(.{1,80})/sg;
    #warn scalar @parts,"\n";
    return (@parts&gt;1?"\n":"").join(".\n",
               map { Data::Dumper::qquote($_) }
               @parts);
}


BEGIN {
    $Data::Dumper::Useqq=1;
    my @chars=map { chr ($_) } 0..255;

    sub _init {
        my $self=shift;
        print STDERR "_init()\n" if $DEBUG;
        my %default=(
             # index is the codes id, value is the string
            codes      =&gt; [ @chars,'' ],
            # lookup from string to id
            strs       =&gt; {
                                        map { $_ =&gt; ord($_) }
                                        @chars[0..255]
                                      },
            # number of bits to output per code
            code_bits  =&gt; INIT_BITS,
            # (2**code_bits) When the number of codes equals
                        # this we add a bit to the code length
            threshold  =&gt; (2**INIT_BITS),
        );

        $self-&gt;{$_}=$default{$_}
            foreach keys %default;
        return $self;
    }

    sub init {
        my $self=shift;
        %$self=(
            # (2**n-1) Size of buffer holding chars representing bits.
            buf_max    =&gt; 1023,
            # user supplied attributes go above here
            %$self,             
            # per object initializers go below here
            # string representing the bitstream of the compressed file
            bit_buf    =&gt; "",      
            in         =&gt; undef,   # in filehandle
            out        =&gt; undef,   # out filehandle
            rbits      =&gt;0,        # bits read
            wbits      =&gt;0,        # bits written
        );
        $self-&gt;_init();
    }
}

sub new {
    my $class=shift;
    my $self=bless {@_},$class;
    return $self-&gt;init()
}


sub _open_files {
    my ($self,$infile,$outfile)=@_;
    if (-e $outfile) {
        if ($self-&gt;{overwrite}) {
            warn "Overwriting '$outfile'\n";
        } else {
            Carp::croak("Won't overwrite '$outfile' unless".
                        " overwrite option set\n");
        }
    }
    open my $in,"&lt;",$infile or die "Reading '$infile' : $!";
    open my $out,"&gt;",$outfile or die "Writing '$outfile' : $!";
    binmode $in;
    binmode $out;
    return ($in,$out);
}

sub expand_file {
    my ($self,$infile,$outfile)=@_;
    $infile or die "Must have a file to expand!";
    unless ($outfile) {
        $outfile=$infile;
        $outfile=~s/\.plzw\z// or $outfile.=".out";
    }
    warn "Expanding '$infile' to '$outfile'\n";
    $self-&gt;expand_fh($self-&gt;_open_files($infile,$outfile));
    return $outfile;
}

sub compress_file {
    my ($self,$infile,$outfile)=@_;
    $outfile||= "$infile.plzw";
    warn "Compressing '$infile' to '$outfile'\n";
    $self-&gt;compress_fh($self-&gt;_open_files($infile,$outfile));
    return $outfile;
}

sub compress_fh {
    my ($self,$in,$out)=@_;
    $self-&gt;init() if $self-&gt;{in} or $self-&gt;{out};
    @{$self}{qw(in out)}=($in,$out);

    my $string;
    my $buf="";
    my $codes=$self-&gt;{codes};

    read($in,$string,1) or die "Empty file or error! $!";

    # compress signature, mode and maxbit and first char.
    print $out MAGIC,chr(BLOCK_MODE|MAX_BITS),$string;


    print STDERR "O &gt; char : ".qquote($string)."\n"
        if $DEBUG;

    $self-&gt;{rbits}+=32;
    my $notfirst=0;

    while (&lt;$in&gt;) {
        $buf.=$_;
        print STDERR "READ:".qquote($buf)."\n" if $DEBUG;

        while ($buf) {
            my $char=substr($buf,0,1,'');
            my $append=$string.$char;
            $self-&gt;{rbits}+=8;
            if (exists($self-&gt;{strs}{$append})) {
                $string=$append;
            } else {
                if ($notfirst) {
                    $self-&gt;output_code($self-&gt;{strs}{$string})
                } else {
                    $notfirst=1;
                }
                # add the code
                if (@$codes&lt;CODE_EOF) {

                    push @$codes,$append;
                    $self-&gt;{strs}{$append}=$#$codes;

                    printf STDERR "C&gt;&gt; %6d : %s\n",$#$codes,qquote($append) 
                           if $DEBUG;

                } elsif ($self-&gt;{wbits}/$self-&gt;{rbits}&lt;.7) {
                    $self-&gt;output_code(WIPE);
                    $self-&gt;_init();
                    #$char="";
                }
                $string=$char
            }
        }
    }
    $self-&gt;output_code($self-&gt;{strs}{$string} )
        if length $string;
    $self-&gt;output_code( CODE_EOF );
    warn "Read %d bits, wrote %d bits producing %%%5.2f compression\n",
               $self-&gt;{rbits},$self-&gt;{wbits},$self-&gt;{wbits}/$self-&gt;{rbits};
    return 1
}


# output_code $fh $code
# Takes a filehandle and a numeric code and outputs it with the
# correct number of code_bits for the number of codes so far involved.
# thus we use 9 bit codes until we hit 512 then we go to 10 bit codes
# etc.
# We cache the codes emitted until we have a multiple of 8 (as we output
# bytes/chars and not code_bits), and at the same time we do a bit of caching
# so we call print and pack less often.

sub output_code {
    my ($self,$code)=@_;
    Carp::confess 'undef!' unless defined $code;

    my $out=$self-&gt;{out};

    # the condition here is against &lt;= $#.. different from input_code
    while ($self-&gt;{threshold} &lt;= $#{$self-&gt;{codes}}
               and $self-&gt;{code_bits}&lt;MAX_BITS)
        {
        $self-&gt;{threshold}&lt;&lt;=1;
        $self-&gt;{code_bits}++;
    }



    unless ($code==CODE_EOF) {
                my $bits=sprintf "%0*b",$self-&gt;{code_bits},$code;
        $self-&gt;{bit_buf}.=$bits;
        $self-&gt;{wbits}+=$self-&gt;{code_bits};
        printf STDERR "O &gt; %-6d : %12s : %s \t| ",
                       $code,$bits,qquote($self-&gt;{codes}[$code])
                   if $DEBUG;
    }
    my $len=length($self-&gt;{bit_buf});

    if (($len&gt;$self-&gt;{buf_max}) or (CODE_EOF == $code))
    {
        my $chunk=substr($self-&gt;{bit_buf},0,
                    ($len&gt;$self-&gt;{buf_max} ? int($len / 8) * 8 : $len),""
                );
        my $pack=pack "B*",$chunk;
        printf STDERR "\nOC&gt; %2d:%2d:%4d:%s\n     %s\n",
                    length($pack),$self-&gt;{code_bits},length($chunk),
                    qquote($chunk),qquote($pack),"\n"
                    if $DEBUG;

        print $out $pack;
        #$self-&gt;{bit_buf}="";
    }
}

sub expand_fh {
    my $self=shift;
    my ($in,$out)=@_;

    $self-&gt;init() if $self-&gt;{in} or $self-&gt;{out};
    $self-&gt;{in}=$in;
    $self-&gt;{out}=$out;

    my $head;
    my $bytes=read($in,$head,3);
    unless ($bytes == 3 and substr($head,0,2) eq MAGIC) {
        Carp::confess("Not a Perl LZW file");
    }
    if (ord(substr($head,3,1)) &amp; BIT_MASK &gt; MAX_BITS) {
        Carp::confess("Can't decompress this file, it used too large ".
                " a bitsize for me to handle.\n");
    }

    my ($char,$old_code);
    read($in,$char,1) or die "Bad file!\n";
    $old_code=ord($char);
    my $codes=$self-&gt;{codes};

    print STDERR "First char: $char\n"
        if $DEBUG;

    print $out $char;
    $char=""; #reset char so the first symbol entered isnt the first char doubled.
              #old_code holds its ord value anyway.

    while ((my $new_code=$self-&gt;input_code()) != CODE_EOF) {

         if ($new_code==WIPE) {
            $self-&gt;_init();
            $char="";
            $old_code=WIPE;
            next;
        }
        print STDERR "Read: $new_code " .qquote($codes-&gt;[$new_code])."\n" if $DEBUG;
        my $str;
        if ( defined $codes-&gt;[$new_code]) {

            $str=$codes-&gt;[$new_code];

            print STDERR "Found $new_code: ",qquote($str),"\n"
                            if $DEBUG&gt;1;

        } else {
            # this handles the KwKwK case

            $str=$codes-&gt;[$old_code].$char;

            print STDERR "Initializing current string for $new_code (",
                            qquote($str).") from ".qquote($codes-&gt;[$old_code]),
                            " and ",qquote($char)."\n"
                            if $DEBUG;
        }

        $self-&gt;output_decoded($str);
        $char=substr($str,0,1);
        if (@$codes&lt;CODE_EOF) {
            push @$codes,$codes-&gt;[$old_code].$char;

            printf STDERR "E&lt;&lt; Created: %6d : %s\n",
                               $#$codes,qquote($codes-&gt;[$old_code].$char)
                               if $DEBUG;

        }
        $old_code=$new_code;
    }
}


# input_code $in
# Reads a code from the input filehandle.
# This is a bit tricky. We are extracting variable numbers of code_bits from the stream each time.
# As with the output we maintain a cache, but this time for the purpose of allowing us to easily
# slice off the correct set of code_bits from the front of the stream.


sub input_code {
    my $self=shift;
    my $in=$self-&gt;{in};

    # make sure we are reading in the correct number of bits
    # the condition here is against &lt;= @.. different from output_code
    while ($self-&gt;{threshold} &lt;= @{$self-&gt;{codes}}
               and $self-&gt;{code_bits}&lt;MAX_BITS) {
        $self-&gt;{threshold}&lt;&lt;=1;
        $self-&gt;{code_bits}++;
    }
    # do we need to fill the buffer up with more bits?
    if (length($self-&gt;{bit_buf})&lt;$self-&gt;{code_bits}) {
        if (not(eof $in)) {
            local $/=\do{my $size=int ($self-&gt;{buf_max} / 8)};
            $self-&gt;{bit_buf}.=unpack "B*",scalar &lt;$in&gt;;
                    print STDERR "RC&lt; ",qquote($self-&gt;{bit_buf})."\n"
                        if $DEBUG;
        } else {
            if ($DEBUG  &amp;&amp; $self-&gt;{bit_buf}=~/1/) {
                # this actually shouldnt be an issue but we do it anyway
                Carp::carp(length($self-&gt;{bit_buf})."&lt;".$self-&gt;{code_bits}.
                ": $self-&gt;{bit_buf} : Corrupt file?\n".Dumper($self));
            }
            return CODE_EOF;
        }
    }

    # read the bits from the buffer
    my $code_bits=substr($self-&gt;{bit_buf},0,$self-&gt;{code_bits},"");
    # now turn it into a number
    my $code=unpack('n',pack "B*",(0 x (16-length $code_bits)).$code_bits);

    print STDERR " &lt; $code $self-&gt;{code_bits} '$code_bits'\n" if $DEBUG;

    return $code;
}


sub output_decoded {
    my ($self,$str)=@_;
    my $fh=$self-&gt;{out};
    print $fh $str;
}

unless (caller) {
    local $DEBUG=1;
    my $obj=__PACKAGE__-&gt;new(overwrite=&gt;1);
    my $compressed_as=$obj-&gt;compress_file($0);
    my $expanded_as=$obj-&gt;expand_file($compressed_as,$compressed_as.".out");
}

1;
__END__
# From http://dogma.net/markn/articles/lzw/lzw.htm
# decompress
Read OLD_CODE
output OLD_CODE
CHARACTER = OLD_CODE
WHILE there are still input characters DO
    Read NEW_CODE
    IF NEW_CODE is not in the translation table THEN
        STRING = get translation of OLD_CODE
        STRING = STRING+CHARACTER
    ELSE
        STRING = get translation of NEW_CODE
    END of IF
    output STRING
    CHARACTER = first character in STRING
    add OLD_CODE + CHARACTER to the translation table
    OLD_CODE = NEW_CODE
END of WHILE

# compress
STRING = get input character
WHILE there are still input characters DO
    CHARACTER = get input character
    IF STRING+CHARACTER is in the string table then
        STRING = STRING+character
    ELSE
        output the code for STRING
        add STRING+CHARACTER to the string table
        STRING = CHARACTER
    END of IF
END of WHILE
output the code for STRING
&lt;/code&gt;
&lt;/readmore&gt;
&lt;p&gt;
The part after the __END__ marker is the LZW algorithm in pseudo code from the  [http://dogma.net/markn/articles/lzw/lzw.htm|Dr. Dobbs Journal] article mentioned above.
&lt;/p&gt;
&lt;br /&gt;
---
&lt;br /&gt; 
demerphq&lt;br /&gt;
&lt;br /&gt;
&lt;sub&gt;&amp;lt;[Elian]&amp;gt; And I do take a kind of perverse pleasure in having an OO assembly language... 
&lt;!--
&lt;hr /&gt;
&lt;p&gt;
&lt;strong&gt;&amp;bull; Update:&amp;nbsp;&amp;nbsp;&lt;/strong&gt;&lt;br /&gt;

&lt;/p&gt;
--&gt;
&lt;/sub&gt;
&lt;br /&gt;
</field>
<field name="root_node">
270016</field>
<field name="parent_node">
270016</field>
</data>
</node>
