package Huffman_GMCH ; use strict ; use warnings ; #========================================================================================= # huffman -- take array of symbol counts and return an encoding # # The array of symbol counts is assumed to be indexed by symbol ordinal. Any symbol # with a zero count will have no representation in the resulting code. # # The encoding is represented as an array of arrays: [ \@lengths, \@counts, \@symbols ] # # where: @lengths is a list of symbol lengths used, in ascending order # # @counts is a list of the number of symbols encoded at the corresponding length # # @symbols is a list of the symbol values, sorted first by encoded length, # and second by symbol ordinal. # # It's not strictly necessary to sort by symbol ordinal, but this looks tidy, and (more # importantly) it can be used to advantage when encoding the encoding. # # Requires: $r_counts -- ref array of symbol counts # # Returns: $r_code -- [ \@lengths, \@counts, \@symbols ] -- see above sub huffman { my ($r_counts) = @_ ; # Fill the @tree list with leaf nodes, ignoring symbols with zero counts my @tree = () ; for (my $s = 0 ; $s <= $#{$r_counts} ; $s++) { if ($r_counts->[$s]) { push @tree, [$r_counts->[$s], $s] } ; } ; # Build the tree but successively combining the two lowest weight nodes. # # We sort the node list each time around the loop. This is a poor man's priority # queue. With luck the sort is fast where the list is mostly ordered. # # NB: this sorts into ascending order of weigth (hence $b->[0] <=> $a->[0]). # # NB: where weights are equal, we treat an internal node as slightly heavier, which # has the effect of minimising tree height. The extension to the comparison will # leave two internal nodes in the current order. Two leaf nodes will be sorted # so that the *smaller* ordinal has the *greater* weight (hence the reversed # order in the final '<=>'). while ($#tree) { @tree = sort { ($b->[0] <=> $a->[0]) || ( ref($b->[1]) ? +1 : ref($a->[1]) ? -1 : $a->[1] <=> $b->[1] ) } @tree ; my $r = pop @tree ; my $l = pop @tree ; push @tree, [$l->[0] + $r->[0], $l, $r] ; } ; # Establish the encoded length of the symbols my @symbol_lengths = () ; _huff_walk(pop @tree, 0, \@symbol_lengths) ; # Now generate and return the encoding my @code = () ; for my $s (0..$#symbol_lengths) { if (my $l = $symbol_lengths[$s]) { push @{$code[$l]}, $s ; } ; } ; my @lengths = () ; my @counts = () ; my @symbols = () ; for my $l (1..$#code) { if (my $ss = $code[$l]) { push @lengths, $l ; push @counts, scalar(@{$ss}) ; push @symbols, @{$ss} ; } ; } ; return [\@lengths, \@counts, \@symbols] ; } ; sub _huff_walk { my ($node, $length, $r_lengths) = @_ ; my ($p, $l, $r) = @$node ; if (ref($l)) { $length++ ; _huff_walk($l, $length, $r_lengths) ; _huff_walk($r, $length, $r_lengths) ; } else { $r_lengths->[$l] = $length ; } ; return ; } ; #========================================================================================= # huffman_encode_map -- construct encode map from given encoding # # The encode map is an array indexed by symbol ordinal giving its Huffman encoding. # # Symbols with no encoding will map to undef (of course). # # The huffman encoding is represented as an integer, with the LS bit being the first bit # to transmit. Above the MS bit is a "guard bit". This particular representation allows # an output loop of the form: # # while ($v != 1) { vec($m, $p++, 1) = $v & 1 ; $v =>> 1 ; } ; # # Requires: $code -- [ \@lengths, \@counts, \@symbols ] -- as returned by huffman() # # Returns: $r_emap -- ref:Array as described above sub huffman_encode_map { my ($code) = @_ ; my ($r_lengths, $r_counts, $r_symbols) = @{$code} ; my @emap = () ; my $hs = 0 ; # Huffman symbol value my $hl = 0 ; # Current huffman symbol length my $hc = 0 ; # Count of symbols remaining at current length my $li = 0 ; # Index of next symbol length to use foreach my $s (@{$r_symbols}) { if ($hc == 0) { # If exhausted current symbol length, advance to next my $l = $hl ; $hl = $r_lengths->[$li] ; $hc = $r_counts->[$li] ; $hs <<= ($hl - $l) ; # Increased symbol length $li++ ; # Index of next length to use } ; my $e = 1 ; # Start map entry with "guard bit" my $v = $hs ; # Copy of of huffman symbol, to be processed LS bit first for (1..$hl) { $e <<= 1 ; $e += $v & 1 ; $v >>= 1 ; } ; $emap[$s] = $e ; # Set map entry $hs++ ; # Next huffman symbol $hc-- ; # Count down symbols ar current length } ; return \@emap ; } ; #========================================================================================= # huffman_enc -- encode given string into given bit-vector according to given encoding map # # Bit-Vector is updated, in place. # # Requires: $r_emap -- encoding map as generated by huffman_encode_map() # $vec -- vector to encode to *** updated in place *** # $p -- next bit of vector # $string -- string to encode # # Returns: new next bit of vector sub huffman_enc { my ($r_emap, undef, $p, $string) = @_ ; my $v ; foreach my $ch (split //, $string) { if (defined($v = $r_emap->[ord($ch)])) { while ($v != 1) { vec($_[1], $p++, 1) = $v & 1 ; $v >>= 1 ; } ; } else { die "No encoding for character '$ch'" ; } ; } ; return $p ; } ; #========================================================================================= # huffman_dec -- decode one symbol from given bit-vector, using given encoding # # The bit pointer is updated, in place. # # Requires: $code -- huffman encoding as returned huffman() # $vec -- vector to decode # $p -- next bit of vector *** updated in place *** # # Returns: next symbol ordinal sub huffman_dec { my ($code, $vec, $p) = @_ ; my ($r_lengths, $r_counts, $r_symbols) = @{$code} ; my $hs = 0 ; # Huffman symbol my $hl = 0 ; # Current huffman symbol length my $hc = 0 ; # Count of symbols at current length my $li = 0 ; # Next length index my $v = 0 ; # What we've read so far do { $hs += $hc ; # Step to first symbol at next length my $l = $hl ; $hl = $r_lengths->[$li] ; # Next length $v -= $hc ; # Strip prefix for current length while ($l < $hl) { # Fetch bits to next length $v <<= 1 ; $v += vec($vec, $p++, 1) ; $l++ ; } ; $hc = $r_counts->[$li] ; # Count of symbols at new length $li++ ; # Index of next length to use } until ($v < $hc) ; $_[2] = $p ; # Update pointer -- in place return $r_symbols->[$hs+$v] ; } ; 1 ;