good chemistry is complicated,and a little bit messy -LW PerlMonks

### Re^5: Data compression by 50% + : is it possible?

by tybalt89 (Parson)
 on May 15, 2019 at 17:42 UTC Need Help??

I just modified my 54 bits/line solution to do base 50 math like you suggested to get 51 bits/line.

It does work --- but ---it is SLOW!!!

```#!/usr/bin/perl

# https://perlmonks.org/?node_id=1233613

use strict;
use warnings;
use bignum;

my @legal = grep !/11/ && tr/1// <= 3, glob '{0,1}' x 8;
my %code;
@code{@legal} = 0 .. \$#legal;
my %decode = reverse %code;
\$_ = [ split ' ', '23456789' & tr/01/ ?/r ] for values %decode;

sub tobits
{
my \$n = shift;
my \$bits = '';
\$bits = \$n % 2 . \$bits, \$n >>= 1 for 1 .. 51;
\$bits;
}

sub tonum
{
my \$n = 0;
\$n = 2 * \$n + \$_ for split //, shift;
\$n;
}

#print tobits(42), "\n";
#exit;

sub compress
{
my \$coded = '';
for ( shift =~ /(.*)\n/g )
{
my @lookup = (0) x 123;
@lookup[ unpack 'C*', \$_ ] = (1) x length;
my \$n = 0;
for( my \$group = 35; \$group < 123; \$group += 10 )
{
\$n = \$n * 50 + \$code{ join '', @lookup[\$group .. \$group + 7] };
}
#   print "\$n\n";
\$coded .= tobits(\$n);
}
return pack 'b*', \$coded;
}

sub decompress
{
my \$decoded = '';
for my \$line ( unpack('b*', shift) =~ /.{51}/g )
{
my \$n = tonum(\$line);
#   print "\$n\n";
my \$digit = 8;
for( my \$group = 33; \$group < 123; \$group += 10 )
{
\$decoded .= pack 'C*',
map \$group + \$_, @{ \$decode{ int \$n / 50 ** \$digit % 50 } };
\$digit--;
}
\$decoded .= "\n";
}
return \$decoded;
}

my \$input = '';
for (1 .. 80)
{
for (my \$x=0; \$x<90; \$x+=10)
{
my @c;
push(@c, int (rand(10)+\$x));
push(@c, int (rand(10)+\$x));
push(@c, int (rand(10)+\$x));
push(@c, int (rand(10)+\$x));
@c = sort{\$a<=>\$b}@c;
for (my \$i = 1; \$i < @c; \$i++)
{
\$input .= chr(33+\$c[\$i]) if \$c[\$i] != \$c[\$i-1] && \$c[\$i] != \$c[\$
+i-1]+1;
}
}
\$input .= "\n";
}

#use Data::Dump 'dd'; dd \$_ for \$input =~ /.*\n/g;
print "\n     input length ", length \$input, "\n";

my \$compressed = compress(\$input);
my \$compressedlength = length \$compressed;
print "compressed length \$compressedlength\n";
my \$restored = decompress(\$compressed);
if( \$input eq \$restored )
{
printf "\nMatched, compression ratio = %.1f%%\n",
100 * (1 - length(\$compressed) / length(\$restored));
}
else
{
print "----------------------------------------failed\n";
use Data::Dump 'dd'; dd \$_ for \$restored =~ /.*\n/g;
}

Replies are listed 'Best First'.
Re^6: Data compression by 50% + : is it possible?
by LanX (Archbishop) on May 15, 2019 at 21:23 UTC
Hi thanks. :)

I'm not surprised, I expected a trade off of speed and compression.

But look at the bright side,  BUK owes you (or Roboticus?) a dinner now. ;)

Honestly if I really needed it I'd rather try to precompute 9 look up tables.

But at this point I'd most probably just try Huffman coding with a single table and even higher compression.

Cheers Rolf
(addicted to the Perl Programming Language :)
Wikisyntax for the Monastery FootballPerl is like chess, only without the dice

> But at this point I'd most probably just try Huffman coding with a single table and even higher compression.

Or maybe not, I just tried to calculate the Huffman codes, and the resulting extra win is only at 5% more.

``` use strict;
use warnings;
use Data::Dump qw/pp dd/;

# my @freq = qw/5 7 10 15 20 45/;
# my \$sym = "1";
# my @tree = map { [ \$_ => \$sym++ ] } @freq;

my %freq = (
""    => 592,
"2"   => 74,
"24"  => 60,
"246" => 24,
"247" => 24,
"248" => 24,
"249" => 24,
"25"  => 84,
"257" => 24,
"258" => 24,
"259" => 24,
"26"  => 84,
"268" => 24,
"269" => 24,
"27"  => 84,
"279" => 24,
"28"  => 84,
"29"  => 60,
"3"   => 208,
"35"  => 144,
"357" => 48,
"358" => 48,
"359" => 48,
"36"  => 192,
"368" => 48,
"369" => 48,
"37"  => 192,
"379" => 48,
"38"  => 192,
"39"  => 144,
"4"   => 366,
"46"  => 228,
"468" => 72,
"469" => 72,
"47"  => 300,
"479" => 72,
"48"  => 300,
"49"  => 228,
"5"   => 524,
"57"  => 312,
"579" => 96,
"58"  => 408,
"59"  => 312,
"6"   => 682,
"68"  => 396,
"69"  => 396,
"7"   => 840,
"79"  => 336,
"8"   => 830,
"9"   => 508,
);

my @tree = map { [ \$freq{\$_} => \$_ ] } keys %freq;

#warn pp \@tree;
#exit;

while ( @tree > 1 ) {
@tree = sort { \$a->[0] <=> \$b->[0] } @tree;
my \$left  = shift @tree;
my \$right = shift @tree;
unshift @tree, [  (\$left->[0] + \$right->[0]) =>  [ \$left, \$right ]
+ ];
}

#warn pp \@tree;

my %huff;
create_code(\$tree[0],"");

#warn pp \%huff;

my \$avr = 0;
\$avr += \$freq{\$_} * length(\$huff{\$_}) / 10000
for keys %freq;

my \$classic = log(50)/log(2);

my \$old=0;
\$old += \$freq{\$_} * length(\$_) * 8 / 10000
for keys %freq;

\$old += 8/9 ;                              # \n

warn pp {
"old"          => \$old,
"line"         => \$old*9/8 . " bytes",
"huff average" => \$avr ,
"classic length" => \$classic ,
"huff win"     => \$avr/\$old*100     ." %",
"classic win"  => \$classic/\$old*100 ." %",
};

sub create_code {
my (\$node,\$code) = @_;
my \$sym  = \$node->[1];
if ( ref \$sym ) {
create_code( \$sym->[0], \$code."0" );
create_code( \$sym->[1], \$code."1" );
} else {
\$huff{\$sym} = \$code;
}
}

Bits needed per group:

```{
"classic length" => 5.64385618977472,
"classic win" => "42.155801598081 %",
"huff average" => 4.9824,
"huff win" => "37.215169703086 %",
"line" => "15.0616 bytes",
"old" => 13.3880888888889,
} at d:/exp/huffman.pl line 98.

##### Update

OTOH does the Huffman with ~5 bits/group need 1/6 less than you did with 6 bits/group (the 54 bit/line solution)

Depends on the POV I think! :)

FWIW here the Huffman code

```{
""    => "0110",
"2"   => 1001000,
"24"  => "0101000",
"246" => 101111110,
"247" => 101111011,
"248" => 101111010,
"249" => 110010100,
"25"  => 1001010,
"257" => 101111001,
"258" => 101111101,
"259" => 101111000,
"26"  => 1011100,
"268" => 101111100,
"269" => 101111111,
"27"  => 1001011,
"279" => 110010101,
"28"  => 1001001,
"29"  => 11101011,
"3"   => 111011,
"35"  => "010111",
"357" => 11001011,
"358" => 11001001,
"359" => 10111011,
"36"  => 110000,
"368" => 11101010,
"369" => 10111010,
"37"  => 110001,
"379" => 11001000,
"38"  => 110011,
"39"  => "010110",
"4"   => 10110,
"46"  => "00100",
"468" => "0101011",
"469" => "0101010",
"47"  => "01110",
"479" => "0101001",
"48"  => "01111",
"49"  => "00101",
"5"   => "0100",
"57"  => 10000,
"579" => 1110100,
"58"  => 11100,
"59"  => 10001,
"6"   => 1010,
"68"  => 11011,
"69"  => 11010,
"7"   => "000",
"79"  => 10011,
"8"   => 1111,
"9"   => "0011",
} at d:/exp/huffman.pl line 83.

Cheers Rolf
(addicted to the Perl Programming Language :)
Wikisyntax for the Monastery FootballPerl is like chess, only without the dice

Just because I wanted to see what the compress and decompress subs looked like :)

```#!/usr/bin/perl

# https://perlmonks.org/?node_id=1233613

use strict;
use warnings;
use re 'eval';

my %huffman = (         # from LanX https://perlmonks.org/?node_id=111
+00047
""    => "0110",
"2"   => 1001000,
"24"  => "0101000",
"246" => 101111110,
"247" => 101111011,
"248" => 101111010,
"249" => 110010100,
"25"  => 1001010,
"257" => 101111001,
"258" => 101111101,
"259" => 101111000,
"26"  => 1011100,
"268" => 101111100,
"269" => 101111111,
"27"  => 1001011,
"279" => 110010101,
"28"  => 1001001,
"29"  => 11101011,
"3"   => 111011,
"35"  => "010111",
"357" => 11001011,
"358" => 11001001,
"359" => 10111011,
"36"  => 110000,
"368" => 11101010,
"369" => 10111010,
"37"  => 110001,
"379" => 11001000,
"38"  => 110011,
"39"  => "010110",
"4"   => 10110,
"46"  => "00100",
"468" => "0101011",
"469" => "0101010",
"47"  => "01110",
"479" => "0101001",
"48"  => "01111",
"49"  => "00101",
"5"   => "0100",
"57"  => 10000,
"579" => 1110100,
"58"  => 11100,
"59"  => 10001,
"6"   => 1010,
"68"  => 11011,
"69"  => 11010,
"7"   => "000",
"79"  => 10011,
"8"   => 1111,
"9"   => "0011",
);

my %tohuff = map { \$_, \$huffman{ ('23456789' & tr/01/ ?/r) =~ tr/ //dr
+ } }
grep !/11/ && tr/1// <= 3, glob '{0,1}' x 8;
my @allcodes = map "\$huffman{\$_}(?{\$_})", keys %huffman;
my \$fromhuff = do { local \$" = '|'; qr/(?:@allcodes)/ };

sub compress
{
my \$coded = '';
for ( shift =~ /(.*)\n/g )
{
my @lookup = (0) x 123;
@lookup[ unpack 'C*', \$_ ] = (1) x length;
for( my \$group = 35; \$group < 123; \$group += 10 )
{
\$coded .= \$tohuff{ join '', @lookup[\$group .. \$group + 7] };
}
}
return pack 'b*', \$coded;
}

sub decompress
{
my \$decoded = '';
local \$_ = unpack 'b*', shift;
while( /\$fromhuff/g )
{
\$decoded .= pack 'C*', map \$decade + \$_, split //, (\$^R // '');
(\$decade += 10) >= 123 and \$decade = 33, \$decoded .= "\n";
}
return \$decoded =~ s/^.+\z//mr;
}

my \$input = '';
for (1 .. 100)
{
for (my \$x=0; \$x<90; \$x+=10)
{
my @c;
push(@c, int (rand(10)+\$x));
push(@c, int (rand(10)+\$x));
push(@c, int (rand(10)+\$x));
push(@c, int (rand(10)+\$x));
@c = sort{\$a<=>\$b}@c;
for (my \$i = 1; \$i < @c; \$i++)
{
\$input .= chr(33+\$c[\$i]) if \$c[\$i] != \$c[\$i-1] && \$c[\$i] != \$c[\$
+i-1]+1;
}
}
\$input .= "\n";
}

#use Data::Dump 'dd'; dd \$_ for \$input =~ /.*\n?/g;
print "\n     input length ", length \$input, "\n";

my \$compressed = compress(\$input);
my \$compressedlength = length \$compressed;
print "compressed length \$compressedlength\n";
my \$restored = decompress(\$compressed);
if( \$input eq \$restored )
{
printf "\nMatched, compression ratio = %.1f%%\n",
100 * (1 - length(\$compressed) / length(\$restored));
}
else
{
print "----------------------------------------failed\n";
use Data::Dump 'dd'; dd \$_ for \$restored =~ /.*\n?/g;
}

Typical output:

```     input length 1512
compressed length 557

Matched, compression ratio = 63.2%

Create A New User
Node Status?
node history
Node Type: note [id://11100032]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others browsing the Monastery: (5)
As of 2019-06-26 22:12 GMT
Sections?
Information?
Find Nodes?
Leftovers?
Voting Booth?
Is there a future for codeless software?

Results (111 votes). Check out past polls.

Notices?