Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked
 
PerlMonks  

B::Deobfuscate - Deobfuscates symbol names

by diotalevi (Canon)
on Mar 20, 2003 at 15:23 UTC ( #244604=sourcecode: print w/ replies, xml ) Need Help??

Category: Miscellaneous
Author/Contact Info Joshua Jore jjore@cpan.org
Description:

B::Deobfuscate is a backend module for the Perl compiler that generates perl source code, based on the internal compiled structure that perl itself creates after parsing a program. It adds symbol renaming functions to the B::Deparse module. An obfuscated program is already parsed and interpreted correctly by the B::Deparse program. Unfortunately, if the obfuscation involved variable renaming then the resulting program also has obfuscated symbols.

This module takes the last step and fixes names like $z5223ed336 to be a word from a dictionary. While the name still isn't meaningful it is at least easier to distinguish and read.

package B::Deobfuscate;
use strict;
use warnings;
use vars '$VERSION';
use base 'B::Deparse';
use B ();
use B::Keywords ();

# Some functions may require() YAML

$VERSION = '0.03';

sub load_keywords {
    my $self = shift;
    my $p = $self->{+__PACKAGE__};

    return $p->{'keywords'} = {
               map { $_, undef }
                   @B::Keywords::Barewords,
                   # Snip the sigils.
                   map(substr($_,1), @B::Keywords::Symbols) };
}

sub load_unknown_dict {
    my $self = shift;
    my $p = $self->{+__PACKAGE__};
    my $dict_file = $p->{'unknown_dict_file'};
    length $dict_file or return;
    my $dict_data;

    # slurp the entire dictionary at once
    open DICT, '<', $dict_file
        or die "Cannot open dictionary $dict_file: $!";
    read DICT, $dict_data, -s DICT;
    close DICT or die "Cannot close $dict_file: $!";

    my $k = $self->load_keywords;

    $p->{'unknown_dict_data'} =
        [ sort { length $a <=> length $b or $a cmp $b }
          grep { ! /\W/ and ! exists $k->{$_} }
          split /\n/, $dict_data ];
}


sub next_short_dict_symbol {
    my $self = shift;
    my $p = $self->{+__PACKAGE__};

    my $sym = shift @{ $p->{'unknown_dict_data'} };
    push @{ $p->{'used_symbols'} }, $sym.
    return $sym;
}

sub next_long_dict_symbol {
    my $self = shift;
    my $p = $self->{+__PACKAGE__};

    my $sym = pop @{ $p->{'unknown_dict_data'} };
    push @{ $p->{'used_symbols'} }, $sym;
    return $sym;
}

sub load_user_config {
    my $self = shift;
    my $p = $self->{+__PACKAGE__};
    my $config_file = $p->{'user_config'};
    defined $config_file and length $config_file or return;

    -f $config_file or die "Configuration file $config_file doesn't ex
+ist";

    require YAML;
    my $config = (YAML::LoadFile( $config_file ))[0];
    $p->{'globals_to_ignore'} = $config->{'globals_to_ignore'};
    $p->{'pad_symbols'} = $config->{'lexicals'};
    $p->{'gv_symbols'} = $config->{'globals'};
    defined $config->{'dictionary'} and
        $p->{'unknown_dict_file'} = $config->{'dictionary'};
    if (defined $config->{'global_regex'}) {
        my $r = $config->{'global_regex'};
        $p->{'global_regex'} = qr/$r/;
    }

    # Symbols that are listed with an undef value actually
    # just aren't renamed at all.
    for my $symt_nym (qw/pad gv/) {
        my $symt = $p->{"${symt_nym}_symbols"};
        for my $symt_key (keys %$symt) {
            not defined $symt->{$symt_key} and
                $symt->{$symt_key} = $symt_key;
        }
    }
}

sub gv_should_be_renamed {
    my $self = shift;
    my $name = shift;
    my $p = $self->{+__PACKAGE__};
    my $k = $p->{'keywords'};

    # Ignore keywords
    return if exists $k->{$name} or
              $name =~ m{\A[[:digit:]]\z};

    if (exists $p->{'gv_symbols'}{$name} or
        $name =~ $p->{'gv_match'} ) {
        return 1;
    }
    return;
}

sub rename_pad {
    my $self = shift;
    my $p = $self->{+__PACKAGE__};
    my $name = shift;

    $name =~ m{\A(\W+)} or die "Invalid pad variable name $name";
    my $sigil = $1;

    my $dict = $p->{'pad_symbols'};
    return $dict->{$name} if exists $dict->{$name};

    $dict->{$name} = $name;
    return $dict->{$name} = lc $sigil . $self->next_short_dict_symbol;
}

sub rename_gv {
    my $self = shift;
    my $name = shift;
    my $p = $self->{+__PACKAGE__};

    return $name unless $self->gv_should_be_renamed( $name );

    my $dict = $p->{'gv_symbols'};
    return $dict->{$name} if exists $dict->{$name};
    return $dict->{$name} = ucfirst $self->next_long_dict_symbol;
}

sub new {
    my $class = shift;
    my $self = $class->SUPER::new( @_ );
    my $p = $self->{+__PACKAGE__} = {};
    $p->{'unknown_dict_file'} = '/usr/share/dict/stop';
    $p->{'unknown_dict_data'} = undef;
    $p->{'user_config'} = undef;
    $p->{'gv_match'} = qw/\A[[:lower:][:digit:]_]+\z/;
    $p->{'pad_symbols'} = {};
    $p->{'gv_symbols'} = {};
    $p->{'output_yaml'} = 0;

    while (my $arg = shift @_) {
        if ($arg =~ m{\A-d([^,]+)}) {
            $p->{'unknown_dict_file'} = $1;
        } elsif ($arg =~ m{\A-c([^,]+)} ) {
            $p->{'user_config'} = $1;
        } elsif ($arg =~ m{\A-m/([^/]+)/} ) {
            $p->{'gv_match'} = $1;
        } elsif ($arg =~ m{\A-y}) {
            $p->{'output_yaml'} = 1;
        }
    }

    $self->load_user_config;
    $self->load_unknown_dict;

    return $self;
}

sub compile {
    my(@args) = @_;
    return sub {
        my $source = '';
        my $self = B::Deobfuscate->new(@args);
        $self->stash_subs("main");
        $self->{'curcv'} = B::main_cv;
        $self->walk_sub(B::main_cv, B::main_start);
        $source .= join '', $self->print_protos;
        @{$self->{'subs_todo'}} =
          sort {$a->[0] <=> $b->[0]} @{$self->{'subs_todo'}};
        $source .= join '', $self->indent($self->deparse(B::main_root,
+ 0)), "\n"
          unless B::Deparse::null B::main_root ;
        my @text;
        while (scalar(@{$self->{'subs_todo'}})) {
            push @text, $self->next_todo;
        }
        $source .= join '', $self->indent(join("", @text)), "\n" if @t
+ext;

        my $p = $self->{+__PACKAGE__};
        my %dump = ( lexicals => $p->{'pad_symbols'},
                     globals => $p->{'gv_symbols'},
                     dictionary => $p->{'unknown_dict_file'},
                     global_regex => $p->{'gv_match'} );

        if ($p->{'output_yaml'}) {
            require YAML;
            print YAML::Dump(\%dump, $source);
        }
        else {
            print $source;
        }
    }
}

sub padname {
    my $self = shift;
    my $padname = $self->SUPER::padname( @_ );

    return $self->rename_pad( $padname );
}

sub gv_name {
    my $self = shift;
    my $gv_name = $self->SUPER::gv_name( @_ );

    return $self->rename_gv( $gv_name );
}

1;

__END__

=head1 NAME

B::Deobfuscate - Extension to B::Deparse for use in de-obfuscating sou
+rce code

=head1 SYNOPSIS

  perl -MO=Deobfuscate,-csynthetic.yml,-y synthetic.pl

=head1 DESCRIPTION

B::Deobfuscate is a backend module for the Perl compiler that generate
+s perl
source code, based on the internal compiled structure that perl itself
creates after parsing a program. It adds symbol renaming functions to 
+the
B::Deparse module. An obfuscated program is already parsed and interpr
+eted
correctly by the B::Deparse program. Unfortunately, if the obfuscation
involved variable renaming then the resulting program also has obfusca
+ted
symbols.

This module takes the last step and fixes names like $z5223ed336 to be
+ a word
from a dictionary. While the name still isn't meaningful it is at leas
+t easier
to distinguish and read. Here are two examples - one from B::Deparse a
+nd one
from B::Deobfuscate.

After B::Deparse:

  if (@z6a703c020a) {
      (my($z5a5fa8125d, $zcc158ad3e0) = File::Temp::tempfile('UNLINK',
+ 1));
      print($z5a5fa8125d "=over 8\n\n");
      (print($z5a5fa8125d @z6a703c020a) or die((((q[Can't print ] . $z
+cc158ad3e0) . ': ') . $!)));
      print($z5a5fa8125d "=back\n");
      (close(*$z5a5fa8125d) or die((((q[Can't close ] . *$za5fa8125d) 
+. ': ' . $!)));
      (@z8374cc586e = $zcc158ad3e0);
      ($z9e5935eea4 = 1);
  }

After B::Deobfuscate:

  if (@parenthesises) {
      (my($scrupulousity, $postprocesser) = File::Temp::tempfile('UNLI
+NK', 1));
      print($scrupulousity "=over 8\n\n");
      (print($scrupulousity @parenthesises) or die((((q[Can't print ] 
+. $postprocesser) . ': ') . $!)));
      print($scrupulousity "=back\n");
      (close(*$scrupulousity) or die((((q[Can't close ] . *$postproces
+ser) . ': ') . $!)));
      (@interruptable = $postprocesser);
      ($propagandaist = 1);
  }

You'll note that the only real difference is that instead of variable 
+names
like $z9e5935eea4 you get $propagandist.

Please note that this module is mainly new and untested code and is
still under development, so it may change in the future.

=head1 OPTIONS

As with all compiler backend options, these must follow directly after
the '-MO=Deobfuscate', separated by a comma but not any white space.
All options defined in B::Deparse are supported here - see the B::Depa
+rse
documentation page to see what options are provided and how to use the
+m.

=over 4

=item B<-d>I<DICTIONARY>

Normally B::Deobfuscate reads the dictionary file at /usr/share/dict/s
+top. If
you would like to specify a different dictionary follow the -d paramet
+er with
the path the file. The path may not have commas in it and only lines i
+n the
dictionary that do not match /\W/ will be used. The entire dictionary 
+will be
loaded into memory at once.

  -d/usr/share/dict/stop

=item B<-m>I<REGEX>

Supply a different regular expression for deciding which symbols to re
+name.
The default value is /\A[[:lower:][:digit:]_]+\z/. Your expression mus
+t be
delimited by the '/' characters and you may not use that character wit
+hin the
expression. That shouldn't be an issue because '/' isn't valid in a sy
+mbol
name anyway.

  -a/\A[[:lower:][:digit:]_]+\z/

=item B<-y>

print two B<YAML> documents to STDOUT instead of the deparsed source c
+ode.
The first document is a configuration document suitable for use with t
+he B<-c>
parameter. The second document is the deparsed source code. Use this f
+eature
to generate a configuration document for further, iterative reverse en
+gineering.

=item B<-c>I<FILENAME>

Supply a filename to a B<YAML> configuration file. Normally you would 
+generate this
file by saving the results of the B<-y> parameter to a file. You can t
+hen edit the
file to provide your own names for symbols and not rely on the random 
+symbol picker
in B<B::Deobfuscate>. You may create your own B<YAML> configuration fi
+le as well.

=back

=head1 CONFIGURATION FILE

The B::Deobfuscation symbol renamer can be controlled with by a config
+uration file.
Use of this feature requires the L<YAML> module be installed.

 dictionary: '/usr/share/dict/propernames'
 global_regex: '(?:)'
 globals:
   kSDsfDS: Slartibartfast
   HGFdsfds: Triantaphyllos
 lexicals:
   '$SdfSd': '$No'
   '$GsdDd': '$Ed'
   '$Ksdfs': '$Ji'

The following keys are recognized:

=over 4

=item B<dictionary>

This is a filename path to the operative dictionary.

 dictionary: /usr/share/dict/stop

=item B<global_regex>

This regular expression tests global symbols. Only symbols that match 
+this
expression may be renamed. The default value is '\A[[:lower:][:digit:]
+_]\z/.
In perl, global symbols are independent of their sigil so the values b
+eing
tested are bare. Future versions of B::Deobfuscate may add the sigil t
+o the
symbol name.

 global_regex: '\A[[:lower:][:digit:]_]\z'

=item B<globals>

This is a hash detailing symbol names as used in the original source a
+nd the
name used in the deobfuscated source. For example - if the original so
+urce
has a variable named @z12345 and you wish to rename all occurrances to
+ 
@URLList then the hash would associate 'z12345' with 'URLList'. The di
+ctionary
picker fills these values in automatically.

If you wish to prevent B::Deobfuscate from renaming a symbol then spec
+ify the
new value as '~' (which in YAML terms is undef).

 globals:
   catfile: ~
   opt_n: ~
   opt_t: ~
   opt_u: ~
   z1234567890: Postprocesser
   z2345678901: Constructable
   z3456789012: Photosynthesises
   z4567890123: Undiscriminate
   z5678901234: Parenthesises
   z6789012345: Animadvertion

=item B<lexicals>

Lexicals is a hash exactly like `globals' except that all the symbol n
+ames
include the sigil which doesn't currently happen for globals.

 lexicals:
   '$k1234567890': '$ivs'
   '$k2345678901': '$ehs'
   '$k3456789012': '$ans'
   '$k4567890123': '$ons'
   '$k5678901234': '$ofs'
   '$k6789012345': '$gos'
   '$k7890123456': '$dus'
   '$k8901234567': '$iis'
   '$k9012345678': '$ats'
   '$k0123456780': '$ets'

=back

=head1 AUTHOR

Joshua b. Jore <jjore@cpan.org>

=head1 SEE ALSO

L<B::Deparse>
L<http://www.perlmonks.org/index.pl?node_id=243011>
L<http://www.perlmonks.org/index.pl?node_id=244604>

=cut

Comment on B::Deobfuscate - Deobfuscates symbol names
Download Code
Replies are listed 'Best First'.
B::Deobfuscate - New version - 0.03
by diotalevi (Canon) on Mar 24, 2003 at 13:17 UTC

    New version 0.03. This adds configuration file support, symbol renaming table dumping and better known-perl-keyword support.

    Example configuration file: (run on the Stunnix Perl-obfus sampe output)

    $ perl -MO=Deobfuscate,-d/usr/share/dict/propernames,-m/\(?:\)/,-y /ho +me/josh/bin/stun.pl dictionary: '/usr/share/dict/propernames' global_regex: '(?:)' globals: File::Temp::tempfile: Triantaphyllos catfile: Christopher opt_n: Sundaresan opt_t: Konstantinos opt_u: Rathnakumar z1387c3e117: Vidhyanath z3d52c17699: Srinivasan z6a703c020a: Slartibartfast z8374cc586e: Ravindranath z9101bb5154: Tiefenthal zf9d3fbdfa8: Pratapwant lexicals: '$z1dfa4cd057': '$ti' '$z5a5fa8125d': '$ji' '$z5da4d3837d': '$al' '$z5f7e4d50b8': '$ami' '$z7d70dd4ea9': '$alf' '$z8348550157': '$po' '$z9e5935eea4': '$hy' '$zbea0aee021': '$ro' '$zc11e41b4ef': '$ed' '$zcc158ad3e0': '$no'
Re: B::Deobfuscate - Deobfuscates symbol names
by John M. Dlugosz (Monsignor) on Apr 03, 2003 at 16:59 UTC
    Suggestion: I think the biometric word lists used by PGP-phone would make excellent dictionaries. They are words designed to be uniformly "far apart" in sound, so are easily distinguished. Two lists are available: one is 256 2-sylable words, the other is 256 3-sylable words.
Re: B::Deobfuscate - Deobfuscates symbol names
by George_Sherston (Vicar) on Jun 23, 2003 at 23:26 UTC
    I enjoyed the controversy that helped give birth to this fine bit of kit - well done for showing him the door in a way that does credit to the community. I just wanted to suggest proper names for your vars. I quite often use proper names for variables in development - I find them euphonious and easy to remember. @bob = ($sally,$eric,$tarquin). Or even get the module to tell the user how many var names it needs, so he/she can input his/her own list of memorable phonemes. Or whatever - you've probably had enough of it!

    George Sherston

      My CVS copy is going to have a built in dictionary of words. Right now it uses the PGP phoneme set but I can't use that dictionary because I think RSA Security owns it. I like your proper-name suggestion and I'll likely incorporate it. I'm going to rework it somewhat so that other modules can supply their own dictionaries. I still want that Acme::Floral (currently a dead link) slot. BTW, was that you that mass upvoted me in that thread just now?

      Updated: Acme::Floral has been created.

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://244604]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others rifling through the Monastery: (8)
As of 2015-07-31 11:02 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (276 votes), past polls