#!/usr/bin/perl -w # bad_crack0.pl - Don Yang (uguu.org) # # Doesn't work, using bigrams results in more noisy choices of shift offsets, # and also makes the cracking process too slow. # # 2015-05-29 use strict; use constant MINIMUM_TRAINING_SAMPLE_SIZE => 0x1000; use constant MAX_KEY_SIZE => 8; use constant DEBUG => 0; # Convert all alpha characters in string to bytes in [0..25] range, and # remove all other characters. sub ConvertInput($) { my ($data) = @_; $data = lc $data; $data =~ y/a-z//cd; $data =~ y/a-z/\0-\31/; return $data; } # Given a nonempty string containing only bytes in [0..25] range, compute # frequency distribution for each byte. sub GetCharDistribution($$) { my ($text, $dist) = @_; if( DEBUG ) { die unless defined $text; die unless length($text) > 0; die unless defined $dist; } # Get frequencies my @freq = (); $freq[$_] = 0 foreach (0..25); $freq[$_]++ foreach unpack 'C*', $text; # Convert frequencies to probabilities @$dist = map {$_ / length($text)} @freq; if( DEBUG ) { print "GetCharDistribution(", length($text), ")\n"; die unless $#$dist == 25; for(my $i = 0; $i < 26; $i++) { printf ' %.3f', $$dist[$i]; print "\n" if( ($i + 1) % 13 == 0 ); } } } # Given a nonempty string containing only bytes in [0..25] range, compute # frequency distribution for each bigram. sub GetBigramDistribution($$) { my ($text, $bigram) = @_; if( DEBUG ) { die unless defined $text; die unless length($text) > 0; die unless defined $bigram; } # Get frequencies my @freq = (); my $last = undef; foreach my $current (unpack 'C*', $text) { if( defined($last) ) { $freq[$last][$current]++; } $last = $current; } # Convert frequencies to probabilities for(my $i = 0; $i < 26; $i++) { for(my $j = 0; $j < 26; $j++) { $$bigram[$i][$j] = ($freq[$i][$j] || 0) / (length($text) - 1); } } } # Load sample data from file sub LoadSample($$$) { my ($file, $dist, $bigram) = @_; open my $handle, "<$file" or die $!; my $data = ConvertInput(join '', <$handle>); close $handle; if( length($data) < MINIMUM_TRAINING_SAMPLE_SIZE ) { die "Insufficient alpha characters in $file\n"; } GetCharDistribution($data, $dist); GetBigramDistribution($data, $bigram); } # Split string into array of bytes by stride sub SplitToStrides($$$) { my ($input, $stride, $output) = @_; if( DEBUG ) { die unless defined $input; die unless length($input) > 0; die unless defined $output; } $#$output = $stride - 1; my $index = 0; foreach my $c (unpack 'C*', $input) { $$output[$index++] .= chr($c); $index %= $stride; } } # Given an expected distribution and a nonempty string containing only # bytes in [0..25] range, output an array of shift offsets, sorted by # error amount. sub GetShift($$$) { my ($dist, $input, $output) = @_; if( DEBUG ) { die unless defined $input; die unless length($input) > 0; die unless defined $output; } my @input_dist; GetCharDistribution($input, \@input_dist); my @offsets = (); for(my $o = 0; $o < 26; $o++) { # Apply shift offset and compute error amount my $error = 0; for(my $i = 0; $i < 26; $i++) { my $diff = $$dist[$i] - $input_dist[($i + $o) % 26]; $error += $diff * $diff; } # Record (error, shift offset) pair push @offsets, [$error, $o]; } # Add offsets to output, sorted by error. foreach my $p (sort {$$a[0] <=> $$b[0]} @offsets) { push @$output, $$p[1]; if( DEBUG ) { print "offset=", $$p[1], ", error=", $$p[0], "\n"; } } } # Apply shift amounts to input and return the shifted string. Both # input and shift amounts are specified as bytes in [0..25] range. sub ApplyShift($$) { my ($input, $key) = @_; if( DEBUG ) { die unless defined $input; die unless length($input) > 0; die unless defined $key; die unless length($key) > 0; die unless $key =~ /^[\0-\31]*$/; } my $index = 0; my $output = ""; foreach my $c (unpack 'C*', $input) { $output .= chr(($c + ord(substr($key, $index++, 1))) % 26); $index %= length($key); } if( DEBUG ) { print "test:"; foreach my $c (unpack 'C*', $key) { printf ' %c%c', ord('A') + $c, ord('a') + 25 - $c; } print "\n"; } return $output; } # Get difference in bigram distribution sub CompareBigramDistribution($$) { my ($expected_dist, $text) = @_; if( DEBUG ) { die unless defined $expected_dist; die unless defined $text; die unless length($text) > 0; die unless $text =~ /^[\0-\31]*$/; } my @dist; GetBigramDistribution($text, \@dist); my $error = 0; for(my $i = 0; $i < 26; $i++) { for(my $j = 0; $j < 26; $j++) { my $diff = $$expected_dist[$i][$j] - $dist[$i][$j]; $error += $diff * $diff; } } if( DEBUG ) { print "error = $error\n"; } return $error; } # Convert binary string with shift amounts to ASCII key sub ConvertKey($) { my ($key) = @_; return join '', map {chr(ord('A') + $_)} unpack 'C*', $key; } # Given a nonempty string containing only bytes in [0..25] range, output # possible keys to stdout. sub Crack($$$) { my ($dist, $bigram, $input) = @_; if( DEBUG ) { die unless defined $dist; die unless defined $bigram; die unless defined $input; die unless length($input) > 0; die unless $input =~ /^[\0-\31]*$/; print "Input size = ", length($input), "\n"; } # Keep track of minimum error against bigram distribution, and only # output new key candidate if it improves upon this error. # # Maximum possible error is 1.0 difference from all 26*26 character # pairs, hence the 26*26 here. my $error = 26 * 26; # Try key lengths up to 1/5 of original input size. This is done # by simply searching through all key sizes linearly. # # A more efficient way to do this is to try common divisors for all # offsets of repeated strings, but that doesn't work so well with # short input strings and we end up having to try all sizes # linearly anyways. my $max_key_size = length($input) / 5; if( $max_key_size > MAX_KEY_SIZE ) { $max_key_size = MAX_KEY_SIZE; } for(my $key_size = 1; $key_size < $max_key_size; $key_size++) { if( DEBUG ) { print "Key size = $key_size\n"; } # Split input to strides by key size. Because key size is much # smaller than input, this guarantees that each stride will always # have a few characters in it. my @strides; SplitToStrides($input, $key_size, \@strides); # Get candidate shift amounts for each stride my @shift = (); $#shift = $#strides; for(my $i = 0; $i < $key_size; $i++) { GetShift($dist, $strides[$i], \@{$shift[$i]}); } # Try the top two choices for each position for(my $i = 0; $i < 1 << $key_size; $i++) { my $test_key = ''; for(my $j = 0; $j < $key_size; $j++) { $test_key .= chr($shift[$j][($i & (1 << $j)) == 0]); } my $test_error = CompareBigramDistribution($bigram, ApplyShift($input, $test_key)); # Keep the keys that minimizes errors against bigram distribution. # # In practice, this actually makes the candidate keys worse :( if( $error > $test_error ) { $error = $test_error; print ConvertKey($test_key), "\n"; } } } } if( $#ARGV < 0 ) { die "$0 \n"; } # Load expected distributions from training sample my $sample = shift @ARGV; my (@dist, @bigram); LoadSample($sample, \@dist, \@bigram); # Get input text to crack my $input = join '', <>; Crack(\@dist, \@bigram, ConvertInput($input));