#!/usr/bin/perl -w use strict; use constant DEBUG => 0; use constant MAX_KEY_SIZE => 64; # Convert all alpha characters in string to bytes in [0..25] range, and # remove all other characters. sub ConvertInput($) { my ($data) = @_; $data = lc $data; $data =~ y/a-z//cd; $data =~ y/a-z/\0-\31/; return $data; } # Compute character frequency distribution from byte array sub GetDistribution($$) { my ($data, $dist) = @_; if( DEBUG ) { die unless defined $data; die unless length($data) > 0; die unless defined $dist; } # Get frequencies my @freq = (); $freq[$_] = 0 foreach (0..25); $freq[$_]++ foreach unpack 'C*', $data; # Convert frequencies to probabilities @$dist = map {$_ / length($data)} @freq; } # Split string into array of bytes by stride sub SplitToStrides($$$) { my ($input, $stride, $output) = @_; if( DEBUG ) { die unless defined $input; die unless length($input) > 0; die unless defined $output; } $#$output = $stride - 1; my $index = 0; foreach my $c (unpack 'C*', $input) { $$output[$index++] .= chr($c); $index %= $stride; } } # Find best shift offset for a single distribution. Returns undef if # no suitable shift offset is found. sub GetShiftOffset($$$) { my ($dist, $input, $confidence) = @_; if( DEBUG ) { die unless defined $input; die unless length($input) > 0; die unless defined $dist; } my @input_dist; GetDistribution($input, \@input_dist); my @offsets = (); for(my $o = 0; $o < 26; $o++) { # Apply shift offset and compute error amount my $error = 0; for(my $i = 0; $i < 26; $i++) { my $diff = $$dist[$i] - $input_dist[($i + $o) % 26]; $error += $diff * $diff; } # Record (error, shift offset) pair push @offsets, [$error, $o]; } # Sort offsets by error amount @offsets = sort {$$a[0] <=> $$b[0]} @offsets; # If we guessed the right key size, the best fitting shift amount # usually results in significantly less error than the next best # fit. This is due to how most people choose keys with distinct # characters, such when an incorrect key size is used, each stride # contains samples of characters that are shifted by different # amounts instead of a single shift amount, and the resulting mix # of character frequencies means no single shift amount is # significantly better than other shift amounts. # # Thus, this scheme doesn't work well when people use keys with # repeated characters, such as "banana". # # The threshold for controlling of much error is "significantly # less" is controlled by multiplying standard deviation with # $confidence parameter, which is gradually decreased until we can # find some key. my $average_error = 0; foreach my $p (@offsets) { $average_error += $$p[0]; } $average_error /= 26; my $error_deviation = 0; foreach my $p (@offsets) { my $delta = $$p[0] - $average_error; $error_deviation += $delta * $delta; } $error_deviation = sqrt($error_deviation / 26); my $margin = $offsets[1][0] - $offsets[0][0]; if( $margin > $error_deviation * $confidence ) { if( DEBUG ) { print "Shift = $offsets[0][1], error = $offsets[0][0]\n"; } return $offsets[0][1]; } if( DEBUG ) { print "No candidate shift found, minimum error = $offsets[0][0]\n"; } return undef; } # Find the best key that would minimize error for a particular key length. # Returns the key if a reasonable candidate exists, or undef otherwise. sub FindKeyWithLength($$$$) { my ($dist, $input, $key_size, $confidence) = @_; # Split input to strides by key size. Because key size is much # smaller than input, this guarantees that each stride will always # have a few characters in it. my @strides; SplitToStrides($input, $key_size, \@strides); # Get candidate shift amounts for each stride my $key = ""; for(my $i = 0; $i < $key_size; $i++) { my $offset = GetShiftOffset($dist, $strides[$i], $confidence); return undef unless defined $offset; $key .= chr($offset); } return $key; } # Convert shift amounts to lowercase key sub ConvertToLowercaseKey($) { my ($key) = @_; return join '', map {chr(ord('a') + (26 - $_) % 26)} unpack 'C*', $key; } # Convert shift amounts to uppercase key sub ConvertToUppercaseKey($) { my ($key) = @_; return join '', map {chr(ord('A') + $_)} unpack 'C*', $key; } # Find candidate key in string of bytes. Output possible keys to stdout. # Returns 1 if a key has been found, 0 otherwise. sub Crack($$$) { my ($dist, $input, $confidence) = @_; # Try key lengths up to 1/5 of original input size. This is done # by simply searching through all key sizes linearly. # # A more efficient way to do this is to try common divisors for all # offsets of repeated strings, but that doesn't work so well with # short input strings and we end up having to try all sizes # linearly anyways. my $max_key_size = length($input) / 5; if( $max_key_size > MAX_KEY_SIZE ) { $max_key_size = MAX_KEY_SIZE; } my %found_keys = (); my $longest_key = undef; for(my $key_size = 1; $key_size < $max_key_size; $key_size++) { if( DEBUG ) { print "Key size = $key_size\n"; } my $key = FindKeyWithLength($dist, $input, $key_size, $confidence); next unless defined $key; next if exists $found_keys{$key}; # Remember longest unique key that is not a multiple of previous # shorter keys. $longest_key = $key; my $base_key = $key; for(my $j = $key_size; $j < $max_key_size; $j += $key_size) { $key .= $base_key; $found_keys{$key} = 1; } } # Output key if( defined $longest_key ) { print ConvertToUppercaseKey($longest_key), "\n", ConvertToLowercaseKey($longest_key), "\n"; return 1; } return 0; } # Expected character probabilities, from "Alice in Wonderland". # http://www.gutenberg.org/ebooks/11 my @expected_distribution = ( .07969, .01419, .02439, .04446, .12517, .01936, .02392, .06414, .07019, .00191, .01048, .04236, .02005, .06546, .07705, .01600, .00178, .05374, .05909, .09919, .03234, .00782, .02400, .00143, .02101, .00065, ); # Find candidate keys for stdin my $input = join '', <>; for(my $c = 20; $c > 5; $c--) { last if Crack(\@expected_distribution, ConvertInput($input), $c / 10); }