#!/usr/bin/perl -w use strict; use constant MAX_KEY_SIZE => 64; # Read input from stdin, or files specified on command line my $input = join '', <>; # Convert all input characters in string to bytes in [0..25] range, and # remove all other characters. $input = lc $input; $input =~ y/a-z//cd; $input =~ y/a-z/\0-\31/; # Expected character probabilities, from "Alice in Wonderland". # http://www.gutenberg.org/ebooks/11 my @expected_distribution = ( .07969, .01419, .02439, .04446, .12517, .01936, .02392, .06414, .07019, .00191, .01048, .04236, .02005, .06546, .07705, .01600, .00178, .05374, .05909, .09919, .03234, .00782, .02400, .00143, .02101, .00065, ); # Find best shift offset for a single distribution. # # Returns (offset, confidence) pair, where "confidence" is some # measure of how likely this offset is correct, higher is better. sub GetShiftOffset($) { my ($data) = @_; # Compute character frequency distribution from byte array my @freq = (); $freq[$_] = 0 foreach (0..25); $freq[$_]++ foreach unpack 'C*', $data; my @input_dist = map {$_ / length($data)} @freq; my @offsets = (); for(my $o = 0; $o < 26; $o++) { # Apply shift offset and compute error amount my $error = 0; for(my $i = 0; $i < 26; $i++) { my $diff = $expected_distribution[$i] - $input_dist[($i + $o) % 26]; $error += $diff * $diff; } # Record (error, shift offset) pair push @offsets, [$error, $o]; } # Sort offsets by error amount @offsets = sort {$$a[0] <=> $$b[0]} @offsets; # If we guessed the right key size, the best fitting shift amount # usually results in significantly less error than the next best # fit. This is due to how most people choose keys with distinct # characters, such when an incorrect key size is used, each stride # contains samples of characters that are shifted by different # amounts instead of a single shift amount, and the resulting mix # of character frequencies means no single shift amount is # significantly better than other shift amounts. # # Thus, this scheme doesn't work well when people use keys with # repeated characters, such as "banana". # # This difference in error amount is returned in units of standard # deviation, which the caller will make use of after having # collected all key candidates. my $average_error = 0; foreach my $p (@offsets) { $average_error += $$p[0]; } $average_error /= 26; my $error_deviation = 0; foreach my $p (@offsets) { my $delta = $$p[0] - $average_error; $error_deviation += $delta * $delta; } $error_deviation = sqrt($error_deviation / 26); # If error_deviation is close to zero, it means the characters were # pretty much uniformly distributed, and it's not possible to crack. if( $error_deviation < 1e-6 ) { return (0, 0); } my $margin = $offsets[1][0] - $offsets[0][0]; # Compute confidence in log units of standard deviation. Anything # in the same threshold bucket seem equally good or bad: # - 1 < confidence # - 0.5 < confidence <= 1 # - 0.25 < confidence <= 0.5 # - confidence <= 0.25 # # We bucketize these confidence values since they aren't really # that precise, and if there are two key candidates with comparable # confidence, we want to keep both of them. my $confidence = $margin / $error_deviation; $confidence = $confidence < 0.25 ? 0 : $confidence < 0.5 ? 1 : $confidence < 1 ? 2 : 3; return ($offsets[0][1], $confidence); } # Find the best key that would minimize error for a particular key length. # Returns (key, confidence), where confidence is a value that indicates the # likelihood of this key being the right one, higher is better. # # If no key with confidence above $threshold is found, return (undef, 0) sub FindKeyWithLength($$) { my ($key_size, $threshold) = @_; # Split input to strides by key size. Because key size is much # smaller than input, this guarantees that each stride will always # have a few characters in it. my @strides; $#strides = $key_size - 1; my $index = 0; foreach my $c (unpack 'C*', $input) { $strides[$index++] .= chr($c); $index %= $key_size; } # Get candidate shift amounts for each stride my $key = ""; my $min_confidence = undef; for(my $i = 0; $i < $key_size; $i++) { my ($offset, $confidence) = GetShiftOffset($strides[$i]); $key .= chr($offset); if( !defined($min_confidence) || $min_confidence > $confidence ) { $min_confidence = $confidence; if( $min_confidence < $threshold ) { return (undef, 0); } } } return ($key, $min_confidence); } # Find candidate key in string of bytes. Output possible keys to stdout. sub Crack() { # Try key lengths up to 1/5 of original input size. This is done # by simply searching through all key sizes linearly. # # A more efficient way to do this is to try common divisors for all # offsets of repeated strings, but that doesn't work so well with # short input strings and we end up having to try all sizes # linearly anyways. my $max_key_size = length($input) / 5; if( $max_key_size > MAX_KEY_SIZE ) { $max_key_size = MAX_KEY_SIZE; } # Minimum accepted confidence threshold. Because we sort keys by # confidence later, if we already found keys with higher confidence, # there is no need to examine partial keys with lower confidence. my $min_confidence = 0; my %found_keys = (); for(my $key_size = 1; $key_size < $max_key_size; $key_size++) { my ($key, $confidence) = FindKeyWithLength($key_size, $min_confidence); next unless defined $key; next if exists $found_keys{$key}; # Remember confidence value for this key $found_keys{$key} = $confidence; # Remove other keys that are multiples of this key. This is so that # if we see a key candidate in the form of "XX", we would return just # "X" as the canonical key. my $base_key = $key; for(my $j = $key_size; $j < $max_key_size; $j += $key_size) { $key .= $base_key; $found_keys{$key} = 0; } # Update minimum confidence level. This new key must be at equal or # higher confidence, otherwise it would have been rejected already. $min_confidence = $confidence; } # Output key with the highest confidence, break ties by preferring # longer keys. foreach my $key (sort {$found_keys{$b} <=> $found_keys{$a} || length($b) <=> length($a)} keys %found_keys) { last if $found_keys{$key} == 0; # Convert shift amounts to uppercase and lowercase keys print ( (join '', map {chr(ord('A') + $_)} unpack 'C*', $key), "\n", (join '', map {chr(ord('a') + (26 - $_) % 26)} unpack 'C*', $key), "\n" ); last; } } Crack();