#!/usr/bin/perl -w
# bad_crack1.pl - Don Yang (uguu.org)
#
# Forked from crack04c.pl
#
# Doesn't work, sorting by overall character frequencies without considering
# error margins results in long keys that tend to overfit.
#
# 2015-06-13

use strict;
use constant DEBUG => 1;
use constant MAX_KEY_SIZE => 64;

# Convert all alpha characters in string to bytes in [0..25] range, and
# remove all other characters.
sub ConvertInput($)
{
   my ($data) = @_;
   $data = lc $data;
   $data =~ y/a-z//cd;
   $data =~ y/a-z/\0-\31/;
   return $data;
}

# Compute character frequencies from byte array
sub GetFrequency($$)
{
   my ($data, $freq) = @_;

   if( DEBUG )
   {
      die unless defined $data;
      die unless length($data) > 0;
      die unless defined $freq;
   }

   $$freq[$_] = 0 foreach (0..25);
   $$freq[$_]++ foreach unpack 'C*', $data;
}

# Split string into array of bytes by stride
sub SplitToStrides($$$)
{
   my ($input, $stride, $output) = @_;

   if( DEBUG )
   {
      die unless defined $input;
      die unless length($input) > 0;
      die unless defined $output;
   }

   $#$output = $stride - 1;

   my $index = 0;
   foreach my $c (unpack 'C*', $input)
   {
      $$output[$index++] .= chr($c);
      $index %= $stride;
   }
}

# Find best shift offset for a single distribution.
sub GetShiftOffset($$$)
{
   my ($dist, $input, $freq) = @_;

   if( DEBUG )
   {
      die unless defined $input;
      die unless length($input) > 0;
      die unless defined $dist;
   }

   GetFrequency($input, $freq);
   my @input_dist = map {$_ / length($input)} @$freq;

   my $best_offset;
   my $min_error = 27;
   my @offsets = ();
   for(my $o = 0; $o < 26; $o++)
   {
      # Apply shift offset and compute error amount
      my $error = 0;
      for(my $i = 0; $i < 26; $i++)
      {
         my $diff = $$dist[$i] - $input_dist[($i + $o) % 26];
         $error += $diff * $diff;
      }

      # Remember offset with the smallest error
      if( $min_error > $error )
      {
         $min_error = $error;
         $best_offset = $o;
      }
   }
   return $best_offset;
}

# Find the best key that would minimize error for a particular key length.
# Returns (key, error), where error is sum of differences squared from
# expected distribution.
sub FindKeyWithLength($$$)
{
   my ($dist, $input, $key_size) = @_;

   # Split input to strides by key size.  Because key size is much
   # smaller than input, this guarantees that each stride will always
   # have a few characters in it.
   my @strides;
   SplitToStrides($input, $key_size, \@strides);

   # Find shift amounts for each stride, and also update shifted frequencies
   my $key = "";
   my @freq = ();
   $freq[$_] = 0 foreach (0..25);
   for(my $i = 0; $i < $key_size; $i++)
   {
      my @stride_freq;
      my $offset = GetShiftOffset($dist, $strides[$i], \@stride_freq);

      for(my $i = 0; $i < 26; $i++)
      {
         $freq[$i] += $stride_freq[($i + $offset) % 26];
      }

      $key .= chr($offset);
   }

   # Compute error from expected distribution
   my $error = 0;
   for(my $i = 0; $i < 26; $i++)
   {
      my $delta = $freq[$i] / length($input) - $$dist[$i];
      $error += $delta * $delta;
   }

   return ($key, $error);
}

# Convert shift amounts to lowercase key
sub ConvertToLowercaseKey($)
{
   my ($key) = @_;
   return join '', map {chr(ord('a') + (26 - $_) % 26)} unpack 'C*', $key;
}

# Convert shift amounts to uppercase key
sub ConvertToUppercaseKey($)
{
   my ($key) = @_;
   return join '', map {chr(ord('A') + $_)} unpack 'C*', $key;
}

# Find candidate key in string of bytes.  Output possible keys to stdout.
sub Crack($$)
{
   my ($dist, $input) = @_;

   # Try key lengths up to 1/5 of original input size.  This is done
   # by simply searching through all key sizes linearly.
   #
   # A more efficient way to do this is to try common divisors for all
   # offsets of repeated strings, but that doesn't work so well with
   # short input strings and we end up having to try all sizes
   # linearly anyways.
   my $max_key_size = length($input) / 5;
   if( $max_key_size > MAX_KEY_SIZE )
   {
      $max_key_size = MAX_KEY_SIZE;
   }

   my %found_keys = ();
   for(my $key_size = 1; $key_size < $max_key_size; $key_size++)
   {
      if( DEBUG )
      {
         print "Key size = $key_size\n";
      }

      my ($key, $error) = FindKeyWithLength($dist, $input, $key_size);
      next unless defined $key;
      next if exists $found_keys{$key};

      # Remember error value for this key
      $found_keys{$key} = $error;

      # Remove other keys that are multiples of this key.  This is so that
      # if we see a key candidate in the form of "XX", we would return just
      # "X" as the canonical key.
      my $base_key = $key;
      for(my $j = $key_size; $j < $max_key_size; $j += $key_size)
      {
         $key .= $base_key;
         $found_keys{$key} = 26;
      }
   }

   # Output key with the lowest error, break ties by preferring
   # longer keys.
   foreach my $key (sort {$found_keys{$a} <=> $found_keys{$b} ||
                          length($b) <=> length($a)}
                    keys %found_keys)
   {
      if( DEBUG )
      {
         print $found_keys{$key}, " -> ", ConvertToUppercaseKey($key), "\n";
      }
      last if $found_keys{$key} >= 26;
      print ConvertToUppercaseKey($key), "\n",
            ConvertToLowercaseKey($key), "\n";

      if( DEBUG ) { next; }
      return;
   }
}


# Expected character probabilities, from "Alice in Wonderland".
# http://www.gutenberg.org/ebooks/11
my @expected_distribution = (
   .07969,
   .01419,
   .02439,
   .04446,
   .12517,
   .01936,
   .02392,
   .06414,
   .07019,
   .00191,
   .01048,
   .04236,
   .02005,
   .06546,
   .07705,
   .01600,
   .00178,
   .05374,
   .05909,
   .09919,
   .03234,
   .00782,
   .02400,
   .00143,
   .02101,
   .00065,
);

# Find candidate keys for stdin
my $input = join '', <>;
Crack(\@expected_distribution, ConvertInput($input));