#!/usr/bin/perl -w
# bad_crack0.pl - Don Yang (uguu.org)
#
# Doesn't work, using bigrams results in more noisy choices of shift offsets,
# and also makes the cracking process too slow.
#
# 2015-05-29

use strict;
use constant MINIMUM_TRAINING_SAMPLE_SIZE => 0x1000;
use constant MAX_KEY_SIZE => 8;
use constant DEBUG => 0;

# Convert all alpha characters in string to bytes in [0..25] range, and
# remove all other characters.
sub ConvertInput($)
{
   my ($data) = @_;
   $data = lc $data;
   $data =~ y/a-z//cd;
   $data =~ y/a-z/\0-\31/;
   return $data;
}

# Given a nonempty string containing only bytes in [0..25] range, compute
# frequency distribution for each byte.
sub GetCharDistribution($$)
{
   my ($text, $dist) = @_;

   if( DEBUG )
   {
      die unless defined $text;
      die unless length($text) > 0;
      die unless defined $dist;
   }

   # Get frequencies
   my @freq = ();
   $freq[$_] = 0 foreach (0..25);
   $freq[$_]++ foreach unpack 'C*', $text;

   # Convert frequencies to probabilities
   @$dist = map {$_ / length($text)} @freq;

   if( DEBUG )
   {
      print "GetCharDistribution(", length($text), ")\n";
      die unless $#$dist == 25;
      for(my $i = 0; $i < 26; $i++)
      {
         printf ' %.3f', $$dist[$i];
         print "\n" if( ($i + 1) % 13 == 0 );
      }
   }
}

# Given a nonempty string containing only bytes in [0..25] range, compute
# frequency distribution for each bigram.
sub GetBigramDistribution($$)
{
   my ($text, $bigram) = @_;

   if( DEBUG )
   {
      die unless defined $text;
      die unless length($text) > 0;
      die unless defined $bigram;
   }

   # Get frequencies
   my @freq = ();
   my $last = undef;
   foreach my $current (unpack 'C*', $text)
   {
      if( defined($last) )
      {
         $freq[$last][$current]++;
      }
      $last = $current;
   }

   # Convert frequencies to probabilities
   for(my $i = 0; $i < 26; $i++)
   {
      for(my $j = 0; $j < 26; $j++)
      {
         $$bigram[$i][$j] = ($freq[$i][$j] || 0) / (length($text) - 1);
      }
   }
}

# Load sample data from file
sub LoadSample($$$)
{
   my ($file, $dist, $bigram) = @_;

   open my $handle, "<$file" or die $!;
   my $data = ConvertInput(join '', <$handle>);
   close $handle;
   if( length($data) < MINIMUM_TRAINING_SAMPLE_SIZE )
   {
      die "Insufficient alpha characters in $file\n";
   }

   GetCharDistribution($data, $dist);
   GetBigramDistribution($data, $bigram);
}

# Split string into array of bytes by stride
sub SplitToStrides($$$)
{
   my ($input, $stride, $output) = @_;

   if( DEBUG )
   {
      die unless defined $input;
      die unless length($input) > 0;
      die unless defined $output;
   }

   $#$output = $stride - 1;

   my $index = 0;
   foreach my $c (unpack 'C*', $input)
   {
      $$output[$index++] .= chr($c);
      $index %= $stride;
   }
}

# Given an expected distribution and a nonempty string containing only
# bytes in [0..25] range, output an array of shift offsets, sorted by
# error amount.
sub GetShift($$$)
{
   my ($dist, $input, $output) = @_;

   if( DEBUG )
   {
      die unless defined $input;
      die unless length($input) > 0;
      die unless defined $output;
   }

   my @input_dist;
   GetCharDistribution($input, \@input_dist);

   my @offsets = ();
   for(my $o = 0; $o < 26; $o++)
   {
      # Apply shift offset and compute error amount
      my $error = 0;
      for(my $i = 0; $i < 26; $i++)
      {
         my $diff = $$dist[$i] - $input_dist[($i + $o) % 26];
         $error += $diff * $diff;
      }

      # Record (error, shift offset) pair
      push @offsets, [$error, $o];
   }

   # Add offsets to output, sorted by error.
   foreach my $p (sort {$$a[0] <=> $$b[0]} @offsets)
   {
      push @$output, $$p[1];

      if( DEBUG )
      {
         print "offset=", $$p[1], ", error=", $$p[0], "\n";
      }
   }
}

# Apply shift amounts to input and return the shifted string.  Both
# input and shift amounts are specified as bytes in [0..25] range.
sub ApplyShift($$)
{
   my ($input, $key) = @_;

   if( DEBUG )
   {
      die unless defined $input;
      die unless length($input) > 0;
      die unless defined $key;
      die unless length($key) > 0;
      die unless $key =~ /^[\0-\31]*$/;
   }

   my $index = 0;
   my $output = "";
   foreach my $c (unpack 'C*', $input)
   {
      $output .= chr(($c + ord(substr($key, $index++, 1))) % 26);
      $index %= length($key);
   }

   if( DEBUG )
   {
      print "test:";
      foreach my $c (unpack 'C*', $key)
      {
         printf ' %c%c', ord('A') + $c, ord('a') + 25 - $c;
      }
      print "\n";
   }
   return $output;
}

# Get difference in bigram distribution
sub CompareBigramDistribution($$)
{
   my ($expected_dist, $text) = @_;

   if( DEBUG )
   {
      die unless defined $expected_dist;
      die unless defined $text;
      die unless length($text) > 0;
      die unless $text =~ /^[\0-\31]*$/;
   }

   my @dist;
   GetBigramDistribution($text, \@dist);

   my $error = 0;
   for(my $i = 0; $i < 26; $i++)
   {
      for(my $j = 0; $j < 26; $j++)
      {
         my $diff = $$expected_dist[$i][$j] - $dist[$i][$j];
         $error += $diff * $diff;
      }
   }

   if( DEBUG )
   {
      print "error = $error\n";
   }
   return $error;
}

# Convert binary string with shift amounts to ASCII key
sub ConvertKey($)
{
   my ($key) = @_;

   return join '', map {chr(ord('A') + $_)} unpack 'C*', $key;
}

# Given a nonempty string containing only bytes in [0..25] range, output
# possible keys to stdout.
sub Crack($$$)
{
   my ($dist, $bigram, $input) = @_;

   if( DEBUG )
   {
      die unless defined $dist;
      die unless defined $bigram;
      die unless defined $input;
      die unless length($input) > 0;
      die unless $input =~ /^[\0-\31]*$/;
      print "Input size = ", length($input), "\n";
   }

   # Keep track of minimum error against bigram distribution, and only
   # output new key candidate if it improves upon this error.
   #
   # Maximum possible error is 1.0 difference from all 26*26 character
   # pairs, hence the 26*26 here.
   my $error = 26 * 26;

   # Try key lengths up to 1/5 of original input size.  This is done
   # by simply searching through all key sizes linearly.
   #
   # A more efficient way to do this is to try common divisors for all
   # offsets of repeated strings, but that doesn't work so well with
   # short input strings and we end up having to try all sizes
   # linearly anyways.
   my $max_key_size = length($input) / 5;
   if( $max_key_size > MAX_KEY_SIZE )
   {
      $max_key_size = MAX_KEY_SIZE;
   }
   for(my $key_size = 1; $key_size < $max_key_size; $key_size++)
   {
      if( DEBUG )
      {
         print "Key size = $key_size\n";
      }

      # Split input to strides by key size.  Because key size is much
      # smaller than input, this guarantees that each stride will always
      # have a few characters in it.
      my @strides;
      SplitToStrides($input, $key_size, \@strides);

      # Get candidate shift amounts for each stride
      my @shift = ();
      $#shift = $#strides;
      for(my $i = 0; $i < $key_size; $i++)
      {
         GetShift($dist, $strides[$i], \@{$shift[$i]});
      }

      # Try the top two choices for each position
      for(my $i = 0; $i < 1 << $key_size; $i++)
      {
         my $test_key = '';
         for(my $j = 0; $j < $key_size; $j++)
         {
            $test_key .= chr($shift[$j][($i & (1 << $j)) == 0]);
         }

         my $test_error =
            CompareBigramDistribution($bigram, ApplyShift($input, $test_key));

         # Keep the keys that minimizes errors against bigram distribution.
         #
         # In practice, this actually makes the candidate keys worse :(
         if( $error > $test_error )
         {
            $error = $test_error;
            print ConvertKey($test_key), "\n";
         }
      }
   }
}


if( $#ARGV < 0 )
{
   die "$0 <training_sample.txt>\n";
}

# Load expected distributions from training sample
my $sample = shift @ARGV;
my (@dist, @bigram);
LoadSample($sample, \@dist, \@bigram);

# Get input text to crack
my $input = join '', <>;
Crack(\@dist, \@bigram, ConvertInput($input));