#!/usr/bin/perl -w # Count the number of 1 bits and 0 bits at each bit position. # # This is useful if we want to create a bitmask that minimizes the # number of 1 bits: # # - Let output_byte = input_byte ^ bitmask. # - For each bit position where we saw more 1 bits than 0 bits, set # the corresponding position in bitmask to 1. This minimizes the # number of 1 bits in input_byte. # # Intuitively, if we saw more 1 than 0 on average for a particular bit # position, then we would see more 0 than 1 if we xor that bit # position with 1. # # Bitmask of 0x60 will minimize 1 bits for ASCII text. For Japanese # text in UTF-8, 0xa3 appears to work better. use strict; my @count = ([0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]); while( my $line = <> ) { foreach my $byte (unpack "C*", $line) { for(my $i = 0; $i < 8; $i++) { if( ($byte & (1 << $i)) == 0 ) { $count[$i][0]++; } else { $count[$i][1]++; } } } } my $bitmask = 0; for(my $i = 0; $i < 8; $i++) { print "count[$i][0] = $count[$i][0]\tcount[$i][1] = $count[$i][1]\n"; if( $count[$i][1] > $count[$i][0] ) { $bitmask |= 1 << $i; } } printf "bitmask = %d = 0x%02x = 0b%08b\n", $bitmask, $bitmask, $bitmask;