#!/usr/bin/ruby -w # Bitcode program with text encoder. # Minimum length for repeat instructions, in units of number of input bytes. # A single repeat instruction costs the same amount of space as two output # instructions, so we the repeat length needs to be at least 3 for it to be # worthwhile. MIN_REPEAT = 3 # Minimum offset for repeat targets, in code block units (each output # instruction counts as 1 unit, and each repeat instruction counts as # 2 units). MIN_OFFSET = 1 # Maximum offset for repeat targets. MAX_OFFSET = 255 + MIN_OFFSET # Instruction return states. RS_NORMAL = 0 RS_FIRST_INSTRUCTION_AFTER_BRANCH = RS_NORMAL + 1 RS_SECOND_INSTRUCTION_AFTER_RETURN = 2 RS_FIRST_INSTRUCTION_AFTER_RETURN = RS_SECOND_INSTRUCTION_AFTER_RETURN + 1 # Maximum number of steps to run. MAX_STEPS = 5000 # Size of ring buffer for storing code offsets. # # For each position in input text, we need to keep track of the output code # position to decide whether the code can be referenced inside a repeat # command. The furthest text that might be referenced is: # # furthest address repeat block = (max offset) / (repeat instruction size) # = 256 / 2 = 128 # # (maximum repeat block length) * (furthest addressable repeat block) = # 258 * 128 = 33024 # # But instead of using that odd size, we will round up to next power of 2. # This allows us to use bitwise operations to compute ring offset. RING_BUFFER_SIZE = 0x10000 RING_BUFFER_MASK = RING_BUFFER_SIZE - 1 # True to enable tracing. TRACE = false # Run assembled bitcode, returning output as a string. def run(code) # Control flow stack: # stack[2*N] = return address. # stack[2*N+1] = output bytes remaining. # # Output bytes remaining is decremented each time a byte is printed, and # control flow is returned to parent frame once the counter reaches zero. stack = [0, 100] # Stack pointer. sp = 1 fail unless stack[sp] > 0 # Instruction pointer. ip = -1 # Instruction start. is = nil # Return state. See RS_* constants near the top. rs = RS_FIRST_INSTRUCTION_AFTER_BRANCH # Accumulator. x = nil # Number of steps executed, used to detect runaway loops. steps = 0 # Buffered output text. output_text = "" # Output current states. unless defined?(trace) define_method :trace do |line, msg| return unless TRACE print " " * (sp - 1), "#{line}: steps=#{steps}, ip=#{ip}, ", "x=#{x ? '0x' + x.to_s(16) : 'nil'}, ", "stack[#{sp - 1}]=#{stack[sp - 1]}, ", "stack[#{sp}]=#{stack[sp]}, ", "is=#{is}, rs=#{rs}, #{msg}\n" end end while ip < code.size steps += 1 fail if steps > MAX_STEPS # First instruction after a branch or return is always ignored. # We always return to the end of the previous instruction because # "1" bits are only guaranteed to appear at the end of instructions, # and only "1" bits are valid branch targets. if rs & 1 != 0 trace(__LINE__, "next") fail unless ip >= -1 fail unless (ip + 1) % 9 == 0 rs -= 1 fail unless (rs == 0 || rs == 2) x = 0 is = ip + 1 ip += 1 next end # Instruction execution only happens at "1" bits. fail unless ip >= 0 if code[ip] == "0" ip += 1 next end fail unless is fail unless x fail unless x >= 0 # Check if instruction pointer has reached the end of a complete # instruction. If not, update accumulator and move on. if (ip + 1) % 9 != 0 trace(__LINE__, "accumulate") x |= 1 << (ip - is) ip += 1 next end fail unless (ip - is == 8 || ip - is == 17) # Check for repeat instruction. if ip - is == 17 length = (x & 0xff) + MIN_REPEAT offset = ((x >> 9) + 2 + MIN_OFFSET) * 9 if rs == RS_SECOND_INSTRUCTION_AFTER_RETURN # Recursive return. trace(__LINE__, "recursive_return: length=#{length}") stack[sp] -= length if stack[sp] <= 0 rs = RS_FIRST_INSTRUCTION_AFTER_RETURN ip = stack[sp - 1] sp -= 2 fail unless sp >= 1 next end # Completed return chain. trace(__LINE__, "completed_return") rs = RS_NORMAL is = ip + 1 ip += 1 x = 0 next end trace(__LINE__, "branch: offset=#{offset}, length=#{length}") fail unless rs == RS_NORMAL # Set return address to be start of previous instruction, and # branch to earlier part of the code. fail unless is - 1 == ip - 18 stack[sp + 1] = is - 1 stack[sp + 2] = [stack[sp], length].min ip -= offset rs = RS_FIRST_INSTRUCTION_AFTER_BRANCH sp += 2 next end # Output byte. trace(__LINE__, "output") fail unless ip - is == 8 fail unless rs == 0 if TRACE print " " * (sp - 1), "#{__LINE__}: output(#{x.chr.dump})\n" end output_text += x.chr x = 0 # Return when enough bytes have been printed. stack[sp] -= 1 if stack[sp] == 0 fail unless sp > 1 rs = RS_FIRST_INSTRUCTION_AFTER_RETURN ip = stack[sp - 1] sp -= 2 next end is = ip + 1 ip += 1 end return output_text end # Encode an 8-bit number, least significant bit first. def encode_number(n) fail if n < 0 fail if n > 255 return (sprintf '%08b', n).reverse end # Return number of 1-bits in a number. def bitcount(n) return n.to_s(2).count("1") end # Translate input text into bit encoding. def encode(input) # Output code. code = "" # Ring buffer of code offsets. offsets = [0] * RING_BUFFER_SIZE o_read = 0 o_write = 0 # Offset of next byte to be encoded. read_offset = 0 while read_offset < input.size fail unless o_write == (read_offset & RING_BUFFER_MASK) # Expire old ring buffer entries that are too far away to be reachable. while o_read != o_write if offsets[o_read] && code.size - offsets[o_read] <= MAX_OFFSET * 9 break end o_read = (o_read + 1) & RING_BUFFER_MASK end history_size = o_write >= o_read ? o_write - o_read : o_write + RING_BUFFER_SIZE - o_read if TRACE print "#{__LINE__}: input=#{input[read_offset].dump}, ", "read_offset=#{read_offset}, ", "o_read=#{o_read}, o_write=#{o_write}, ", "history_size=#{history_size}, ", "code.size=#{code.size}\n" end # Find longest match in previous input. repeat_offset = -1 # Offset within code string. repeat_length = 0 # Number of bytes matched. history_size.times{|i| fail unless ((o_read + i) & RING_BUFFER_MASK) != o_write code_offset = offsets[(o_read + i) & RING_BUFFER_MASK] # Skip code offsets that can't be referenced, i.e. the string that # we want to copy is covered by a repeat block. next unless code_offset # Check for matching substring. match_offset = read_offset - history_size + i match_length = 0 fail unless match_offset < read_offset (255 + MIN_REPEAT).times{|j| break if read_offset + j >= input.size break if input[match_offset + j] != input[read_offset + j] match_length += 1 } # A previous substring is a potential repeat candidate if it's # at least 3 bytes long. if match_length >= MIN_REPEAT if TRACE print "#{__LINE__}: match_offset=#{match_offset}, ", "match_length=#{match_length}, ", "code_offset=#{code_offset}\n" end if match_length > repeat_length # Candidates that are longer than what we have matched before # are always accepted, since they improve compression ratio. repeat_offset = code_offset repeat_length = match_length elsif match_length == repeat_length # Candidates that are of same length as what we have matched # before are accepted only if it takes fewer 1-bits to encode # those offsets. fail unless repeat_offset % 9 == 0 fail unless code_offset % 9 == 0 if bitcount(code_offset / 9 - MIN_OFFSET) < bitcount(repeat_offset / 9 - MIN_OFFSET) repeat_offset = code_offset repeat_length = match_length end end end } # Mark current position. This makes it possible for for later code # to copy text starting from current location. This happens # whether we output a literal or a repeat command. offsets[o_write] = code.size o_write = (o_write + 1) & RING_BUFFER_MASK if repeat_offset < 0 then # Did not match a previous substring. Output a literal byte. code += encode_number(input[read_offset].ord) + "1" read_offset += 1 if TRACE print "#{__LINE__}: read_offset=#{read_offset}, ", "o_write=#{o_write}, code.size=#{code.size}\n" end else # Matched a previous substring that's sufficiently long. # Output a repeat command. if TRACE print "#{__LINE__}: repeat_offset=#{repeat_offset}, ", "repeat_length=#{repeat_length}\n" end fail unless (code.size - repeat_offset) % 9 == 0 fail unless repeat_length >= MIN_REPEAT code += encode_number(repeat_length - MIN_REPEAT) + "0" + encode_number((code.size - repeat_offset) / 9 - MIN_OFFSET) + "1" # Remove code positions for region covered by repeat code. (1..(repeat_length - 1)).each{|i| offsets[o_write] = nil o_write = (o_write + 1) & RING_BUFFER_MASK } read_offset += repeat_length if TRACE print "#{__LINE__}: read_offset=#{read_offset}, ", "o_write=#{o_write}, code.size=#{code.size}\n" end end end return code end input_text = "123333123333123333123333" code = encode(input_text) if TRACE i = 0 while i < code.size if code[i + 8] == "1" print code[i, 9], ": output(", code[i, 8].reverse.to_i(2).chr.dump, ")\n" i += 9 else print code[i, 18], ": repeat(", code[i, 8].reverse.to_i(2) + MIN_REPEAT, ", ", -(code[i + 9, 8].reverse.to_i(2) + MIN_OFFSET), ")\n" i += 18 end end end output_text = run(code) if TRACE print " input=#{input_text}\n", "output=#{input_text}\n" end fail unless input_text == output_text