#!/usr/bin/perl -w # rename_tokens.pl - Don Yang (uguu.org) # # Rename identifiers in file. This is used to estimate an optimistic # lower bound on code size. # # ./rename_tokens.pl < input.c | wc # # I could make the script just add up the token sizes, but outputting # the renamed tokens to stdout makes it easier to debug the script # itself. # # 01/07/12 use strict; while(my $input = <>) { # Don't touch lines with preprocessor if( $input =~ /^#/ ) { print $input; next; } $input =~ s/([\r\n]*)$//s; my $eol = $1; my $output = ""; while( $input ne "" ) { if( $input =~ s/^(\s)\s*// ) { $output .= $1; } # C keywords can not be renamed if( $input =~ s/^(auto | break | case | char | const | continue | default | do | double | else | enum | extern | float | for | goto | if | int | long | register | return | short | signed | sizeof | static | struct | switch | typedef | union | unsigned | void | volatile | while)\b//x ) { $output .= $1; next; } # Identifiers start with alphabet character or underscore, and # counts as 1 token. This is because identifiers can usually be # renamed to a single character. if( $input =~ s/^([[:alpha:]_])\w+// ) { $output .= uc $1; next; } # Numeric constants start with a digit. if( $input =~ s/^(\d\w+)// ) { $output .= $1; next; } # All other characters count as 1 token. $output .= substr($input, 0, 1); $input = substr($input, 1); } print $output, $eol; }