#!/usr/bin/perl -w # Strip C/C++ comments from input stream, and convert non-keyword # identifiers into single character identifiers. This is used to get a # rough estimate of the minimized source. # # Note that all non-keyword identifiers are assumed to be compressible to a # single character, including library functions and "main". The actual # minified source might not be as small as the output of this script. # Again, this script is only meant to be used to get a rough estimate. use strict; # Dictionary of C++ keywords. C keywords are a subset of these. my %keywords = (); $keywords{$_} = 1 foreach (qw{ alignas alignof and and_eq asm atomic_cancel atomic_commit atomic_noexcept auto bitand bitor bool break case catch char char8_t char16_t char32_t class compl concept const consteval constexpr constinit const_cast continue contract_assert co_await co_return co_yield decltype default delete do double dynamic_cast else enum explicit export extern false float for friend goto if inline int long mutable namespace new noexcept not not_eq nullptr operator or or_eq private protected public reflexpr register reinterpret_cast requires return short signed sizeof static static_assert static_cast struct switch synchronized template this thread_local throw true try typedef typeid typename union unsigned using virtual void volatile wchar_t while xor xor_eq }); # Parser states. use constant NORMAL => 0; use constant BLOCK_COMMENT => 1; use constant QUOTE => 2; my $state = NORMAL; my $quote = undef; while( my $line = <> ) { # Remove trailing newline. This simplifies the regular expressions below. chomp $line; while( $line =~ s/^(\s*)(\S.*)$/$2/ ) { # Output whitespace prefix. print $1; if( $state == NORMAL ) { if( $line =~ m{^//} ) { # Single line comment, drop the rest of this line. $line = ""; last; } elsif( $line =~ m{^/\*(.*)$} ) { # Start of block comment. $line = $1; $state = BLOCK_COMMENT; } elsif( $line =~ s/^(['"])(.*)$/$2/ ) { # Start of quoted string. print $1; $quote = $1; $state = QUOTE; } elsif( $line =~ /^#/ ) { # Preprocessor. last; } elsif( $line =~ s/^(\d\w*)(.*)$/$2/ ) { # Numeric literal. print $1; } elsif( $line =~ s/^(\w+)(.*)$/$2/ ) { # Identifier. if( exists $keywords{$1} ) { print $1; } else { print substr($1, 0, 1); } } else { # Other characters outside quote and comments. print substr($line, 0, 1); $line = substr($line, 1); } # C++11 raw strings are not supported. } elsif( $state == QUOTE ) { if( $line =~ s/^(\\.)(.*)$/$2/ ) { # Backslash escaped character inside quoted string. print $1; } elsif( substr($line, 0, 1) eq $quote ) { # End of quoted string. print $quote; $line = substr($line, 1); $state = NORMAL; } else { # Regular unescaped character inside quoted string. print substr($line, 0, 1); $line = substr($line, 1); } } else # $state == BLOCK_COMMENT { if( substr($line, 0, 2) eq "*/" ) { # End of block comments. $line = substr($line, 2); $state = NORMAL; } else { # Comment characters. $line = substr($line, 1); } } } # Output whatever is left of the line, which should only contain # whitespaces. Also add back the trailing newline. print "$line\n"; }