/* yuriko0.c - Don Yang (uguu.org) Process files on command line: ./yuriko [...] Process file from stdin: cat source | ./yuriko See dot files for more comments on how the DFAs work. To detect file content type automatically, we use two observations: 1. Most programs start with comments (e.g. copyright messages) 2. Commenting syntax in one language is often syntax errors in other languages. Thus we can guess, with reasonable accuracy, what language a file is by seeing which type of comments appeared. This is not foolproof, for example, // is a valid operator in Python, but its presence will cause the source to be parsed as C with the current ordering (prefer C over Python). If we reverse the ordering, then every C program will be parsed as Python because of # preprocessors. To that, we use a third observation: 3. Most programs out there are written in C/C++. Seriously. Really, we might need some option to manually override the detected language, especially for polyglots. But this scheme is going to be right most of the time. We should probably assume where it guessed wrong, the program wasn't going to be well documented anyways. 02/08/06 */ #include #include #include #include #define READ_BUFFER_SIZE 1024 /* DFAs: non-space character = edges q = match last quote read z = match on nest level == 1 ' " = match and remember quote character ( ) # ; * / \\ \n = match character + = increment nest level - = decrement nest level ! = output last character(s) read instead of whitespace ? = buffer character digits = next state. Not strictly DFAs since they maintain more than just the current state, especially for OCaml. Close enough. */ static char *dfa_c[] = { /* 0 = init */ "\"1 \'1 /?3 0", /* 1 = string */ "\\2 q0 1", /* 2 = escape */ "1", /* 3 = start_comment */ "*!4 /!6 0", /* 4 = comment */ "*!5 !4", /* 5 = end_comment */ "*!5 /!0 !4", /* 6 = s_comment */ "\n0 !6" }; static char *dfa_ml[] = { /* 0 = init */ "\"1 \'1 (?3 0", /* 1 = string */ "\\2 q0 1", /* 2 = escape */ "1", /* 3 = start_comment */ "*!+4 (?3 0", /* 4 = comment */ "(!5 *!6 !4", /* 5 = nest_comment */ "*!+4 (!5 !4", /* 6 = end_comment */ "*!6 )z-!0 )-!4 !4" }; static char *dfa_py[] = { /* 0 = init */ "\"1 \'1 #!7 0", /* 1 = string */ "\\2 q3 1", /* 2 = escape */ "1", /* 3 = string2 */ "q4 \"1 \'1 #!7 0", /* 4 = t_string */ "q5 4", /* 5 = t_string_end1 */ "q6 4", /* 6 = t_string_end2 */ "q0 4", /* 7 = comment */ "\n0 !7" }; static char *dfa_scm[] = { /* 0 = init */ "\"1 ;!3 0", /* 1 = string */ "\\2 q0 1", /* 2 = escape */ "1", /* 3 = comment */ "\n0 !3" }; /* Current DFA state */ typedef struct { int state, nest; char quote, buffer_char; int continue_line, line_number; } State; /* If true, output code instead of comments, and do not print line numbers */ static int InvertOutput = 0; static int SingleStepDFA(/*@observer@*/char **dfa, State *state, char input); static int TestDFA(/*@observer@*/char **dfa, char *input, size_t input_size); static void RunDFA(/*@observer@*/char **dfa, State *state, char *input, size_t input_size); static /*@observer@*/char **DetectFileType(FILE *infile, /*@out@*/char *buffer, /*@out@*/size_t *size); int main(int argc, char **argv) { /*@observer@*/char **dfa; char buffer[READ_BUFFER_SIZE]; FILE *infile; State state; size_t size; int i; /* Select output mode by checking what file name we were compiled with (yuriko.c -> output comments with line numbers, nozomi.c -> output code without line numbers). We get the other functionality for free just by inverting DFA output. */ char *f = __FILE__; InvertOutput = ((int)*f & 1) ^ 1; if( argc == 1 ) { /* Process stdin */ dfa = DetectFileType(stdin, buffer, &size); memset(&state, 0, sizeof(State)); while( size > 0 ) { RunDFA(dfa, &state, buffer, size); size = fread(buffer, 1, READ_BUFFER_SIZE, stdin); } } else { /* Process files specified on command line arguments */ for(i = 1; i < argc; i++) { if( argc > 2 && InvertOutput == 0 ) (void)puts(argv[i]); if( (infile = fopen(argv[i], "rb")) == NULL ) { (void)puts("can not open file"); continue; } dfa = DetectFileType(infile, buffer, &size); memset(&state, 0, sizeof(State)); while( size > 0 ) { RunDFA(dfa, &state, buffer, size); size = fread(buffer, 1, READ_BUFFER_SIZE, infile); } (void)fclose(infile); } } return 0; } /* Run DFA for one iteration, return a combination of bit values: */ #define CODE_STATE 0 /* Currently inside code state, output whitespace */ #define COMMENT_STATE 1 /* Currently inside comment state, output text */ #define BUFFER_CHAR 2 /* Process the current character at next state */ #define FLUSH_CHAR 4 /* Need to flush existing buffered character */ static int SingleStepDFA(/*@observer@*/char **dfa, State *state, char input) { int action = CODE_STATE; char *p; #define SKIP_EDGE() \ p = strpbrk(p, "0123456789") for(p = dfa[state->state]; !isdigit(*p); p++) { assert(p != NULL); if( *p == ' ' ) continue; switch( *p ) { case 'q': if( input != state->quote ) SKIP_EDGE(); break; case 'z': if( state->nest != 1 ) SKIP_EDGE(); break; case '+': state->nest++; break; case '-': state->nest--; break; case '!': action |= COMMENT_STATE; break; case '?': action |= BUFFER_CHAR; if( state->buffer_char != '\0' ) { assert(state->buffer_char == input); action |= FLUSH_CHAR; } else { state->buffer_char = input; } break; case '\'': case '\"': if( input == *p ) state->quote = *p; else SKIP_EDGE(); break; default: if( input != *p ) SKIP_EDGE(); break; } } state->state = (int)*p - (int)'0'; return action; } /* Test DFA on input text, return 1 if input contains comments */ static int TestDFA(/*@observer@*/char **dfa, char *input, size_t input_size) { State state; size_t i; memset(&state, 0, sizeof(State)); for(i = 0; i < input_size; i++) { /* Count lines and ignore everything after the 5th line. While the first few lines are usually comments, the length of those lines vary among programmers and languages, and we don't want to be checking for comments when we are inside the code region. */ if( *input == '\n' ) { if( ++(state.line_number) >= 5 ) return 0; } if( (SingleStepDFA(dfa, &state, *input++) & COMMENT_STATE) != 0 ) return 1; } return 0; } /* Run DFA on input text */ static void RunDFA( /*@observer@*/char **dfa, State *state, char *input, size_t input_size) { int action; size_t i; for(i = 0; i < input_size; i++, input++) { /* Print leading line number. This helps matching each line back to the original file more easily. We don't print line numbers when we are extracting code, since the output in that case is usually meant to be machine readable rather than human readable. */ if( InvertOutput == 0 ) { if( state->continue_line == 0 ) { printf("%7d ", ++(state->line_number)); state->continue_line = 1; } if( *input == '\n' ) state->continue_line = 0; } /* Process input */ action = SingleStepDFA(dfa, state, *input); /* Flush previously buffered character. For the current selected set of languages, this is guaranteed to be same as currently buffered character. */ if( (action & FLUSH_CHAR) != 0 ) { if( (action & COMMENT_STATE) != 0 ) (void)putchar(state->buffer_char); else (void)putchar(' '); } /* Buffer character until next iteration. This means if the file ends with a partial comment terminator, the output will be one character short. Garbage in, garbage out. */ if( (action & BUFFER_CHAR) != 0 ) continue; if( (action & COMMENT_STATE) != InvertOutput ) { /* Inside comments */ if( state->buffer_char != '\0' ) { (void)putchar(state->buffer_char); state->buffer_char = '\0'; } (void)putchar(*input); } else { /* Outside comments */ if( state->buffer_char != '\0' ) { (void)putchar(' '); state->buffer_char = '\0'; } if( isspace(*input) ) (void)putchar(*input); else (void)putchar(' '); } } } /* Detect file type by testing first block with multiple DFAs. note that even though the first block is read, only the first few lines are used in TestDFA. */ static /*@observer@*/char **DetectFileType( FILE *infile, /*@out@*/char *buffer, /*@out@*/size_t *size) { /* Read first block */ *size = fread(buffer, 1, READ_BUFFER_SIZE, infile); if( TestDFA(dfa_c, buffer, *size) != 0 ) return dfa_c; if( TestDFA(dfa_ml, buffer, *size) != 0 ) return dfa_ml; if( TestDFA(dfa_py, buffer, *size) != 0 ) return dfa_py; if( TestDFA(dfa_scm, buffer, *size) != 0 ) return dfa_scm; /* Assume C by default */ return dfa_c; }