/* yuriko0.c - Don Yang (uguu.org)

   Process files on command line:
      ./yuriko <infile> [...]
   Process file from stdin:
      cat source | ./yuriko

   See dot files for more comments on how the DFAs work.

   To detect file content type automatically, we use two observations:

   1. Most programs start with comments (e.g. copyright messages)
   2. Commenting syntax in one language is often syntax errors in
      other languages.

   Thus we can guess, with reasonable accuracy, what language a file
   is by seeing which type of comments appeared.  This is not
   foolproof, for example, // is a valid operator in Python, but its
   presence will cause the source to be parsed as C with the current
   ordering (prefer C over Python).  If we reverse the ordering, then
   every C program will be parsed as Python because of #
   preprocessors.  To that, we use a third observation:

   3. Most programs out there are written in C/C++.  Seriously.

   Really, we might need some option to manually override the detected
   language, especially for polyglots.  But this scheme is going to be
   right most of the time.  We should probably assume where it guessed
   wrong, the program wasn't going to be well documented anyways.

   02/08/06
                                                                           */

#include<assert.h>
#include<ctype.h>
#include<stdio.h>
#include<string.h>

#define READ_BUFFER_SIZE   1024

/* DFAs:
   non-space character = edges
      q = match last quote read
      z = match on nest level == 1
      ' " = match and remember quote character
      ( ) # ; * / \\ \n = match character
   + = increment nest level
   - = decrement nest level
   ! = output last character(s) read instead of whitespace
   ? = buffer character
   digits = next state.

   Not strictly DFAs since they maintain more than just the current
   state, especially for OCaml.  Close enough.                      */

static char *dfa_c[] =
{
   /* 0 = init */          "\"1  \'1  /?3  0",
   /* 1 = string */        "\\2  q0  1",
   /* 2 = escape */        "1",
   /* 3 = start_comment */ "*!4  /!6  0",
   /* 4 = comment */       "*!5  !4",
   /* 5 = end_comment */   "*!5  /!0  !4",
   /* 6 = s_comment */     "\n0  !6"
};
static char *dfa_ml[] =
{
   /* 0 = init */          "\"1  \'1  (?3  0",
   /* 1 = string */        "\\2  q0  1",
   /* 2 = escape */        "1",
   /* 3 = start_comment */ "*!+4  (?3  0",
   /* 4 = comment */       "(!5  *!6  !4",
   /* 5 = nest_comment */  "*!+4  (!5  !4",
   /* 6 = end_comment */   "*!6  )z-!0  )-!4  !4"
};
static char *dfa_py[] =
{
   /* 0 = init */          "\"1  \'1  #!7  0",
   /* 1 = string */        "\\2  q3  1",
   /* 2 = escape */        "1",
   /* 3 = string2 */       "q4  \"1  \'1  #!7  0",
   /* 4 = t_string */      "q5  4",
   /* 5 = t_string_end1 */ "q6  4",
   /* 6 = t_string_end2 */ "q0  4",
   /* 7 = comment */       "\n0  !7"
};
static char *dfa_scm[] =
{
   /* 0 = init */          "\"1  ;!3  0",
   /* 1 = string */        "\\2  q0  1",
   /* 2 = escape */        "1",
   /* 3 = comment */       "\n0  !3"
};

/* Current DFA state */
typedef struct
{
   int state, nest;
   char quote, buffer_char;
   int continue_line, line_number;
} State;

/* If true, output code instead of comments, and do not print line numbers */
static int InvertOutput = 0;

static int SingleStepDFA(/*@observer@*/char **dfa, State *state, char input);
static int TestDFA(/*@observer@*/char **dfa, char *input, size_t input_size);
static void RunDFA(/*@observer@*/char **dfa,
                   State *state, char *input, size_t input_size);
static /*@observer@*/char **DetectFileType(FILE *infile,
                                           /*@out@*/char *buffer,
                                           /*@out@*/size_t *size);

int main(int argc, char **argv)
{
   /*@observer@*/char **dfa;
   char buffer[READ_BUFFER_SIZE];
   FILE *infile;
   State state;
   size_t size;
   int i;

   /* Select output mode by checking what file name we were compiled
      with (yuriko.c -> output comments with line numbers, nozomi.c ->
      output code without line numbers).  We get the other
      functionality for free just by inverting DFA output.             */
   char *f = __FILE__;
   InvertOutput = ((int)*f & 1) ^ 1;

   if( argc == 1 )
   {
      /* Process stdin */
      dfa = DetectFileType(stdin, buffer, &size);
      memset(&state, 0, sizeof(State));
      while( size > 0 )
      {
         RunDFA(dfa, &state, buffer, size);
         size = fread(buffer, 1, READ_BUFFER_SIZE, stdin);
      }
   }
   else
   {
      /* Process files specified on command line arguments */
      for(i = 1; i < argc; i++)
      {
         if( argc > 2 && InvertOutput == 0 )
            (void)puts(argv[i]);
         if( (infile = fopen(argv[i], "rb")) == NULL )
         {
            (void)puts("can not open file");
            continue;
         }

         dfa = DetectFileType(infile, buffer, &size);
         memset(&state, 0, sizeof(State));
         while( size > 0 )
         {
            RunDFA(dfa, &state, buffer, size);
            size = fread(buffer, 1, READ_BUFFER_SIZE, infile);
         }
         (void)fclose(infile);
      }
   }
   return 0;
}


/* Run DFA for one iteration, return a combination of bit values: */
#define CODE_STATE      0  /* Currently inside code state, output whitespace */
#define COMMENT_STATE   1  /* Currently inside comment state, output text */
#define BUFFER_CHAR     2  /* Process the current character at next state */
#define FLUSH_CHAR      4  /* Need to flush existing buffered character */
static int SingleStepDFA(/*@observer@*/char **dfa, State *state, char input)
{
   int action = CODE_STATE;
   char *p;

   #define SKIP_EDGE() \
      p = strpbrk(p, "0123456789")
   for(p = dfa[state->state]; !isdigit(*p); p++)
   {
      assert(p != NULL);
      if( *p == ' ' )
         continue;
      switch( *p )
      {
         case 'q': if( input != state->quote ) SKIP_EDGE(); break;
         case 'z': if( state->nest != 1 ) SKIP_EDGE(); break;
         case '+': state->nest++; break;
         case '-': state->nest--; break;
         case '!': action |= COMMENT_STATE; break;
         case '?':
            action |= BUFFER_CHAR;
            if( state->buffer_char != '\0' )
            {
               assert(state->buffer_char == input);
               action |= FLUSH_CHAR;
            }
            else
            {
               state->buffer_char = input;
            }
            break;
         case '\'':
         case '\"':
            if( input == *p )
               state->quote = *p;
            else
               SKIP_EDGE();
            break;
         default:
            if( input != *p )
               SKIP_EDGE();
            break;
      }
   }
   state->state = (int)*p - (int)'0';
   return action;
}

/* Test DFA on input text, return 1 if input contains comments */
static int TestDFA(/*@observer@*/char **dfa, char *input, size_t input_size)
{
   State state;
   size_t i;

   memset(&state, 0, sizeof(State));
   for(i = 0; i < input_size; i++)
   {
      /* Count lines and ignore everything after the 5th line.  While
         the first few lines are usually comments, the length of those
         lines vary among programmers and languages, and we don't want
         to be checking for comments when we are inside the code region. */
      if( *input == '\n' )
      {
         if( ++(state.line_number) >= 5 )
            return 0;
      }
      if( (SingleStepDFA(dfa, &state, *input++) & COMMENT_STATE) != 0 )
         return 1;
   }
   return 0;
}

/* Run DFA on input text */
static void RunDFA(
   /*@observer@*/char **dfa, State *state, char *input, size_t input_size)
{
   int action;
   size_t i;

   for(i = 0; i < input_size; i++, input++)
   {
      /* Print leading line number.  This helps matching each line
         back to the original file more easily.  We don't print line
         numbers when we are extracting code, since the output in that
         case is usually meant to be machine readable rather than
         human readable.                                               */
      if( InvertOutput == 0 )
      {
         if( state->continue_line == 0 )
         {
            printf("%7d ", ++(state->line_number));
            state->continue_line = 1;
         }
         if( *input == '\n' )
            state->continue_line = 0;
      }

      /* Process input */
      action = SingleStepDFA(dfa, state, *input);

      /* Flush previously buffered character.  For the current selected
         set of languages, this is guaranteed to be same as currently
         buffered character.                                            */
      if( (action & FLUSH_CHAR) != 0 )
      {
         if( (action & COMMENT_STATE) != 0 )
            (void)putchar(state->buffer_char);
         else
            (void)putchar(' ');
      }

      /* Buffer character until next iteration.  This means if the
         file ends with a partial comment terminator, the output will
         be one character short.  Garbage in, garbage out.            */
      if( (action & BUFFER_CHAR) != 0 )
         continue;

      if( (action & COMMENT_STATE) != InvertOutput )
      {
         /* Inside comments */
         if( state->buffer_char != '\0' )
         {
            (void)putchar(state->buffer_char);
            state->buffer_char = '\0';
         }
         (void)putchar(*input);
      }
      else
      {
         /* Outside comments */
         if( state->buffer_char != '\0' )
         {
            (void)putchar(' ');
            state->buffer_char = '\0';
         }
         if( isspace(*input) )
            (void)putchar(*input);
         else
            (void)putchar(' ');
      }
   }
}

/* Detect file type by testing first block with multiple DFAs. note
   that even though the first block is read, only the first few lines
   are used in TestDFA.                                               */
static /*@observer@*/char **DetectFileType(
   FILE *infile, /*@out@*/char *buffer, /*@out@*/size_t *size)
{
   /* Read first block */
   *size = fread(buffer, 1, READ_BUFFER_SIZE, infile);
   if( TestDFA(dfa_c, buffer, *size) != 0 )     return dfa_c;
   if( TestDFA(dfa_ml, buffer, *size) != 0 )    return dfa_ml;
   if( TestDFA(dfa_py, buffer, *size) != 0 )    return dfa_py;
   if( TestDFA(dfa_scm, buffer, *size) != 0 )   return dfa_scm;

   /* Assume C by default */
   return dfa_c;
}