#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "config.h"
#include "cdetect.h"
#include "debug.h"

#define SJIS_LEAD		(1)
#define SJIS_TRAIL		(1 << 1)
#define JISX0201		(1 << 2)
#define EUC_DBCS		(1 << 3)
#define EUC_X0201PREFIX		(1 << 4)
#define JIS_ESCAPE		(1 << 5)
#define UCS2_NUL		(1 << 6)
#define UTF8_LEAD		(1 << 7)

#define ASCII_MODE		(1)
#define SJIS_X0208MODE		(1 << 1)
#define SJIS_X0201MODE		(1 << 2)
#define EUC_MODE		(1 << 3)
#define EUC_X0201MODE		(1 << 4)
#define UTF8_MODE		(1 << 5)

#define SCORE_A 4
#define SCORE_B 2
#define SCORE_C 1

char *code_sets[] =
{
  "US-ASCII",
  "EUC-JP",
  "Shift_JIS",
  "ISO-2022-JP",
  "UTF-16",
  "UTF16BE",
  "UTF-8",
  "BINARY",
};


static int IsSJISLead(char ch);
static int IsSJISTrail(char ch);

static int scan(char *ptr, int len, int *score, int *unscore, int *newline);

enum CHARSET CodeDetect(char *ptr, int len, int *nl)
{
  int code;
  int score[CODE_MAX];
  int unscore[CODE_MAX];
  int newline[3];
  int max;
  int n;
  int m;

  memset(score, 0, sizeof(score));
  memset(unscore, 0, sizeof(unscore));
  memset(newline, 0, sizeof(newline));

  max = 0;
  code = 0;
  scan(ptr, len, score, unscore, newline);
  for(n = 0; n < 3; n++){
    if(max < newline[n]){
      max = newline[n];
      code = n;
    }
  }
  if(nl){
    *nl = code;
  }

  code = CS_BINARY;
  max = 0;
  for(m = 0; m < 2; m++){
    for(n = CS_US_ASCII; n < CODE_MAX; n++){
      if(max < score[n] && unscore[n] == 0){
	max = score[n];
	code = n;
      }
      unscore[n] = 0;
    }
    if(code != CS_BINARY){
      break;
    }
  }

  return code;
}

#define CHECK_SCORE 0
#if CHECK_SCORE
#define CHECK(A) (checkpoints[A]++)
#else
#define CHECK(A)
#endif
static int scan(char *ptr, int len, int *score, int *unscore, int *newline)
{
  int state = 0;
  int ch_state = 0;
  int ch_pre_state = 0;
  int jis_step = 0;
  int utf8_step = 0;
  int utf8_len;
  unsigned char pre_ch = 0;
  unsigned int  eval;
  static const int utf8_score[] = {0, SCORE_A, SCORE_A, SCORE_A, SCORE_A, SCORE_A+SCORE_A};
#if CHECK_SCORE
  int checkpoints[32] = {0};
#endif

  while(len){
    ch_pre_state = ch_state;
    ch_state = 0;

    eval = (unsigned char)(*ptr);

    switch(eval){
    case 0:
      ch_state = UCS2_NUL;
      break;
    case '\r':
      newline[NEWLINE_CR]++;
      break;
    case '\n':
      if(pre_ch == '\r'){
	newline[NEWLINE_CR]--;
	newline[NEWLINE_CRLF]++;
      }else{
	newline[NEWLINE_LF]++;
      }
      break;
    case '\t':
    case 0x1A:/* EOF */
      break;

    case 0x1B:
      if(jis_step == 0){
	ch_state |= JIS_ESCAPE;
	jis_step++;
      }
      CHECK(0);
      unscore[CS_US_ASCII]++;
      break;

    case '(':
    case '$':
      if(jis_step == 1){
	ch_state |= JIS_ESCAPE;
	jis_step++;
      }
      break;

    case '@':
    case 'B':
    case 'H':
    case 'I':
    case 'J':
      if(jis_step == 2){
	ch_state |= JIS_ESCAPE;
	score[CS_ISO2022_JP] += SCORE_A;
	CHECK(1);
      }
      break;

    case 0x8E:
      ch_state |= EUC_X0201PREFIX;
      break;

    default:
      if(eval < 0x20){
	score[CS_BINARY]++;
	CHECK(2);
      }
      break;
    }

    if(!(ch_state & JIS_ESCAPE)){
      if(jis_step){
	/* binary? */
	score[CS_BINARY]++;
	CHECK(6);
      }
      jis_step = 0;
    }

    if(IsSJISLead(eval)){
      ch_state |= SJIS_LEAD;
    }
    if(IsSJISTrail(eval) && (ch_pre_state & SJIS_LEAD)){
      ch_state |= SJIS_TRAIL;
    }

    if(0xa1 <= eval && eval <= 0xfe){
      ch_state |= EUC_DBCS;
    }else{
      state &= ~EUC_MODE;
    }

    if(0xa0 <= eval && eval <= 0xdf){
      ch_state |= JISX0201;
    }

    /* Shift_JIS */
    if(ch_pre_state & SJIS_LEAD){
      if(ch_state & SJIS_TRAIL){
	if(state & SJIS_X0208MODE){
	  score[CS_Shift_JIS] += SCORE_A;
	  CHECK(7);
	}else{
	  state |= SJIS_X0208MODE;
	  score[CS_Shift_JIS] += SCORE_C;
	  CHECK(8);
	}
        ch_state &= ~SJIS_LEAD;
      }else{
	state &= ~SJIS_X0208MODE;
	score[CS_BINARY] += SCORE_C;
	unscore[CS_Shift_JIS]++;
	CHECK(9);
      }
    }else{
      state &= ~SJIS_X0208MODE;
    }

    /* EUC */
    if(ch_pre_state & EUC_DBCS){
      if(ch_state & EUC_DBCS){
	if(state & (EUC_MODE | EUC_X0201MODE)){
	  score[CS_EUC_JP] += SCORE_A;
	  CHECK(10);
	}else{
	  state |=EUC_MODE;
	  score[CS_EUC_JP] += SCORE_C;
	  CHECK(11);
	}
	ch_state &= ~EUC_DBCS;
      }else{
	state &= ~EUC_MODE;
	unscore[CS_EUC_JP]++;
	CHECK(12);
      }
    }

    /* JIS X 0201 */
    if(ch_state & JISX0201){
      if(ch_pre_state & EUC_X0201PREFIX){
	if(state & (EUC_MODE | EUC_X0201MODE)){
	  score[CS_EUC_JP] += SCORE_B;
	  CHECK(13);
	}else{
	  score[CS_EUC_JP] += SCORE_C;
	  state |= (EUC_MODE | EUC_X0201MODE);
	}
	ch_state &= ~EUC_DBCS;
      }else{
	state &= ~EUC_X0201MODE;
	if(state & SJIS_X0201MODE){
	  score[CS_Shift_JIS] += SCORE_B;
	  CHECK(14);
	}else{
	  state |= SJIS_X0201MODE;
	  score[CS_BINARY] += SCORE_C;
	}
	CHECK(15);
      }
    }else{
      state &= ~(EUC_X0201MODE | SJIS_X0201MODE);
    }

    /* UTF-8 */
    if((eval & 0x80) == 0){           /* 0xxx xxxx */
      ;
    }else{
      unscore[CS_US_ASCII]++;

      if((eval & 0xc0) == 0x80){  /* 10xx xxxx */
	if(utf8_step){
	  utf8_step--;
	  if(utf8_step == 0){
	    score[CS_UTF8] += utf8_score[utf8_len - 1];
	    CHECK(3);
	  }
	}else{
	  unscore[CS_UTF8]++;
	  CHECK(4);
	}
      }else if((eval & 0xe0) == 0xc0){  /* 110x xxxx */
	utf8_len = 1;
	goto DETERMIN_TRAILBYTES;
      }else if((eval & 0xf0) == 0xe0){  /* 1110 xxxx */
	utf8_len = 2;
	goto DETERMIN_TRAILBYTES;
      }else if((eval & 0xf8) == 0xf0){  /* 1111 0xxx */
	utf8_len = 3;
	goto DETERMIN_TRAILBYTES;
      }else if((eval & 0xfc) == 0xf8){  /* 1111 10xx */
	utf8_len = 4;
	goto DETERMIN_TRAILBYTES;
      }else if((eval & 0xfe) == 0xfc){  /* 1111 110x */
	utf8_len = 5;
      DETERMIN_TRAILBYTES:
	if(utf8_step){
	  unscore[CS_UTF8]++;
	  CHECK(5);
	  utf8_step = 0;
	}else{
	  utf8_step = utf8_len;
	}
      }
    }


    if(0x20 <= eval && eval < 0x7f){
      if(ch_pre_state & UCS2_NUL){
	score[CS_UTF16] += SCORE_A;
	CHECK(16);
      }
      if(state & ASCII_MODE){
	score[CS_US_ASCII] += SCORE_C;
	CHECK(17);
      }else{
	state |= ASCII_MODE;
      }
    }else{
      state &= ~ ASCII_MODE;
    }

    pre_ch = eval;
    ptr++;
    len--;
  }

#if 0
  for(len = 0; len <= 16; len++){
    dprintf("check %d: %d\n", len, checkpoints[len]);
  }
#endif

  return 0;
}

static int IsSJISLead(char ch)
{

  if((unsigned char)(((ch ^ 0x20)) - 0xa1) < 0x3c){
    return 1;
  }
  return 0;
}

static int IsSJISTrail(char ch)
{

  if(0x7f < (unsigned int)ch){
    ch--;
  }
  if((unsigned int)((ch - 0x40) < 0xbc)){
    return 1;
  }
  return 0;
}


enum CHARSET NextCharset(enum CHARSET charset)
{
  for(;;){
    charset++;
    switch(charset){
    case CS_US_ASCII:
      goto DETERMIN;
    case CS_EUC_JP:
      goto DETERMIN;
    case CS_Shift_JIS:
      goto DETERMIN;
    case CS_ISO2022_JP:
      goto DETERMIN;
    case CS_UTF16:
    case CS_UTF16BE:
      break;
    case CS_UTF8:
      goto DETERMIN;
    case CS_BINARY:
      goto DETERMIN;
    case CODE_MAX:
    default:
      charset = 0;
      break;
    }
  }
DETERMIN:
  return charset;
}

