/* 
 * Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
 *
 * This file is part of Rast.
 * See the file COPYING for redistribution information.
 *
 */

#include <ctype.h>

#include <apr_strings.h>

#include "rast/config.h"
#include "rast/encoding.h"
#include "rast/string.h"

static const unsigned char euc_jp_char_size_table[] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
};

static const char *x0201kana_to_x0208[] = {
    "\xA1\xA1", "\xA1\xA3", "\xA1\xD6", "\xA1\xD7", "\xA1\xA2", "\xA1\xA6",
    "\xA5\xF2", "\xA5\xA1", "\xA5\xA3", "\xA5\xA5", "\xA5\xA7", "\xA5\xA9",
    "\xA5\xE3", "\xA5\xE5", "\xA5\xE7", "\xA5\xC3", "\xA1\xBC", "\xA5\xA2",
    "\xA5\xA4", "\xA5\xA6", "\xA5\xA8", "\xA5\xAA", "\xA5\xAB", "\xA5\xAD",
    "\xA5\xAF", "\xA5\xB1", "\xA5\xB3", "\xA5\xB5", "\xA5\xB7", "\xA5\xB9",
    "\xA5\xBB", "\xA5\xBD", "\xA5\xBF", "\xA5\xC1", "\xA5\xC4", "\xA5\xC6",
    "\xA5\xC8", "\xA5\xCA", "\xA5\xCB", "\xA5\xCC", "\xA5\xCD", "\xA5\xCE",
    "\xA5\xCF", "\xA5\xD2", "\xA5\xD5", "\xA5\xD8", "\xA5\xDB", "\xA5\xDE",
    "\xA5\xDF", "\xA5\xE0", "\xA5\xE1", "\xA5\xE2", "\xA5\xE4", "\xA5\xE6",
    "\xA5\xE8", "\xA5\xE9", "\xA5\xEA", "\xA5\xEB", "\xA5\xEC", "\xA5\xED",
    "\xA5\xEF", "\xA5\xF3", "\xA1\xAB", "\xA1\xAC",
};

static const char *voiced_x0201kana_to_x0208[] = {
    "\xA1\xA1\xA1\xAB", "\xA1\xA3\xA1\xAB", "\xA1\xD6\xA1\xAB",
    "\xA1\xD7\xA1\xAB", "\xA1\xA2\xA1\xAB", "\xA1\xA6\xA1\xAB",
    "\xA5\xF2\xA1\xAB", "\xA5\xA1\xA1\xAB", "\xA5\xA3\xA1\xAB",
    "\xA5\xA5\xA1\xAB", "\xA5\xA7\xA1\xAB", "\xA5\xA9\xA1\xAB",
    "\xA5\xE3\xA1\xAB", "\xA5\xE5\xA1\xAB", "\xA5\xE7\xA1\xAB",
    "\xA5\xC3\xA1\xAB", "\xA1\xBC\xA1\xAB", "\xA5\xA2\xA1\xAB",
    "\xA5\xA4\xA1\xAB", "\xA5\xF4", "\xA5\xA8\xA1\xAB",
    "\xA5\xAA\xA1\xAB", "\xA5\xAC", "\xA5\xAE",
    "\xA5\xB0", "\xA5\xB2", "\xA5\xB4",
    "\xA5\xB6", "\xA5\xB8", "\xA5\xBA",
    "\xA5\xBC", "\xA5\xBE", "\xA5\xC0",
    "\xA5\xC2", "\xA5\xC5", "\xA5\xC7",
    "\xA5\xC9", "\xA5\xCA\xA1\xAB", "\xA5\xCB\xA1\xAB",
    "\xA5\xCC\xA1\xAB", "\xA5\xCD\xA1\xAB", "\xA5\xCE\xA1\xAB",
    "\xA5\xD0", "\xA5\xD3", "\xA5\xD6",
    "\xA5\xD9", "\xA5\xDC", "\xA5\xDE\xA1\xAB",
    "\xA5\xDF\xA1\xAB", "\xA5\xE0\xA1\xAB", "\xA5\xE1\xA1\xAB",
    "\xA5\xE2\xA1\xAB", "\xA5\xE4\xA1\xAB", "\xA5\xE6\xA1\xAB",
    "\xA5\xE8\xA1\xAB", "\xA5\xE9\xA1\xAB", "\xA5\xEA\xA1\xAB",
    "\xA5\xEB\xA1\xAB", "\xA5\xEC\xA1\xAB", "\xA5\xED\xA1\xAB",
    "\xA5\xEF\xA1\xAB", "\xA5\xF3\xA1\xAB", "\xA1\xAB\xA1\xAB",
    "\xA1\xAC\xA1\xAB",
};

static const char *semi_voiced_x0201kana_to_x0208[] = {
    "\xA1\xA1\xA1\xAC", "\xA1\xA3\xA1\xAC", "\xA1\xD6\xA1\xAC",
    "\xA1\xD7\xA1\xAC", "\xA1\xA2\xA1\xAC", "\xA1\xA6\xA1\xAC",
    "\xA5\xF2\xA1\xAC", "\xA5\xA1\xA1\xAC", "\xA5\xA3\xA1\xAC",
    "\xA5\xA5\xA1\xAC", "\xA5\xA7\xA1\xAC", "\xA5\xA9\xA1\xAC",
    "\xA5\xE3\xA1\xAC", "\xA5\xE5\xA1\xAC", "\xA5\xE7\xA1\xAC",
    "\xA5\xC3\xA1\xAC", "\xA1\xBC\xA1\xAC", "\xA5\xA2\xA1\xAC",
    "\xA5\xA4\xA1\xAC", "\xA5\xA6\xA1\xAC", "\xA5\xA8\xA1\xAC",
    "\xA5\xAA\xA1\xAC", "\xA5\xAB\xA1\xAC", "\xA5\xAD\xA1\xAC",
    "\xA5\xAF\xA1\xAC", "\xA5\xB1\xA1\xAC", "\xA5\xB3\xA1\xAC",
    "\xA5\xB5\xA1\xAC", "\xA5\xB7\xA1\xAC", "\xA5\xB9\xA1\xAC",
    "\xA5\xBB\xA1\xAC", "\xA5\xBD\xA1\xAC", "\xA5\xBF\xA1\xAC",
    "\xA5\xC1\xA1\xAC", "\xA5\xC4\xA1\xAC", "\xA5\xC6\xA1\xAC",
    "\xA5\xC8\xA1\xAC", "\xA5\xCA\xA1\xAC", "\xA5\xCB\xA1\xAC",
    "\xA5\xCC\xA1\xAC", "\xA5\xCD\xA1\xAC", "\xA5\xCE\xA1\xAC",
    "\xA5\xD1", "\xA5\xD4", "\xA5\xD7",
    "\xA5\xDA", "\xA5\xDD", "\xA5\xDE\xA1\xAC",
    "\xA5\xDF\xA1\xAC", "\xA5\xE0\xA1\xAC", "\xA5\xE1\xA1\xAC",
    "\xA5\xE2\xA1\xAC", "\xA5\xE4\xA1\xAC", "\xA5\xE6\xA1\xAC",
    "\xA5\xE8\xA1\xAC", "\xA5\xE9\xA1\xAC", "\xA5\xEA\xA1\xAC",
    "\xA5\xEB\xA1\xAC", "\xA5\xEC\xA1\xAC", "\xA5\xED\xA1\xAC",
    "\xA5\xEF\xA1\xAC", "\xA5\xF3\xA1\xAC", "\xA1\xAB\xA1\xAC",
    "\xA1\xAC\xA1\xAC",
};

typedef enum {
    CHAR_TYPE_OTHER,
    CHAR_TYPE_ALPHABET,
    CHAR_TYPE_KATAKANA,
    CHAR_TYPE_HIRAGANA,
} char_type_e;

static int
get_char_len(const char *ptr, const char *ptr_end)
{
    int len;

    len = euc_jp_char_size_table[(unsigned char) *ptr];
    if (ptr + len > ptr_end) {
        len = ptr_end - ptr;
    }
    return len;
}

static rast_error_t *
euc_jp_get_char_len(rast_tokenizer_t *tokenizer, rast_size_t *char_len)
{
    *char_len = get_char_len(tokenizer->ptr, tokenizer->ptr_end);
    return RAST_OK;
}

static char_type_e
get_char_type(const char *ch, int ch_nbytes)
{
    const unsigned char *p;

    p = (const unsigned char *) ch;
    switch (ch_nbytes) {
    case 1:
        if (isalnum(*p) || isspace(*p)) {
            return CHAR_TYPE_ALPHABET;
        }
        break;
    case 2:
        switch (*p) {
        case 0xA1:
            if (*(p + 1) == 0xBC) {
                return CHAR_TYPE_KATAKANA;
            }
            break;
        case 0xA3:
            if (*(p + 1) >= 0xA0) {
                return CHAR_TYPE_ALPHABET;
            }
            break;
        case 0xA4:
            if (*(p + 1) >= 0xA0) {
                return CHAR_TYPE_HIRAGANA;
            }
            break;
        case 0xA5:
            if (*(p + 1) >= 0xA0) {
                return CHAR_TYPE_KATAKANA;
            }
            break;
        case 0x8E:
            if (*(p + 1) >= 0xA0) {
                return CHAR_TYPE_KATAKANA;
            }
            break;
        }
        break;
    }
    return CHAR_TYPE_OTHER;
}

static int
decide_n(char_type_e type)
{
    switch (type) {
    case CHAR_TYPE_ALPHABET:
    case CHAR_TYPE_HIRAGANA:
    case CHAR_TYPE_KATAKANA:
        return 3;
    case CHAR_TYPE_OTHER:
    default:
        return 2;
    }
}

static rast_error_t *
euc_jp_get_token(rast_tokenizer_t *tokenizer, rast_token_t *token)
{
    const char *ptr = tokenizer->ptr;
    const char *ptr_end = tokenizer->ptr_end;
    const char *p;
    int nbytes, n;
    char_type_e type1, type2, type3;

    if (ptr >= ptr_end) {
        return rast_error(RAST_ERROR_CURSOR, "out of cursor");
    }
    token->nchars = 0;
    p = ptr;
    nbytes = get_char_len(p, ptr_end);
    type1 = get_char_type(p, nbytes);
    n = decide_n(type1);
    p += nbytes;
    token->nchars++;
    if (p >= ptr_end) {
        token->nbytes = p - ptr;
        token->is_complete = 0;
        return RAST_OK;
    }
    nbytes = get_char_len(p, ptr_end);
    type2 = get_char_type(p, nbytes);
    p += nbytes;
    token->nchars++;
    if (type2 != type1) {
        token->nbytes = p - ptr;
        token->is_complete = 1;
        return RAST_OK;
    }
    while (token->nchars < n) {
        if (p >= ptr_end) {
            token->nbytes = p - ptr;
            token->is_complete = 0;
            return RAST_OK;
        }
        nbytes = get_char_len(p, ptr_end);
        type3 = get_char_type(p, nbytes);
        if (type3 != type1) {
            token->nbytes = p - ptr;
            token->is_complete = 1;
            return RAST_OK;
        }
        p += nbytes;
        token->nchars++;
    }
    token->nbytes = p - ptr;
    token->is_complete = 1;
    return RAST_OK;
}

static rast_error_t *
euc_jp_get_next_offset(rast_tokenizer_t *tokenizer,
                       rast_size_t *byte_offset, rast_size_t *char_offset)
{
    *char_offset = 1;
    return euc_jp_get_char_len(tokenizer, byte_offset);
}

static int
convert_x0201kana_to_x0208(const unsigned char *sp,
                           const unsigned char *sp_end, rast_string_t *dst)
{
    const char *s;

    if (sp + 4 <= sp_end && *(sp + 2) == 0x8E) {
        if (*(sp + 3) == 0xDE) {
            s = voiced_x0201kana_to_x0208[*(sp + 1) - 0xA0];
            rast_string_append(dst, s, strlen(s));
            return 4;
        }
        else if (*(sp + 3) == 0xDF) {
            s = semi_voiced_x0201kana_to_x0208[*(sp + 1) - 0xA0];
            rast_string_append(dst, s, strlen(s));
            return 4;
        }
    }
    s = x0201kana_to_x0208[*(sp + 1) - 0xA0];
    rast_string_append(dst, s, strlen(s));
    return 2;
}

static int
convert_x0208alnum_to_ascii(const unsigned char *sp,
                            const unsigned char *sp_end, rast_string_t *dst)
{
    unsigned char c;

    c = *(sp + 1) & 0x7F;
    rast_string_append(dst, &c, 1);
    return 2;
}

static void
euc_jp_normalize_text(apr_pool_t *pool,
                      const char *src, rast_size_t src_len,
                      char **dst, rast_size_t *dst_len)
{
    const unsigned char *sp, *sp_end;
    rast_string_t *string;
    int len;

    sp = (unsigned char *) src;
    sp_end = sp + src_len;
    string = rast_string_create(pool, "", 0, src_len + 1);
    while (sp < sp_end) {
        if (isspace(*sp)) {
            rast_string_append(string, " ", 1);
            while (sp < sp_end && isspace(*sp)) {
                sp++;
            }
        }
        else {
            len = get_char_len(sp, sp_end);
            if (len == 2) {
                switch (*sp) {
                case 0x8E:
                    if (*(sp + 1) >= 0xA0) {
                        sp += convert_x0201kana_to_x0208(sp, sp_end, string);
                        continue;
                    }
                    break;
                case 0xA3:
                    if (*(sp + 1) >= 0xA0) {
                        sp += convert_x0208alnum_to_ascii(sp, sp_end, string);
                        continue;
                    }
                    break;
                }
            }
            rast_string_append(string, sp, len);
            sp += len;
        }
    }
    *dst = string->ptr;
    *dst_len = string->len;
}

static void
euc_jp_normalize_chars(apr_pool_t *pool,
                       const char *src, rast_size_t src_len,
                       char **dst, rast_size_t *dst_len)
{
    const unsigned char *sp, *sp_end;
    unsigned char *dp;
    int len;

    sp = (unsigned char *) src;
    sp_end = sp + src_len;
    *dst = (char *) apr_palloc(pool, src_len + 1);
    dp = (unsigned char *) *dst;
    while (sp < sp_end) {
        if (isupper(*sp)) {
            *dp = tolower(*sp);
            sp++;
            dp++;
        }
        else {
            len = get_char_len(sp, sp_end);
            memcpy(dp, sp, len);
            sp += len;
            dp += len;
        }
    }
    *dp = '\0';
    *dst_len = dp - (unsigned char *) *dst;
}

static int
euc_jp_char_is_space(rast_char_t *ch)
{
    const unsigned char *p;

    p = (const unsigned char *) ch->ptr;
    if (ch->nbytes == 2) {
        return *p == 0xA1 && *(p + 1) == 0xA1;
    }
    return isspace(*p);
}

rast_encoding_module_t rast_encoding_euc_jp = {
    "EUC-JP",
    euc_jp_get_char_len,
    euc_jp_get_token,
    euc_jp_get_next_offset,
    euc_jp_normalize_text,
    euc_jp_normalize_chars,
    euc_jp_char_is_space,
};

/* vim: set filetype=c sw=4 expandtab : */
