/* 
 * Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
 *
 * This file is part of Rast.
 * See the file COPYING for redistribution information.
 *
 */

#include <apr_strings.h>

#include <unicode/utypes.h>
#include <unicode/ustring.h>
#include <unicode/uchar.h>
#include <unicode/unorm.h>

#include <ctype.h>

#include "rast/config.h"
#include "rast/encoding.h"
#include "rast/string.h"

static const unsigned char utf8_char_size_table[] = {
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1,
};

#define UNICODE_SPACE         0x00000020
#define UNICODE_QUESTION_MARK 0x0000003F

static rast_error_t *
utf8_get_char_len(rast_tokenizer_t *tokenizer, rast_size_t *char_len)
{
    const char *ptr = tokenizer->ptr;
    const char *ptr_end = tokenizer->ptr_end;
    rast_size_t len;

    len = utf8_char_size_table[(unsigned char) *ptr];
    if (ptr + len > ptr_end) {
        len = ptr_end - ptr;
    }
    *char_len = len;
    return RAST_OK;
}

static int
utf8_get_uchar32(const unsigned char *p, const unsigned char *end, UChar32 *ch)
{
    UErrorCode err;
    int32_t len, buf_len;
    UChar buf[2];

    len = utf8_char_size_table[*p];
    if (len > end - p) {
        *ch = UNICODE_QUESTION_MARK;
        return end - p;
    }
    err = U_ZERO_ERROR;
    u_strFromUTF8(buf, 2, &buf_len, p, len, &err);
    if (err != U_ZERO_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) {
        *ch = UNICODE_QUESTION_MARK;
        return len;
    }
    err = U_ZERO_ERROR;
    u_strToUTF32(ch, 1, NULL, buf, buf_len, &err);
    if (err != U_ZERO_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) {
        *ch = UNICODE_QUESTION_MARK;
    }
    return len;
}

static UBlockCode
get_ublock_code(UChar ch)
{
    UBlockCode code;

    code = ublock_getCode(ch);
    switch (code) {
    case UBLOCK_BASIC_LATIN:
    case UBLOCK_LATIN_1_SUPPLEMENT:
    case UBLOCK_LATIN_EXTENDED_A:
    case UBLOCK_LATIN_EXTENDED_B:
    case UBLOCK_LATIN_EXTENDED_ADDITIONAL:
        return UBLOCK_BASIC_LATIN;
    case UBLOCK_GREEK:
    case UBLOCK_GREEK_EXTENDED:
        return UBLOCK_GREEK;
    case UBLOCK_CYRILLIC:
    case UBLOCK_CYRILLIC_SUPPLEMENTARY:
        return UBLOCK_CYRILLIC;
    default:
        return code;
    }
}

static int
decide_n(UBlockCode block_code)
{
    switch (block_code) {
    case UBLOCK_BASIC_LATIN:
    case UBLOCK_GREEK:
    case UBLOCK_CYRILLIC:
    case UBLOCK_HIRAGANA:
    case UBLOCK_KATAKANA:
        return 3;
    default:
        return 2;
    }
}

static rast_error_t *
utf8_get_token(rast_tokenizer_t *tokenizer, rast_token_t *token)
{
    const char *ptr = tokenizer->ptr;
    const char *ptr_end = tokenizer->ptr_end;
    const char *p;
    UChar32 ch, ch2, ch3;
    UBlockCode block_code;
    int nbytes, n;

    if (ptr >= ptr_end) {
        return rast_error(RAST_ERROR_CURSOR, "out of cursor");
    }
    token->nchars = 0;
    p = ptr;
    p += utf8_get_uchar32(p, ptr_end, &ch);
    token->nchars++;
    if (p >= ptr_end) {
        token->nbytes = p - ptr;
        token->is_complete = 0;
        return RAST_OK;
    }
    block_code = get_ublock_code(ch);
    n = decide_n(block_code);
    p += utf8_get_uchar32(p, ptr_end, &ch2);
    token->nchars++;
    if (!(u_isalnum(ch) || ch == UNICODE_SPACE) ||
        !(u_isalnum(ch2) || ch2 == UNICODE_SPACE) ||
        get_ublock_code(ch2) != block_code) {
        token->nbytes = p - ptr;
        token->is_complete = 1;
        return RAST_OK;
    }
    while (token->nchars < n) {
        if (p >= ptr_end) {
            token->nbytes = p - ptr;
            token->is_complete = 0;
            return RAST_OK;
        }
        nbytes = utf8_get_uchar32(p, ptr_end, &ch3);
        if (!(u_isalnum(ch3) || ch3 == UNICODE_SPACE) ||
            get_ublock_code(ch3) != block_code) {
            token->nbytes = p - ptr;
            token->is_complete = 1;
            return RAST_OK;
        }
        p += nbytes;
        token->nchars++;
    }
    token->nbytes = p - ptr;
    token->is_complete = 1;
    return RAST_OK;
}

static rast_error_t *
utf8_get_next_offset(rast_tokenizer_t *tokenizer,
                     rast_size_t *byte_offset, rast_size_t *char_offset)
{
    *char_offset = 1;
    return utf8_get_char_len(tokenizer, byte_offset);
}

static UErrorCode
convert_utf8_to_utf16(apr_pool_t *pool,
                      const char *src, rast_size_t src_len,
                      UChar **dst, int32_t *dst_len)
{
    UErrorCode err;
    UChar *s;
    int32_t len;

    err = U_ZERO_ERROR;
    u_strFromUTF8(NULL, 0, &len, src, src_len, &err);
    if (err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR) {
        return err;
    }
    s = (UChar *) apr_palloc(pool, sizeof(UChar) * len);
    err = U_ZERO_ERROR;
    u_strFromUTF8(s, len, NULL, src, src_len, &err);
    if (err != U_ZERO_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) {
        return err;
    }
    *dst = s;
    *dst_len = len;
    return U_ZERO_ERROR;
}

static UErrorCode
convert_utf16_to_utf8(apr_pool_t *pool,
                      const UChar *src, int32_t src_len,
                      char **dst, rast_size_t *dst_len)
{
    UErrorCode err;
    char *s;
    int32_t len;

    err = U_ZERO_ERROR;
    u_strToUTF8(NULL, 0, &len, src, src_len, &err);
    if (err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR) {
        return err;
    }
    s = (char *) apr_palloc(pool, len + 1);
    err = U_ZERO_ERROR;
    u_strToUTF8(s, len + 1, 0, src, src_len, &err);
    if (err != U_ZERO_ERROR) {
        return err;
    }
    *dst = s;
    *dst_len = len;
    return U_ZERO_ERROR;
}

static UErrorCode
normalize_utf16(apr_pool_t *pool,
                const UChar *src, int32_t src_len,
                UChar **dst, int32_t *dst_len)
{
    UErrorCode err;
    UChar *s;
    int32_t len;

    err = U_ZERO_ERROR;
    len = unorm_normalize(src, src_len, UNORM_NFKC, 0, NULL, 0, &err);
    if (err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR) {
        return err;
    }
    s = (UChar *) apr_palloc(pool, sizeof(UChar) * len);
    err = U_ZERO_ERROR;
    unorm_normalize(src, src_len, UNORM_NFKC, 0, s, len, &err);
    if (err != U_ZERO_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) {
        return err;
    }
    *dst = s;
    *dst_len = len;
    return U_ZERO_ERROR;
}

static void
cut_extra_spaces(apr_pool_t *pool,
                 const UChar *src, int32_t src_len,
                 UChar **dst, int32_t *dst_len)
{
    const UChar *sp, *sp_end;
    UChar *dp;

    sp = src;
    sp_end = src + src_len;
    *dst = (UChar *) apr_palloc(pool, sizeof(UChar) * src_len);
    dp = *dst;
    while (sp < sp_end) {
        if (u_isWhitespace(*sp)) {
            *dp = UNICODE_SPACE;
            dp++;
            sp++;
            while (sp < sp_end && u_isWhitespace(*sp)) {
                sp++;
            }
        }
        else {
            *dp = *sp;
            dp++;
            sp++;
        }
    }
    *dst_len = dp - *dst;
}

static void
utf8_normalize_text(apr_pool_t *pool,
                    const char *src, rast_size_t src_len,
                    char **dst, rast_size_t *dst_len)
{
    apr_pool_t *sub_pool;
    UChar *usrc, *udst;
    int32_t usrc_len, udst_len;
    UErrorCode err;

    apr_pool_create(&sub_pool, pool);
    err = convert_utf8_to_utf16(sub_pool, src, src_len, &usrc, &usrc_len);
    if (err != U_ZERO_ERROR) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    err = normalize_utf16(pool, usrc, usrc_len, &udst, &udst_len);
    if (err != U_ZERO_ERROR) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    cut_extra_spaces(sub_pool, udst, udst_len, &udst, &udst_len);
    err = convert_utf16_to_utf8(pool, udst, udst_len, dst, dst_len);
    if (err != U_ZERO_ERROR) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    apr_pool_destroy(sub_pool);
}

static void
utf8_normalize_chars(apr_pool_t *pool,
                     const char *src, rast_size_t src_len,
                     char **dst, rast_size_t *dst_len)
{
    apr_pool_t *sub_pool;
    UChar *usrc, *udst;
    int32_t usrc_len, udst_len;
    UErrorCode err;

    apr_pool_create(&sub_pool, pool);
    err = convert_utf8_to_utf16(sub_pool, src, src_len, &usrc, &usrc_len);
    if (err != U_ZERO_ERROR) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    err = U_ZERO_ERROR;
    udst_len = u_strToLower(NULL, 0, usrc, usrc_len, NULL, &err);
    if (err != U_ZERO_ERROR && err != U_BUFFER_OVERFLOW_ERROR) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    udst = apr_palloc(sub_pool, sizeof(UChar) * udst_len);
    err = U_ZERO_ERROR;
    u_strToLower(udst, udst_len, usrc, usrc_len, NULL, &err);
    if (err != U_ZERO_ERROR && err != U_STRING_NOT_TERMINATED_WARNING) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    err = convert_utf16_to_utf8(pool, udst, udst_len, dst, dst_len);
    if (err != U_ZERO_ERROR) {
        apr_pool_destroy(sub_pool);
        *dst = apr_pstrndup(pool, src, src_len);
        *dst_len = src_len;
        return;
    }
    apr_pool_destroy(sub_pool);
}

static int
utf8_char_is_space(rast_char_t *ch)
{
    UChar32 c;

    utf8_get_uchar32(ch->ptr, ch->ptr + ch->nbytes, &c);
    return u_isspace(c);
}

rast_encoding_module_t rast_encoding_utf8 = {
    "UTF-8",
    utf8_get_char_len,
    utf8_get_token,
    utf8_get_next_offset,
    utf8_normalize_text,
    utf8_normalize_chars,
    utf8_char_is_space,
};

/* vim: set filetype=c sw=4 expandtab : */
