/*
 * Conversion from JUNET encoding according to RFC 1468
 *
 * SPDX-FileType: SOURCE
 * SPDX-FileCopyrightText: Michael Bäuerle
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <assert.h>
#include <stddef.h>
#include <string.h>

#include "libjpiconv-0/iconv.h"  /* Always include main header file first */
#include "libjpiconv-0/iconv_bool.h"
#include "libjpiconv-0/iconv_errno.h"
#include "libjpiconv-0/iconv_name.h"
#include "libjpiconv-0/iconv_rfc1468.h"
#include "libjpiconv-0/iconv_table.h"
#include "libjpiconv-0/iconv_table_ni.h"
#include "libjpiconv-0/iconv_utf8.h"


/*
 * Encodings for ISO 2022 decoder
 *
 * The numbers for valid encodings match the ones in the ISO registry.
 *
 * Attention:
 * A C90 compiler must not support more than 31 significant characters for an
 * internal identifier.
 */
enum jpic0_i_enc
{
    jpic0_i_state_invalid         = 0,
    jpic0_i_state_ascii           = 6,
    jpic0_i_state_iso_646_jp      = 14,
    jpic0_i_state_jis_x_208_1978  = 42,
    jpic0_i_state_jis_x_208_1983  = 87
};


/* Data type for encodings */
typedef enum jpic0_i_enc  jpic0_i_enc;


/* State of ISO 2022 decoder */
struct jpic0_i_state
{
    char         *inarray;
    size_t       *inlen;
    char         *outarray;
    size_t       *outlen;
    size_t        inlen_start;   /* Initial value of *inlen */
    size_t        outlen_start;  /* Initial value of *outlen */
    size_t        nonident;      /* Number of on non-identical conversions */
    int           flag;
    jpic0_i_enc   encoding;      /* Currently selected encoding */
    jpic0_i_bool  resync;        /* Lost synchronization */
    jpic0_i_bool  abort;         /* Abort conversion before NUL */
};


/* Data type for state */
typedef struct jpic0_i_state  jpic0_i_state;


/* ========================================================================== */
/*
 * Check whether character 'c' is a NUL control character
 *
 * Abort connversion if this is the case (or continue on user request).
 * Input and output buffers are not modified in all cases.
 *
 * Returns zero (false / no error) for accept.
 */
static jpic0_i_bool jpic0_i_check_nul(jpic0_i_state *state, const char c)
{
    if ((const char)0x00 == c)
    {
        if ( !(JPIC0_ICONV_IGNORE_NULL & state->flag) )
        {
            state->abort = 1;
            return 1;
        }
    }

    return 0;
}


/* ========================================================================== */
/*
 * Nonidentical conversion
 *
 * Used for invalid or unknown input data on request.
 * Consumes one octet from input buffer on success (the caller must ensure it
 * is present).
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_nonident(jpic0_i_state *state)
{
    static const size_t len_ni = sizeof JPIC0_I_NONIDENT - 1U;  /* -1 for NUL */

    if ( !(JPIC0_ICONV_REPLACE_INVALID & state->flag) )
    {
        errno = JPIC0_I_EILSEQ;
        return 1;
    }
    else
    {
        const size_t index_out = state->outlen_start - *(state->outlen);

        if (len_ni > *(state->outlen))
        {
            /* Not enough space for replacement in outarray */
            errno = JPIC0_I_E2BIG;
            return 1;
        }

        assert(0U < *(state->inlen));
        --(*(state->inlen));

        (void)memcpy(&(state->outarray)[index_out], JPIC0_I_NONIDENT, len_ni);
        *(state->outlen) -= len_ni;
        ++(state->nonident);
    }

    return 0;
}


/* ========================================================================== */
/*
 * Replacement conversion
 *
 * The parameter "repl" must be the UTF-8 sequence used as replacement.
 * Consumes one octet from input buffer on success (the caller must ensure it
 * is present).
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_replace(jpic0_i_state *state, const char *repl)
{
    const size_t len_repl  = strlen(repl);
    const size_t index_out = state->outlen_start - *(state->outlen);

    if (len_repl > *(state->outlen))
    {
        /* Not enough space for replacement in outarray */
        errno = JPIC0_I_E2BIG;
        return 1;
    }

    assert(0U < *(state->inlen));
    --(*(state->inlen));

    (void)memcpy(&(state->outarray)[index_out], repl, len_repl);
    *(state->outlen) -= len_repl;

    return 0;
}


/* ========================================================================== */
/*
 * Decode "kuten" with mapping table
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_decode_jis_x_208_kuten(jpic0_i_state *state,
                                                   const long int kuten)
{
    long int ucp = -1;  /* Unicode codepoint */
    size_t   i   = 0;

    /* The lookup table must be terminated with a kuten value of -1 */
    while (jpic0_i_iso2022_jp_table[i].jis != -1L)
    {
        if (jpic0_i_iso2022_jp_table[i].jis == kuten)
        {
            ucp = jpic0_i_iso2022_jp_table[i].uc;
            break;
        }
        ++i;
    }

    if (-1L == ucp)
    {
        /* No mapping found, use nonidentical conversion on request */
        if (jpic0_i_nonident(state))
            return 1;
    }
    else
    {
        char utf8_buf[JPIC0_I_UTF8_BUFLEN] = { 0 };

        if (jpic0_i_encode_utf8(utf8_buf, ucp))
        {
            /* Invalid Unicode codepoint (bug in mapping table) */
            errno = JPIC0_I_EILSEQ;
            return 1;
        }

        if (jpic0_i_replace(state, utf8_buf))
            return 1;
    }

    return 0;
}


/* ========================================================================== */
/*
 * Conversion from JIS X 208-1978 or JIS X 208-1983 to Unicode (UTF-8)
 *
 * For better error tolerance, the codepoints from JIS X 208-1983 are not
 * rejected if the escape sequence for JIS X 208-1978 is used.
 *
 * The codepoints are two-octet sequences. They form a 94x94 character table:
 * <https://www.sljfaq.org/afaq/encodings.html#encodings-JIS-X-0208>
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_decode_jis_x_208(jpic0_i_state *state)
{
    while (*(state->inlen))
    {
        const size_t index_in = state->inlen_start - *(state->inlen);

        if (1U == *(state->inlen))
        {
            /* Truncated kuten at end of input data => Check for 0x00 (NUL) */
            if (jpic0_i_check_nul(state, state->inarray[index_in]))
                break;

            if (jpic0_i_nonident(state))
            {
                if (JPIC0_I_EILSEQ == errno)
                    errno = JPIC0_I_EINVAL;
                return 1;
            }
        }
        else
        {
            const unsigned char ku  = state->inarray[index_in];
            const unsigned char ten = state->inarray[index_in + 1U];

            if ( ((const unsigned char)0x21 > ku)  ||
                 ((const unsigned char)0x7E < ku)  ||
                 ((const unsigned char)0x21 > ten) ||
                 ((const unsigned char)0x7E < ten) )
            {
                /* Invalid kuten => Check for escape sequence */
                if ((const unsigned char)0x1B == ku)
                    break;

                /* Check for garbage octet before escape sequence */
                if ((const unsigned char)0x1B == ten)
                {
                    state->resync = 1;
                    break;
                }

                /* Check for 0x00 (NUL) octet */
                if ( ((const unsigned char)0x00 == ku) &&
                     ((const unsigned char)0x00 == ten) )
                {
                    if (jpic0_i_check_nul(state, 0x00))
                        break;
                }

                /* Check for CRLF (missing switch to single byte encoding) */
                if ( ((const unsigned char)0x0D == ku) &&
                     ((const unsigned char)0x0A == ten) )
                {
                    state->resync = 1;
                    break;
                }

                if (jpic0_i_nonident(state))
                    return 1;
            }
            else
            {
                long int kuten = ((long int)ku << 8) | (long int)ten;

                if (jpic0_i_decode_jis_x_208_kuten(state, kuten))
                    return 1;
            }
            /* Conversion has already consumed ku, consume ten too */
            assert(0U < *(state->inlen));
            --(*(state->inlen));
        }
    }

    return 0;
}


/* ========================================================================== */
/*
 * Conversion from ISO 646-US or ISO 646-JP to Unicode (UTF-8)
 *
 * If parameter "jp" is true, then the japanese variant of ISO 646 is used.
 * Otherwise then the US variant of ISO 646 is used.
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_decode_iso_646(jpic0_i_state *state,
                                           jpic0_i_bool jp)
{
    while (*(state->inlen))
    {
        const size_t index_in  = state->inlen_start  - *(state->inlen);
        const size_t index_out = state->outlen_start - *(state->outlen);
        const char   c         = state->inarray[index_in];

        /* Check for codepoint 0x1B (ESC) */
        if ((const char)0x1B == c)
            break;

        /* Check for codepoints beyond 0x7F */
        if ((const unsigned char)0x80 & (const unsigned char)c)
        {
            if (jpic0_i_nonident(state))
                return 1;
        }
        /* Check for codepoint 0x5C (YEN SIGN in japanese variant of ISO 646) */
        else if (jp && ((const char)0x5C == c))
        {
            if (jpic0_i_replace(state, JPIC0_I_ISO646JP_YEN_SIGN))
                return 1;
        }
        /* Check for codepoint 0x7E (OVERLINE in japanese variant of ISO 646) */
        else if (jp && ((const char)0x7E == c))
        {
            if (jpic0_i_replace(state, JPIC0_I_ISO646JP_OVERLINE))
                return 1;
        }
        /* Copy other ISO 646 codepoints literally */
        else
        {
            /* Check for codepoint 0x00 (NUL) */
            if (jpic0_i_check_nul(state, c))
                break;

            if (0U == *(state->outlen))
            {
                /* Not enough space in outarray */
                errno = JPIC0_I_E2BIG;
                return 1;
            }
            else
            {
                assert(0U < *(state->inlen));
                --(*(state->inlen));

                state->outarray[index_out] = c;
                --(*(state->outlen));
            }
        }
    }

    return 0;
}


/* ========================================================================== */
/*
 * Conversion to Unicode (UTF-8).
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_decode(jpic0_i_state *state)
{
    if      (jpic0_i_state_ascii          == state->encoding)
        return jpic0_i_decode_iso_646(state, 0);
    else if (jpic0_i_state_iso_646_jp     == state->encoding)
        return jpic0_i_decode_iso_646(state, 1);
    else if (jpic0_i_state_jis_x_208_1978 == state->encoding)
        return jpic0_i_decode_jis_x_208(state);
    else if (jpic0_i_state_jis_x_208_1983 == state->encoding)
        return jpic0_i_decode_jis_x_208(state);
    else
    {
        /* Character set not supported */
        errno = JPIC0_I_EBADF;
        return 1;
    }
}


/* ========================================================================== */
/*
 * Synchronize to next escape sequence.
 *
 * A nonidentical conversion is executed for every input octet that is not ESC
 * (but always for the first octet), if requested by caller.
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
static jpic0_i_bool jpic0_i_resync(jpic0_i_state *state)
{
    jpic0_i_bool first = 1;

    while (*(state->inlen))
    {
        const size_t index_in = state->inlen_start - *(state->inlen);

        /* Check for ESC control character */
        if (!first && ((const char)0x1B == state->inarray[index_in]))
            break;

        /* Check for NUL control character */
        if (jpic0_i_check_nul(state, state->inarray[index_in]))
            break;

        first = 0;
        if (jpic0_i_nonident(state))
            return 1;
    }

    return 0;
}


/* ========================================================================== */
/*
 * Decode escape sequence and switch encoding.
 *
 * Returns zero on success (nothing to do or successful switch to new encoding).
 * Returns nonzero on error (and sets 'resync' flag in '*state').
 */
static jpic0_i_bool jpic0_i_switch_encoding(jpic0_i_state *state)
{
    if (*(state->inlen))
    {
        const size_t index_in = state->inlen_start - *(state->inlen);
        const int    first    = (unsigned char)state->inarray[index_in];

        if (0x1B == first)
        {
            if (3U > *(state->inlen))
            {
                /* Invalid (truncated) escape sequence at end of input data */
                state->resync = 1;
                return 1;
            }
            else
            {
                const int second = (unsigned char)state->inarray[index_in + 1U];
                const int third  = (unsigned char)state->inarray[index_in + 2U];

                if      ( (0x28 == second) && (0x42 == third) )
                    state->encoding = jpic0_i_state_ascii;
                else if ( (0x28 == second) && (0x4A == third) )
                    state->encoding = jpic0_i_state_iso_646_jp;
                else if ( (0x24 == second) && (0x40 == third) )
                    state->encoding = jpic0_i_state_jis_x_208_1978;
                else if ( (0x24 == second) && (0x42 == third) )
                    state->encoding = jpic0_i_state_jis_x_208_1983;
                else
                {
                    state->resync = 1;
                    return 1;
                }
                *(state->inlen) -= 3U;
            }
        }
    }

    return 0;
}


/* ========================================================================== */
/*
 * Conversion from JUNET (RFC 1468) encoding to Unicode (UTF-8).
 *
 * See API documentation for parameters other than 'nonident'.
 * On success the number of non-identical conversions is written at location
 * 'nonident'.
 *
 * Returns zero (false / no error) on success, 'errno' will be set otherwise.
 */
jpic0_i_bool jpic0_i_rfc1468(char *inarray, size_t *inlen,
                             char *outarray, size_t *outlen,
                             int flag, size_t *nonident)
{
    /* clang compiler complains with { NULL } */
    jpic0_i_state state = { NULL, NULL, NULL, NULL, 0, 0, 0, 0, 0, 0, 0 };

    /* C90 does not allow aggregate initializer with non-constant expressions */
    state.inarray      = inarray;
    state.inlen        = inlen;
    state.outarray     = outarray;
    state.outlen       = outlen;
    state.inlen_start  = *inlen;
    state.outlen_start = *outlen;
    state.nonident     = 0;
    state.flag         = flag;
    state.encoding     = jpic0_i_state_ascii;
    state.resync       = 0;
    state.abort        = 0;

    while (*(state.inlen))
    {
        if (state.resync)
        {
            /* Lost synchronization */
            if (jpic0_i_resync(&state))
               return 1;
            else
               state.resync = 0;
        }
        else
        {
            /* Check for escape sequence */
            if (jpic0_i_switch_encoding(&state))
               continue;

            /* Decode payload */
            if (jpic0_i_decode(&state))
               return 1;
        }

        if (state.abort)
            break;
    }

    /* Quoted from RFC 1468: "[...], the text must end in ASCII" */
    if (!(JPIC0_ICONV_REPLACE_INVALID & state.flag))
    {
        if (jpic0_i_state_ascii != state.encoding)
        {
            /* Invalid decoder state at end of data */
            errno = JPIC0_I_EINVAL;
            return 1;
        }
    }

    *nonident = state.nonident;
    return 0;
}
