/*
 * parse.c - parse a sentence
 *
 * Copyright (C) 1996, 1997, 2000, 2001, 
 *                            Nara Institute of Science and Technology
 *                           
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *      This product includes software developed by Nara Institute of 
 *      Science and Technology.
 * 4. The name Nara Institute of Science and Technology may not be used to
 *    endorse or promote products derived from this software without specific
 *    prior written permission.
 *    
 *
 * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology 
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE Nara Institute
 * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * Modified by: A.Kitauchi <akira-k@is.aist-nara.ac.jp>, Oct. 1996
 * $Id: parse.c,v 1.49 2003/06/10 07:29:45 kazuma-t Exp $
 */

#include "chalib.h"
#include "literal.h"
#include "tokenizer.h"
#include "dartsdic.h"

#define PATH1_NUM		256

#define is_spc(c)    ((c)==' '||(c)=='\t')

cha_block_t *Cha_mrph_block;
path_t *Cha_path = NULL;
int Cha_path_num;

#define new_mrph() cha_block_new_item(Cha_mrph_block)
#define nth_mrph(n) ((mrph2_t*)cha_block_get_item(Cha_mrph_block,(n)))
#define mrph_last_idx() (cha_block_num(Cha_mrph_block)-1)
#define pop_mrph() cha_block_pop(Cha_mrph_block)

/*
 * malloc_chars
 */
#define CHUNK_SIZE 512
#define CHA_MALLOC_SIZE (1024 * 64)
#define malloc_char(n)     malloc_chars(1, n)
#define malloc_short(n)    malloc_chars(2, n)
#define malloc_int(n)      malloc_chars(4, n)
#define free_chars()       malloc_chars(0, 0)
static void *
malloc_chars(int size, int nitems)
{
    static char *buffer_ptr[CHUNK_SIZE];
    static int buffer_ptr_num = 0;
    static int buffer_idx = CHA_MALLOC_SIZE;

    if (nitems == 0) {
	/*
	 * free 
	 */
	if (buffer_ptr_num > 0) {
	    while (buffer_ptr_num > 1)
		free(buffer_ptr[--buffer_ptr_num]);
	    buffer_idx = 0;
	}
	return NULL;
    } else {
	if (size > 1) {
	    /*
	     * size ǳ꤭ͤ 
	     */
	    buffer_idx += size - (buffer_idx & (size - 1));
	    nitems *= size;
	}

	if (buffer_idx + nitems >= CHA_MALLOC_SIZE) {
	    if (buffer_ptr_num == CHUNK_SIZE)
		cha_exit(1, "Can't allocate memory");
	    buffer_ptr[buffer_ptr_num++] = cha_malloc(CHA_MALLOC_SIZE);
	    buffer_idx = 0;
	}

	buffer_idx += nitems;
	return buffer_ptr[buffer_ptr_num - 1] + buffer_idx - nitems;
    }
}

static void *
malloc_free_block(void *ptr, int *nblockp, int size, int do_free)
{
    if (do_free) {
	/*
	 * free and malloc one block 
	 */
	if (*nblockp > 1) {
	    free(ptr);
	    *nblockp = 0;
	}
	if (*nblockp == 0)
	    ptr = malloc_free_block(ptr, nblockp, size, 0);
    } else {
	/*
	 * realloc one block larger 
	 */
	if (*nblockp == 0)
	    ptr = malloc(size * ++*nblockp);
	else {
	    ptr = realloc(ptr, size * ++*nblockp);
	}
    }

    return ptr;
}

#define malloc_path()  malloc_free_path(0)
#define free_path()    malloc_free_path(1)
static int
malloc_free_path(int do_free)
{
    static int nblock = 0;

    Cha_path = malloc_free_block((void *) Cha_path, &nblock,
				 sizeof(path_t) * CHA_PATH_NUM, do_free);

    return Cha_path == NULL;
}

/*
 * register_undef_mrph1 - ̤Хåեɲ
 */
static int
register_undef_mrph1(char *target, int undef_len, int no)
{
    mrph2_t *mrph = new_mrph();

    mrph->midasi = target;
    mrph->yomi = "";
    mrph->base_length = mrph->length = undef_len;
    mrph->base = "";
    mrph->pron = "";
    mrph->compound = "\n";

    mrph->hinsi = Cha_undef_info[no].hinsi;
    mrph->con_tbl = Cha_undef_info[no].con_tbl;
    mrph->ktype = 0;
    mrph->kform = 0;
    mrph->is_undef = no + 1;	/* ̤ */
    mrph->weight = MRPH_DEFAULT_WEIGHT;
    mrph->info = "";		/* ղþ϶ʸȤ롥 */

    return mrph_last_idx();
}

/*
 * register_mrph - ѤĴ٤ʤǤХåեɲ
 */
static int
register_mrph(mrph2_t *new_mrph)
{
    if (!new_mrph->ktype) {
	/*
	 * Ѥʤ 
	 */
    } else {
	/*
	 * Ѥ 
	 */
	if (new_mrph->kform) {
	    /*
	     * 촴ʤ 
	     */
	    new_mrph->base_length = 0;
	    new_mrph->yomi = "";
	    new_mrph->pron = "";
	} else {
	    /*
	     * 촴 
	     */
	    int f;
	    int ktype = new_mrph->ktype;
	    int baselen = new_mrph->length;
	    int con_tbl = new_mrph->con_tbl;
	    char *follows = new_mrph->midasi + baselen;
	    mrph2_t *new_mrph0 = new_mrph;
	    new_mrph->kform = 0;
	    for (f = 1; Cha_form[ktype][f].name; f++) {
		if (!Cha_form[ktype][f].gobi[0] ||
		    (follows[0] == Cha_form[ktype][f].gobi[0] &&
		     !memcmp(follows, Cha_form[ktype][f].gobi,
			     Cha_form[ktype][f].gobi_len))) {
		    if (new_mrph->kform > 0) {
			new_mrph = new_mrph();
			*new_mrph = *new_mrph0;
		    }
		    new_mrph->kform = f;
		    new_mrph->length =
			baselen + Cha_form[ktype][f].gobi_len;
		    new_mrph->con_tbl = con_tbl + f - 1;
		}
	    }
	    if (new_mrph->kform == 0)
		pop_mrph();
	}
    }

    return mrph_last_idx();
}

static void
da_get_mrph_data(darts_t* da, mrph2_t *mrph, da_lex_t *lex_data,
		 char *str, int str_len)
{
    mrph->midasi = str;
    mrph->base_length = mrph-> length = str_len;
    mrph->is_undef = 0;

    mrph->hinsi = lex_data->posid;
    mrph->ktype = lex_data->inf_type;
    mrph->kform = lex_data->inf_form;
    mrph->weight = lex_data->weight;
    mrph->con_tbl = lex_data->con_tbl;

    da_get_data(da, lex_data->dat_index,
		&(mrph->yomi), &(mrph->pron), &(mrph->base), &(mrph->info),
		&(mrph->compound));
}

static int
da_convert_mrphs(darts_t* da, char *target, long *indecies)
{
    mrph2_t *new_mrph;
    long *index;
    da_lex_t lex_data[256]; /* XXX */

    for (index = indecies; *index >= 0; index++) {
	int nlex, i, target_len;
	nlex = da_get_lex(da, *index, lex_data, &target_len);
	for (i = 0; i < nlex; i++) {
	    new_mrph = new_mrph();
	    da_get_mrph_data(da, new_mrph,
			     lex_data + i, target, target_len);
	    register_mrph(new_mrph);
	}
    }

    return mrph_last_idx();
}

/*
 * collect_mrphs_for_pos()
 */
static int
collect_mrphs_for_pos(int pos, int *p_idx)
{
    static int p_start;
    int i, j;

    j = 0;
    if (pos == 0) {
	/*
	 * new sentence 
	 */
	p_idx[j++] = 0;
	p_start = 1;
    } else {
	for (i = p_start; i < Cha_path_num; i++) {
	    if (Cha_path[i].end <= pos) {
		if (i == p_start)
		    p_start++;
		if (Cha_path[i].end == pos)
		    p_idx[j++] = i;
	    }
	}
    }
    p_idx[j] = -1;

    return j;
}


/*
 * check_connect()
 */
static int
check_connect(int pos, int m_num, int *p_idx)
{
    /*
     * ֤ͤǥѥʬह 
     */
    typedef struct _path_cost_t {
	int min_cost;
	short min_cost_no;
	short state;
	short num;
	int cost[PATH1_NUM];
	int pno[PATH1_NUM];
    } path_cost_t;

    /*
     * static short best_start, best_end, best_state; static int
     * best_cost; 
     */
    static path_cost_t pcost[PATH1_NUM];
    int pcost_num;
    mrph2_t *new_mrph;
    int i, pno, pcostno;
    int haba_cost, con_cost, cost, mrph_cost;
    int con_tbl, next_state;

#ifdef DEBUG
    printf("[m:%d] ", m_num);
#endif
    new_mrph = nth_mrph(m_num);
    con_tbl = new_mrph->con_tbl;

    pcost[0].state = -1;
    pcost_num = 0;

    for (i = 0; (pno = p_idx[i]) >= 0; i++) {
	/*
	 * ȥޥȥĴ٤Ƽ֤³ȤФ 
	 */
	next_state = cha_check_automaton
	    (Cha_path[pno].state, con_tbl, Cha_con_cost_undef, &con_cost);

	if (con_cost == -1)
	    continue;

#ifdef DEBUG
	printf
	    ("[%3d, %3d, pos:%d, len:%d, state:%5d,%5d, cost:%d, undef:%d]\n",
	     Cha_path[pno].mrph_p, m_num, pos, new_mrph->length,
	     Cha_path[pno].state, next_state, cost, new_mrph->is_undef);
#endif
	/*
	 * cost ׻ 
	 */
	cost = Cha_path[pno].cost + con_cost * Cha_con_cost_weight;

	/*
	 * ɤ pcost °뤫Ĵ٤ 
	 */
	for (pcostno = 0; pcostno < pcost_num; pcostno++)
	    if (next_state == pcost[pcostno].state)
		break;
	if (pcostno < pcost_num) {
	    /*
	     * tricky: when Cha_cost_width is -1, ">-1" means ">=0" 
	     */
	    if (cost - pcost[pcostno].min_cost > Cha_cost_width)
		continue;
	} else {
	    /*
	     *  pcost  
	     */
	    pcost_num++;
	    pcost[pcostno].num = 0;
	    pcost[pcostno].state = next_state;
	    pcost[pcostno].min_cost = INT_MAX;
	}

	/*
	 * pcost Ͽ 
	 */
	if (Cha_cost_width < 0) {
	    pcost[pcostno].min_cost = cost;
	    pcost[pcostno].pno[0] = pno;
	} else {
	    pcost[pcostno].cost[pcost[pcostno].num] = cost;
	    pcost[pcostno].pno[pcost[pcostno].num] = pno;
	    if (cost < pcost[pcostno].min_cost) {
		pcost[pcostno].min_cost = cost;
		pcost[pcostno].min_cost_no = pcost[pcostno].num;
	    }
	    pcost[pcostno].num++;
	}
    }

    if (pcost_num == 0)
	return TRUE;

    /*
     * ǥ 
     */
    if (new_mrph->is_undef) {
	mrph_cost = Cha_undef_info[new_mrph->is_undef - 1].cost
	    + Cha_undef_info[new_mrph->is_undef -
			     1].cost_step * new_mrph->length / 2;
    } else {
	mrph_cost = Cha_hinsi[new_mrph->hinsi].cost;
    }
    mrph_cost *= new_mrph->weight * Cha_mrph_cost_weight;

    for (pcostno = 0; pcostno < pcost_num; pcostno++) {
	/*
	 * ˤޤäƤѥȴФ 
	 */
	if (Cha_cost_width < 0) {
	    Cha_path[Cha_path_num].path = malloc_int(2);
	    Cha_path[Cha_path_num].path[0] = pcost[pcostno].pno[0];
	    Cha_path[Cha_path_num].path[1] = -1;
	} else {
	    int npath = 0;
	    int path[PATH1_NUM];
	    haba_cost = pcost[pcostno].min_cost + Cha_cost_width;
	    path[npath++] = pcost[pcostno].pno[pcost[pcostno].min_cost_no];
	    for (i = 0; i < pcost[pcostno].num; i++)
		if (pcost[pcostno].cost[i] <= haba_cost
		    && i != pcost[pcostno].min_cost_no)
		    path[npath++] = pcost[pcostno].pno[i];
	    path[npath++] = -1;
	    memcpy(Cha_path[Cha_path_num].path = malloc_int(npath),
		   path, sizeof(int) * npath);
	}

	/*
	 * Cha_path Ͽ 
	 */
	Cha_path[Cha_path_num].cost = pcost[pcostno].min_cost + mrph_cost;
	Cha_path[Cha_path_num].mrph_p = m_num;
	Cha_path[Cha_path_num].state = pcost[pcostno].state;
	Cha_path[Cha_path_num].start = pos;
	Cha_path[Cha_path_num].end = pos + new_mrph->length;
#ifdef DEBUG
	printf("%3d %3d %5d [p:%d,prev:%d,m:%d,c:%d,pc:%d]\n",
	       Cha_path[Cha_path_num].start, Cha_path[Cha_path_num].end,
	       Cha_path[Cha_path_num].state,
	       Cha_path_num, Cha_path[Cha_path_num].path[0], m_num,
	       pcost[0].cost[i], Cha_path[Cha_path_num].cost);
#endif
	if (++Cha_path_num % CHA_PATH_NUM == 0 && malloc_path())
	    return FALSE;
    }

    return TRUE;
}

static void
set_mrph_end(mrph2_t *mrph)
{
    mrph->midasi = mrph->yomi = mrph->info = "";
    mrph->base = mrph->pron = "";
    mrph->compound = "\n";
    mrph->base_length = mrph->length = 0;

    mrph->hinsi = 0;
    mrph->ktype = 0;
    mrph->kform = 0;
    mrph->con_tbl = 0;
    mrph->is_undef = 0;
    mrph->weight = MRPH_DEFAULT_WEIGHT;
}

static int
lookup_dic(char *target, int target_len, int cursor)
{
    int dic_no;
    long index_buffer[256]; /* for Darts */

    for (dic_no = 0; dic_no < Da_ndicfile; dic_no++) {
	da_lookup(Da_dicfile[dic_no],
		  target + cursor, target_len - cursor, index_buffer);
	da_convert_mrphs(Da_dicfile[dic_no], target + cursor, index_buffer);
    }

    return mrph_last_idx();
}


/*
 * ̤ 
 */
static int
set_undefword(char *target, int cursor, int head_mrph_idx, int tail_mrph_idx)
{
    int undef_len;
    int i;

    undef_len = cha_tok_char_type_len(Cha_tokenizer, cursor);

    for (i = head_mrph_idx; i <= tail_mrph_idx; i++) {
	/*
	 * ̤ƱĹñ줬ˤ̤ɲäʤ 
	 */
	if (Cha_con_cost_undef > 0 && nth_mrph(i)->length == undef_len) {
	    undef_len = 0;
	    break;
	}
    }

    /*
     * ̤ɲ 
     */
    if (undef_len > 0) {
	int no;
	for (no = 0; no < Cha_undef_info_num; no++)
	    register_undef_mrph1(target + cursor, undef_len, no);
    }

    return mrph_last_idx();
}

#define cursor_sep(c, l) \
     ((!cha_tok_is_jisx0208_latin(Cha_tokenizer,(c), (l))) ? \
        cha_tok_mblen_on_cursor(Cha_tokenizer, (c)) : \
        cha_tok_char_type_len(Cha_tokenizer, (c)))

/*
 * cha_parse_sentence() - ʸǲϤ
 *
 * return value:
 *     0 - ok
 *     1 - no result / too many morphs
 */
int
cha_parse_sentence(char *target, int target_len)
{
    int cursor, prev_cursor;
    int path_idx[PATH1_NUM], path_idx_num;
    static int path0 = -1;

    cha_tok_parse(Cha_tokenizer, target, target_len + 1);
    cha_block_clear(Cha_mrph_block);

    free_chars();
    free_path();

    /*
     * ʸƬ
     */
    Cha_path[0].start = Cha_path[0].end = 0;
    Cha_path[0].path = &path0;
    Cha_path[0].cost = 0;
    Cha_path[0].mrph_p = 0;
    Cha_path[0].state = 0;

    Cha_path_num = 1;
    set_mrph_end(new_mrph());

    /*
     * ܽ
     */
    for (cursor = prev_cursor = 0; cursor < target_len;
	cursor += cursor_sep(cursor, target_len - cursor),
	prev_cursor = cursor) {
	int mrph_idx;
	int i;

        /* skip annotations and white space */
	while (cha_tok_anno_type(Cha_tokenizer, cursor) != 0 )
	  cursor += cha_tok_char_type_len(Cha_tokenizer, cursor);
	if (cursor >= target_len)
	  break;
	
	path_idx_num = collect_mrphs_for_pos(prev_cursor, path_idx);
	if (path_idx_num == 0)
	    continue;
	else if (path_idx_num < 0)
	    goto error_end;

	mrph_idx = mrph_last_idx() + 1;
	/* pick up possible words from dictionary */
	lookup_dic(target, target_len, cursor);

	/* set undefined word */
	set_undefword(target, cursor, mrph_idx, mrph_last_idx());

	/* check the path between the preceding and current position  */
	for (i = mrph_idx; i <= mrph_last_idx(); i++) {
	    if (check_connect(cursor, i, path_idx) == FALSE)
		goto error_end;
	}
    }

    /*
     * ʸ
     */
    set_mrph_end(new_mrph());
    path_idx_num = collect_mrphs_for_pos(prev_cursor, path_idx);
    if (check_connect(cursor, mrph_last_idx(), path_idx) == FALSE)
        goto error_end;

    return 0;

    /*
     * 顼
     */
  error_end:
    printf("Error: Too many morphs: %s\n", target);

    return 1;
}
