/*
 * ʸʸsplitsplitter
 *
 * ʸζ򸡽Ф
 *  anthy_init_split_context() ʬѤΥƥȤä
 *  anthy_mark_border() ʬ򤷤
 *  anthy_release_split_context() ƥȤ
 *
 *  anthy_commit_border() ߥåȤ줿ƤФƳؽ򤹤
 *
 *  anthy_get_nr_seginfo()
 *  anthy_get_nth_seginfo() Ϥʸι
 *
 * Funded by IPA̤Ƨեȥ¤ 2001 9/22
 * Copyright (C) 2000-2001 TABATA Yusuke, UGAWA Tomoharu
 *
 * $Id: splitter.c,v 1.48 2002/11/18 11:39:18 yusuke Exp $
 */

#include <stdlib.h>

#include <alloc.h>
#include <record.h>
#include <splitter.h>
#include <logger.h>
#include "wordborder.h"

#define MAX_EXPAND_PAIR_ENTRY_COUNT 1000
#define MAX_EXPAND_DEP_ENTRY_COUNT 1000

static allocator seginfo_ator;

wtype_t anthy_wtype_noun;
wtype_t anthy_wtype_prefix, anthy_wtype_postfix;
wtype_t anthy_wtype_a_tail_of_v_renyou;
wtype_t anthy_wtype_v_renyou;
wtype_t anthy_wtype_noun_tail;/* ֤ơפȤ */
wtype_t anthy_wtype_n1;
wtype_t anthy_wtype_n10;


static void
metaword_dtor(void *p)
{
  struct meta_word *mw= p;
  if (mw->si) {
    anthy_sfree(seginfo_ator, mw->si);
  }
}

static void
seginfo_dtor(void *p)
{
  struct seg_info *si = p;
  if (si->cand.str) {
    free(si->cand.str);
  }
  if (si->word_info) {
    free(si->word_info);
  }
}

/*
 * make_word_cacheǺʸ
 */
static void
release_info_cache(struct splitter_context *sc)
{
  int i;
  struct word_split_info_cache *info = sc->word_split_info;

  anthy_free_allocator(info->MwAllocator);
  anthy_free_allocator(info->WlAllocator);
  anthy_free_allocator(info->ExAllocator);
  for (i = 0; i < sc->char_count; i++) {
    if (info->cnode[i].ex) {
      free(info->cnode[i].ex);
    }
  }
  free(info->cnode);
  free(info->seq_len);
  free(info->rev_seq_len);
  free(info);
}

/*
 * Ƥʬʸåơʸθ󤹤
 * ǳݤƤrelease_info_cacheǲ 
 */
static void
make_word_cache(struct splitter_context *sc)
{
  int i;
  struct word_split_info_cache *info;

  /* åΥǡ */
  sc->word_split_info = malloc(sizeof(struct word_split_info_cache));
  info = sc->word_split_info;
  info->MwAllocator = anthy_create_allocator(sizeof(struct meta_word),
					     metaword_dtor);
  info->WlAllocator = anthy_create_allocator(sizeof(struct word_list), 0);
  info->ExAllocator = anthy_create_allocator(sizeof(struct extent), 0);
  info->cnode =
    malloc(sizeof(struct char_node) * (sc->char_count + 1));

  info->seq_len = malloc(sizeof(int) * (sc->char_count + 1));
  info->rev_seq_len = malloc(sizeof(int) * (sc->char_count + 1));

  /* ХꥢΤȤޤǽ */
  for (i = 0; i <= sc->char_count; i++) {
    info->seq_len[i] = 0;
    info->rev_seq_len[i] = 0;
    info->cnode[i].wl = 0;
    info->cnode[i].mw = 0;
    info->cnode[i].ex = 0;
    info->cnode[i].max_len = 0;
  }

  /* word_listƤmetaword */
  anthy_make_word_list_all(sc);
  anthy_make_metaword_all(sc);
}

static void
init_seginfo(struct seg_info *si)
{
  si->cand.str = 0;
  si->word_info = 0;
  si->nr_word_info = 0;
  si->info_len = 0;
  si->dep_len = 0;
  si->struct_ratio = RATIO_BASE;
  si->score = 0;
}

/** seginfowordinfoɲä
 */
static void
seginfo_pushback_wordinfo(struct seg_info *si, wtype_t wt,
			  int len, int core_len, int ratio)
{
  si->word_info = realloc(si->word_info,
			  (si->nr_word_info+1) * sizeof(struct word_info));
  si->word_info[si->nr_word_info].wt = wt;
  si->word_info[si->nr_word_info].len = len;
  si->word_info[si->nr_word_info].core_len = core_len;
  si->word_info[si->nr_word_info].ratio = ratio;
  si->info_len += len;
  si->nr_word_info ++;
}


/*
 * seginfo¤ɾ
 */
static void
eval_seginfo_by_struct(struct seg_info *si)
{
  int len = si->info_len + si->dep_len;
  int uncover = len - si->info_len;

  /* seginfoʤХǥեȤ */
  if (!si->nr_word_info) {
    return ;
  }

  /* seginfoǥСƤΰγ礬礭ۤɥ礭 */
  if (uncover > 7) {
    /* Ĺ°줬Ĥ */
    uncover = 7;
  }
  /* °줬2ʸʤиʤ */
  uncover -= 2;
  if (uncover < 0) {
    uncover = 0;
  }
  si->struct_ratio  *= (RATIO_BASE - uncover *
			(RATIO_BASE/16));
  si->struct_ratio /= RATIO_BASE;
  
  if (si->nr_word_info > 1) {
    si->struct_ratio /= (si->nr_word_info+1);
  }
  if (si->struct_ratio == 0) {
    si->struct_ratio = 1;
  }
}

/*
 * metawordʸФ
 * NULLʤФϹԤʤ(פοȤ˻Ȥ)
 */
static struct seg_info *
get_seginfo_from_metaword(struct meta_word *mw)
{
  struct seg_info *si;

  /* ǤˤФ֤ */
  if (mw->si) {
    return mw->si;
  }

  /* ̵ΤǺ */

  /* ޤwordlistmetawordξϤΤޤ޾Ф */
  if (mw->wl && mw->wl->len) {
    struct word_list *wl = mw->wl;
    int tail_len;
    mw->si = anthy_smalloc(seginfo_ator);
    si = mw->si;
    init_seginfo(si);
    si->type = SI_NORMAL;
    si->dep_len = mw->wl->dep_len;
    /* Ƭ */
    if (wl->prefix_len) {
      seginfo_pushback_wordinfo(si, wl->prefix_wt,
				wl->prefix_len, wl->prefix_len,
				25);
    }
    /* Ω */
    if (wl->postfix_len) {
      /* °ΤȤɲ */
      tail_len = 0;
    } else {
      tail_len = wl->tail_len;
    }
    seginfo_pushback_wordinfo(si, wl->core_wt, 
			      wl->core_len + tail_len, wl->core_len,
			      wl->conn_ratio);
    /*  */
    if (wl->postfix_len) {
      seginfo_pushback_wordinfo(si, wl->postfix_wt,
				wl->postfix_len + wl->tail_len,
				wl->postfix_len,
				25);
    }

    eval_seginfo_by_struct(mw->si);

    /* °ΥѥФ륹 */
    si->struct_ratio *= wl->conn_ratio;
    si->struct_ratio /= RATIO_BASE;
    /* Ωγѷˤ륹 */
    si->struct_ratio *= wl->indep_ratio;
    si->struct_ratio /= RATIO_BASE;
    return si;
  }

  /* metawordΥפˤäƤȤѤ */
  switch (mw->type) {
  case MW_WRAP:
    /* wrap줿ΤξФ */
    return get_seginfo_from_metaword(mw->mw1);
  case MW_V_RENYOU_A:
  case MW_V_RENYOU_T:
    /* 2ʸ */
    mw->si = anthy_smalloc(seginfo_ator);
    si = mw->si;
    init_seginfo(si);
    si->type = SI_NORMAL;
    seginfo_pushback_wordinfo(si, anthy_wtype_v_renyou,
			      mw->mw1->len, mw->mw1->len, RATIO_BASE);
    if (mw->type == MW_V_RENYOU_A) {
      seginfo_pushback_wordinfo(si, anthy_wtype_a_tail_of_v_renyou,
				mw->mw2->len - mw->mw2->wl->dep_len,
				mw->mw2->len - mw->mw2->wl->dep_len,
				RATIO_BASE);
    } else {
      seginfo_pushback_wordinfo(si, anthy_wtype_noun_tail, 
				mw->mw2->len - mw->mw2->wl->dep_len,
				mw->mw2->len - mw->mw2->wl->dep_len,
				RATIO_BASE);
    }
    si->struct_ratio = mw->mw2->wl->conn_ratio;
    eval_seginfo_by_struct(mw->si);
    return si;
  case MW_OCHAIRE_LEAF:
    /* ؽˤ */
    mw->si = anthy_smalloc(seginfo_ator);
    si = mw->si;
    init_seginfo(si);
    si->type = SI_CAND;
    si->cand.str = anthy_xstr_dup_str(mw->cand_hint);
    si->cand.len = mw->cand_hint->len;
    /* seginfoĹmetawordĹ */
    si->info_len = mw->len;
    eval_seginfo_by_struct(mw->si);
    return si;
  case MW_NAMEPAIR:
  case MW_DUMMY:
  case MW_SINGLE:
  case MW_OCHAIRE:
    /* seginfoʤ */
  default:
    break;
  }
  return 0;
}

int
anthy_get_nr_seginfo(struct splitter_context *sc, int from, int len)
{
  struct meta_word *mw;
  int n;

  for (n = 0, mw = sc->word_split_info->cnode[from].mw;
       mw; mw = mw->next) {
    if (mw->len == len) {
      if (get_seginfo_from_metaword(mw)) {
	n++;
      }
    }
  }
  return n;
}

struct seg_info *
anthy_get_nth_seginfo(struct splitter_context *sc,
		      int from, int len, int nth)
{
  struct meta_word *mw;
  int n;
  for (n = 0, mw = sc->word_split_info->cnode[from].mw;
       mw; mw = mw->next) {
    if (mw->len == len) {
      struct seg_info *si = get_seginfo_from_metaword(mw);
      if (si) {
	if (n == nth) {
	  return si;
	}
	n++;
      }
    }
  }
  return NULL;
}

/** ƤӽФwordsplitterΥȥåץ٥δؿ */
void
anthy_mark_border(struct splitter_context *sc,
		  int from, int from2, int to)
{
  int i;
  struct word_split_info_cache *info;

  /* sanity check */
  if ((to - from) <= 0) {
    return ;
  }

  /* ޡѤΰ */
  info = sc->word_split_info ;
  info->seg_border = alloca(sizeof(int)*(sc->char_count + 1));
  for (i = 0; i < sc->char_count + 1; i++) {
    info->seg_border[i] = sc->ce[i].seg_border;
  }

  /* ꤹ */
  anthy_eval_border(sc, from, to);

  /* ̤ž */
  for (i = from+1; i < from2; i++) {
    info->seg_border[i] = 0;
  }
  for (i = from; i < to; i++) {
    sc->ce[i].seg_border = info->seg_border[i];
  }
}

/* ʸ᤬礵줿Τǡؽ */
static void
proc_expanded_segment(struct splitter_context *sc,
		      int from, int len)
{
  int initial_len = sc->ce[from].initial_seg_len;
  int i, nr;
  xstr from_xs, to_xs, *xs;

  from_xs.str = sc->ce[from].c;
  from_xs.len = initial_len;
  to_xs.str = sc->ce[from].c;
  to_xs.len = len;
  if (anthy_select_section("EXPANDPAIR", 1) == -1) {
    return ;
  }
  if (anthy_select_column(&from_xs, 1) == -1) {
    return ;
  }
  nr = anthy_get_nr_values();
  for (i = 0; i < nr; i ++) {
    xs = anthy_get_nth_xstr(i);
    if (!xs || !anthy_xstrcmp(xs, &to_xs)) {
      /* ˤ */
      return ;
    }
  }
  anthy_set_nth_xstr(nr, &to_xs);
  anthy_truncate_section(MAX_EXPAND_PAIR_ENTRY_COUNT);
}

/* °쥰դȤϰۤʤΰ褬ꤵ줿Τǡ̤Τ°ؽ */
static void
proc_unknown_depword(struct seg_info *si, xstr *seg)
{
  int i;
  int rv, nr;
  xstr dep;
  xstr tail;

  if (si->dep_len == 0) {
    /* Ȥseginfo°äƤʤ */
    return ;
  }

  rv = anthy_select_section("EXPAND_DEP", 1);
  if (rv == -1) {
    return ;
  }

  /* Ȥ°õ */
  dep.len = si->dep_len;
  dep.str = &seg->str[si->info_len];
  rv = anthy_select_column(&dep, 1);
  if (rv == -1) {
    return ;
  }
  /*  */
  tail.len = seg->len - si->dep_len - si->info_len;
  tail.str = &dep.str[dep.len];

  nr = anthy_get_nr_values();
  for (i = 0; i < nr; i++) {
    xstr *xs;
    xs = anthy_get_nth_xstr(i);
    if (xs && !anthy_xstrcmp(xs, &tail)) {
      /*  */
      return ;
    }
  }

  /* ɲä */
  anthy_set_nth_xstr(nr, &tail);
  anthy_truncate_section(MAX_EXPAND_DEP_ENTRY_COUNT);
}

/* ʸΥޡȸؽ */
void
anthy_commit_border(struct splitter_context *sc, int nr_segments,
		    struct seg_info **info, int *seg_len)
{
  int i, from = 0;


  for (i = 0; i < nr_segments; i++) {
    /* 줾ʸФ */

    int len = seg_len[i];
    int initial_len = sc->ce[from].initial_seg_len;
    int real_len = 0;
    int l2;

    if (info[i] && info[i]->info_len + info[i]->dep_len < len) {
      xstr xs;
      xs.str = sc->ce[from].c;
      xs.len = len;
      proc_unknown_depword(info[i], &xs);
    }

    if (!initial_len || from + initial_len == sc->char_count) {
      /* ϶ǤϤʤ */
      goto tail;
    }
    l2 = sc->ce[from + initial_len].initial_seg_len;
    if (initial_len + l2 > len) {
      /* ٤ʸޤۤɳ礵줿櫓ǤϤʤ */
      goto tail;
    }
    if (info[i]) {
      real_len = info[i]->info_len + info[i]->dep_len;
    }
    if (real_len <= initial_len) {
      goto tail;
    }
    /* ʸޤĹ˳ĥ줿ʸ᤬ߥåȤ줿 */
    proc_expanded_segment(sc, from, real_len);
  tail:
    from += len;
  }
}

void
anthy_init_split_context(xstr *xs, struct splitter_context *sc)
{
  int i;
  sc->char_count = xs->len;
  sc->ce = (struct char_ent*)
    malloc(sizeof(struct char_ent)*(xs->len + 1));
  for (i = 0; i <= xs->len; i++) {
    sc->ce[i].c = &xs->str[i];
    sc->ce[i].seg_border = 0;
    sc->ce[i].initial_seg_len = 0;
  }

  /* ξüʸζǤ */
  sc->ce[0].seg_border = 1;
  sc->ce[xs->len].seg_border = 1;

  make_word_cache(sc);
}

void
anthy_release_split_context(struct splitter_context *sc)
{
  if (sc->word_split_info) {
    release_info_cache(sc);
    sc->word_split_info = 0;
  }
  if (sc->ce) {
    free(sc->ce);
    sc->ce = 0;
  }
}

int
anthy_init_splitter(void)
{
  if (anthy_init_depword_tab()) {
    anthy_log(0, "Failed to init dependent word table.\n");
    return -1;
  }
  seginfo_ator = anthy_create_allocator(sizeof(struct seg_info), seginfo_dtor);
  anthy_name_to_wtype("̾", &anthy_wtype_noun);
  anthy_name_to_wtype("ƻ첽", &anthy_wtype_a_tail_of_v_renyou);
  anthy_name_to_wtype("ưϢѷ", &anthy_wtype_v_renyou);
  anthy_name_to_wtype("̾첽", &anthy_wtype_noun_tail);
  anthy_name_to_wtype("̾Ƭ", &anthy_wtype_prefix);
  anthy_name_to_wtype("̾", &anthy_wtype_postfix);
  anthy_name_to_wtype("1", &anthy_wtype_n1);
  anthy_name_to_wtype("10", &anthy_wtype_n10);
  return anthy_init_wordlist();
}

void
anthy_quit_splitter(void)
{
  anthy_release_depword_tab();
}
