# Copyright (C) 2005  Network Applied Communication Laboratory Co., Ltd.
#
# This file is part of Rast.
# See the file COPYING for redistribution information.
#

# -*- mode: Ruby; coding: euc-japan; -*-

require "test/unit"

require "rast_test"

module Rast
  class Encoding
    class EucJpTest < Test::Unit::TestCase
      def setup
        @encoding = Encoding["euc_jp"]
      end

      def test_register_tokenize
        result = []
        @encoding.register_tokenize("ruby is great.") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["rub", 0, true], result[0])
        assert_equal(["uby", 1, true], result[1])
        assert_equal(["by ", 2, true], result[2])
        assert_equal(["y i", 3, true], result[3])
        assert_equal([" is", 4, true], result[4])
        assert_equal(["is ", 5, true], result[5])
        assert_equal(["s g", 6, true], result[6])
        assert_equal([" gr", 7, true], result[7])
        assert_equal(["gre", 8, true], result[8])
        assert_equal(["rea", 9, true], result[9])
        assert_equal(["eat", 10, true], result[10])
        assert_equal(["at", 11, true], result[11])
        assert_equal(["t.", 12, true], result[12])
        assert_equal([".", 13, false], result[13])
        assert_equal(14, result.length)

        result = []
        @encoding.register_tokenize("") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(false, result[3][2])
        assert_equal("", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(false, result[4][2])
        assert_equal(5, result.length)

        result = []
        @encoding.register_tokenize("") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(false, result[3][2])
        assert_equal("", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(false, result[4][2])
        assert_equal(5, result.length)

        result = []
        @encoding.register_tokenize("ܸ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("ܸ", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(false, result[2][2])
        assert_equal(3, result.length)

        result = []
        @encoding.register_tokenize("RubyǤ") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal("Rub", result[0][0])
        assert_equal(0, result[0][1])
        assert_equal(true, result[0][2])
        assert_equal("uby", result[1][0])
        assert_equal(1, result[1][1])
        assert_equal(true, result[1][2])
        assert_equal("by", result[2][0])
        assert_equal(2, result[2][1])
        assert_equal(true, result[2][2])
        assert_equal("y", result[3][0])
        assert_equal(3, result[3][1])
        assert_equal(true, result[3][2])
        assert_equal("Ǥ", result[4][0])
        assert_equal(4, result[4][1])
        assert_equal(true, result[4][2])
        assert_equal("", result[5][0])
        assert_equal(5, result[5][1])
        assert_equal(false, result[5][2])
        assert_equal("", result[6][0])
        assert_equal(6, result[6][1])
        assert_equal(false, result[6][2])
        assert_equal(7, result.length)

        result = []
        @encoding.register_tokenize("") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(0, result.length)

        result = []
        @encoding.register_tokenize("a") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["a", 0, false], result[0])
        assert_equal(1, result.length)

        result = []
        @encoding.register_tokenize("") do |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["", 0, false], result[0])
        assert_equal(1, result.length)

        result = []
        @encoding.search_tokenize("") do
          |ngram, pos, complete|
          result.push([ngram, pos, complete])
        end
        assert_equal(["", 0, true], result[0])
        assert_equal(1, result.length)
      end

      def test_normalize_text
        assert_equal(" abc ", @encoding.normalize_text("  abc  "))
        assert_equal(" abc abc", @encoding.normalize_text(" abc\nabc"))
        assert_equal("a b c d e ",
                     @encoding.normalize_text("a\n \t b\nc\r\rd \ne "))

        s = @encoding.normalize_text("£")
        assert_equal("012ABC", s)
        s = @encoding.normalize_text("")
        assert_equal("", s)
        s = @encoding.normalize_text("")
        assert_equal("", s)
        s = @encoding.normalize_text("Î")
        assert_equal("ĥƥ", s)
        s = @encoding.normalize_text("Î")
        assert_equal("ĥƥ", s)
        s = @encoding.normalize_text("ގގގގ")
        assert_equal("", s)
        s = @encoding.normalize_text("ގގގÎގĎ")
        assert_equal("¥ťǥ", s)
        s = @encoding.normalize_text("ʎߎˎߎ̎ߎ͎ߎΎ")
        assert_equal("ѥԥץڥ", s)
        s = @encoding.normalize_text("")
        assert_equal("", s)
      end

      def test_normalize_chars
        s = @encoding.normalize_chars("ABC")
        assert_equal("abc", s)
      end
    end
  end
end
