#!/usr/bin/ruby
# -*- coding:utf-8 -*-

require 'kconv'


# ==============================================================================
# convert_to_utf8
# ==============================================================================

convert_to_utf8 = Proc.new do
	file = File.new($filename, "r")
		lines = file.readlines
	file.close

	dicfile = File.new($dicname, "w")

	lines.length.times do |i|
		s = lines[i].kconv(Kconv::UTF8, Kconv::EUC)

		# コードポイントを含まないエントリを処理
		if s.index("&#") == nil
			dicfile.puts s
			next
		end

		# コードポイントを含むエントリを処理
		# （例）あしゅくにょらい 阿&#38310;如来阿&#38310;
		s = s.split("&#")
		# ["あしゅくにょらい 阿", "38310;如来阿", "38310;"]

		t = ""

		s.length.times do |c|
			if s[c].index(";") == nil
				t = t + s[c]
				next
			end

			s[c] = s[c].split(";")
			codepoint = s[c][0]
			codepoint = codepoint.to_i

			# CJK統合漢字の範囲内か確認
			if codepoint >= 19968 &&
			codepoint <= 40959
				# [38310].pack("U*") => "閦"
				t = t + [codepoint].pack("U*")
			else
				t = ""
				break
			end

			if s[c][1] != nil
				t = t + s[c][1]
			end
		end

		if t != ""
			dicfile.puts t
		end
	end

	dicfile.close
end


# ==============================================================================
# main
# ==============================================================================

targetfiles = ARGV

if ARGV == []
	puts "Usage: ruby script.rb [FILE]"
	exit
end

targetfiles.length.times do |i|
	$filename = targetfiles[i]
	$dicname = $filename + ".utf8"

	convert_to_utf8.call
end
