/*
 * Copyright (C) 2010-2011 Mtzky.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *         http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.mtzky.lucene.filter;

import org.apache.lucene.analysis.TokenStream;

/**
 * <p>
 * Normalizes <a href="http://www.unicode.org/charts/PDF/UFF00.pdf">Halfwidth
 * and Fullwidth Forms</a>.
 * </p>
 * <p>
 * U+FFE3 (&#xFFE3;) is converted to U+00AF (&#x00AF;) but not U+203E
 * (&#x203E;).
 * </p>
 * 
 * @author mtzky
 */
public class HalfFullwidthFormsFilter extends CharToCharMappingTokenFilter {

	private static final char[] MAP = {
			// /* Fullwidth ASCII variants */
			'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-',
			'.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',
			';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G',
			'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
			'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a',
			'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
			'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
			'|', '}', '~',
			/* Fullwidth brackets */
			'\u2985', '\u2986',
			/* Halfwidth CJK punctuation */
			'\u3002', '\u300C', '\u300D', '\u3001',
			/* Halfwidth Katakana variants */
			'\u30FB', '\u30F2', '\u30A1', '\u30A3', '\u30A5', '\u30A7',
			'\u30A9', '\u30E3', '\u30E5', '\u30E7', '\u30C3', '\u30FC',
			'\u30A2', '\u30A4', '\u30A6', '\u30A8', '\u30AA', '\u30AB',
			'\u30AD', '\u30AF', '\u30B1', '\u30B3', '\u30B5', '\u30B7',
			'\u30B9', '\u30BB', '\u30BD', '\u30BF', '\u30C1', '\u30C4',
			'\u30C6', '\u30C8', '\u30CA', '\u30CB', '\u30CC', '\u30CD',
			'\u30CE', '\u30CF', '\u30D2', '\u30D5', '\u30D8', '\u30DB',
			'\u30DE', '\u30DF', '\u30E0', '\u30E1', '\u30E2', '\u30E4',
			'\u30E6', '\u30E8', '\u30E9', '\u30EA', '\u30EB', '\u30EC',
			'\u30ED', '\u30EF', '\u30F3', '\u3099', '\u309A',
			/* Halfwidth Hangul variants */
			'\u3164', '\u3131', '\u3132', '\u3133', '\u3134', '\u3135',
			'\u3136', '\u3137', '\u3138', '\u3139', '\u313A', '\u313B',
			'\u313C', '\u313D', '\u313E', '\u313F', '\u3140', '\u3141',
			'\u3142', '\u3143', '\u3144', '\u3145', '\u3146', '\u3147',
			'\u3148', '\u3149', '\u314A', '\u314B', '\u314C', '\u314D',
			'\u314E', '\uFFBF', '\uFFC0', '\uFFC1', '\u314F', '\u3150',
			'\u3151', '\u3152', '\u3153', '\u3154', '\uFFC8', '\uFFC9',
			'\u3155', '\u3156', '\u3157', '\u3158', '\u3159', '\u315A',
			'\uFFD0', '\uFFD1', '\u315B', '\u315C', '\u315D', '\u315E',
			'\u315F', '\u3160', '\uFFD8', '\uFFD9', '\u3161', '\u3162',
			'\u3163', '\uFFDD', '\uFFDE', '\uFFDF',
			/* Fullwidth symbol variants */
			'\u00A2', '\u00A3', '\u00AC', '\u00AF', '\u00A6', '\u00A5',
			'\u20A9', ' ',
			/* Halfwidth symbol variants */
			'\u2502', '\u2190', '\u2191', '\u2192', '\u2193', '\u25A0',
			'\u25CB' };

	public HalfFullwidthFormsFilter(final TokenStream input) {
		super(input);
	}

	@Override
	protected char convert(final char c) {
		return (0xFF01 <= c && c <= 0xFFEE) ? MAP[c - 0xFF01] : c;
	}

}
