/*
 * Copyright (C) 2010-2011 Mtzky.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *         http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.mtzky.lucene.tokenizer;

import static java.lang.Character.*;
import static java.lang.Character.UnicodeBlock.*;
import static org.mtzky.io.IOUtils.*;

import java.io.IOException;
import java.io.PushbackReader;
import java.io.Reader;
import java.lang.Character.UnicodeBlock;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;

/**
 * <p>
 * Tokenizes the input into unigram of the Unicode code point.
 * </p>
 * 
 * @see org.mtzky.lucene.filter.StopTermFilter StopTermFilter
 * @author mtzky
 */
public class UnicodeBlockTokenizer extends Tokenizer {

	private final CharTermAttribute termAtt;
	private final OffsetAttribute offsetAtt;
	private final TypeAttribute typeAttribute;

	private final PushbackReader in;

	private int pos = 0;

	/**
	 * @param in
	 */
	public UnicodeBlockTokenizer(final Reader in) {
		this(toPushbackReader(in));
	}

	/**
	 * @param in
	 */
	public UnicodeBlockTokenizer(final PushbackReader in) {
		super(in);
		this.in = in;
		this.termAtt = addAttribute(CharTermAttribute.class);
		this.offsetAtt = addAttribute(OffsetAttribute.class);
		this.typeAttribute = addAttribute(TypeAttribute.class);
	}

	@Override
	public final boolean incrementToken() throws IOException {
		clearAttributes();
		final char[] token = readToken();
		if (token == null || token.length < 1) {
			/* End of the stream */
			return false;
		}
		final int nextPos = pos + token.length;
		/* XXX 2011/04/10 mtzky CharTermAttribute#copyBuffer() - #readToken() */
		termAtt.copyBuffer(token, 0, token.length);
		offsetAtt.setOffset(correctOffset(pos), correctOffset(nextPos));
		pos = nextPos;
		return true;
	}

	/**
	 * <p>
	 * Returns a character array of the token, or {@code null} if the end of the
	 * stream has been reached
	 * </p>
	 * <p>
	 * Override to change the behavior.
	 * </p>
	 * 
	 * @return a character array of the token, or {@code null} if the end of the
	 *         stream has been reached
	 * @throws IOException
	 * @see #read()
	 * @see #unread(int)
	 * @see TypeAttribute
	 */
	protected char[] readToken() throws IOException {
		final int cp = read();
		if (cp < 0) {
			return null;
		}
		typeAttribute.setType(UnicodeBlock.of(cp).toString());
		return toChars(cp);
	}

	/**
	 * <p>
	 * Reads a single Unicode code point.
	 * </p>
	 * 
	 * @return The code point read, or -1 if the end of the stream has been
	 *         reached
	 * @throws IOException
	 */
	protected final int read() throws IOException {
		final int c = in.read();
		if (c < 0) {
			return c;
		}
		if (UnicodeBlock.of(c) == HIGH_SURROGATES) {
			final int low = in.read();
			if (low < 0) {
				return c;
			}
			if (UnicodeBlock.of(low) == LOW_SURROGATES) {
				return toCodePoint((char) c, (char) low);
			}
			in.unread(low);
		}
		return c;
	}

	/**
	 * <p>
	 * Pushes back a single Unicode code point to the front of the
	 * {@link PushbackReader push back} buffer.
	 * </p>
	 * 
	 * @param codePoint
	 *            Unicode code point to push back
	 * @throws IOException
	 */
	protected final void unread(final int codePoint) throws IOException {
		in.unread(toChars(codePoint));
	}

	@Override
	public void end() {
		final int finalOffset = correctOffset(pos);
		offsetAtt.setOffset(finalOffset, finalOffset);
	}

	@Override
	public void reset(final Reader input) throws IOException {
		super.reset(input);
		reset();
	}

	@Override
	public void reset() throws IOException {
		super.reset();
		pos = 0;
	}

}
