/*
 * Copyright (C) 2010-2011 Mtzky.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *         http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.mtzky.lucene.filter;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

/**
 * <p>
 * Removes stop term from a token stream. Override
 * {@link #contains(char[], int)} to remove stop term.
 * </p>
 * 
 * @see org.apache.lucene.analysis.LowerCaseFilter LowerCaseFilter
 * @author mtzky
 */
public abstract class StopTermFilter extends TokenFilter {

	private final CharTermAttribute termAttr;
	private final PositionIncrementAttribute posAttr;

	/**
	 * @param in
	 */
	public StopTermFilter(final TokenStream in) {
		super(in);
		this.termAttr = addAttribute(CharTermAttribute.class);
		this.posAttr = addAttribute(PositionIncrementAttribute.class);
	}

	@Override
	public boolean incrementToken() throws IOException {
		/* return the first non-stop word found */
		int skippedPos = 0;
		while (input.incrementToken()) {
			switch (contains(termAttr.buffer(), termAttr.length())) {
			case KEEP:
				final int newPos = posAttr.getPositionIncrement() + skippedPos;
				posAttr.setPositionIncrement(newPos);
				return true;
			case IGNORE:
				skippedPos += posAttr.getPositionIncrement();
			}
		}
		/* reached EOS -- return false */
		return false;
	}

	/**
	 * <p>
	 * Returns {@link TokenProcess#IGNORE} and
	 * {@link TokenProcess#IGNORE_KEEP_POSITION} if skips a term, or
	 * {@link TokenProcess#KEEP}.
	 * </p>
	 * <p>
	 * In most cases, you should return {@link TokenProcess#IGNORE} because it
	 * does not lose information (positions of the original tokens) during
	 * indexing. If you return {@link TokenProcess#IGNORE_KEEP_POSITION}, the
	 * {@link PositionIncrementAttribute position} is NOT incremented. For
	 * example, it is useful to skip IVS token.
	 * </p>
	 * 
	 * @param term
	 *            the internal termBuffer character array
	 * @param len
	 *            the number of valid characters (length of the term) in the
	 *            termBuffer array
	 * @return {@link TokenProcess}
	 * @see PositionIncrementAttribute
	 * @see org.apache.lucene.analysis.tokenattributes.TypeAttribute
	 *      TypeAttribute
	 */
	protected abstract TokenProcess contains(char[] term, int len);

	/**
	 * <p>
	 * Means how to process the token.
	 * </p>
	 * 
	 * @see StopTermFilter#contains(char[], int)
	 * @author mtzky
	 */
	protected enum TokenProcess {

		/**
		 * <p>
		 * The token is ignored, and incremented the position.
		 * </p>
		 * 
		 * @see PositionIncrementAttribute#setPositionIncrement(int)
		 */
		IGNORE,

		/**
		 * <p>
		 * The token is ignored, but NOT incremented the position.
		 * </p>
		 * 
		 * @see PositionIncrementAttribute#setPositionIncrement(int)
		 */
		IGNORE_KEEP_POSITION,

		/**
		 * <p>
		 * The token is NOT ignored.
		 * </p>
		 */
		KEEP;

	}

}
