/*
 * Decompiled with CFR 0.152.
 */
package net.loomchild.maligna.model.vocabulary;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import net.loomchild.maligna.coretypes.Alignment;
import net.loomchild.maligna.filter.modifier.modify.split.FilterNonWordsSplitAlgorithmDecorator;
import net.loomchild.maligna.filter.modifier.modify.split.SplitAlgorithm;
import net.loomchild.maligna.filter.modifier.modify.split.WordSplitAlgorithm;
import net.loomchild.maligna.model.ModelParseException;
import net.loomchild.maligna.model.vocabulary.Vocabulary;

public class VocabularyUtil {
    public static final SplitAlgorithm DEFAULT_TOKENIZE_ALGORITHM = new FilterNonWordsSplitAlgorithmDecorator(new WordSplitAlgorithm());
    public static final int DEFAULT_MAX_WORD_COUNT = 5000;
    public static final int DEFAULT_MIN_OCCURRENCE_COUNT = 2;

    public static Vocabulary parse(Reader reader) {
        try {
            String line;
            BufferedReader bufferedReader = new BufferedReader(reader);
            Vocabulary vocabulary = new Vocabulary();
            int expectedWid = vocabulary.getWordCount() + 1;
            while ((line = bufferedReader.readLine()) != null) {
                String[] parts = line.split("\\s");
                if (parts.length == 2) {
                    int wid = Integer.parseInt(parts[0]);
                    String word = parts[1];
                    if (wid != expectedWid) {
                        throw new ModelParseException("Word ordering error");
                    }
                    vocabulary.putWord(word);
                    ++expectedWid;
                    continue;
                }
                if (parts.length == 0) continue;
                throw new ModelParseException("Bad number of line parts.");
            }
            return vocabulary;
        }
        catch (NumberFormatException e) {
            throw new ModelParseException("Part format error", e);
        }
        catch (IOException e) {
            throw new ModelParseException("IO error", e);
        }
    }

    public static void tokenize(SplitAlgorithm splitAlgorithm, List<Alignment> alignmentList, Vocabulary sourceVocabulary, Vocabulary targetVocabulary, List<List<Integer>> sourceWidList, List<List<Integer>> targetWidList) {
        for (Alignment alignment : alignmentList) {
            sourceWidList.add(VocabularyUtil.tokenizePutGet(splitAlgorithm, alignment.getSourceSegmentList(), sourceVocabulary));
            targetWidList.add(VocabularyUtil.tokenizePutGet(splitAlgorithm, alignment.getTargetSegmentList(), targetVocabulary));
        }
    }

    private static List<Integer> tokenizePutGet(SplitAlgorithm splitAlgorithm, List<String> segmentList, Vocabulary vocabulary) {
        List<String> wordList = splitAlgorithm.modify(segmentList);
        vocabulary.putWordList(wordList);
        List<Integer> widList = vocabulary.getWidList(wordList);
        return widList;
    }

    public static List<Integer> tokenize(SplitAlgorithm splitAlgorithm, List<String> segmentList, Vocabulary vocabulary) {
        List<String> wordList = splitAlgorithm.modify(segmentList);
        List<Integer> widList = vocabulary.getWidList(wordList);
        return widList;
    }

    public static Vocabulary createTruncatedVocabulary(List<List<Integer>> widList, Vocabulary vocabulary, int maxWordCount, int minOccurrenceCount) {
        int occurrenceThreshold = minOccurrenceCount;
        int occurrenceCountThreshold = Integer.MAX_VALUE;
        int[] occurrenceCountArray = new int[vocabulary.getWordCount() + 1];
        Arrays.fill(occurrenceCountArray, 0);
        occurrenceCountArray[0] = -1;
        for (List<Integer> widSegment : widList) {
            Iterator<Integer> iterator = widSegment.iterator();
            while (iterator.hasNext()) {
                int wid;
                int n = wid = iterator.next().intValue();
                occurrenceCountArray[n] = occurrenceCountArray[n] + 1;
            }
        }
        if (vocabulary.getWordCount() > maxWordCount) {
            if (maxWordCount == 0) {
                occurrenceThreshold = Integer.MAX_VALUE;
            } else {
                int[] occurrenceCountArrayCopy = Arrays.copyOf(occurrenceCountArray, occurrenceCountArray.length);
                Arrays.sort(occurrenceCountArrayCopy);
                int index = occurrenceCountArrayCopy.length - maxWordCount;
                if (occurrenceCountArrayCopy[index] >= minOccurrenceCount) {
                    occurrenceThreshold = occurrenceCountArrayCopy[index];
                    ++index;
                    while (index < occurrenceCountArrayCopy.length && occurrenceCountArrayCopy[index] == occurrenceThreshold) {
                        ++index;
                    }
                    occurrenceCountThreshold = index - (occurrenceCountArrayCopy.length - maxWordCount);
                }
            }
        }
        Vocabulary resultVocabulary = new Vocabulary();
        for (int wid = 1; wid < occurrenceCountArray.length; ++wid) {
            int occurenceCount = occurrenceCountArray[wid];
            String word = vocabulary.getWord(wid);
            if (occurenceCount > occurrenceThreshold) {
                resultVocabulary.putWord(word);
                continue;
            }
            if (occurenceCount != occurrenceThreshold || occurrenceCountThreshold <= 0) continue;
            resultVocabulary.putWord(word);
            --occurrenceCountThreshold;
        }
        return resultVocabulary;
    }

    public static Vocabulary createTruncatedVocabulary(List<List<Integer>> widList, Vocabulary vocabulary) {
        return VocabularyUtil.createTruncatedVocabulary(widList, vocabulary, 5000, 2);
    }
}

