/*
 * Decompiled with CFR 0.152.
 */
package edu.northwestern.at.morphadorner.tools.mergebrilllexicon;

import edu.northwestern.at.morphadorner.corpuslinguistics.lemmatizer.DefaultLemmatizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.lemmatizer.Lemmatizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.BaseLexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.BrillLexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.LexiconEntry;
import edu.northwestern.at.morphadorner.corpuslinguistics.partsofspeech.NUPOSPartOfSpeechTags;
import edu.northwestern.at.morphadorner.corpuslinguistics.partsofspeech.PartOfSpeechTags;
import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.PennTreebankTokenizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.WordTokenizer;
import edu.northwestern.at.utils.Formatters;
import java.io.File;
import java.io.IOException;
import java.util.List;

public class MergeBrillLexicon {
    protected static WordTokenizer spellingTokenizer = new PennTreebankTokenizer();
    protected static Lemmatizer lemmatizer;
    protected static PartOfSpeechTags partOfSpeechTags;
    protected static String lemmaSeparator;

    public static void mergeBrillLexicon(String lexiconFileName, String brillLexiconFileName, String mergedLexiconFileName) throws IOException {
        BaseLexicon lexicon = new BaseLexicon();
        lexicon.loadLexicon(new File(lexiconFileName).toURI().toURL(), "utf-8");
        partOfSpeechTags = new NUPOSPartOfSpeechTags();
        try {
            lemmatizer = new DefaultLemmatizer();
        }
        catch (Exception e) {
            System.out.println("Unable to create lemmatizer.");
            System.exit(1);
        }
        System.out.println("MorphAdorner lexicon has " + Formatters.formatIntegerWithCommas(lexicon.getLexiconSize()) + " entries.");
        BrillLexicon brillLexicon = new BrillLexicon(new File(brillLexiconFileName).toURI().toURL(), "utf-8");
        System.out.println("Brill lexicon has " + Formatters.formatIntegerWithCommas(brillLexicon.size()) + " entries.");
        for (String word : brillLexicon.keySet()) {
            int firstFreq = 2;
            int otherFreq = 1;
            String lemma = word;
            List posTags = (List)brillLexicon.get(word);
            String posTag = (String)posTags.get(0);
            if (posTags.size() == 1 && partOfSpeechTags.isProperNounTag(posTag)) {
                String lowerCaseWord = word.toLowerCase();
                if (lexicon.containsEntry(lowerCaseWord)) {
                    LexiconEntry lexEntry = lexicon.getLexiconEntry(lowerCaseWord).deepClone();
                    lexEntry.entry = word;
                    lexicon.setLexiconEntry(word, lexEntry);
                }
                firstFreq = 1;
            }
            for (int i = 0; i < posTags.size(); ++i) {
                posTag = (String)posTags.get(i);
                if (lexicon.getCategoryCount(word, posTag) != 0) continue;
                lemma = MergeBrillLexicon.getLemma(word, posTag);
                lexicon.updateEntryCount(word, posTag, lemma, i == 0 ? firstFreq : otherFreq);
            }
        }
        lexicon.saveLexiconToTextFile(mergedLexiconFileName, "utf-8");
        System.out.println("Merged lexicon has " + Formatters.formatIntegerWithCommas(lexicon.getLexiconSize()) + " entries.");
    }

    public static String getLemma(String spelling, String partOfSpeech) {
        String lemmata = spelling;
        String lemmaClass = partOfSpeechTags.getLemmaWordClass(partOfSpeech);
        if (!lemmatizer.cantLemmatize(spelling) && !lemmaClass.equals("none") && (lemmata = lemmatizer.lemmatize(spelling, "compound")).equals(spelling)) {
            List<String> wordList = spellingTokenizer.extractWords(spelling);
            if (!partOfSpeechTags.isCompoundTag(partOfSpeech) || wordList.size() == 1) {
                lemmata = lemmaClass.length() == 0 ? lemmatizer.lemmatize(spelling) : lemmatizer.lemmatize(spelling, lemmaClass);
            } else {
                lemmata = "";
                String lemmaPiece = "";
                String[] posTags = partOfSpeechTags.splitTag(partOfSpeech);
                if (posTags.length == wordList.size()) {
                    for (int i = 0; i < wordList.size(); ++i) {
                        String wordPiece = wordList.get(i);
                        if (i > 0) {
                            lemmata = lemmata + lemmaSeparator;
                        }
                        lemmaClass = partOfSpeechTags.getLemmaWordClass(posTags[i]);
                        lemmaPiece = lemmatizer.lemmatize(wordPiece, lemmaClass);
                        lemmata = lemmata + lemmaPiece;
                    }
                }
            }
        }
        return lemmata;
    }

    public static void main(String[] args) {
        try {
            MergeBrillLexicon.mergeBrillLexicon(args[0], args[1], args[2]);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    protected MergeBrillLexicon() {
    }

    static {
        lemmaSeparator = "|";
    }
}

