/*
 * Decompiled with CFR 0.152.
 */
package edu.northwestern.at.morphadorner.tools.taggertrainer.ngram;

import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.BaseLexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.lexicon.Lexicon;
import edu.northwestern.at.morphadorner.corpuslinguistics.postagger.transitionmatrix.TransitionMatrix;
import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.UnicodeReader;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.StringTokenizer;

public class NGramTaggerTrainer {
    protected static Lexicon wordLexicon;
    protected static String trainingDataFileName;
    protected static int trainingDataCount;
    protected static TransitionMatrix transitionMatrix;
    protected static String transitionMatrixFileName;
    protected static String sepChars;

    protected static void getProgramParameters(String[] args) throws IOException {
        if (args.length < 3) {
            NGramTaggerTrainer.help();
        }
        trainingDataFileName = args[0];
        File file = new File(args[1]);
        wordLexicon = new BaseLexicon();
        wordLexicon.loadLexicon(file.toURI().toURL(), "utf-8");
        transitionMatrixFileName = args[2];
    }

    protected static void loadTrainingData() throws IOException {
        String line;
        long startTime = System.currentTimeMillis();
        BufferedReader input = new BufferedReader(new UnicodeReader(new FileInputStream(trainingDataFileName), "utf-8"));
        trainingDataCount = 0;
        String previousPOS = ".";
        String previousPOSM1 = ".";
        while ((line = input.readLine()) != null) {
            if ((line = line.trim()).length() == 0) continue;
            StringTokenizer tokenizer = new StringTokenizer(line, sepChars);
            String spelling = "";
            String currentPOS = "";
            try {
                spelling = tokenizer.nextToken().trim();
                currentPOS = tokenizer.nextToken().trim();
            }
            catch (Exception e) {
                if (CharUtils.isPunctuationOrSymbol(spelling)) {
                    currentPOS = spelling;
                }
                e.printStackTrace();
                System.out.println("line=" + line);
            }
            transitionMatrix.incrementCount(currentPOS, 1);
            transitionMatrix.incrementCount(previousPOS, currentPOS, 1);
            transitionMatrix.incrementCount(previousPOSM1, previousPOS, currentPOS, 1);
            ++trainingDataCount;
            previousPOSM1 = previousPOS;
            previousPOS = currentPOS;
        }
        long endTime = System.currentTimeMillis();
        long secs = (endTime - startTime + 999L) / 1000L;
        System.out.println("Training data loaded in " + secs + " seconds.");
    }

    public static void main(String[] args) {
        try {
            NGramTaggerTrainer.getProgramParameters(args);
            NGramTaggerTrainer.loadTrainingData();
            transitionMatrix.saveTransitionMatrix(transitionMatrixFileName, "utf-8", '\t');
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    protected static URL getURL(String fileNameOrURL) {
        URL fileURL;
        try {
            fileURL = new URL(fileNameOrURL);
        }
        catch (MalformedURLException e) {
            try {
                fileURL = new File(fileNameOrURL).toURI().toURL();
            }
            catch (Exception e2) {
                fileURL = null;
            }
        }
        return fileURL;
    }

    protected static void help() {
        System.out.println("java edu.northwestern.at.taggertrainer.ngram.NGramTaggerTrainer trainingdata wordlexicon outputtransitionmatrix");
        System.exit(1);
    }

    static {
        trainingDataCount = 0;
        transitionMatrix = new TransitionMatrix();
        transitionMatrixFileName = null;
        sepChars = "\t";
    }
}

