/*
 * Decompiled with CFR 0.152.
 */
package edu.northwestern.at.morphadorner.tools.punktabbreviationdetector;

import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.ICU4JBreakIteratorWordTokenizer;
import edu.northwestern.at.morphadorner.tools.punktabbreviationdetector.PunktToken;
import edu.northwestern.at.morphadorner.tools.punktabbreviationdetector.PunktTokenCounter;
import edu.northwestern.at.morphadorner.tools.punktabbreviationdetector.PunktTokenType;
import edu.northwestern.at.utils.CharUtils;
import edu.northwestern.at.utils.FileNameUtils;
import edu.northwestern.at.utils.FileUtils;
import edu.northwestern.at.utils.SetUtils;
import edu.northwestern.at.utils.StringUtils;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.List;
import java.util.Locale;
import java.util.TreeSet;

public class PunktAbbreviationDetector {
    protected static final int INITPARAMS = 2;
    protected static PrintStream printStream;

    public static void main(String[] args) throws IOException {
        long startTime = System.currentTimeMillis();
        printStream = new PrintStream((OutputStream)new BufferedOutputStream(System.out), true, "utf-8");
        String languageCode = args[0];
        printStream.println("Language code: " + languageCode);
        Locale locale = PunktAbbreviationDetector.languageCodeToLocale(languageCode);
        printStream.println("Language: " + locale.getDisplayLanguage());
        ICU4JBreakIteratorWordTokenizer tokenizer = new ICU4JBreakIteratorWordTokenizer(locale);
        tokenizer.setStoreWhitespaceTokens(true);
        tokenizer.setMergeWhitespaceTokens(true);
        tokenizer.setSplitAroundPeriods(false);
        String abbrevsFileName = args[1];
        PunktTokenCounter tokenCounter = new PunktTokenCounter(0.3, false);
        String[] wildCards = new String[args.length - 2];
        for (int i = 2; i < args.length; ++i) {
            wildCards[i - 2] = args[i];
        }
        String[] fileNames = FileNameUtils.expandFileNameWildcards(wildCards);
        printStream.println("There are " + StringUtils.formatNumberWithCommas(fileNames.length) + " files to process.");
        long tokensRead = 0L;
        for (int i = 0; i < fileNames.length; ++i) {
            String text = FileUtils.readTextFile(fileNames[i], "utf-8");
            List<String> tokens = tokenizer.extractWords(text);
            for (int j = 0; j < tokens.size(); ++j) {
                tokenCounter.count(PunktAbbreviationDetector.makePunktToken(tokens.get(j)));
                ++tokensRead;
            }
        }
        long processingTime = (System.currentTimeMillis() - startTime + 999L) / 1000L;
        printStream.println("\nProcessing completed in " + StringUtils.formatNumberWithCommas(processingTime) + " seconds.");
        printStream.println("\n" + StringUtils.formatNumberWithCommas(tokensRead) + " tokens extracted.");
        printStream.println();
        printStream.println("There were " + StringUtils.formatNumberWithCommas(tokenCounter.getCandidates().size()) + " candidates.");
        printStream.println("There are " + StringUtils.formatNumberWithCommas(tokenCounter.getAbbreviations().size()) + " abbreviations.");
        SetUtils.saveSet(new TreeSet<String>(tokenCounter.getAbbreviations()), abbrevsFileName, "utf-8");
    }

    public static Locale languageCodeToLocale(String languageCode) {
        return new Locale(languageCode);
    }

    public static PunktToken makePunktToken(String token) {
        char ch = token.charAt(0);
        if (Character.isWhitespace(ch)) {
            return new PunktToken(token, PunktTokenType.WHITESPACE);
        }
        if (Character.isDigit(ch)) {
            return new PunktToken(token, PunktTokenType.NUMBER);
        }
        if (CharUtils.isPunctuationOrSymbol(token)) {
            return new PunktToken(token, PunktTokenType.NONWORD);
        }
        return new PunktToken(token, PunktTokenType.WORD);
    }

    protected PunktAbbreviationDetector() {
    }
}

