/*
 * Decompiled with CFR 0.152.
 */
package edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer;

import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.AbstractPreTokenizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.PreTokenizer;
import edu.northwestern.at.utils.PatternReplacer;

public class EccoPreTokenizer
extends AbstractPreTokenizer
implements PreTokenizer {
    protected static final String EccoAlwaysSeparators = "((-{2,})|(\\.{3,})|[\\(\\)\\[\\]\";:/=`\u00b6<>\u00a1\u00bf\u00ab\u00bb\u201c\u201d\u00a6\u2758[\\p{InGeneralPunctuation}&&[^\\{\\}\\|\u2022\u2032\u2033\u2034\u2018\u2019\u2010\u2011\u2026\u2042\u2020\u2021\u00a7]]\\p{InLetterlikeSymbols}\\p{InMathematicalOperators}\\p{InMiscellaneousTechnical}[\\p{InGeometricShapes}&&[^\u25cf\u25ca]]\\p{InMiscellaneousSymbols}\\p{InDingbats}\\p{InAlphabeticPresentationForms}])";
    protected static final PatternReplacer wordOrSpanGapReplacer = new PatternReplacer("(\u3008[\u25ca|\u2026]+\u3009)", " $1 ");
    protected static final PatternReplacer doubleBackTicksReplacer = new PatternReplacer("(``)", " $1 ");
    protected static final PatternReplacer singleBackTicksReplacer = new PatternReplacer("`([A-Z])", "` $1");

    public EccoPreTokenizer() {
        alwaysSeparatorsReplacer = new PatternReplacer(EccoAlwaysSeparators, " $1 ");
    }

    @Override
    public String pretokenize(String line) {
        String result = super.pretokenize(line);
        result = wordOrSpanGapReplacer.replace(result);
        result = result.replaceAll("(\\s|\\.|\\?|!)\u2014", "$1 \u2014");
        result = result.replaceAll("([\\p{L}\\-0-9\\'\u2011\u25cf]{3,})\u2014", "$1 \u2014");
        result = result.replaceAll("\u2014([\\p{L}\\-0-9\\'\u2011\u25cf]{3,})", "\u2014 $1");
        result = result.replaceAll("([0-9]+)\u2014", "$1 \u2014");
        result = doubleBackTicksReplacer.replace(result);
        result = singleBackTicksReplacer.replace(result);
        return result;
    }
}

