/*
 * Decompiled with CFR 0.152.
 */
package edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer;

import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.AbstractPreTokenizer;
import edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.PreTokenizer;
import edu.northwestern.at.utils.PatternReplacer;

public class EEBOPreTokenizer
extends AbstractPreTokenizer
implements PreTokenizer {
    protected static final String EEBOAlwaysSeparators = "((-{2,})|(\\.{3,})|[\\(\\)\\[\\]\";:/=\u00b6<>\u201c\u201d\u2014\u00a6\u2758[\\p{InGeneralPunctuation}&&[^\\{\\}\\|\u2022\u2032\u2033\u2034\u2018\u2019\u2010\u2011\u2026]]\\p{InLetterlikeSymbols}\\p{InMathematicalOperators}\\p{InMiscellaneousTechnical}[\\p{InGeometricShapes}&&[^\u25cf\u25ca]]\\p{InMiscellaneousSymbols}\\p{InDingbats}\\p{InAlphabeticPresentationForms}])";
    protected static final PatternReplacer wordOrSpanGapReplacer = new PatternReplacer("(\u3008[\u25ca|\u2026]+\u3009)", " $1 ");
    protected static final PatternReplacer doubleBackTicksReplacer = new PatternReplacer("(``)", " $1 ");
    protected static final PatternReplacer singleBackTicksReplacer = new PatternReplacer("`([A-Z])", "` $1");

    public EEBOPreTokenizer() {
        alwaysSeparatorsReplacer = new PatternReplacer(EEBOAlwaysSeparators, " $1 ");
    }

    @Override
    public String pretokenize(String line) {
        String result = super.pretokenize(line);
        result = wordOrSpanGapReplacer.replace(result);
        result = doubleBackTicksReplacer.replace(result);
        result = singleBackTicksReplacer.replace(result);
        return result;
    }
}

