/*
 * Decompiled with CFR 0.152.
 */
package gate.corpora;

import gate.Document;
import gate.Resource;
import gate.TextualDocument;
import gate.corpora.DocumentImpl;
import gate.corpora.HtmlDocumentFormat;
import gate.corpora.MimeType;
import gate.corpora.RepositioningInfo;
import gate.creole.ResourceInstantiationException;
import gate.event.StatusListener;
import gate.html.NekoHtmlDocumentHandler;
import gate.util.DocumentFormatException;
import gate.util.Out;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLErrorHandler;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.html.HTMLConfiguration;

/*
 * This class specifies class file version 49.0 but uses Java 6 signatures.  Assumed Java 6.
 */
public class NekoHtmlDocumentFormat
extends HtmlDocumentFormat {
    private static final boolean DEBUG = false;
    private Set<String> ignorableTags = null;
    private static Pattern afterNewlinePattern = Pattern.compile("^", 8);

    public void setIgnorableTags(Set<String> set) {
        this.ignorableTags = set;
    }

    public Set<String> getIgnorableTags() {
        return this.ignorableTags;
    }

    @Override
    public Boolean supportsRepositioning() {
        return Boolean.TRUE;
    }

    @Override
    public void unpackMarkup(Document document) throws DocumentFormatException {
        this.unpackMarkup(document, null, null);
    }

    @Override
    public void unpackMarkup(Document document, RepositioningInfo repositioningInfo, RepositioningInfo repositioningInfo2) throws DocumentFormatException {
        if (document == null || document.getSourceUrl() == null && document.getContent() == null) {
            throw new DocumentFormatException("GATE document is null or no content found. Nothing to parse!");
        }
        StatusListener statusListener = new StatusListener(){

            public void statusChanged(String string) {
                NekoHtmlDocumentFormat.this.fireStatusChanged(string);
            }
        };
        boolean bl = NekoHtmlDocumentFormat.hasContentButNoValidUrl(document);
        NekoHtmlDocumentHandler nekoHtmlDocumentHandler = null;
        try {
            XMLInputSource xMLInputSource;
            HTMLConfiguration hTMLConfiguration = new HTMLConfiguration();
            hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/elems", (Object)"lower");
            hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/names/attrs", (Object)"lower");
            hTMLConfiguration.setFeature("http://cyberneko.org/html/features/augmentations", true);
            nekoHtmlDocumentHandler = new NekoHtmlDocumentHandler(document, null, this.ignorableTags);
            nekoHtmlDocumentHandler.addStatusListener(statusListener);
            nekoHtmlDocumentHandler.setRepositioningInfo(repositioningInfo);
            nekoHtmlDocumentHandler.setAmpCodingInfo(repositioningInfo2);
            int[] nArray = this.buildLineOffsets(document.getContent().toString());
            nekoHtmlDocumentHandler.setLineOffsets(nArray);
            hTMLConfiguration.setDocumentHandler((XMLDocumentHandler)nekoHtmlDocumentHandler);
            hTMLConfiguration.setErrorHandler((XMLErrorHandler)nekoHtmlDocumentHandler);
            if (bl) {
                xMLInputSource = new XMLInputSource(null, null, null, (Reader)new StringReader(document.getContent().toString()), null);
            } else if (document instanceof TextualDocument) {
                String string = ((TextualDocument)document).getEncoding();
                InputStreamReader inputStreamReader = new InputStreamReader(document.getSourceUrl().openStream(), string);
                xMLInputSource = new XMLInputSource(null, document.getSourceUrl().toString(), document.getSourceUrl().toString(), (Reader)inputStreamReader, string);
                hTMLConfiguration.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
            } else {
                xMLInputSource = new XMLInputSource(null, document.getSourceUrl().toString(), document.getSourceUrl().toString());
            }
            hTMLConfiguration.parse(xMLInputSource);
            ((DocumentImpl)document).setNextAnnotationId(nekoHtmlDocumentHandler.getCustomObjectsId());
        }
        catch (XNIException xNIException) {
            document.getFeatures().put("parsingError", Boolean.TRUE);
            Boolean bl2 = (Boolean)document.getFeatures().get("throwExceptionOnFormatError");
            if (bl2 != null && bl2.booleanValue()) {
                throw new DocumentFormatException((Exception)((Object)xNIException));
            }
            Out.println("Warning: Document remains unparsed. \n\n  Stack Dump: ");
            xNIException.printStackTrace(Out.getPrintWriter());
        }
        catch (IOException iOException) {
            throw new DocumentFormatException("I/O exception for " + document.getSourceUrl().toString(), iOException);
        }
        finally {
            if (nekoHtmlDocumentHandler != null) {
                nekoHtmlDocumentHandler.removeStatusListener(statusListener);
            }
        }
    }

    private int[] buildLineOffsets(String string) {
        Matcher matcher = afterNewlinePattern.matcher(string);
        int n = 0;
        while (matcher.find()) {
            ++n;
        }
        int[] nArray = new int[n];
        matcher.reset();
        for (int i = 0; i < nArray.length; ++i) {
            matcher.find();
            nArray[i] = matcher.start();
        }
        return nArray;
    }

    @Override
    public Resource init() throws ResourceInstantiationException {
        MimeType mimeType = new MimeType("text", "html");
        mimeString2ClassHandlerMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), this);
        mimeString2mimeTypeMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), mimeType);
        suffixes2mimeTypeMap.put("html", mimeType);
        suffixes2mimeTypeMap.put("htm", mimeType);
        magic2mimeTypeMap.put("<html", mimeType);
        this.setMimeType(mimeType);
        return this;
    }
}

