/*
 * Decompiled with CFR 0.152.
 */
package gate.corpora;

import gate.Document;
import gate.DocumentFormat;
import gate.Resource;
import gate.corpora.DocumentContentImpl;
import gate.corpora.MimeType;
import gate.corpora.RepositioningInfo;
import gate.creole.ResourceInstantiationException;
import gate.util.DocumentFormatException;
import java.io.InputStream;
import java.net.URL;
import org.textmining.text.extraction.WordExtractor;

public class MSWordDocumentFormat
extends DocumentFormat {
    public Resource init() throws ResourceInstantiationException {
        MimeType mimeType = new MimeType("application", " msword");
        mimeString2ClassHandlerMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), this);
        mimeString2mimeTypeMap.put(mimeType.getType() + "/" + mimeType.getSubtype(), mimeType);
        suffixes2mimeTypeMap.put("doc", mimeType);
        this.setMimeType(mimeType);
        return this;
    }

    public Boolean supportsRepositioning() {
        return new Boolean(false);
    }

    public void unpackMarkup(Document document) throws DocumentFormatException {
        URL uRL = document.getSourceUrl();
        if (uRL == null) {
            throw new DocumentFormatException("Unpacking MS Word files requires an URL to the original content!");
        }
        String string = null;
        try {
            InputStream inputStream = uRL.openStream();
            WordExtractor wordExtractor = new WordExtractor();
            string = wordExtractor.extractText(inputStream);
            inputStream.close();
        }
        catch (Exception exception) {
            throw new DocumentFormatException("Exception for " + document.getSourceUrl().toExternalForm(), exception);
        }
        document.setContent(new DocumentContentImpl(string));
    }

    public void unpackMarkup(Document document, RepositioningInfo repositioningInfo, RepositioningInfo repositioningInfo2) throws DocumentFormatException {
        this.unpackMarkup(document);
    }
}

