/*
 * Decompiled with CFR 0.152.
 */
package ru.ispras.texterra.core.nlp.datasets.lemma.penn;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.io.FileUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import ru.ispras.modis.utils.concurrent.StreamProcessor;
import ru.ispras.texterra.core.exceptions.TexterraSystemException;
import ru.ispras.texterra.core.nlp.datamodel.IAnnotation;
import ru.ispras.texterra.core.nlp.datamodel.INLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.Lemma;
import ru.ispras.texterra.core.nlp.datamodel.NLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.Token;
import ru.ispras.texterra.core.nlp.datamodel.pos.IPOSTagFactory;
import ru.ispras.texterra.core.nlp.datamodel.pos.POSToken;
import ru.ispras.texterra.core.nlp.datamodel.pos.morph.IMorphProperties;
import ru.ispras.texterra.core.nlp.datasets.IDatasetReader;

public class PennLemmaDatasetReader
implements IDatasetReader {
    private static String WORD_ELEMENT_TAG = "fs";
    private final File base;
    private final IPOSTagFactory<? extends IMorphProperties> posTagFactory;

    public PennLemmaDatasetReader(File corpusRoot, IPOSTagFactory<? extends IMorphProperties> posTagFactory) {
        this.base = corpusRoot;
        this.posTagFactory = posTagFactory;
    }

    @Override
    public Iterable<INLPDocument> read() {
        Collection files = FileUtils.listFiles((File)this.base, (String[])new String[]{"xml"}, (boolean)true);
        return new StreamProcessor().transform((Iterable)files, (StreamProcessor.IElementTransformer)new StreamProcessor.IElementTransformer<File, INLPDocument>(){

            public INLPDocument transformElement(File input) {
                return PennLemmaDatasetReader.this.parseFile(input);
            }
        });
    }

    private INLPDocument parseFile(File file) {
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = dbf.newDocumentBuilder();
            Document doc = builder.parse(file);
            return this.parseDocument(doc);
        }
        catch (IOException | ParserConfigurationException | SAXException e) {
            throw new TexterraSystemException(e);
        }
    }

    private INLPDocument parseDocument(Document doc) {
        NodeList wordNodes = doc.getElementsByTagName(WORD_ELEMENT_TAG);
        ArrayList<String> wordForms = new ArrayList<String>(wordNodes.getLength());
        ArrayList<IMorphProperties> posTags = new ArrayList<IMorphProperties>(wordNodes.getLength());
        ArrayList<String> lemmas = new ArrayList<String>(wordNodes.getLength());
        for (int i = 0; i < wordNodes.getLength(); ++i) {
            Node wordNode = wordNodes.item(i);
            NodeList wordDescription = wordNode.getChildNodes();
            block11: for (int j = 0; j < wordDescription.getLength(); ++j) {
                Node description = wordDescription.item(j);
                if (description.getNodeType() != 1) continue;
                String value = description.getAttributes().getNamedItem("value").getNodeValue();
                switch (description.getAttributes().getNamedItem("name").getNodeValue()) {
                    case "string": {
                        wordForms.add(value);
                        continue block11;
                    }
                    case "msd": {
                        posTags.add(this.posTagFactory.getPOSTag(value));
                        continue block11;
                    }
                    case "base": {
                        lemmas.add(value);
                    }
                }
            }
            if (wordForms.size() == posTags.size() && wordForms.size() == lemmas.size()) continue;
            throw new IllegalStateException("Broken input file.");
        }
        return this.generateDocument(wordForms, posTags, lemmas);
    }

    private INLPDocument generateDocument(List<String> wordForms, List<IMorphProperties> posTags, List<String> lemmas) {
        if (wordForms.size() != posTags.size() || wordForms.size() != lemmas.size()) {
            throw new IllegalArgumentException();
        }
        String text = this.generateDocumentText(wordForms);
        NLPDocument doc = new NLPDocument(text);
        int start = 0;
        ArrayList<Token> tokens = new ArrayList<Token>(wordForms.size());
        ArrayList<POSToken> posTokens = new ArrayList<POSToken>(posTags.size());
        ArrayList<Lemma> lemmaAnnotations = new ArrayList<Lemma>(lemmas.size());
        for (int i = 0; i < wordForms.size(); ++i) {
            String wordForm = wordForms.get(i);
            Token token = new Token(doc, start, start + wordForm.length());
            if (!((String)token.getValue()).equals(wordForm)) {
                throw new IllegalStateException();
            }
            tokens.add(token);
            posTokens.add(new POSToken((IAnnotation)token, posTags.get(i)));
            lemmaAnnotations.add(new Lemma((IAnnotation)token, lemmas.get(i)));
            start += wordForm.length() + 1;
        }
        return doc.withAnnotations(tokens).withAnnotations(posTokens).withAnnotations(lemmaAnnotations);
    }

    private String generateDocumentText(List<String> tokens) {
        StringBuilder builder = new StringBuilder();
        for (String token : tokens) {
            builder = builder.append(token).append(' ');
        }
        return builder.toString();
    }
}

