/*
 * Decompiled with CFR 0.152.
 */
package ru.ispras.texterra.core.nlp.datasets.common.synTagRus;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import ru.ispras.texterra.core.nlp.datamodel.IAnnotation;
import ru.ispras.texterra.core.nlp.datamodel.INLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.Lemma;
import ru.ispras.texterra.core.nlp.datamodel.NLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.Sentence;
import ru.ispras.texterra.core.nlp.datamodel.Token;
import ru.ispras.texterra.core.nlp.datamodel.pos.IPOSTagFactory;
import ru.ispras.texterra.core.nlp.datamodel.pos.POSToken;
import ru.ispras.texterra.core.nlp.datamodel.pos.POSTokenFactory;
import ru.ispras.texterra.core.nlp.datasets.IDatasetReader;

public class SynTagRusPerSentenceFileParser
implements IDatasetReader {
    private static final String BODY_TAG_NAME = "body";
    private static final String SENTENCE_TAG_NAME = "S";
    private static final String WORD_TAG_NAME = "W";
    private final File file;
    private final POSTokenFactory posTokenFactory;

    public SynTagRusPerSentenceFileParser(File file, IPOSTagFactory<?> posTagFactory) {
        this.file = file;
        this.posTokenFactory = new POSTokenFactory(posTagFactory);
    }

    @Override
    public Iterable<INLPDocument> read() {
        try {
            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
            DocumentBuilder builder = dbf.newDocumentBuilder();
            Document doc = builder.parse(this.file);
            return this.parseDocument(doc);
        }
        catch (IOException | ParserConfigurationException | SAXException e) {
            throw new RuntimeException(e);
        }
    }

    private Iterable<INLPDocument> parseDocument(Document doc) {
        Node body = this.getDocumentBody(doc.getChildNodes());
        List<Node> sentences = this.getSentences(body.getChildNodes());
        ArrayList<INLPDocument> result = new ArrayList<INLPDocument>(sentences.size());
        for (Node sentence : sentences) {
            result.add(this.parseSentence(sentence));
        }
        return result;
    }

    private List<Node> getSentences(NodeList nodes) {
        ArrayList<Node> result = new ArrayList<Node>();
        for (int i = 0; i < nodes.getLength(); ++i) {
            Node node = nodes.item(i);
            if (node.getNodeType() != 1 || !node.getNodeName().equals(SENTENCE_TAG_NAME)) continue;
            result.add(node);
        }
        return result;
    }

    private INLPDocument parseSentence(Node sentenceNode) {
        ArrayList<String> tokens = new ArrayList<String>();
        ArrayList<String> tokenFeatures = new ArrayList<String>();
        ArrayList<String> lemmas = new ArrayList<String>();
        NodeList children = sentenceNode.getChildNodes();
        block4: for (int i = 0; i < children.getLength(); ++i) {
            Node child = children.item(i);
            switch (child.getNodeType()) {
                case 1: {
                    if (!child.getNodeName().equals(WORD_TAG_NAME) || this.isFantom(child)) continue block4;
                    tokens.add(this.getWordText(child));
                    tokenFeatures.add(this.getWordFeatures(child));
                    lemmas.add(this.getWordLemma(child));
                    continue block4;
                }
                case 3: {
                    String[] textUnits;
                    String text = child.getNodeValue();
                    for (String textUnit : textUnits = text.split("\\s+", -1)) {
                        if (textUnit.isEmpty()) {
                            if (tokens.isEmpty() || ((String)tokens.get(tokens.size() - 1)).equals(" ")) continue;
                            tokens.add(" ");
                            tokenFeatures.add(null);
                            lemmas.add(null);
                            continue;
                        }
                        for (int j = 0; j < textUnit.length(); ++j) {
                            tokens.add(Character.toString(textUnit.charAt(j)));
                            tokenFeatures.add("PUNCT");
                            lemmas.add("");
                        }
                    }
                    continue block4;
                }
            }
        }
        if (tokens.size() != tokenFeatures.size() || tokens.size() != lemmas.size()) {
            throw new IllegalStateException();
        }
        String docText = this.generateText(tokens);
        ArrayList<Token> tokenAnnotations = new ArrayList<Token>();
        ArrayList<POSToken> posTokenAnnotations = new ArrayList<POSToken>();
        ArrayList<Lemma> lemmaAnnotations = new ArrayList<Lemma>();
        int start = 0;
        for (int i = 0; i < tokens.size(); ++i) {
            String token = (String)tokens.get(i);
            if (token != null && !token.equals(" ")) {
                Token tokenAnnotation = new Token(docText, start, start + token.length());
                if (!tokenAnnotation.getText().equals(token)) {
                    throw new IllegalStateException();
                }
                tokenAnnotations.add(tokenAnnotation);
                String tokenFeature = (String)tokenFeatures.get(i);
                posTokenAnnotations.add(this.posTokenFactory.createPOSToken(tokenAnnotation, tokenFeature));
                lemmaAnnotations.add(new Lemma((IAnnotation)tokenAnnotation, (String)lemmas.get(i)));
            }
            start += token == null ? 0 : token.length();
        }
        INLPDocument doc = new NLPDocument(docText);
        doc = doc.withAnnotations(Arrays.asList(new Sentence(doc)));
        doc = doc.withAnnotations(tokenAnnotations);
        doc = doc.withAnnotations(posTokenAnnotations);
        doc = doc.withAnnotations(lemmaAnnotations);
        return doc;
    }

    private String generateText(List<String> tokens) {
        StringBuilder builder = new StringBuilder();
        for (String token : tokens) {
            builder = builder.append(token);
        }
        return builder.toString();
    }

    private String getWordText(Node word) {
        Node text = this.findElement(word.getChildNodes(), (short)3);
        return text == null ? null : text.getNodeValue();
    }

    private Node findElement(NodeList nodes, short nodeType) {
        for (int i = 0; i < nodes.getLength(); ++i) {
            Node child = nodes.item(i);
            if (child.getNodeType() != nodeType) continue;
            return child;
        }
        return null;
    }

    private String getWordFeatures(Node word) {
        Node features = this.findAttribute(word, "FEAT");
        return features == null ? "" : features.getNodeValue();
    }

    private String getWordLemma(Node word) {
        Node lemma = this.findAttribute(word, "LEMMA");
        return lemma == null ? "" : lemma.getNodeValue();
    }

    private boolean isFantom(Node word) {
        Node nodeType = this.findAttribute(word, "NODETYPE");
        return nodeType != null && nodeType.getNodeValue().equals("FANTOM");
    }

    private Node findAttribute(Node node, String attributeName) {
        NamedNodeMap attributes = node.getAttributes();
        return attributes.getNamedItem(attributeName);
    }

    private Node getDocumentBody(NodeList nodes) {
        for (int i = 0; i < nodes.getLength(); ++i) {
            Node node = nodes.item(i);
            if (node.getNodeType() != 1) continue;
            if (node.getNodeName().equals(BODY_TAG_NAME)) {
                return node;
            }
            Node childBodyNode = this.getDocumentBody(node.getChildNodes());
            if (childBodyNode == null) continue;
            return childBodyNode;
        }
        return null;
    }
}

