/*
 * Decompiled with CFR 0.152.
 */
package ru.ispras.texterra.core.nlp.datasets.lemma.ruscorpora;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import javax.xml.stream.FactoryConfigurationError;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import ru.ispras.modis.utils.concurrent.StreamProcessor;
import ru.ispras.texterra.core.nlp.annotators.lemmatizer.wordProcessors.CompositeProcessor;
import ru.ispras.texterra.core.nlp.annotators.lemmatizer.wordProcessors.PunctuationProcessor;
import ru.ispras.texterra.core.nlp.annotators.lemmatizer.wordProcessors.ToLowerCaseProcessor;
import ru.ispras.texterra.core.nlp.datamodel.INLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.Lemma;
import ru.ispras.texterra.core.nlp.datamodel.NLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.Token;
import ru.ispras.texterra.core.nlp.datamodel.pos.IPOSTagFactory;
import ru.ispras.texterra.core.nlp.datamodel.pos.POSToken;
import ru.ispras.texterra.core.nlp.datamodel.pos.POSTokenFactory;
import ru.ispras.texterra.core.nlp.datasets.IDatasetReader;

public class LemmaGenDatasetReader
implements IDatasetReader {
    private File lemmaDataSet;
    private POSTokenFactory posTokenFactory;

    public LemmaGenDatasetReader(File lemmaDataSet, IPOSTagFactory<?> posTagFactory) {
        this.lemmaDataSet = lemmaDataSet;
        this.posTokenFactory = new POSTokenFactory(posTagFactory);
    }

    @Override
    public Iterable<INLPDocument> read() {
        return new StreamProcessor().transform(Arrays.asList(this.lemmaDataSet.listFiles()), (StreamProcessor.IElementTransformer)new StreamProcessor.IElementTransformer<File, INLPDocument>(){

            public INLPDocument transformElement(File file) {
                return LemmaGenDatasetReader.this.read(file);
            }
        });
    }

    private INLPDocument read(File file) {
        ArrayList<String> lemmasAsString = new ArrayList<String>();
        ArrayList<String> tagsAsString = new ArrayList<String>();
        ArrayList<String> words = new ArrayList<String>();
        ArrayList<Lemma> lemmas = new ArrayList<Lemma>();
        ArrayList<Token> tokens = new ArrayList<Token>();
        ArrayList<POSToken> tags = new ArrayList<POSToken>();
        try {
            this.collect(words, lemmasAsString, tagsAsString, file);
            String text = this.getText(words);
            INLPDocument doc = new NLPDocument(text);
            int start = 0;
            int end = 0;
            for (int i = 0; i < words.size(); ++i) {
                end = start + ((String)words.get(i)).length();
                tokens.add(new Token(doc, start, end));
                lemmas.add(new Lemma(doc, start, end, this.preProcessLemma((String)lemmasAsString.get(i))));
                tags.add(this.posTokenFactory.createPOSToken(new Token(doc, start, end), (String)tagsAsString.get(i)));
                start = end + 1;
            }
            doc = doc.withAnnotations(tokens);
            doc = doc.withAnnotations(lemmas);
            doc = doc.withAnnotations(tags);
            return doc;
        }
        catch (IOException | FactoryConfigurationError | XMLStreamException e) {
            throw new RuntimeException(e);
        }
    }

    private String preProcessLemma(String lemma) {
        CompositeProcessor p = new CompositeProcessor(new PunctuationProcessor(), new ToLowerCaseProcessor());
        return p.getLemma(lemma);
    }

    private String getText(List<String> words) {
        StringBuilder text = new StringBuilder();
        for (String word : words) {
            text.append(word + " ");
        }
        return text.toString();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    private void collect(List<String> words, List<String> lemmas, List<String> tags, File file) throws IOException, XMLStreamException, FactoryConfigurationError {
        try (BufferedReader in = new BufferedReader(new InputStreamReader((InputStream)new FileInputStream(file), "UTF-8"));
             XMLStreamReader reader = XMLInputFactory.newFactory().createXMLStreamReader(in);){
            String lemma = null;
            String word = null;
            String tag = null;
            while (reader.hasNext()) {
                switch (reader.next()) {
                    case 1: {
                        String name = reader.getLocalName();
                        if (name.equals("ana")) {
                            lemma = reader.getAttributeValue(null, "lex");
                            tag = this.getTag(reader.getAttributeValue(null, "gr"));
                            break;
                        }
                        if (!name.equals("f")) break;
                        word = reader.getElementText();
                        break;
                    }
                    case 2: {
                        if (word == null) break;
                        if (lemma == null || tag == null) {
                            lemma = word;
                            tag = "PUNCT";
                        }
                        words.add(word);
                        lemmas.add(lemma);
                        tags.add(tag);
                        word = null;
                        lemma = null;
                        tag = null;
                    }
                }
            }
        }
    }

    private String getTag(String fullTag) {
        return fullTag.split(",")[0].split("=")[0];
    }
}

