/*
 * Decompiled with CFR 0.152.
 */
package ru.ispras.texterra.core.nlp.annotators.token.opennlp;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenizerFactory;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
import ru.ispras.texterra.core.exceptions.TexterraSystemException;
import ru.ispras.texterra.core.nlp.annotators.IAnnotatorTrainer;
import ru.ispras.texterra.core.nlp.annotators.token.opennlp.OpenNLPTokenizer;
import ru.ispras.texterra.core.nlp.datamodel.FastInRelationNLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.IAnnotation;
import ru.ispras.texterra.core.nlp.datamodel.INLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.IToken;
import ru.ispras.texterra.core.nlp.datamodel.Sentence;
import ru.ispras.texterra.core.nlp.datamodel.relations.ContainsAnnotationRelation;
import ru.ispras.texterra.core.nlp.datamodel.relations.ContainsAnnotationRelationCacheBuilder;
import ru.ispras.texterra.core.nlp.datamodel.relations.IAnnotationRelation;
import ru.ispras.texterra.core.nlp.datamodel.relations.IAnnotationRelationCacheBuilder;
import ru.ispras.texterra.utils.language.ITexterraLanguage;

public class OpenNLPTokenizerTrainer
implements IAnnotatorTrainer<OpenNLPTokenizer> {
    private static IAnnotationRelation contains = new ContainsAnnotationRelation();
    private final ITexterraLanguage language;
    private final Pattern alphanumericPattern;

    public OpenNLPTokenizerTrainer(ITexterraLanguage language) {
        this(language, language.getAlphabet());
    }

    public OpenNLPTokenizerTrainer(ITexterraLanguage language, Pattern alphanumericWordPattern) {
        this.language = language;
        this.alphanumericPattern = alphanumericWordPattern;
    }

    public OpenNLPTokenizer train(Iterable<INLPDocument> documents) {
        return new OpenNLPTokenizer(this.trainModel(documents));
    }

    private TokenizerModel trainModel(Iterable<INLPDocument> documents) {
        TokenizerFactory factory = new TokenizerFactory(this.language.getLanguageTag(), new Dictionary(), true, this.alphanumericPattern);
        try {
            return TokenizerME.train(this.getTokenSamples(documents), (TokenizerFactory)factory, (TrainingParameters)new TrainingParameters());
        }
        catch (IOException e) {
            throw new TexterraSystemException((Throwable)e);
        }
    }

    private ObjectStream<TokenSample> getTokenSamples(final Iterable<INLPDocument> documents) {
        return new ObjectStream<TokenSample>(){
            Iterator<INLPDocument> documentsIt;
            Iterator<TokenSample> samplesIt;
            {
                this.documentsIt = documents.iterator();
                this.samplesIt = null;
            }

            public void close() throws IOException {
            }

            public TokenSample read() throws IOException {
                while (this.samplesIt == null || !this.samplesIt.hasNext()) {
                    if (!this.documentsIt.hasNext()) {
                        return null;
                    }
                    INLPDocument document = this.documentsIt.next();
                    Collection samples = OpenNLPTokenizerTrainer.this.getTokenSamples(document);
                    this.samplesIt = samples.iterator();
                }
                return this.samplesIt.next();
            }

            public void reset() throws IOException, UnsupportedOperationException {
                this.documentsIt = documents.iterator();
                this.samplesIt = null;
            }
        };
    }

    private Collection<TokenSample> getTokenSamples(INLPDocument doc) {
        FastInRelationNLPDocument fastDoc = new FastInRelationNLPDocument(doc, Sentence.class, IToken.class, (IAnnotationRelationCacheBuilder)new ContainsAnnotationRelationCacheBuilder());
        List sentences = fastDoc.getAnnotations(Sentence.class);
        ArrayList<TokenSample> result = new ArrayList<TokenSample>(sentences.size());
        for (Sentence sentence : sentences) {
            List tokens = fastDoc.getInRelationAnnotations((IAnnotation)sentence, contains, IToken.class);
            result.add(this.getTokenSample(sentence.getText(), tokens, sentence.getStart()));
        }
        return result;
    }

    private TokenSample getTokenSample(String text, List<IToken> tokens, int sentenceStart) {
        Span[] tokenSpans = new Span[tokens.size()];
        for (int i = 0; i < tokens.size(); ++i) {
            IToken token = tokens.get(i);
            tokenSpans[i] = new Span(token.getStart() - sentenceStart, token.getEnd() - sentenceStart);
        }
        return new TokenSample(text, tokenSpans);
    }
}

