/*
 * Decompiled with CFR 0.152.
 */
package ru.ispras.texterra.core.nlp.annotators.pos.hierarchical;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.log4j.Logger;
import ru.ispras.ml.classification.IConfidenceClassifier;
import ru.ispras.ml.classification.IConfidenceClassifierTrainer;
import ru.ispras.ml.datamodel.IInstance;
import ru.ispras.ml.datamodel.ILabeledDataset;
import ru.ispras.ml.datamodel.InstanceFactory;
import ru.ispras.ml.datamodel.LabelledInstance;
import ru.ispras.ml.datamodel.identification.HashCodeInstanceIdGeneratingStrategy;
import ru.ispras.ml.datamodel.identification.InstanceIdGeneratingStrategy;
import ru.ispras.texterra.core.nlp.annotators.IAnnotatorTrainer;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.POSTagger;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.classifiers.POSTaggerClassifierTrainer;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.classifiers.morph.IMorphClassifier;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.classifiers.morph.MorphClassifier;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.classifiers.morph.SingleValueMorphClassifier;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.dictionary.IPOSDictionary;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.dictionary.POSDictionary;
import ru.ispras.texterra.core.nlp.annotators.pos.hierarchical.featureextractors.POSTaggerFeatureExtractorFactory;
import ru.ispras.texterra.core.nlp.datamodel.IAnnotation;
import ru.ispras.texterra.core.nlp.datamodel.INLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.IToken;
import ru.ispras.texterra.core.nlp.datamodel.NLPDocumentWithAnnotation;
import ru.ispras.texterra.core.nlp.datamodel.Token;
import ru.ispras.texterra.core.nlp.datamodel.pos.IPOSTag;
import ru.ispras.texterra.core.nlp.datamodel.pos.POSToken;
import ru.ispras.texterra.core.nlp.datamodel.pos.morph.IMorphProperties;

public class POSTaggerAnnotatorTrainer
implements IAnnotatorTrainer<POSTagger> {
    private final Logger logger = Logger.getLogger(POSTaggerAnnotatorTrainer.class);
    private final InstanceFactory<NLPDocumentWithAnnotation<IToken>> instanceFactory = new InstanceFactory(Arrays.asList(new POSTaggerFeatureExtractorFactory().create()), (InstanceIdGeneratingStrategy)new HashCodeInstanceIdGeneratingStrategy());
    private final POSTaggerClassifierTrainer<IPOSTag> posClassifierTrainer = new POSTaggerClassifierTrainer();
    private final POSTaggerClassifierTrainer<IMorphProperties> posMorphClassifierTrainer = new POSTaggerClassifierTrainer();

    private Set<IPOSTag> getPOSTags(Iterable<INLPDocument> documents) {
        return StreamSupport.stream(documents.spliterator(), true).flatMap(doc -> doc.getAnnotations(POSToken.class).stream()).map(posToken -> ((IMorphProperties)posToken.getValue()).getPOSTag()).collect(Collectors.toSet());
    }

    private ILabeledDataset<IPOSTag> getLabeledDatasetForTags(Iterable<INLPDocument> documents) {
        ArrayList<LabelledInstance> labeledInstances = new ArrayList<LabelledInstance>();
        for (INLPDocument doc : documents) {
            List posTokens = doc.getAnnotations(POSToken.class);
            for (POSToken posToken : posTokens) {
                IInstance featureInstance = this.instanceFactory.createInstance((Object)new NLPDocumentWithAnnotation(doc, (IAnnotation)new Token((IAnnotation)posToken)));
                IPOSTag curPosTag = ((IMorphProperties)posToken.getValue()).getPOSTag();
                LabelledInstance labeledInstance = new LabelledInstance(featureInstance, (Object)curPosTag);
                labeledInstances.add(labeledInstance);
            }
        }
        return () -> labeledInstances.stream();
    }

    private IConfidenceClassifier<IPOSTag> getClassifier(Iterable<INLPDocument> documents) {
        this.logger.debug((Object)"building posTag classifier");
        IConfidenceClassifierTrainer<IPOSTag> trainer = this.posClassifierTrainer.create();
        trainer.train(this.getLabeledDatasetForTags(documents));
        return trainer.getPredictor();
    }

    private ILabeledDataset<IMorphProperties> getLabeledDatasetForMorphTags(IPOSTag posTag, Iterable<INLPDocument> documents) {
        ArrayList<LabelledInstance> labeledInstances = new ArrayList<LabelledInstance>();
        for (INLPDocument doc : documents) {
            List posTokens = doc.getAnnotations(POSToken.class);
            for (POSToken posToken : posTokens) {
                IPOSTag curPosTag = ((IMorphProperties)posToken.getValue()).getPOSTag();
                if (!curPosTag.equals(posTag)) continue;
                IInstance featureInstance = this.instanceFactory.createInstance((Object)new NLPDocumentWithAnnotation(doc, (IAnnotation)new Token((IAnnotation)posToken)));
                LabelledInstance labeledInstance = new LabelledInstance(featureInstance, posToken.getValue());
                labeledInstances.add(labeledInstance);
            }
        }
        return () -> labeledInstances.stream();
    }

    private IMorphClassifier getMorphClassifier(IPOSTag posTag, Iterable<INLPDocument> documents) {
        this.logger.debug((Object)("building morph classifier for tag : " + posTag));
        Set posMorphTags = this.getLabeledDatasetForMorphTags(posTag, documents).getLabelsSet();
        if (posMorphTags.size() == 1) {
            SingleValueMorphClassifier res = new SingleValueMorphClassifier(posTag, (IMorphProperties)posMorphTags.iterator().next());
            this.logger.debug((Object)("builded singleValue morph classifier for tag : " + res.getPOSTag() + " with morph tag " + res.getPOSMorphTags()));
            return res;
        }
        IConfidenceClassifierTrainer<IMorphProperties> trainer = this.posMorphClassifierTrainer.create();
        trainer.train(this.getLabeledDatasetForMorphTags(posTag, documents));
        MorphClassifier res = new MorphClassifier(posTag, (IConfidenceClassifier<IMorphProperties>)trainer.getPredictor(), posMorphTags);
        this.logger.debug((Object)("builded multipleValue morph classifier for tag : " + res.getPOSTag() + " with morph tag " + res.getPOSMorphTags()));
        return res;
    }

    private IPOSDictionary getDict(Iterable<INLPDocument> documents) {
        POSDictionary.Builder dictBuilder = new POSDictionary.Builder();
        for (INLPDocument doc : documents) {
            for (POSToken posToken : doc.getAnnotations(POSToken.class)) {
                String word = posToken.getText().toLowerCase();
                IMorphProperties postag = (IMorphProperties)posToken.getValue();
                dictBuilder.add(word, postag);
            }
        }
        return dictBuilder.build();
    }

    public POSTagger train(Iterable<INLPDocument> documents) {
        this.logger.debug((Object)"reading docs");
        List<INLPDocument> docs = StreamSupport.stream(documents.spliterator(), false).collect(Collectors.toList());
        this.logger.debug((Object)"end reading docs");
        this.logger.debug((Object)"building posDictionary");
        IPOSDictionary posDictionary = this.getDict(docs);
        this.logger.debug((Object)"end building posDictionary");
        Set<IPOSTag> posTags = this.getPOSTags(docs);
        this.logger.debug((Object)("detected postags : " + posTags.stream().map(IPOSTag::getTag).sorted().collect(Collectors.joining(" "))));
        IConfidenceClassifier<IPOSTag> posTokenClassifier = this.getClassifier(docs);
        Map<IPOSTag, IMorphClassifier> morphClassifiers = posTags.stream().map(posTag -> this.getMorphClassifier((IPOSTag)posTag, (Iterable<INLPDocument>)docs)).collect(Collectors.toMap(morphClassifier -> morphClassifier.getPOSTag(), morphClassifier -> morphClassifier));
        POSTagger posTagger = new POSTagger(posTokenClassifier, morphClassifiers, this.instanceFactory, posDictionary);
        return posTagger;
    }
}

