/*
 * Decompiled with CFR 0.152.
 */
package ru.ispras.texterra.core.nlp.annotators;

import java.util.Collection;
import java.util.LinkedList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import ru.ispras.texterra.core.nlp.annotators.ISerializableAnnotator;
import ru.ispras.texterra.core.nlp.datamodel.INLPDocument;
import ru.ispras.texterra.core.nlp.datamodel.IToken;
import ru.ispras.texterra.core.nlp.datamodel.URL;

public class URLAnnotator
implements ISerializableAnnotator<URL> {
    private static final long serialVersionUID = 4580962085750950805L;
    public static final Pattern urlPattern = URLAnnotator.getURLPattern();
    public static final Pattern domainNamePattern = Pattern.compile("([^.]+\\.)+([^.]+)");

    @Override
    public Collection<URL> annotate(INLPDocument doc) {
        LinkedList<URL> res = new LinkedList<URL>();
        for (IToken token : doc.getAnnotations(IToken.class)) {
            if (!this.isURL((String)token.getValue())) continue;
            res.add(new URL(token));
        }
        return res;
    }

    private boolean isURL(String text) {
        Matcher matcher = urlPattern.matcher(text);
        if (!matcher.matches()) {
            return false;
        }
        String schemeName = matcher.group("scheme");
        String domainName = matcher.group("domain");
        String port = matcher.group("port");
        String path = matcher.group("path");
        String query = matcher.group("query");
        String fragment = matcher.group("fragment");
        return schemeName != null || (this.validDomainName(domainName) || port != null) && path != null;
    }

    private boolean validDomainName(String domain) {
        if (domain == null) {
            return false;
        }
        return domainNamePattern.matcher(domain).matches();
    }

    private static Pattern getURLPattern() {
        String schemeNamePattern = "(?<scheme>(?:(https?)|(ftp)))";
        String schemePattern = schemeNamePattern + ":(?://)?";
        String domainPattern = "(?<domain>[^?#/:]+)";
        String portPattern = ":(?<port>\\d+)";
        String pathPattern = "(?<path>[^?#&]*)";
        String hierarchicalPartPattern = domainPattern + "(" + portPattern + ")?(/" + pathPattern + ")?";
        String queryStringPattern = "(?<query>[^:?#]+)";
        String queryPattern = "(\\?" + queryStringPattern + ")?";
        String fragmentIdPattern = "(?<fragment>[^#]+)";
        String fragmentPattern = "(#" + fragmentIdPattern + ")?";
        String urlPattern = "(" + schemePattern + ")?" + hierarchicalPartPattern + queryPattern + fragmentPattern;
        return Pattern.compile(urlPattern, 66);
    }
}

