/*
 * Decompiled with CFR 0.152.
 */
package org.sing_group.seda.core.ncbi;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.List;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.sing_group.seda.core.ncbi.NcbiAssemblyAccession;
import org.sing_group.seda.core.ncbi.NcbiTaxonomyResolver;

public class NcbiAssemblyAccesionResolver {
    public static final String NCBI_URL = "https://www.ncbi.nlm.nih.gov";
    private static final Pattern ACCESSION_PATTERN = Pattern.compile("GC[AF]_[0-9]*" + Pattern.quote(".") + "[0-9]*");
    private int timeoutMillis;

    public NcbiAssemblyAccesionResolver() {
        this(10000);
    }

    public NcbiAssemblyAccesionResolver(int timeoutMillis) {
        this.timeoutMillis = timeoutMillis;
    }

    public List<NcbiAssemblyAccession> resolve(String ... names) {
        LinkedList<NcbiAssemblyAccession> toret = new LinkedList<NcbiAssemblyAccession>();
        for (String name : names) {
            Optional<NcbiAssemblyAccession> accession = this.resolve(name);
            if (!accession.isPresent()) continue;
            toret.add(accession.get());
        }
        return toret;
    }

    public Optional<NcbiAssemblyAccession> resolve(String name) {
        Matcher matcher = ACCESSION_PATTERN.matcher(name);
        if (!matcher.find()) {
            return Optional.empty();
        }
        String accession = matcher.group(0);
        return this.resolveAccession(name, accession);
    }

    private Optional<NcbiAssemblyAccession> resolveAccession(String name, String accession) {
        try {
            Document doc = Jsoup.parse(NcbiAssemblyAccesionResolver.assemblyUrl(accession), this.timeoutMillis);
            Elements scriptElements = doc.select("script");
            for (Element script : scriptElements) {
                String scriptContent = script.html();
                if (!scriptContent.contains("pageData")) continue;
                Pattern taxIdPattern = Pattern.compile("pageData\\.taxId\\s*=\\s*\"(\\d+)\"");
                Pattern speciesNamePattern = Pattern.compile("pageData\\.speciesName\\s*=\\s*\"([^\"]+)\"");
                Matcher taxIdMatcher = taxIdPattern.matcher(scriptContent);
                Matcher speciesNameMatcher = speciesNamePattern.matcher(scriptContent);
                if (!taxIdMatcher.find() || !speciesNameMatcher.find()) continue;
                String taxId = taxIdMatcher.group(1);
                String speciesName = speciesNameMatcher.group(1);
                return Optional.of(new NcbiAssemblyAccession(accession, speciesName, NcbiAssemblyAccesionResolver.taxonomyUrl(taxId)));
            }
        }
        catch (IOException e) {
            e.printStackTrace();
        }
        return Optional.empty();
    }

    protected static URL assemblyUrl(String accession) throws MalformedURLException {
        return new URL("https://www.ncbi.nlm.nih.gov/assembly/" + accession);
    }

    private static URL taxonomyUrl(String path) throws MalformedURLException {
        return new URL("https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=" + path + "&mode=info");
    }

    public static void main(String[] args) throws MalformedURLException {
        System.out.println(new NcbiTaxonomyResolver().resolve(new URL("https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=94439&mode=info")).get().getValues());
    }
}

