package be.ac.vub.bsb.parsers.ncbi;

import be.ac.ulb.bigre.pathwayinference.core.core.PathwayinferenceConstants;
import be.ac.ulb.bigre.pathwayinference.core.io.IOTools;
import be.ac.ulb.bigre.pathwayinference.core.util.DiverseTools;
import cern.colt.matrix.impl.AbstractFormatter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.math3.geometry.VectorFormat;
import org.htmlparser.parserapplications.StringExtractor;
import org.htmlparser.util.ParserException;

/* loaded from: input_file:be/ac/vub/bsb/parsers/ncbi/NCBIDBOnlineQueries.class */
public class NCBIDBOnlineQueries {
    public static String PROTEIN_DB = "protein";
    public static String NUCLEOTIDE_DB = "nucleotide";
    public static String ALTERNATE_GENOMIC_PREFIX = "AC_";
    public static String COMPLETE_GENOMIC_PREFIX = "NC_";
    public static String INCOMPLETE_GENOMIC_REGION_PREFIX = "NG_";
    public static String INTERMEDIATE_GENOMIC_ASSEMBLY_PREFIX = "NT_";
    public static String INTERMEDIATE_GENOMIC_ASSEMBLY2_PREFIX = "NW_";
    public static String WHOLE_GENOME_SHOTGUN_PREFIX = "NZ_";
    public static String NON_REAL_GENOMIC_ASSEMBLY = "NS_";
    public static String GENOME_COMPLETED = "complete";
    public static String GENOME_INCOMPLETE = "incomplete";
    public static String DRAFT = "DRAFT";
    public static String PLASMID = "plasmid";
    public static String MITOCHONDRION = "mitochondrion";
    public static String SRA_SAMPLE_PROTECTED = "universal unknown sample for protected datasets";
    public static String SRA_SAMPLE_DEFAULT = "Default sample for sample pools from HMP pilot.";
    public static Map<String, String> AMBIGUOUS_TAXON_NAMES_VS_LINEAGES = new HashMap();

    public static void fillAmbiguousTaxonVsLineageMap() {
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Bacillus", "cellular organisms; Bacteria; Firmicutes; Bacilli; Bacillales; Bacillaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Bosea", "cellular organisms; Bacteria; Proteobacteria; Alphaproteobacteria; Rhizobiales; Bradyrhizobiaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Buchnera", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; Enterobacteriaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Coxiella", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Legionellales; Coxiellaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Edwardsiella", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; Enterobacteriaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Gordonia", "cellular organisms; Bacteria; Actinobacteria; Actinobacteria; Actinobacteridae; Actinomycetales; Corynebacterineae; Gordoniaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Lamprocystis", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Chromatiales; Chromatiaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Leptonema", "cellular organisms; Bacteria; Spirochaetes; Spirochaetia; Spirochaetales; Leptospiraceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Moorea", "cellular organisms; Bacteria; Cyanobacteria; Oscillatoriophycideae; Oscillatoriales");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Moraxella", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Pseudomonadales; Moraxellaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Morganella", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; Enterobacteriaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Paracoccus", "cellular organisms; Bacteria; Proteobacteria; Alphaproteobacteria; Rhodobacterales; Rhodobacteraceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Planococcus", "cellular organisms; Bacteria; Firmicutes; Bacilli; Bacillales; Planococcaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Proteus", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; Enterobacteriaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Rhodobium", "cellular organisms; Bacteria; Proteobacteria; Alphaproteobacteria; Rhizobiales; Rhodobiaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Rothia", "cellular organisms; Bacteria; Actinobacteria; Actinobacteria; Actinobacteridae; Actinomycetales; Micrococcineae; Micrococcaceae");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Spirulina", "cellular organisms; Bacteria; Cyanobacteria; Oscillatoriophycideae; Oscillatoriales");
        AMBIGUOUS_TAXON_NAMES_VS_LINEAGES.put("Yersinia", "cellular organisms; Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales; Enterobacteriaceae");
    }

    public static NCBITaxonDBQueryResult getNCBITaxonGenomeSize(int i) {
        NCBITaxonDBQueryResult nCBITaxonDBQueryResult = new NCBITaxonDBQueryResult();
        nCBITaxonDBQueryResult.setNcbiId(i);
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        HashMap hashMap3 = new HashMap();
        boolean z = false;
        boolean z2 = false;
        String str = "";
        int i2 = 0;
        try {
            for (String str2 : new StringExtractor("http://www.ncbi.nlm.nih.gov/sites/entrez?db=genome&cmd=Search&dopt=DocSum&term=txid" + i + "[Organism%3Aexp]").extractStrings(false).split("\n")) {
                if (z && !z2) {
                    str = "";
                    for (String str3 : str2.split(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR)) {
                        if (str3.contains("chromosome") || str3.contains("segment") || str3.contains("sequence") || str3.contains("complete") || str3.contains("whole")) {
                            break;
                        }
                        str = String.valueOf(str) + AbstractFormatter.DEFAULT_COLUMN_SEPARATOR + str3;
                    }
                    if (str.startsWith(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR)) {
                        str = str.replaceFirst(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR, "");
                    }
                    if (str.endsWith(",")) {
                        str = str.substring(0, str.length() - 1);
                    }
                    if (str2.contains(GENOME_COMPLETED)) {
                        hashMap2.put(str, GENOME_COMPLETED);
                    } else {
                        hashMap2.put(str, GENOME_INCOMPLETE);
                    }
                    if (str2.contains(DRAFT)) {
                        hashMap2.put(str, GENOME_INCOMPLETE);
                    }
                    if (str2.contains(PLASMID)) {
                        hashMap3.put(str, true);
                    } else {
                        hashMap3.put(str, false);
                    }
                    z2 = true;
                }
                if (str2.contains("Length: ") && z) {
                    i2 = Integer.parseInt(str2.split("Length: ")[1].replace("nt", "").trim().replace(",", ""));
                }
                if (str2.contains("Organelle: ") && z && str2.split("Organelle: ")[1].trim().equalsIgnoreCase(MITOCHONDRION)) {
                    hashMap3.put(str, true);
                }
                if (str2.startsWith("Links")) {
                    z = true;
                }
                if (str2.startsWith("Created")) {
                    if (!str.isEmpty()) {
                        System.out.println("Placing taxon " + str + " with genome size: " + i2);
                        if (hashMap.containsKey(str)) {
                            hashMap.put(str, Integer.valueOf(hashMap.get(str).intValue() + i2));
                        } else {
                            hashMap.put(str, Integer.valueOf(i2));
                        }
                    }
                    z = false;
                    z2 = false;
                }
            }
            nCBITaxonDBQueryResult.setTaxonNameVsGenomeStatus(hashMap2);
            nCBITaxonDBQueryResult.setTaxonNameVsLengths(hashMap);
            nCBITaxonDBQueryResult.setTaxonNameVsIsNonMainReplicator(hashMap3);
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return nCBITaxonDBQueryResult;
    }

    public static Map<String, String> getGIForAccesionNumber(List<String> list) {
        HashMap hashMap = new HashMap();
        String str = "http://www.ncbi.nlm.nih.gov/sites/entrez?db=protein&cmd=search&term=" + list.get(0);
        for (int i = 1; i < list.size(); i++) {
            str = String.valueOf(str) + "+" + list.get(i);
        }
        boolean z = false;
        boolean z2 = false;
        String str2 = "";
        try {
            for (String str3 : new StringExtractor(str).extractStrings(false).split("\n")) {
                if (z2) {
                    hashMap.put(str2, str3.trim());
                    z2 = false;
                }
                if (z) {
                    str2 = str3.trim();
                    z = false;
                }
                if (str3.contains("Accession:")) {
                    z = true;
                }
                if (str3.contains("GI:")) {
                    z2 = true;
                }
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return hashMap;
    }

    public static String getRefSeqGenomeAccessionNumberGivenOrganismName(String str) {
        String str2 = "";
        if (str.contains(PathwayinferenceConstants.REACTION_SUBREACTION_JOINER)) {
            str = str.replace(PathwayinferenceConstants.REACTION_SUBREACTION_JOINER, "+");
        }
        if (str.contains(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR)) {
            str = str.replace(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR, "+");
        }
        String str3 = "http://www.ncbi.nlm.nih.gov/sites/entrez?db=genome&cmd=search&term=" + str;
        String str4 = str;
        if (str.contains("+")) {
            str4 = str.replace("+", AbstractFormatter.DEFAULT_COLUMN_SEPARATOR);
        }
        try {
            for (String str5 : new StringExtractor(str3).extractStrings(false).split("\n")) {
                if (str5.contains(ALTERNATE_GENOMIC_PREFIX)) {
                    str2 = String.valueOf(ALTERNATE_GENOMIC_PREFIX) + str5.split(ALTERNATE_GENOMIC_PREFIX)[1];
                } else if (str5.contains(COMPLETE_GENOMIC_PREFIX)) {
                    str2 = String.valueOf(COMPLETE_GENOMIC_PREFIX) + str5.split(COMPLETE_GENOMIC_PREFIX)[1];
                } else if (str5.contains(INCOMPLETE_GENOMIC_REGION_PREFIX)) {
                    str2 = String.valueOf(INCOMPLETE_GENOMIC_REGION_PREFIX) + str5.split(INCOMPLETE_GENOMIC_REGION_PREFIX)[1];
                } else if (str5.contains(INTERMEDIATE_GENOMIC_ASSEMBLY_PREFIX)) {
                    str2 = String.valueOf(INTERMEDIATE_GENOMIC_ASSEMBLY_PREFIX) + str5.split(INTERMEDIATE_GENOMIC_ASSEMBLY_PREFIX)[1];
                } else if (str5.contains(INTERMEDIATE_GENOMIC_ASSEMBLY2_PREFIX)) {
                    str2 = String.valueOf(INTERMEDIATE_GENOMIC_ASSEMBLY2_PREFIX) + str5.split(INTERMEDIATE_GENOMIC_ASSEMBLY2_PREFIX)[1];
                } else if (str5.contains(WHOLE_GENOME_SHOTGUN_PREFIX)) {
                    str2 = String.valueOf(WHOLE_GENOME_SHOTGUN_PREFIX) + str5.split(WHOLE_GENOME_SHOTGUN_PREFIX)[1];
                } else if (str5.contains(NON_REAL_GENOMIC_ASSEMBLY)) {
                    str2 = String.valueOf(NON_REAL_GENOMIC_ASSEMBLY) + str5.split(NON_REAL_GENOMIC_ASSEMBLY)[1];
                }
                if (str5.contains(str4) && !str5.contains("Genome Results") && (!str5.contains("(") || !str5.contains(""))) {
                    break;
                }
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return str2;
    }

    /* JADX WARN: Code restructure failed: missing block: B:16:0x006d, code lost:
    
        r5 = r0.split("root;")[1].trim();
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    public static java.lang.String getOrganismLineageGivenName(java.lang.String r4) {
        /*
            fillAmbiguousTaxonVsLineageMap()
            java.lang.String r0 = ""
            r5 = r0
            java.util.Map<java.lang.String, java.lang.String> r0 = be.ac.vub.bsb.parsers.ncbi.NCBIDBOnlineQueries.AMBIGUOUS_TAXON_NAMES_VS_LINEAGES
            r1 = r4
            boolean r0 = r0.containsKey(r1)
            if (r0 == 0) goto L22
            java.util.Map<java.lang.String, java.lang.String> r0 = be.ac.vub.bsb.parsers.ncbi.NCBIDBOnlineQueries.AMBIGUOUS_TAXON_NAMES_VS_LINEAGES
            r1 = r4
            java.lang.Object r0 = r0.get(r1)
            java.lang.String r0 = (java.lang.String) r0
            r5 = r0
            goto L96
        L22:
            java.lang.StringBuilder r0 = new java.lang.StringBuilder
            r1 = r0
            java.lang.String r2 = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name="
            r1.<init>(r2)
            r1 = r4
            java.lang.StringBuilder r0 = r0.append(r1)
            java.lang.String r0 = r0.toString()
            r6 = r0
            org.htmlparser.parserapplications.StringExtractor r0 = new org.htmlparser.parserapplications.StringExtractor
            r1 = r0
            r2 = r6
            r1.<init>(r2)
            r7 = r0
            r0 = r7
            r1 = 0
            java.lang.String r0 = r0.extractStrings(r1)     // Catch: org.htmlparser.util.ParserException -> L8f
            r8 = r0
            r0 = r8
            java.lang.String r1 = "\n"
            java.lang.String[] r0 = r0.split(r1)     // Catch: org.htmlparser.util.ParserException -> L8f
            r9 = r0
            r0 = r9
            r1 = r0
            r13 = r1
            int r0 = r0.length     // Catch: org.htmlparser.util.ParserException -> L8f
            r12 = r0
            r0 = 0
            r11 = r0
            goto L85
        L5b:
            r0 = r13
            r1 = r11
            r0 = r0[r1]     // Catch: org.htmlparser.util.ParserException -> L8f
            r10 = r0
            r0 = r10
            java.lang.String r1 = "Lineage"
            boolean r0 = r0.contains(r1)     // Catch: org.htmlparser.util.ParserException -> L8f
            if (r0 == 0) goto L82
            r0 = r10
            java.lang.String r1 = "root;"
            java.lang.String[] r0 = r0.split(r1)     // Catch: org.htmlparser.util.ParserException -> L8f
            r1 = 1
            r0 = r0[r1]     // Catch: org.htmlparser.util.ParserException -> L8f
            java.lang.String r0 = r0.trim()     // Catch: org.htmlparser.util.ParserException -> L8f
            r10 = r0
            r0 = r10
            r5 = r0
            goto L96
        L82:
            int r11 = r11 + 1
        L85:
            r0 = r11
            r1 = r12
            if (r0 < r1) goto L5b
            goto L96
        L8f:
            r8 = move-exception
            r0 = r8
            r0.printStackTrace()
        L96:
            r0 = r5
            return r0
        */
        throw new UnsupportedOperationException("Method not decompiled: be.ac.vub.bsb.parsers.ncbi.NCBIDBOnlineQueries.getOrganismLineageGivenName(java.lang.String):java.lang.String");
    }

    public static String getOrganismLineageGivenAccessionNumber(String str) {
        String str2 = "";
        try {
            String extractStrings = new StringExtractor("http://getentry.ddbj.nig.ac.jp/getentry/na/" + str + "/?filetype=txt").extractStrings(false);
            if (extractStrings.contains("ORGANISM")) {
                str2 = extractStrings.split("ORGANISM")[1].trim().split("\\. REFERENCE")[0].trim();
                if (str2.contains("Archaea")) {
                    if (str2.contains(VectorFormat.DEFAULT_SEPARATOR)) {
                        str2 = "Archaea; " + str2.split("Archaea; ")[1] + VectorFormat.DEFAULT_SEPARATOR + str2.split("Archaea; ")[0].trim();
                    } else {
                        str2 = "Archaea";
                    }
                } else if (str2.contains("Bacteria")) {
                    if (str2.contains(VectorFormat.DEFAULT_SEPARATOR)) {
                        str2 = "Bacteria; " + str2.split("Bacteria; ")[1] + VectorFormat.DEFAULT_SEPARATOR + str2.split("Bacteria; ")[0].trim();
                    } else {
                        str2 = "Bacteria";
                    }
                } else if (str2.contains("Eukarya")) {
                    if (str2.contains(VectorFormat.DEFAULT_SEPARATOR)) {
                        str2 = "Eukarya; " + str2.split("Eukarya; ")[1] + VectorFormat.DEFAULT_SEPARATOR + str2.split("Eukarya; ")[0].trim();
                    } else {
                        str2 = "Eukarya";
                    }
                } else if (str2.startsWith("uncultured organism")) {
                    str2 = "NA";
                }
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return str2;
    }

    public static Map<String, String> getOrganismIdentifiersUsingEUtilities(List<String> list) {
        HashMap hashMap = new HashMap();
        int i = 0;
        try {
            for (String str : new StringExtractor("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=taxonomy&retmax=" + (list.size() + 1) + "&term=" + DiverseTools.listToString(list, ",")).extractStrings(false).split("\n")) {
                System.out.println(str);
                hashMap.put(list.get(i), str.split(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR)[1]);
                i++;
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }
        if (i < list.size()) {
            System.err.println("Could not obtain unique identifiers for all organism names!");
        }
        return hashMap;
    }

    public static Set<String> getProteinIdsGivenOrganism(String str) {
        HashSet hashSet = new HashSet();
        try {
            for (String str2 : new StringExtractor("http://www.ncbi.nlm.nih.gov/protein/?term=txid" + str + "[Organism:exp]").extractStrings(false).split("\n")) {
                System.out.println(str2);
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }
        return hashSet;
    }

    public static Map<String, String> getSequenceUsingEUtilities(String str, List<String> list) {
        if (str.isEmpty()) {
            str = PROTEIN_DB;
        }
        System.out.println("Fetching info for identifiers: " + list + " from database " + str);
        ArrayList arrayList = new ArrayList();
        HashMap hashMap = new HashMap();
        String str2 = "";
        String str3 = "";
        try {
            for (String str4 : new StringExtractor("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=" + str + "&retmax=" + (list.size() + 1) + "&term=" + DiverseTools.listToString(list, ",")).extractStrings(false).split("\n")) {
                System.out.println(str4);
                String[] split = str4.split(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR);
                for (int i = 1; i < split.length; i++) {
                    str2 = String.valueOf(str2) + "," + split[i];
                }
            }
        } catch (ParserException e) {
            e.printStackTrace();
        }
        if (str2.startsWith(",")) {
            str2 = str2.replaceFirst(",", "");
        }
        try {
            String[] split2 = new StringExtractor("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=" + str + "&id=" + str2 + "&rettype=fasta&retmode=text").extractStrings(false).split(PathwayinferenceConstants.DIRECT_REACTION);
            for (int i2 = 1; i2 < split2.length; i2++) {
                arrayList.add(split2[i2]);
            }
        } catch (ParserException e2) {
            e2.printStackTrace();
        }
        for (int i3 = 0; i3 < arrayList.size(); i3++) {
            String str5 = (String) arrayList.get(i3);
            if (str.equals(PROTEIN_DB)) {
                str3 = String.valueOf(str5.split("] ")[0]) + "]";
                str5 = str5.split("] ")[1].trim();
            } else if (str.equals(NUCLEOTIDE_DB)) {
                str3 = str5.split("sequence ")[0].trim();
                str5 = str5.split("sequence ")[1].trim();
            }
            String str6 = PathwayinferenceConstants.DIRECT_REACTION + str3 + "\n" + str5.replace(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR, "");
            for (String str7 : list) {
                if (str6.contains("|gb|" + str7) || str6.contains("gi|" + str7) || str6.contains("emb|" + str7) || str6.contains("ref|" + str7)) {
                    hashMap.put(str7, str6);
                }
            }
        }
        if (hashMap.keySet().size() != list.size()) {
            System.err.println("Could not get sequences (number: " + hashMap.keySet().size() + ") for all genbank identifiers (number: " + list.size() + ")!");
        }
        return hashMap;
    }

    public static String getFastaSequenceGivenGenBankId(String str) {
        try {
            for (String str2 : new StringExtractor("http://www.ncbi.nlm.nih.gov/protein/" + str + "?report=fasta").extractStrings(false).split("\n")) {
                System.out.println(str2);
            }
            return "";
        } catch (ParserException e) {
            e.printStackTrace();
            return "";
        }
    }

    public static void main(String[] strArr) {
        ArrayList arrayList = new ArrayList();
        arrayList.add("GG692713.1");
        arrayList.add("GG692714.1");
        Map<String, String> sequenceUsingEUtilities = getSequenceUsingEUtilities(NUCLEOTIDE_DB, arrayList);
        for (String str : sequenceUsingEUtilities.keySet()) {
            System.out.println("exporting fasta file for id: " + str);
            IOTools.exportStringToFile(sequenceUsingEUtilities.get(str), "test_" + str + ".fasta");
        }
        System.out.println(sequenceUsingEUtilities);
    }
}
