In [1]:
import requests
import datetime
import os
import pickle
import pandas as pd
from ost import io, seq

# from Tara code
def _get_sequence(chn):
    """Get the sequence out of an OST chain."""
    # initialise
    lst_rn = chn.residues[0].number.num
    idx = 1
    sqe = chn.residues[0].one_letter_code
    if lst_rn != 1:
        sqe = "-"
        idx = 0

    for res in chn.residues[idx:]:
        lst_rn += 1
        while lst_rn != res.number.num:
            sqe += "-"
            lst_rn += 1
        sqe += res.one_letter_code

    return sqe

def _check_sequence(up_ac, sequence):
    """Verify sequence to only contain standard olc."""
    for res in sequence:
        if res not in "ACDEFGHIKLMNPQRSTVWY":
            raise RuntimeError(
                "Non-standard aa found in UniProtKB sequence "
                + f"for entry '{up_ac}': {res}"
            )

def _fetch_upkb_entry(up_ac):
    """Fetch data for an UniProtKB entry."""
    # This is a simple parser for UniProtKB txt format, instead of breaking it up
    # into multiple functions, we just allow many many branches & statements,
    # here.
    # pylint: disable=too-many-branches,too-many-statements
    data = {}
    data["up_organism"] = ""
    data["up_sequence"] = ""
    data["up_ac"] = up_ac
    rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt")
    for line in rspns.iter_lines(decode_unicode=True):
        if line.startswith("ID   "):
            sline = line.split()
            if len(sline) != 5:
                _abort_msg(f"Unusual UniProtKB ID line found:\n'{line}'")
            data["up_id"] = sline[1]
        elif line.startswith("OX   NCBI_TaxID="):
            # Following strictly the UniProtKB format: 'OX   NCBI_TaxID=<ID>;'
            data["up_ncbi_taxid"] = line[len("OX   NCBI_TaxID=") : -1]
            data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip()
        elif line.startswith("OS   "):
            if line[-1] == ".":
                data["up_organism"] += line[len("OS   ") : -1]
            else:
                data["up_organism"] += line[len("OS   ") : -1] + " "
        elif line.startswith("SQ   "):
            sline = line.split()
            if len(sline) != 8:
                _abort_msg(f"Unusual UniProtKB SQ line found:\n'{line}'")
            data["up_seqlen"] = int(sline[2])
            data["up_crc64"] = sline[6]
        elif line.startswith("     "):
            sline = line.split()
            if len(sline) > 6:
                _abort_msg(
                    "Unusual UniProtKB sequence data line "
                    + f"found:\n'{line}'"
                )
            data["up_sequence"] += "".join(sline)
        elif line.startswith("RP   "):
            if "ISOFORM" in line.upper():
                RuntimeError(
                    f"First ISOFORM found for '{up_ac}', needs " + "handling."
                )
        elif line.startswith("DT   "):
            # 2012-10-03
            dt_flds = line[len("DT   ") :].split(", ")
            if dt_flds[1].upper().startswith("SEQUENCE VERSION "):
                data["up_last_mod"] = datetime.datetime.strptime(
                    dt_flds[0], "%d-%b-%Y"
                )
        elif line.startswith("GN   Name="):
            data["up_gn"] = line[len("GN   Name=") :].split(";")[0]
            data["up_gn"] = data["up_gn"].split("{")[0].strip()

    # we have not seen isoforms in the data set, yet, so we just set them to '.'
    data["up_isoform"] = None

    if "up_gn" not in data:
        _abort_msg(f"No gene name found for UniProtKB entry '{up_ac}'.")
    if "up_last_mod" not in data:
        _abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.")
    if "up_crc64" not in data:
        _abort_msg(f"No CRC64 value found for UniProtKB entry '{up_ac}'.")
    if len(data["up_sequence"]) == 0:
        _abort_msg(f"No sequence found for UniProtKB entry '{up_ac}'.")
    # check that sequence length and CRC64 is correct
    if data["up_seqlen"] != len(data["up_sequence"]):
        _abort_msg(
            "Sequence length of SQ line and sequence data differ for "
            + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
            + f"{len(data['up_sequence'])}"
        )
    _check_sequence(data["up_ac"], data["up_sequence"])

    if "up_id" not in data:
        _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.")
    if "up_ncbi_taxid" not in data:
        _abort_msg(f"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.")
    if len(data["up_organism"]) == 0:
        _abort_msg(f"No organism species found for UniProtKB entry '{up_ac}'.")

    return data

def _get_upkb_for_sequence(sqe, up_ac):
    """Get UniProtKB entry data for given sequence."""
    up_data = _fetch_upkb_entry(up_ac)
    if sqe != up_data["up_sequence"]:
        raise RuntimeError(
            f"Sequences not equal from file: {sqe}, from UniProtKB: "
            + f"{up_data['up_sequence']}"
        )

    return up_data
In [2]:
def _get_ncbi_sequence(ncbi_ac):
    """Fetch OST sequence object from NCBI web service."""
    # src: https://www.ncbi.nlm.nih.gov/books/NBK25500/#_chapter1_Downloading_Full_Records_
    rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
                         f"efetch.fcgi?db=protein&id={ncbi_ac}" \
                         f"&rettype=fasta&retmode=text")
    return io.SequenceFromString(rspns.text, "fasta")
In [3]:
# s = _get_ncbi_sequence("CAD2068351.1")
# up_data = _fetch_upkb_entry("A0A485PQD1")
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data
In [4]:
# check USDA data
# metadata_file = "./OneDrive-Part/ASFV-G_proteome_accessions.csv"
# pdb_dir = "./OneDrive-Part/AlphaFoldSimple"
metadata_file = "./2022-08-22_updates/ASFV-G_proteome_accessions.csv"
pdb_dir = "./2022-08-22_updates/AlphaFold-RENAME"
pkl_dir = "./OneDrive-Part/PickleFiles"
In [5]:
metadata = pd.read_csv(metadata_file)
assert len(set(metadata.Protein)) == metadata.shape[0]
assert len(set(metadata["Associated PDB"])) == metadata.shape[0]
metadata
Out[5]:
Protein Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
0 285L 285L.pdb CAD2068351.1 A0A485PQD1 ASFV-G 285L This model was predicted using AlphaFold2 model_1_pred_0 NaN
1 A104R A104R.pdb CAD2068395.1 A0A0A1E0L7 ASFV-G A104R This model was predicted using AlphaFold2 model_2_pred_0 NaN
2 A118R A118R.pdb CAD2068397.1 A0A2X0RVA9 ASFV-G A118R This model was predicted using AlphaFold2 model_1_pred_0 NaN
3 A137R A137R.pdb CAD2068404.1 A0A2X0THQ0 ASFV-G A137R This model was predicted using AlphaFold2 model_3_pred_0 NaN
4 A151R A151R.pdb CAD2068398.1 A0A2X0TC55 ASFV-G A151R This model was predicted using AlphaFold2 model_4_pred_0 NaN
... ... ... ... ... ... ... ... ...
192 QP509L QP509L.pdb CAD2068484.1 A0A2X0THX2 ASFV-G QP509L This model was predicted using AlphaFold2 NaN NaN
193 R298L R298L.pdb CAD2068482.1 A0A2X0SE42 ASFV-G R298L This model was predicted using AlphaFold2 model_3_pred_0 NaN
194 S183L S183L.pdb CAD2068472.1 A0A2X0SE34 ASFV-G S183L This model was predicted using AlphaFold2 model_4_pred_0 NaN
195 S273R S273R.pdb CAD2068473.1 A0A2X0TKM5 ASFV-G S273R This model was predicted using AlphaFold2 model_2_pred_0 NaN
196 X69R X69R.pdb CAD2068372.1 A0A2X0TKC7 ASFV-G X69R This model was predicted using AlphaFold2 model_2_pred_0 NaN

197 rows × 8 columns

In [10]:
# compare new with old
metadata_old = pd.read_csv("./OneDrive-Part/ASFV-G_proteome_accessions.csv")
for key in ["NCBI_Accession", "UniProt_ID", "_struct.title "]:
    print(key)
    tsto = set(metadata_old[key])
    tstm = set(metadata[key])
    print("ONLY IN OLD:", sorted(tsto - tstm))
    print("ONLY IN NEW:", sorted(tstm - tsto))
NCBI_Accession
ONLY IN OLD: []
ONLY IN NEW: []
UniProt_ID
ONLY IN OLD: []
ONLY IN NEW: []
_struct.title 
ONLY IN OLD: []
ONLY IN NEW: []
In [7]:
pdb_files = [f for f in sorted(os.listdir(pdb_dir)) if f.endswith(".pdb")]
In [19]:
# check names
pdb_file_split = [os.path.splitext(f) for f in pdb_files]
In [9]:
tstp = set(pdb_files)
tstm = set(metadata["Associated PDB"])
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: []
ONLY IN METADATA: []
In [10]:
tstp = set(fs[0] for fs in pdb_file_split)
tstm = set(metadata.Protein)
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: []
ONLY IN METADATA: []
In [11]:
# can use either Protein or PDB name as index
metadata = metadata.set_index("Protein")
In [13]:
# NOTE: stupid space there...
metadata[metadata["_struct.title "].isna()]
Out[13]:
Protein Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
In [14]:
metadata[metadata["_struct.pdbx_model_detail"].isna()]
Out[14]:
Protein Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
In [15]:
metadata[~metadata.notes.isna()]
Out[15]:
Protein Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
58 CP2475L_p14 CP2475L_p14.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p14 This model was predicted using AlphaFold2 model_4_pred_0 protein p14 from the pp220 polyprotein encoded...
59 CP2475L_p34 CP2475L_p34.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p34 This model was predicted using AlphaFold2 model_4_pred_0 protein p34 from the pp220 polyprotein encoded...
60 CP2475L_p37 CP2475L_p37.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p37 This model was predicted using AlphaFold2 model_4_pred_0 protein p37 from the pp220 polyprotein encoded...
61 CP2475L_p150 CP2475L_p150.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p150 This model was predicted using AlphaFold2 model_1_pred_0 protein p150 from the pp220 polyprotein encode...
62 CP2475L_p5 CP2475L_p5.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p5 This model was predicted using AlphaFold2 model_3_pred_0 protein p5 from the pp220 polyprotein encoded ...
69 D250R D250R.pdb CAD2068464.1 A0A2X0THV3 ASFV-G D250R This model was predicted using AlphaFold2 model_3_pred_0 Mislabled on NCBI and Uniport as D205R
76 DP79L DP79L.pdb CAD2068466.1 A0A0A1E158 ASFV-G DP79L This model was predicted using AlphaFold2 model_3_pred_0 Mislabled on Uniprot as D79L
111 hypothetical_01 hypothetical_01.pdb CAD2068367.1 A0A485PU43 ASFV-G hypothetical_01 This model was predicted using AlphaFold2 model_5_pred_0 labeled as hypothetical on NCBI and Uniprot
112 hypothetical_02 hypothetical_02.pdb CAD2068400.1 A0A485PQI3 ASFV-G hypothetical_02 This model was predicted using AlphaFold2 model_3_pred_0 labeled as hypthetical on NCBI and Uniprot
113 hypothetical_03 hypothetical_03.pdb CAD2068512.1 A0A485PZB7 ASFV-G hypothetical_03 This model was predicted using AlphaFold2 model_3_pred_0 labeled as hypthetical on NCBI and Uniprot
In [13]:
# tst = metadata.loc["hypothetical_03"]
# s = _get_ncbi_sequence(tst.NCBI_Accession)
# up_data = _fetch_upkb_entry(tst.UniProt_ID)
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data

# checked all the one above manually and ok as stated (best to add to model_detail!)
In [44]:
# check model numbers
tst = metadata["ranking debugg model ID"]
for idx, mdl_id in tst.items():
    mdl_num = None
    if type(mdl_id) == str:
        mdl_id_split = mdl_id.split('_')
        if len(mdl_id_split) == 4:
            mdl_num = int(mdl_id_split[1])
    if not mdl_num:
        print(idx, mdl_id)
    elif mdl_num not in range(1, 6):
        print(idx, mdl_id, mdl_num)
QP509L nan
In [14]:
pkl_files = os.listdir(pkl_dir)
for pkl_file in pkl_files:
    protein = pkl_file.split('_', 1)[0]
    assert protein in metadata.index
    assert os.path.exists(os.path.join(pdb_dir, protein + ".pdb"))
pkl_file = pkl_files[0]
In [15]:
model_data = pickle.load(open(os.path.join(pkl_dir, pkl_file), "rb"))
In [16]:
model_data.keys()
# NOTE: only pLDDT in here; may as well use the ones from the PDB file...
Out[16]:
dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'predicted_lddt', 'structure_module', 'plddt', 'ranking_confidence'])
In [17]:
import numpy as np
for pkl_file in pkl_files:
    # GET pLDDT from PDB file
    protein = pkl_file.split('_', 1)[0]
    ent = io.LoadPDB(os.path.join(pdb_dir, protein + ".pdb"))
    bfactors = []
    for a in ent.atoms:
        res_idx = a.residue.number.num - 1
        assert res_idx <= len(bfactors)
        if res_idx < len(bfactors):
            assert a.b_factor == bfactors[res_idx]
        else:
            bfactors.append(a.b_factor)
    # COMPARE with pickled one
    model_data = pickle.load(open(os.path.join(pkl_dir, pkl_file), "rb"))
    assert len(bfactors) == len(model_data["plddt"])
    b1 = np.round(np.asarray(bfactors), 2)
    b2 = np.round(model_data["plddt"], 2)
    d_bf = b1 - b2
    if any(d_bf):
        nz_idx = np.nonzero(d_bf)[0]
        len_nz_idx = len(nz_idx)
        nz_idx = nz_idx[:5]
        print(protein, len_nz_idx, nz_idx + 1, d_bf[nz_idx], b1[nz_idx], b2[nz_idx])
A137R 137 [1 2 3 4 5] [-15.18  -6.81  -4.82 -12.5   -3.61] [27.34 27.25 34.49 28.27 31.02] [42.52 34.06 39.31 40.77 34.63]
A151R 151 [1 2 3 4 5] [-1.57 -5.29 -3.63 -4.76 -6.1 ] [34.45 31.08 34.97 41.78 52.73] [36.02 36.37 38.6  46.54 58.83]
In [20]:
def _check_subset(s1, s2):
    # check if s2 is uniquely contained in s1
    # (and if so, returns values for seq_db_align_begin & seq_db_align_end)
    if s1.count(s2) == 1:
        align_begin = s1.find(s2) + 1
        align_end = align_begin + len(s2) - 1
        return align_begin, align_end
    else:
        return None
In [21]:
# check shared ones
for protein, pdb_ext in sorted(pdb_file_split):
    if protein not in metadata.index:
        print("SKIPPING", protein)
        continue
    else:
        row = metadata.loc[protein]
    pdb_path = os.path.join(pdb_dir, protein + pdb_ext)
    ent = io.LoadPDB(pdb_path)
    assert ent.chain_count == 1
    sqe = _get_sequence(ent.chains[0])
    s_ncbi = _get_ncbi_sequence(row.NCBI_Accession)
    up_data = _fetch_upkb_entry(row.UniProt_ID)
    if up_data["up_sequence"] != str(s_ncbi):
        print(protein, "inconsistent UP/NCBI sequences", up_data["up_sequence"], str(s_ncbi))
    if up_data["up_sequence"] != sqe:
        tst = _check_subset(up_data["up_sequence"], sqe)
        if tst:
            print(protein, "PDB seq. is subset of UP", tst)
        else:
            print(protein, "inconsistent UP/PDB sequences", up_data["up_sequence"], sqe)
    if str(s_ncbi) != sqe:
        tst = _check_subset(str(s_ncbi), sqe)
        if tst:
            print(protein, "PDB seq. is subset of NCBI", tst)
        else:
            print(protein, "inconsistent NCBI/PDB sequences", str(s_ncbi), sqe)
CP2475L_p14 PDB seq. is subset of UP (369, 522)
CP2475L_p14 PDB seq. is subset of NCBI (369, 522)
CP2475L_p150 PDB seq. is subset of UP (894, 2476)
CP2475L_p150 PDB seq. is subset of NCBI (894, 2476)
CP2475L_p34 PDB seq. is subset of UP (45, 368)
CP2475L_p34 PDB seq. is subset of NCBI (45, 368)
CP2475L_p37 PDB seq. is subset of UP (523, 893)
CP2475L_p37 PDB seq. is subset of NCBI (523, 893)
CP2475L_p5 PDB seq. is subset of UP (2, 39)
CP2475L_p5 PDB seq. is subset of NCBI (2, 39)
In [22]:
# check QP509L (take pLDDT from unrelaxed)
ent_unr = io.LoadPDB("./OneDrive-Part/QP509L-unrelaxed.pdb")
ent_rel = io.LoadPDB("./OneDrive-Part/AlphaFoldSimple/QP509L.pdb")
# NOTE: ent_rel has hydrogens and no b_factors and no chain name!
print(set(a.occupancy for a in ent_rel.atoms), set(a.b_factor for a in ent_rel.atoms))
ev_rel = ent_rel.Select("ele!=H")
ev_atoms = set(a.qualified_name for a in ev_rel.atoms)
eu_atoms = set(a.qualified_name[2:] for a in ent_unr.atoms)
print("IN UNRELAXED:", sorted(eu_atoms - ev_atoms))
print("IN RELAXED:", sorted(ev_atoms - eu_atoms))
{1.0} {0.0}
IN UNRELAXED: ['ILE10.CD1', 'ILE108.CD1', 'ILE12.CD1', 'ILE132.CD1', 'ILE134.CD1', 'ILE136.CD1', 'ILE137.CD1', 'ILE147.CD1', 'ILE153.CD1', 'ILE155.CD1', 'ILE158.CD1', 'ILE191.CD1', 'ILE194.CD1', 'ILE195.CD1', 'ILE228.CD1', 'ILE232.CD1', 'ILE237.CD1', 'ILE254.CD1', 'ILE255.CD1', 'ILE256.CD1', 'ILE279.CD1', 'ILE293.CD1', 'ILE300.CD1', 'ILE303.CD1', 'ILE313.CD1', 'ILE342.CD1', 'ILE343.CD1', 'ILE344.CD1', 'ILE354.CD1', 'ILE368.CD1', 'ILE369.CD1', 'ILE393.CD1', 'ILE394.CD1', 'ILE4.CD1', 'ILE405.CD1', 'ILE413.CD1', 'ILE414.CD1', 'ILE426.CD1', 'ILE430.CD1', 'ILE442.CD1', 'ILE446.CD1', 'ILE46.CD1', 'ILE463.CD1', 'ILE471.CD1', 'ILE48.CD1', 'ILE489.CD1', 'ILE5.CD1', 'ILE61.CD1', 'ILE65.CD1', 'LYS509.O']
IN RELAXED: ['ILE10.CD', 'ILE108.CD', 'ILE12.CD', 'ILE132.CD', 'ILE134.CD', 'ILE136.CD', 'ILE137.CD', 'ILE147.CD', 'ILE153.CD', 'ILE155.CD', 'ILE158.CD', 'ILE191.CD', 'ILE194.CD', 'ILE195.CD', 'ILE228.CD', 'ILE232.CD', 'ILE237.CD', 'ILE254.CD', 'ILE255.CD', 'ILE256.CD', 'ILE279.CD', 'ILE293.CD', 'ILE300.CD', 'ILE303.CD', 'ILE313.CD', 'ILE342.CD', 'ILE343.CD', 'ILE344.CD', 'ILE354.CD', 'ILE368.CD', 'ILE369.CD', 'ILE393.CD', 'ILE394.CD', 'ILE4.CD', 'ILE405.CD', 'ILE413.CD', 'ILE414.CD', 'ILE426.CD', 'ILE430.CD', 'ILE442.CD', 'ILE446.CD', 'ILE46.CD', 'ILE463.CD', 'ILE471.CD', 'ILE48.CD', 'ILE489.CD', 'ILE5.CD', 'ILE61.CD', 'ILE65.CD', 'LYS509.O1', 'LYS509.O2']
In [23]:
# can we fix it internally?
import ost
from ost import testutils, conop
from ost.mol import mm
# setup conop
testutils.SetDefaultCompoundLib()
processor = conop.RuleBasedProcessor(conop.GetDefaultLib())
# map with MM function
mm.MMModeller.AssignPDBNaming(ent_rel)
# check processing
ost.PushVerbosityLevel(2)
processor.Process(ent_rel)
ost.PopVerbosityLevel()
residue LYS509 contains unknown atom O2
residue LYS509 contains unknown atom O1
residue LYS509 contains unknown atom HE1
residue LYS509 contains unknown atom HD1
residue LYS509 contains unknown atom HG1
residue LYS509 contains unknown atom HB1
In [24]:
import xml.dom.minidom
def _get_ncbi_info(ncbi_ac):
    """Fetch dict with info from NCBI web service."""
    # src: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary
    rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
                         f"esummary.fcgi?db=protein&id={ncbi_ac}")
    dom = xml.dom.minidom.parseString(rspns.text)
    docsums = dom.getElementsByTagName("DocSum")
    assert len(docsums) == 1
    docsum = docsums[0]
    ncbi_dict = {}
    for cn in docsum.childNodes:
        if cn.nodeName == "Item":
            cn_name = cn.getAttribute("Name")
            cn_type = cn.getAttribute("Type")
            if cn.childNodes:
                d = cn.childNodes[0].data
                if cn_type == "String":
                    ncbi_dict[cn_name] = d
                elif cn_type == "Integer":
                    ncbi_dict[cn_name] = int(d)
                else:
                    raise RuntimeError(f"Unknown type {cn_type} for {ncbi_ac}")
            else:
                ncbi_dict[cn_name] = None
    return ncbi_dict
In [25]:
# fetch some extra info from NCBI
for idx, row in metadata.iterrows():
    ncbi_info = _get_ncbi_info(row.NCBI_Accession)
    # Gi is some numerical sequence identifier used internally by NCBI
    metadata.loc[idx, "NCBI_Gi"] = str(ncbi_info["Gi"])
    # UpdateData is to be stored as the version date in ModelCIF
    metadata.loc[idx, "NCBI_UpdateDate"] = ncbi_info["UpdateDate"]
    # TaxId should be same as one from UP
    metadata.loc[idx, "NCBI_TaxId"] = str(ncbi_info["TaxId"])
    # Status expected to be live
    if ncbi_info["Status"] != "live":
        print(idx, row.NCBI_Accession, "Status", ncbi_info["Status"])
    # ReplacedBy expected to be empty
    if ncbi_info["ReplacedBy"]:
        print(idx, row.NCBI_Accession, "ReplacedBy", ncbi_info["ReplacedBy"])
    # AccessionVersion expected to be NCBI_Accession
    if ncbi_info["AccessionVersion"] != row.NCBI_Accession:
        print(idx, row.NCBI_Accession, "AccessionVersion", ncbi_info["AccessionVersion"])
In [26]:
metadata
Out[26]:
Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes NCBI_Gi NCBI_UpdateDate NCBI_TaxId
Protein
285L 285L.pdb CAD2068351.1 A0A485PQD1 ASFV-G 285L This model was predicted using AlphaFold2 model_1_pred_0 NaN 1886136876 2020/08/05 10497
A104R A104R.pdb CAD2068395.1 A0A0A1E0L7 ASFV-G A104R This model was predicted using AlphaFold2 model_2_pred_0 NaN 1886136920 2020/08/05 10497
A118R A118R.pdb CAD2068397.1 A0A2X0RVA9 ASFV-G A118R This model was predicted using AlphaFold2 model_1_pred_0 NaN 1886136922 2020/08/05 10497
A137R A137R.pdb CAD2068404.1 A0A2X0THQ0 ASFV-G A137R This model was predicted using AlphaFold2 model_3_pred_0 NaN 1886136929 2020/08/05 10497
A151R A151R.pdb CAD2068398.1 A0A2X0TC55 ASFV-G A151R This model was predicted using AlphaFold2 model_4_pred_0 NaN 1886136923 2020/08/05 10497
... ... ... ... ... ... ... ... ... ... ...
QP509L QP509L.pdb CAD2068484.1 A0A2X0THX2 ASFV-G QP509L This model was predicted using AlphaFold2 NaN NaN 1886137009 2020/08/05 10497
R298L R298L.pdb CAD2068482.1 A0A2X0SE42 ASFV-G R298L This model was predicted using AlphaFold2 model_3_pred_0 NaN 1886137007 2020/08/05 10497
S183L S183L.pdb CAD2068472.1 A0A2X0SE34 ASFV-G S183L This model was predicted using AlphaFold2 model_4_pred_0 NaN 1886136997 2020/08/05 10497
S273R S273R.pdb CAD2068473.1 A0A2X0TKM5 ASFV-G S273R This model was predicted using AlphaFold2 model_2_pred_0 NaN 1886136998 2020/08/05 10497
X69R X69R.pdb CAD2068372.1 A0A2X0TKC7 ASFV-G X69R This model was predicted using AlphaFold2 model_2_pred_0 NaN 1886136897 2020/08/05 10497

197 rows × 10 columns

In [27]:
len(set(metadata.NCBI_Accession)), set(metadata.NCBI_TaxId), \
len(set(metadata.NCBI_Gi)), set(metadata.NCBI_UpdateDate)
Out[27]:
(193, {'10497'}, 193, {'2020/08/05'})
In [28]:
up_data
Out[28]:
{'up_organism': 'African swine fever virus (ASFV)',
 'up_sequence': 'MMNIIKIRGKIFFAVLLEEDISLNTLSPNAVIRKI',
 'up_ac': 'A0A485PZB7',
 'up_id': 'A0A485PZB7_ASF',
 'up_last_mod': datetime.datetime(2019, 6, 5, 0, 0),
 'up_gn': 'hypthetical',
 'up_ncbi_taxid': '10497',
 'up_seqlen': 35,
 'up_crc64': '5CA27C9509292409',
 'up_isoform': None}
In [ ]: