import requests
import datetime
import os
import pickle
import pandas as pd
from ost import io, seq
# from Tara code
def _get_sequence(chn):
"""Get the sequence out of an OST chain."""
# initialise
lst_rn = chn.residues[0].number.num
idx = 1
sqe = chn.residues[0].one_letter_code
if lst_rn != 1:
sqe = "-"
idx = 0
for res in chn.residues[idx:]:
lst_rn += 1
while lst_rn != res.number.num:
sqe += "-"
lst_rn += 1
sqe += res.one_letter_code
return sqe
def _check_sequence(up_ac, sequence):
"""Verify sequence to only contain standard olc."""
for res in sequence:
if res not in "ACDEFGHIKLMNPQRSTVWY":
raise RuntimeError(
"Non-standard aa found in UniProtKB sequence "
+ f"for entry '{up_ac}': {res}"
)
def _fetch_upkb_entry(up_ac):
"""Fetch data for an UniProtKB entry."""
# This is a simple parser for UniProtKB txt format, instead of breaking it up
# into multiple functions, we just allow many many branches & statements,
# here.
# pylint: disable=too-many-branches,too-many-statements
data = {}
data["up_organism"] = ""
data["up_sequence"] = ""
data["up_ac"] = up_ac
rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt")
for line in rspns.iter_lines(decode_unicode=True):
if line.startswith("ID "):
sline = line.split()
if len(sline) != 5:
_abort_msg(f"Unusual UniProtKB ID line found:\n'{line}'")
data["up_id"] = sline[1]
elif line.startswith("OX NCBI_TaxID="):
# Following strictly the UniProtKB format: 'OX NCBI_TaxID=<ID>;'
data["up_ncbi_taxid"] = line[len("OX NCBI_TaxID=") : -1]
data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip()
elif line.startswith("OS "):
if line[-1] == ".":
data["up_organism"] += line[len("OS ") : -1]
else:
data["up_organism"] += line[len("OS ") : -1] + " "
elif line.startswith("SQ "):
sline = line.split()
if len(sline) != 8:
_abort_msg(f"Unusual UniProtKB SQ line found:\n'{line}'")
data["up_seqlen"] = int(sline[2])
data["up_crc64"] = sline[6]
elif line.startswith(" "):
sline = line.split()
if len(sline) > 6:
_abort_msg(
"Unusual UniProtKB sequence data line "
+ f"found:\n'{line}'"
)
data["up_sequence"] += "".join(sline)
elif line.startswith("RP "):
if "ISOFORM" in line.upper():
RuntimeError(
f"First ISOFORM found for '{up_ac}', needs " + "handling."
)
elif line.startswith("DT "):
# 2012-10-03
dt_flds = line[len("DT ") :].split(", ")
if dt_flds[1].upper().startswith("SEQUENCE VERSION "):
data["up_last_mod"] = datetime.datetime.strptime(
dt_flds[0], "%d-%b-%Y"
)
elif line.startswith("GN Name="):
data["up_gn"] = line[len("GN Name=") :].split(";")[0]
data["up_gn"] = data["up_gn"].split("{")[0].strip()
# we have not seen isoforms in the data set, yet, so we just set them to '.'
data["up_isoform"] = None
if "up_gn" not in data:
_abort_msg(f"No gene name found for UniProtKB entry '{up_ac}'.")
if "up_last_mod" not in data:
_abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.")
if "up_crc64" not in data:
_abort_msg(f"No CRC64 value found for UniProtKB entry '{up_ac}'.")
if len(data["up_sequence"]) == 0:
_abort_msg(f"No sequence found for UniProtKB entry '{up_ac}'.")
# check that sequence length and CRC64 is correct
if data["up_seqlen"] != len(data["up_sequence"]):
_abort_msg(
"Sequence length of SQ line and sequence data differ for "
+ f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
+ f"{len(data['up_sequence'])}"
)
_check_sequence(data["up_ac"], data["up_sequence"])
if "up_id" not in data:
_abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.")
if "up_ncbi_taxid" not in data:
_abort_msg(f"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.")
if len(data["up_organism"]) == 0:
_abort_msg(f"No organism species found for UniProtKB entry '{up_ac}'.")
return data
def _get_upkb_for_sequence(sqe, up_ac):
"""Get UniProtKB entry data for given sequence."""
up_data = _fetch_upkb_entry(up_ac)
if sqe != up_data["up_sequence"]:
raise RuntimeError(
f"Sequences not equal from file: {sqe}, from UniProtKB: "
+ f"{up_data['up_sequence']}"
)
return up_data
def _get_ncbi_sequence(ncbi_ac):
"""Fetch OST sequence object from NCBI web service."""
# src: https://www.ncbi.nlm.nih.gov/books/NBK25500/#_chapter1_Downloading_Full_Records_
rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
f"efetch.fcgi?db=protein&id={ncbi_ac}" \
f"&rettype=fasta&retmode=text")
return io.SequenceFromString(rspns.text, "fasta")
# s = _get_ncbi_sequence("CAD2068351.1")
# up_data = _fetch_upkb_entry("A0A485PQD1")
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data
# check USDA data
# metadata_file = "./OneDrive-Part/ASFV-G_proteome_accessions.csv"
# pdb_dir = "./OneDrive-Part/AlphaFoldSimple"
metadata_file = "./2022-08-22_updates/ASFV-G_proteome_accessions.csv"
pdb_dir = "./2022-08-22_updates/AlphaFold-RENAME"
pkl_dir = "./OneDrive-Part/PickleFiles"
metadata = pd.read_csv(metadata_file)
assert len(set(metadata.Protein)) == metadata.shape[0]
assert len(set(metadata["Associated PDB"])) == metadata.shape[0]
metadata
Protein | Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | |
---|---|---|---|---|---|---|---|---|
0 | 285L | 285L.pdb | CAD2068351.1 | A0A485PQD1 | ASFV-G 285L | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN |
1 | A104R | A104R.pdb | CAD2068395.1 | A0A0A1E0L7 | ASFV-G A104R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN |
2 | A118R | A118R.pdb | CAD2068397.1 | A0A2X0RVA9 | ASFV-G A118R | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN |
3 | A137R | A137R.pdb | CAD2068404.1 | A0A2X0THQ0 | ASFV-G A137R | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN |
4 | A151R | A151R.pdb | CAD2068398.1 | A0A2X0TC55 | ASFV-G A151R | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... |
192 | QP509L | QP509L.pdb | CAD2068484.1 | A0A2X0THX2 | ASFV-G QP509L | This model was predicted using AlphaFold2 | NaN | NaN |
193 | R298L | R298L.pdb | CAD2068482.1 | A0A2X0SE42 | ASFV-G R298L | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN |
194 | S183L | S183L.pdb | CAD2068472.1 | A0A2X0SE34 | ASFV-G S183L | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN |
195 | S273R | S273R.pdb | CAD2068473.1 | A0A2X0TKM5 | ASFV-G S273R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN |
196 | X69R | X69R.pdb | CAD2068372.1 | A0A2X0TKC7 | ASFV-G X69R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN |
197 rows × 8 columns
# compare new with old
metadata_old = pd.read_csv("./OneDrive-Part/ASFV-G_proteome_accessions.csv")
for key in ["NCBI_Accession", "UniProt_ID", "_struct.title "]:
print(key)
tsto = set(metadata_old[key])
tstm = set(metadata[key])
print("ONLY IN OLD:", sorted(tsto - tstm))
print("ONLY IN NEW:", sorted(tstm - tsto))
NCBI_Accession ONLY IN OLD: [] ONLY IN NEW: [] UniProt_ID ONLY IN OLD: [] ONLY IN NEW: [] _struct.title ONLY IN OLD: [] ONLY IN NEW: []
pdb_files = [f for f in sorted(os.listdir(pdb_dir)) if f.endswith(".pdb")]
# check names
pdb_file_split = [os.path.splitext(f) for f in pdb_files]
tstp = set(pdb_files)
tstm = set(metadata["Associated PDB"])
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: [] ONLY IN METADATA: []
tstp = set(fs[0] for fs in pdb_file_split)
tstm = set(metadata.Protein)
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: [] ONLY IN METADATA: []
# can use either Protein or PDB name as index
metadata = metadata.set_index("Protein")
# NOTE: stupid space there...
metadata[metadata["_struct.title "].isna()]
Protein | Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes |
---|
metadata[metadata["_struct.pdbx_model_detail"].isna()]
Protein | Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes |
---|
metadata[~metadata.notes.isna()]
Protein | Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | |
---|---|---|---|---|---|---|---|---|
58 | CP2475L_p14 | CP2475L_p14.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p14 | This model was predicted using AlphaFold2 | model_4_pred_0 | protein p14 from the pp220 polyprotein encoded... |
59 | CP2475L_p34 | CP2475L_p34.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p34 | This model was predicted using AlphaFold2 | model_4_pred_0 | protein p34 from the pp220 polyprotein encoded... |
60 | CP2475L_p37 | CP2475L_p37.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p37 | This model was predicted using AlphaFold2 | model_4_pred_0 | protein p37 from the pp220 polyprotein encoded... |
61 | CP2475L_p150 | CP2475L_p150.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p150 | This model was predicted using AlphaFold2 | model_1_pred_0 | protein p150 from the pp220 polyprotein encode... |
62 | CP2475L_p5 | CP2475L_p5.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p5 | This model was predicted using AlphaFold2 | model_3_pred_0 | protein p5 from the pp220 polyprotein encoded ... |
69 | D250R | D250R.pdb | CAD2068464.1 | A0A2X0THV3 | ASFV-G D250R | This model was predicted using AlphaFold2 | model_3_pred_0 | Mislabled on NCBI and Uniport as D205R |
76 | DP79L | DP79L.pdb | CAD2068466.1 | A0A0A1E158 | ASFV-G DP79L | This model was predicted using AlphaFold2 | model_3_pred_0 | Mislabled on Uniprot as D79L |
111 | hypothetical_01 | hypothetical_01.pdb | CAD2068367.1 | A0A485PU43 | ASFV-G hypothetical_01 | This model was predicted using AlphaFold2 | model_5_pred_0 | labeled as hypothetical on NCBI and Uniprot |
112 | hypothetical_02 | hypothetical_02.pdb | CAD2068400.1 | A0A485PQI3 | ASFV-G hypothetical_02 | This model was predicted using AlphaFold2 | model_3_pred_0 | labeled as hypthetical on NCBI and Uniprot |
113 | hypothetical_03 | hypothetical_03.pdb | CAD2068512.1 | A0A485PZB7 | ASFV-G hypothetical_03 | This model was predicted using AlphaFold2 | model_3_pred_0 | labeled as hypthetical on NCBI and Uniprot |
# tst = metadata.loc["hypothetical_03"]
# s = _get_ncbi_sequence(tst.NCBI_Accession)
# up_data = _fetch_upkb_entry(tst.UniProt_ID)
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data
# checked all the one above manually and ok as stated (best to add to model_detail!)
# check model numbers
tst = metadata["ranking debugg model ID"]
for idx, mdl_id in tst.items():
mdl_num = None
if type(mdl_id) == str:
mdl_id_split = mdl_id.split('_')
if len(mdl_id_split) == 4:
mdl_num = int(mdl_id_split[1])
if not mdl_num:
print(idx, mdl_id)
elif mdl_num not in range(1, 6):
print(idx, mdl_id, mdl_num)
QP509L nan
pkl_files = os.listdir(pkl_dir)
for pkl_file in pkl_files:
protein = pkl_file.split('_', 1)[0]
assert protein in metadata.index
assert os.path.exists(os.path.join(pdb_dir, protein + ".pdb"))
pkl_file = pkl_files[0]
model_data = pickle.load(open(os.path.join(pkl_dir, pkl_file), "rb"))
model_data.keys()
# NOTE: only pLDDT in here; may as well use the ones from the PDB file...
dict_keys(['distogram', 'experimentally_resolved', 'masked_msa', 'predicted_lddt', 'structure_module', 'plddt', 'ranking_confidence'])
import numpy as np
for pkl_file in pkl_files:
# GET pLDDT from PDB file
protein = pkl_file.split('_', 1)[0]
ent = io.LoadPDB(os.path.join(pdb_dir, protein + ".pdb"))
bfactors = []
for a in ent.atoms:
res_idx = a.residue.number.num - 1
assert res_idx <= len(bfactors)
if res_idx < len(bfactors):
assert a.b_factor == bfactors[res_idx]
else:
bfactors.append(a.b_factor)
# COMPARE with pickled one
model_data = pickle.load(open(os.path.join(pkl_dir, pkl_file), "rb"))
assert len(bfactors) == len(model_data["plddt"])
b1 = np.round(np.asarray(bfactors), 2)
b2 = np.round(model_data["plddt"], 2)
d_bf = b1 - b2
if any(d_bf):
nz_idx = np.nonzero(d_bf)[0]
len_nz_idx = len(nz_idx)
nz_idx = nz_idx[:5]
print(protein, len_nz_idx, nz_idx + 1, d_bf[nz_idx], b1[nz_idx], b2[nz_idx])
A137R 137 [1 2 3 4 5] [-15.18 -6.81 -4.82 -12.5 -3.61] [27.34 27.25 34.49 28.27 31.02] [42.52 34.06 39.31 40.77 34.63] A151R 151 [1 2 3 4 5] [-1.57 -5.29 -3.63 -4.76 -6.1 ] [34.45 31.08 34.97 41.78 52.73] [36.02 36.37 38.6 46.54 58.83]
def _check_subset(s1, s2):
# check if s2 is uniquely contained in s1
# (and if so, returns values for seq_db_align_begin & seq_db_align_end)
if s1.count(s2) == 1:
align_begin = s1.find(s2) + 1
align_end = align_begin + len(s2) - 1
return align_begin, align_end
else:
return None
# check shared ones
for protein, pdb_ext in sorted(pdb_file_split):
if protein not in metadata.index:
print("SKIPPING", protein)
continue
else:
row = metadata.loc[protein]
pdb_path = os.path.join(pdb_dir, protein + pdb_ext)
ent = io.LoadPDB(pdb_path)
assert ent.chain_count == 1
sqe = _get_sequence(ent.chains[0])
s_ncbi = _get_ncbi_sequence(row.NCBI_Accession)
up_data = _fetch_upkb_entry(row.UniProt_ID)
if up_data["up_sequence"] != str(s_ncbi):
print(protein, "inconsistent UP/NCBI sequences", up_data["up_sequence"], str(s_ncbi))
if up_data["up_sequence"] != sqe:
tst = _check_subset(up_data["up_sequence"], sqe)
if tst:
print(protein, "PDB seq. is subset of UP", tst)
else:
print(protein, "inconsistent UP/PDB sequences", up_data["up_sequence"], sqe)
if str(s_ncbi) != sqe:
tst = _check_subset(str(s_ncbi), sqe)
if tst:
print(protein, "PDB seq. is subset of NCBI", tst)
else:
print(protein, "inconsistent NCBI/PDB sequences", str(s_ncbi), sqe)
CP2475L_p14 PDB seq. is subset of UP (369, 522) CP2475L_p14 PDB seq. is subset of NCBI (369, 522) CP2475L_p150 PDB seq. is subset of UP (894, 2476) CP2475L_p150 PDB seq. is subset of NCBI (894, 2476) CP2475L_p34 PDB seq. is subset of UP (45, 368) CP2475L_p34 PDB seq. is subset of NCBI (45, 368) CP2475L_p37 PDB seq. is subset of UP (523, 893) CP2475L_p37 PDB seq. is subset of NCBI (523, 893) CP2475L_p5 PDB seq. is subset of UP (2, 39) CP2475L_p5 PDB seq. is subset of NCBI (2, 39)
# check QP509L (take pLDDT from unrelaxed)
ent_unr = io.LoadPDB("./OneDrive-Part/QP509L-unrelaxed.pdb")
ent_rel = io.LoadPDB("./OneDrive-Part/AlphaFoldSimple/QP509L.pdb")
# NOTE: ent_rel has hydrogens and no b_factors and no chain name!
print(set(a.occupancy for a in ent_rel.atoms), set(a.b_factor for a in ent_rel.atoms))
ev_rel = ent_rel.Select("ele!=H")
ev_atoms = set(a.qualified_name for a in ev_rel.atoms)
eu_atoms = set(a.qualified_name[2:] for a in ent_unr.atoms)
print("IN UNRELAXED:", sorted(eu_atoms - ev_atoms))
print("IN RELAXED:", sorted(ev_atoms - eu_atoms))
{1.0} {0.0} IN UNRELAXED: ['ILE10.CD1', 'ILE108.CD1', 'ILE12.CD1', 'ILE132.CD1', 'ILE134.CD1', 'ILE136.CD1', 'ILE137.CD1', 'ILE147.CD1', 'ILE153.CD1', 'ILE155.CD1', 'ILE158.CD1', 'ILE191.CD1', 'ILE194.CD1', 'ILE195.CD1', 'ILE228.CD1', 'ILE232.CD1', 'ILE237.CD1', 'ILE254.CD1', 'ILE255.CD1', 'ILE256.CD1', 'ILE279.CD1', 'ILE293.CD1', 'ILE300.CD1', 'ILE303.CD1', 'ILE313.CD1', 'ILE342.CD1', 'ILE343.CD1', 'ILE344.CD1', 'ILE354.CD1', 'ILE368.CD1', 'ILE369.CD1', 'ILE393.CD1', 'ILE394.CD1', 'ILE4.CD1', 'ILE405.CD1', 'ILE413.CD1', 'ILE414.CD1', 'ILE426.CD1', 'ILE430.CD1', 'ILE442.CD1', 'ILE446.CD1', 'ILE46.CD1', 'ILE463.CD1', 'ILE471.CD1', 'ILE48.CD1', 'ILE489.CD1', 'ILE5.CD1', 'ILE61.CD1', 'ILE65.CD1', 'LYS509.O'] IN RELAXED: ['ILE10.CD', 'ILE108.CD', 'ILE12.CD', 'ILE132.CD', 'ILE134.CD', 'ILE136.CD', 'ILE137.CD', 'ILE147.CD', 'ILE153.CD', 'ILE155.CD', 'ILE158.CD', 'ILE191.CD', 'ILE194.CD', 'ILE195.CD', 'ILE228.CD', 'ILE232.CD', 'ILE237.CD', 'ILE254.CD', 'ILE255.CD', 'ILE256.CD', 'ILE279.CD', 'ILE293.CD', 'ILE300.CD', 'ILE303.CD', 'ILE313.CD', 'ILE342.CD', 'ILE343.CD', 'ILE344.CD', 'ILE354.CD', 'ILE368.CD', 'ILE369.CD', 'ILE393.CD', 'ILE394.CD', 'ILE4.CD', 'ILE405.CD', 'ILE413.CD', 'ILE414.CD', 'ILE426.CD', 'ILE430.CD', 'ILE442.CD', 'ILE446.CD', 'ILE46.CD', 'ILE463.CD', 'ILE471.CD', 'ILE48.CD', 'ILE489.CD', 'ILE5.CD', 'ILE61.CD', 'ILE65.CD', 'LYS509.O1', 'LYS509.O2']
# can we fix it internally?
import ost
from ost import testutils, conop
from ost.mol import mm
# setup conop
testutils.SetDefaultCompoundLib()
processor = conop.RuleBasedProcessor(conop.GetDefaultLib())
# map with MM function
mm.MMModeller.AssignPDBNaming(ent_rel)
# check processing
ost.PushVerbosityLevel(2)
processor.Process(ent_rel)
ost.PopVerbosityLevel()
residue LYS509 contains unknown atom O2 residue LYS509 contains unknown atom O1 residue LYS509 contains unknown atom HE1 residue LYS509 contains unknown atom HD1 residue LYS509 contains unknown atom HG1 residue LYS509 contains unknown atom HB1
import xml.dom.minidom
def _get_ncbi_info(ncbi_ac):
"""Fetch dict with info from NCBI web service."""
# src: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary
rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
f"esummary.fcgi?db=protein&id={ncbi_ac}")
dom = xml.dom.minidom.parseString(rspns.text)
docsums = dom.getElementsByTagName("DocSum")
assert len(docsums) == 1
docsum = docsums[0]
ncbi_dict = {}
for cn in docsum.childNodes:
if cn.nodeName == "Item":
cn_name = cn.getAttribute("Name")
cn_type = cn.getAttribute("Type")
if cn.childNodes:
d = cn.childNodes[0].data
if cn_type == "String":
ncbi_dict[cn_name] = d
elif cn_type == "Integer":
ncbi_dict[cn_name] = int(d)
else:
raise RuntimeError(f"Unknown type {cn_type} for {ncbi_ac}")
else:
ncbi_dict[cn_name] = None
return ncbi_dict
# fetch some extra info from NCBI
for idx, row in metadata.iterrows():
ncbi_info = _get_ncbi_info(row.NCBI_Accession)
# Gi is some numerical sequence identifier used internally by NCBI
metadata.loc[idx, "NCBI_Gi"] = str(ncbi_info["Gi"])
# UpdateData is to be stored as the version date in ModelCIF
metadata.loc[idx, "NCBI_UpdateDate"] = ncbi_info["UpdateDate"]
# TaxId should be same as one from UP
metadata.loc[idx, "NCBI_TaxId"] = str(ncbi_info["TaxId"])
# Status expected to be live
if ncbi_info["Status"] != "live":
print(idx, row.NCBI_Accession, "Status", ncbi_info["Status"])
# ReplacedBy expected to be empty
if ncbi_info["ReplacedBy"]:
print(idx, row.NCBI_Accession, "ReplacedBy", ncbi_info["ReplacedBy"])
# AccessionVersion expected to be NCBI_Accession
if ncbi_info["AccessionVersion"] != row.NCBI_Accession:
print(idx, row.NCBI_Accession, "AccessionVersion", ncbi_info["AccessionVersion"])
metadata
Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | NCBI_Gi | NCBI_UpdateDate | NCBI_TaxId | |
---|---|---|---|---|---|---|---|---|---|---|
Protein | ||||||||||
285L | 285L.pdb | CAD2068351.1 | A0A485PQD1 | ASFV-G 285L | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN | 1886136876 | 2020/08/05 | 10497 |
A104R | A104R.pdb | CAD2068395.1 | A0A0A1E0L7 | ASFV-G A104R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN | 1886136920 | 2020/08/05 | 10497 |
A118R | A118R.pdb | CAD2068397.1 | A0A2X0RVA9 | ASFV-G A118R | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN | 1886136922 | 2020/08/05 | 10497 |
A137R | A137R.pdb | CAD2068404.1 | A0A2X0THQ0 | ASFV-G A137R | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN | 1886136929 | 2020/08/05 | 10497 |
A151R | A151R.pdb | CAD2068398.1 | A0A2X0TC55 | ASFV-G A151R | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN | 1886136923 | 2020/08/05 | 10497 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
QP509L | QP509L.pdb | CAD2068484.1 | A0A2X0THX2 | ASFV-G QP509L | This model was predicted using AlphaFold2 | NaN | NaN | 1886137009 | 2020/08/05 | 10497 |
R298L | R298L.pdb | CAD2068482.1 | A0A2X0SE42 | ASFV-G R298L | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN | 1886137007 | 2020/08/05 | 10497 |
S183L | S183L.pdb | CAD2068472.1 | A0A2X0SE34 | ASFV-G S183L | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN | 1886136997 | 2020/08/05 | 10497 |
S273R | S273R.pdb | CAD2068473.1 | A0A2X0TKM5 | ASFV-G S273R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN | 1886136998 | 2020/08/05 | 10497 |
X69R | X69R.pdb | CAD2068372.1 | A0A2X0TKC7 | ASFV-G X69R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN | 1886136897 | 2020/08/05 | 10497 |
197 rows × 10 columns
len(set(metadata.NCBI_Accession)), set(metadata.NCBI_TaxId), \
len(set(metadata.NCBI_Gi)), set(metadata.NCBI_UpdateDate)
(193, {'10497'}, 193, {'2020/08/05'})
up_data
{'up_organism': 'African swine fever virus (ASFV)', 'up_sequence': 'MMNIIKIRGKIFFAVLLEEDISLNTLSPNAVIRKI', 'up_ac': 'A0A485PZB7', 'up_id': 'A0A485PZB7_ASF', 'up_last_mod': datetime.datetime(2019, 6, 5, 0, 0), 'up_gn': 'hypthetical', 'up_ncbi_taxid': '10497', 'up_seqlen': 35, 'up_crc64': '5CA27C9509292409', 'up_isoform': None}