import requests
import datetime
import os
import pandas as pd
import xml.dom.minidom
from ost import io
# from Tara code
def _get_sequence(chn):
"""Get the sequence out of an OST chain."""
# initialise
lst_rn = chn.residues[0].number.num
idx = 1
sqe = chn.residues[0].one_letter_code
if lst_rn != 1:
sqe = "-"
idx = 0
for res in chn.residues[idx:]:
lst_rn += 1
while lst_rn != res.number.num:
sqe += "-"
lst_rn += 1
sqe += res.one_letter_code
return sqe
def _check_sequence(up_ac, sequence):
"""Verify sequence to only contain standard olc."""
for res in sequence:
if res not in "ACDEFGHIKLMNPQRSTVWY":
raise RuntimeError(
"Non-standard aa found in UniProtKB sequence "
+ f"for entry '{up_ac}': {res}"
)
def _fetch_upkb_entry(up_ac):
"""Fetch data for an UniProtKB entry."""
# This is a simple parser for UniProtKB txt format, instead of breaking it up
# into multiple functions, we just allow many many branches & statements,
# here.
# pylint: disable=too-many-branches,too-many-statements
data = {}
data["up_organism"] = ""
data["up_sequence"] = ""
data["up_ac"] = up_ac
rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt")
for line in rspns.iter_lines(decode_unicode=True):
if line.startswith("ID "):
sline = line.split()
if len(sline) != 5:
_abort_msg(f"Unusual UniProtKB ID line found:\n'{line}'")
data["up_id"] = sline[1]
elif line.startswith("OX NCBI_TaxID="):
# Following strictly the UniProtKB format: 'OX NCBI_TaxID=<ID>;'
data["up_ncbi_taxid"] = line[len("OX NCBI_TaxID=") : -1]
data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip()
elif line.startswith("OS "):
if line[-1] == ".":
data["up_organism"] += line[len("OS ") : -1]
else:
data["up_organism"] += line[len("OS ") : -1] + " "
elif line.startswith("SQ "):
sline = line.split()
if len(sline) != 8:
_abort_msg(f"Unusual UniProtKB SQ line found:\n'{line}'")
data["up_seqlen"] = int(sline[2])
data["up_crc64"] = sline[6]
elif line.startswith(" "):
sline = line.split()
if len(sline) > 6:
_abort_msg(
"Unusual UniProtKB sequence data line "
+ f"found:\n'{line}'"
)
data["up_sequence"] += "".join(sline)
elif line.startswith("RP "):
if "ISOFORM" in line.upper():
RuntimeError(
f"First ISOFORM found for '{up_ac}', needs " + "handling."
)
elif line.startswith("DT "):
# 2012-10-03
dt_flds = line[len("DT ") :].split(", ")
if dt_flds[1].upper().startswith("SEQUENCE VERSION "):
data["up_last_mod"] = datetime.datetime.strptime(
dt_flds[0], "%d-%b-%Y"
)
elif line.startswith("GN Name="):
data["up_gn"] = line[len("GN Name=") :].split(";")[0]
data["up_gn"] = data["up_gn"].split("{")[0].strip()
# we have not seen isoforms in the data set, yet, so we just set them to '.'
data["up_isoform"] = None
if "up_gn" not in data:
_abort_msg(f"No gene name found for UniProtKB entry '{up_ac}'.")
if "up_last_mod" not in data:
_abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.")
if "up_crc64" not in data:
_abort_msg(f"No CRC64 value found for UniProtKB entry '{up_ac}'.")
if len(data["up_sequence"]) == 0:
_abort_msg(f"No sequence found for UniProtKB entry '{up_ac}'.")
# check that sequence length and CRC64 is correct
if data["up_seqlen"] != len(data["up_sequence"]):
_abort_msg(
"Sequence length of SQ line and sequence data differ for "
+ f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
+ f"{len(data['up_sequence'])}"
)
_check_sequence(data["up_ac"], data["up_sequence"])
if "up_id" not in data:
_abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.")
if "up_ncbi_taxid" not in data:
_abort_msg(f"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.")
if len(data["up_organism"]) == 0:
_abort_msg(f"No organism species found for UniProtKB entry '{up_ac}'.")
return data
def _get_upkb_for_sequence(sqe, up_ac):
"""Get UniProtKB entry data for given sequence."""
up_data = _fetch_upkb_entry(up_ac)
if sqe != up_data["up_sequence"]:
raise RuntimeError(
f"Sequences not equal from file: {sqe}, from UniProtKB: "
+ f"{up_data['up_sequence']}"
)
return up_data
def _get_ncbi_sequence(ncbi_ac):
"""Fetch OST sequence object from NCBI web service."""
# src: https://www.ncbi.nlm.nih.gov/books/NBK25500/#_chapter1_Downloading_Full_Records_
rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
f"efetch.fcgi?db=protein&id={ncbi_ac}" \
f"&rettype=fasta&retmode=text")
return io.SequenceFromString(rspns.text, "fasta")
# s = _get_ncbi_sequence("CAD2068351.1")
# up_data = _fetch_upkb_entry("A0A485PQD1")
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data
# check USDA data
metadata_file = "./InputFiles/ASFV-G_proteome_accessions.csv"
pdb_dir = "./InputFiles/AlphaFold-RENAME"
metadata = pd.read_csv(metadata_file)
assert len(set(metadata.Protein)) == metadata.shape[0]
assert len(set(metadata["Associated PDB"])) == metadata.shape[0]
metadata
Protein | Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | |
---|---|---|---|---|---|---|---|---|
0 | 285L | 285L.pdb | CAD2068351.1 | A0A485PQD1 | ASFV-G 285L | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN |
1 | A104R | A104R.pdb | CAD2068395.1 | A0A0A1E0L7 | ASFV-G A104R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN |
2 | A118R | A118R.pdb | CAD2068397.1 | A0A2X0RVA9 | ASFV-G A118R | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN |
3 | A137R | A137R.pdb | CAD2068404.1 | A0A2X0THQ0 | ASFV-G A137R | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN |
4 | A151R | A151R.pdb | CAD2068398.1 | A0A2X0TC55 | ASFV-G A151R | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... |
192 | QP509L | QP509L.pdb | CAD2068484.1 | A0A2X0THX2 | ASFV-G QP509L | This model was predicted using AlphaFold2 | NaN | NaN |
193 | R298L | R298L.pdb | CAD2068482.1 | A0A2X0SE42 | ASFV-G R298L | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN |
194 | S183L | S183L.pdb | CAD2068472.1 | A0A2X0SE34 | ASFV-G S183L | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN |
195 | S273R | S273R.pdb | CAD2068473.1 | A0A2X0TKM5 | ASFV-G S273R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN |
196 | X69R | X69R.pdb | CAD2068372.1 | A0A2X0TKC7 | ASFV-G X69R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN |
197 rows × 8 columns
pdb_files = [f for f in sorted(os.listdir(pdb_dir)) if f.endswith(".pdb")]
# check names
pdb_file_split = [os.path.splitext(f) for f in pdb_files]
# CHECK THAT PDB FILES MATCH EXISTING ONES
# -> extra QP509L-unrelaxed expected
tstp = set(pdb_files)
tstm = set(metadata["Associated PDB"])
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: ['QP509L-unrelaxed.pdb'] ONLY IN METADATA: []
# CHECK THAT FILE NAMES MATCH PROTEIN NAMES
tstp = set(fs[0] for fs in pdb_file_split)
tstm = set(metadata.Protein)
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: ['QP509L-unrelaxed'] ONLY IN METADATA: []
# can use either Protein or PDB name as index
metadata = metadata.set_index("Protein")
# NOTE: stupid space there...
metadata[metadata["_struct.title "].isna()]
Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | |
---|---|---|---|---|---|---|---|
Protein |
metadata[metadata["_struct.pdbx_model_detail"].isna()]
Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | |
---|---|---|---|---|---|---|---|
Protein |
metadata[~metadata.notes.isna()]
Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | |
---|---|---|---|---|---|---|---|
Protein | |||||||
CP2475L_p14 | CP2475L_p14.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p14 | This model was predicted using AlphaFold2 | model_4_pred_0 | protein p14 from the pp220 polyprotein encoded... |
CP2475L_p34 | CP2475L_p34.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p34 | This model was predicted using AlphaFold2 | model_4_pred_0 | protein p34 from the pp220 polyprotein encoded... |
CP2475L_p37 | CP2475L_p37.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p37 | This model was predicted using AlphaFold2 | model_4_pred_0 | protein p37 from the pp220 polyprotein encoded... |
CP2475L_p150 | CP2475L_p150.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p150 | This model was predicted using AlphaFold2 | model_1_pred_0 | protein p150 from the pp220 polyprotein encode... |
CP2475L_p5 | CP2475L_p5.pdb | CAD2068454.1 | A0A2X0THU5 | ASFV-G CP2475L p5 | This model was predicted using AlphaFold2 | model_3_pred_0 | protein p5 from the pp220 polyprotein encoded ... |
D250R | D250R.pdb | CAD2068464.1 | A0A2X0THV3 | ASFV-G D250R | This model was predicted using AlphaFold2 | model_3_pred_0 | Mislabled on NCBI and Uniport as D205R |
DP79L | DP79L.pdb | CAD2068466.1 | A0A0A1E158 | ASFV-G DP79L | This model was predicted using AlphaFold2 | model_3_pred_0 | Mislabled on Uniprot as D79L |
hypothetical_01 | hypothetical_01.pdb | CAD2068367.1 | A0A485PU43 | ASFV-G hypothetical_01 | This model was predicted using AlphaFold2 | model_5_pred_0 | labeled as hypothetical on NCBI and Uniprot |
hypothetical_02 | hypothetical_02.pdb | CAD2068400.1 | A0A485PQI3 | ASFV-G hypothetical_02 | This model was predicted using AlphaFold2 | model_3_pred_0 | labeled as hypthetical on NCBI and Uniprot |
hypothetical_03 | hypothetical_03.pdb | CAD2068512.1 | A0A485PZB7 | ASFV-G hypothetical_03 | This model was predicted using AlphaFold2 | model_3_pred_0 | labeled as hypthetical on NCBI and Uniprot |
for mdl_notes in metadata[~metadata.notes.isna()].notes:
mdl_notes = mdl_notes.replace("hypthetical", "hypothetical") \
.replace("Uniport", "UniProt") \
.replace("Uniprot", "UniProt") \
.replace("Mislabled", "mislabeled")
print(mdl_notes)
protein p14 from the pp220 polyprotein encoded by CP2475L protein p34 from the pp220 polyprotein encoded by CP2475L protein p37 from the pp220 polyprotein encoded by CP2475L protein p150 from the pp220 polyprotein encoded by CP2475L protein p5 from the pp220 polyprotein encoded by CP2475L mislabeled on NCBI and UniProt as D205R mislabeled on UniProt as D79L labeled as hypothetical on NCBI and UniProt labeled as hypothetical on NCBI and UniProt labeled as hypothetical on NCBI and UniProt
mdl_notes = 'labeled as hypthetical on NCBI and Uniprot'
'labeled as hypothetical on NCBI and Uniprot'
# tst = metadata.loc["hypothetical_03"]
# s = _get_ncbi_sequence(tst.NCBI_Accession)
# up_data = _fetch_upkb_entry(tst.UniProt_ID)
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data
# checked all the one above manually and ok as stated (best to add to model_detail!)
# special one...
metadata.loc["QP509L"]
Associated PDB QP509L.pdb NCBI_Accession CAD2068484.1 UniProt_ID A0A2X0THX2 _struct.title ASFV-G QP509L _struct.pdbx_model_detail This model was predicted using AlphaFold2 ranking debugg model ID NaN notes NaN NCBI_Gi 1886137009 NCBI_UpdateDate 2020/08/05 NCBI_TaxId 10497 Name: QP509L, dtype: object
set(metadata["_struct.pdbx_model_detail"])
{'This model was predicted using AlphaFold2'}
len(set(metadata["_struct.title "]))
197
# check model numbers
tst = metadata["ranking debugg model ID"]
for idx, mdl_id in tst.items():
mdl_num = None
if type(mdl_id) == str:
mdl_id_split = mdl_id.split('_')
if len(mdl_id_split) == 4:
mdl_num = int(mdl_id_split[1])
if not mdl_num:
print(idx, mdl_id)
elif mdl_num not in range(1, 6):
print(idx, mdl_id, mdl_num)
QP509L nan
def _check_subset(s1, s2):
# check if s2 is uniquely contained in s1
# (and if so, returns values for seq_db_align_begin & seq_db_align_end)
if s1.count(s2) == 1:
align_begin = s1.find(s2) + 1
align_end = align_begin + len(s2) - 1
return align_begin, align_end
else:
return None
# check shared ones
for protein, pdb_ext in sorted(pdb_file_split):
if protein not in metadata.index:
print("SKIPPING", protein)
continue
else:
row = metadata.loc[protein]
pdb_path = os.path.join(pdb_dir, protein + pdb_ext)
ent = io.LoadPDB(pdb_path)
assert ent.chain_count == 1
sqe = _get_sequence(ent.chains[0])
s_ncbi = _get_ncbi_sequence(row.NCBI_Accession)
up_data = _fetch_upkb_entry(row.UniProt_ID)
if up_data["up_sequence"] != str(s_ncbi):
print(protein, "inconsistent UP/NCBI sequences", up_data["up_sequence"], str(s_ncbi))
if up_data["up_sequence"] != sqe:
tst = _check_subset(up_data["up_sequence"], sqe)
if tst:
print(protein, "PDB seq. is subset of UP", tst)
else:
print(protein, "inconsistent UP/PDB sequences", up_data["up_sequence"], sqe)
if str(s_ncbi) != sqe:
tst = _check_subset(str(s_ncbi), sqe)
if tst:
print(protein, "PDB seq. is subset of NCBI", tst)
else:
print(protein, "inconsistent NCBI/PDB sequences", str(s_ncbi), sqe)
CP2475L_p14 PDB seq. is subset of UP (369, 522) CP2475L_p14 PDB seq. is subset of NCBI (369, 522) CP2475L_p150 PDB seq. is subset of UP (894, 2476) CP2475L_p150 PDB seq. is subset of NCBI (894, 2476) CP2475L_p34 PDB seq. is subset of UP (45, 368) CP2475L_p34 PDB seq. is subset of NCBI (45, 368) CP2475L_p37 PDB seq. is subset of UP (523, 893) CP2475L_p37 PDB seq. is subset of NCBI (523, 893) CP2475L_p5 PDB seq. is subset of UP (2, 39) CP2475L_p5 PDB seq. is subset of NCBI (2, 39)
# check QP509L (take pLDDT from unrelaxed)
ent_unr = io.LoadPDB("./InputFiles/AlphaFold-RENAME/QP509L-unrelaxed.pdb")
ent_rel = io.LoadPDB("./InputFiles/AlphaFold-RENAME/QP509L.pdb")
print(set(a.occupancy for a in ent_rel.atoms), set(a.b_factor for a in ent_rel.atoms))
ev_atoms = set(a.qualified_name for a in ent_rel.atoms)
eu_atoms = set(a.qualified_name for a in ent_unr.atoms)
print("IN UNRELAXED:", sorted(eu_atoms - ev_atoms))
print("IN RELAXED:", sorted(ev_atoms - eu_atoms))
{1.0} {0.0} IN UNRELAXED: [] IN RELAXED: []
def _get_ncbi_info(ncbi_ac):
"""Fetch dict with info from NCBI web service."""
# src: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary
rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
f"esummary.fcgi?db=protein&id={ncbi_ac}")
dom = xml.dom.minidom.parseString(rspns.text)
docsums = dom.getElementsByTagName("DocSum")
assert len(docsums) == 1
docsum = docsums[0]
ncbi_dict = {}
for cn in docsum.childNodes:
if cn.nodeName == "Item":
cn_name = cn.getAttribute("Name")
cn_type = cn.getAttribute("Type")
if cn.childNodes:
d = cn.childNodes[0].data
if cn_type == "String":
ncbi_dict[cn_name] = d
elif cn_type == "Integer":
ncbi_dict[cn_name] = int(d)
else:
raise RuntimeError(f"Unknown type {cn_type} for {ncbi_ac}")
else:
ncbi_dict[cn_name] = None
return ncbi_dict
# fetch some extra info from NCBI
for idx, row in metadata.iterrows():
ncbi_info = _get_ncbi_info(row.NCBI_Accession)
# Gi is some numerical sequence identifier used internally by NCBI
metadata.loc[idx, "NCBI_Gi"] = str(ncbi_info["Gi"])
# UpdateData is to be stored as the version date in ModelCIF
metadata.loc[idx, "NCBI_UpdateDate"] = ncbi_info["UpdateDate"]
# TaxId should be same as one from UP
metadata.loc[idx, "NCBI_TaxId"] = str(ncbi_info["TaxId"])
# Status expected to be live
if ncbi_info["Status"] != "live":
print(idx, row.NCBI_Accession, "Status", ncbi_info["Status"])
# ReplacedBy expected to be empty
if ncbi_info["ReplacedBy"]:
print(idx, row.NCBI_Accession, "ReplacedBy", ncbi_info["ReplacedBy"])
# AccessionVersion expected to be NCBI_Accession
if ncbi_info["AccessionVersion"] != row.NCBI_Accession:
print(idx, row.NCBI_Accession, "AccessionVersion", ncbi_info["AccessionVersion"])
metadata
Associated PDB | NCBI_Accession | UniProt_ID | _struct.title | _struct.pdbx_model_detail | ranking debugg model ID | notes | NCBI_Gi | NCBI_UpdateDate | NCBI_TaxId | |
---|---|---|---|---|---|---|---|---|---|---|
Protein | ||||||||||
285L | 285L.pdb | CAD2068351.1 | A0A485PQD1 | ASFV-G 285L | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN | 1886136876 | 2020/08/05 | 10497 |
A104R | A104R.pdb | CAD2068395.1 | A0A0A1E0L7 | ASFV-G A104R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN | 1886136920 | 2020/08/05 | 10497 |
A118R | A118R.pdb | CAD2068397.1 | A0A2X0RVA9 | ASFV-G A118R | This model was predicted using AlphaFold2 | model_1_pred_0 | NaN | 1886136922 | 2020/08/05 | 10497 |
A137R | A137R.pdb | CAD2068404.1 | A0A2X0THQ0 | ASFV-G A137R | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN | 1886136929 | 2020/08/05 | 10497 |
A151R | A151R.pdb | CAD2068398.1 | A0A2X0TC55 | ASFV-G A151R | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN | 1886136923 | 2020/08/05 | 10497 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
QP509L | QP509L.pdb | CAD2068484.1 | A0A2X0THX2 | ASFV-G QP509L | This model was predicted using AlphaFold2 | NaN | NaN | 1886137009 | 2020/08/05 | 10497 |
R298L | R298L.pdb | CAD2068482.1 | A0A2X0SE42 | ASFV-G R298L | This model was predicted using AlphaFold2 | model_3_pred_0 | NaN | 1886137007 | 2020/08/05 | 10497 |
S183L | S183L.pdb | CAD2068472.1 | A0A2X0SE34 | ASFV-G S183L | This model was predicted using AlphaFold2 | model_4_pred_0 | NaN | 1886136997 | 2020/08/05 | 10497 |
S273R | S273R.pdb | CAD2068473.1 | A0A2X0TKM5 | ASFV-G S273R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN | 1886136998 | 2020/08/05 | 10497 |
X69R | X69R.pdb | CAD2068372.1 | A0A2X0TKC7 | ASFV-G X69R | This model was predicted using AlphaFold2 | model_2_pred_0 | NaN | 1886136897 | 2020/08/05 | 10497 |
197 rows × 10 columns
len(set(metadata.NCBI_Accession)), set(metadata.NCBI_TaxId), \
len(set(metadata.NCBI_Gi)), set(metadata.NCBI_UpdateDate)
(193, {'10497'}, 193, {'2020/08/05'})
# all matching tax IDs?
for protein, upac in metadata.UniProt_ID.items():
up_info = _fetch_upkb_entry(upac)
if up_info["up_ncbi_taxid"] != "10497":
print(protein, up_info)
# check PDB files
import ost
from ost import testutils, conop
# setup conop
testutils.SetDefaultCompoundLib()
io.profiles['DEFAULT'].processor = conop.RuleBasedProcessor(conop.GetDefaultLib())
# check processing
ost.PushVerbosityLevel(2)
for protein, pdb_ext in sorted(pdb_file_split):
pdb_path = os.path.join(pdb_dir, protein + pdb_ext)
ent = io.LoadPDB(pdb_path)
ost.PopVerbosityLevel()
# NOTE: lack of output means that all atom names are ok
up_info
{'up_organism': 'African swine fever virus (ASFV)', 'up_sequence': 'MLLYIVIIVACIISKLVPNEYWAIHLFFIIMIFMVYMYEKLDIHQKSQFWNYTMSGLSGHNVQVTCKCY', 'up_ac': 'A0A2X0TKC7', 'up_id': 'A0A2X0TKC7_ASF', 'up_last_mod': datetime.datetime(2018, 9, 12, 0, 0), 'up_gn': 'X69R CDS', 'up_ncbi_taxid': '10497', 'up_seqlen': 69, 'up_crc64': '3B92E4DB323A7A74', 'up_isoform': None}
ncbi_info
{'Caption': 'CAD2068372', 'Title': 'X69R CDS [African swine fever virus]', 'Extra': 'gi|1886136897|emb|CAD2068372.1|[1886136897]', 'Gi': 1886136897, 'CreateDate': '2010/08/18', 'UpdateDate': '2020/08/05', 'Flags': 0, 'TaxId': 10497, 'Length': 69, 'Status': 'live', 'ReplacedBy': None, 'Comment': ' ', 'AccessionVersion': 'CAD2068372.1'}
# check files that were converted (input is std out from run cut to only include "translating...")
log_lines = open("./script_out.txt").readlines()
# plenty of assertions in here should also catch any errors...
idx = 1
timings = dict()
while idx < len(log_lines) - 1:
l = log_lines[idx].strip()
if "already done..." in l:
idx += 1
continue
assert l.startswith("translating")
mdl_title = l[len("translating"):-3].strip()
if mdl_title in timings:
print("WEIRD", l)
l = log_lines[idx + 1].strip()
assert l.startswith("preparing data")
assert l.endswith("s)")
t_prep = float(l.split()[-1][1:-2])
l = log_lines[idx + 2].strip()
assert l.startswith("generating ModelCIF objects")
assert l.endswith("s)")
t_cif = float(l.split()[-1][1:-2])
l = log_lines[idx + 3].strip()
assert l.startswith("processing QA scores")
assert l.endswith("s)")
t_qa = float(l.split()[-1][1:-2])
l = log_lines[idx + 4].strip()
assert l.startswith("write to disk")
assert l.endswith("s)")
t_write = float(l.split()[-1][1:-2])
l = log_lines[idx + 5].strip()
assert l.startswith("... done with")
assert l.endswith("s).")
t_all = float(l.split()[-1][1:-3])
timings[mdl_title] = {
"t_prep": t_prep,
"t_cif": t_cif,
"t_qa": t_qa,
"t_write": t_write,
"t_all": t_all
}
idx += 6
print(f"DONE {len(timings)} models")
mdl_titles = set(metadata.index)
assert len(set(timings) - mdl_titles) == 0
missing_ones = mdl_titles - set(timings)
print(f"MISSING {len(missing_ones)} models")
DONE 197 models MISSING 0 models
k_parts = ["t_prep", "t_cif", "t_qa", "t_write"]
totals = {k: sum(v[k] for v in timings.values()) \
for k in k_parts + ["t_all"]}
print(f"TOTAL TIME SPENT: {round(totals['t_all'], 2)} s")
for k in k_parts:
print(f"-> {k}: {round(100 * totals[k] / totals['t_all'], 1)}%")
TOTAL TIME SPENT: 356.38 s -> t_prep: 80.5% -> t_cif: 1.2% -> t_qa: 0.0% -> t_write: 17.8%