In [1]:
import requests
import datetime
import os
import pandas as pd
import xml.dom.minidom
from ost import io

# from Tara code
def _get_sequence(chn):
    """Get the sequence out of an OST chain."""
    # initialise
    lst_rn = chn.residues[0].number.num
    idx = 1
    sqe = chn.residues[0].one_letter_code
    if lst_rn != 1:
        sqe = "-"
        idx = 0

    for res in chn.residues[idx:]:
        lst_rn += 1
        while lst_rn != res.number.num:
            sqe += "-"
            lst_rn += 1
        sqe += res.one_letter_code

    return sqe

def _check_sequence(up_ac, sequence):
    """Verify sequence to only contain standard olc."""
    for res in sequence:
        if res not in "ACDEFGHIKLMNPQRSTVWY":
            raise RuntimeError(
                "Non-standard aa found in UniProtKB sequence "
                + f"for entry '{up_ac}': {res}"
            )

def _fetch_upkb_entry(up_ac):
    """Fetch data for an UniProtKB entry."""
    # This is a simple parser for UniProtKB txt format, instead of breaking it up
    # into multiple functions, we just allow many many branches & statements,
    # here.
    # pylint: disable=too-many-branches,too-many-statements
    data = {}
    data["up_organism"] = ""
    data["up_sequence"] = ""
    data["up_ac"] = up_ac
    rspns = requests.get(f"https://www.uniprot.org/uniprot/{up_ac}.txt")
    for line in rspns.iter_lines(decode_unicode=True):
        if line.startswith("ID   "):
            sline = line.split()
            if len(sline) != 5:
                _abort_msg(f"Unusual UniProtKB ID line found:\n'{line}'")
            data["up_id"] = sline[1]
        elif line.startswith("OX   NCBI_TaxID="):
            # Following strictly the UniProtKB format: 'OX   NCBI_TaxID=<ID>;'
            data["up_ncbi_taxid"] = line[len("OX   NCBI_TaxID=") : -1]
            data["up_ncbi_taxid"] = data["up_ncbi_taxid"].split("{")[0].strip()
        elif line.startswith("OS   "):
            if line[-1] == ".":
                data["up_organism"] += line[len("OS   ") : -1]
            else:
                data["up_organism"] += line[len("OS   ") : -1] + " "
        elif line.startswith("SQ   "):
            sline = line.split()
            if len(sline) != 8:
                _abort_msg(f"Unusual UniProtKB SQ line found:\n'{line}'")
            data["up_seqlen"] = int(sline[2])
            data["up_crc64"] = sline[6]
        elif line.startswith("     "):
            sline = line.split()
            if len(sline) > 6:
                _abort_msg(
                    "Unusual UniProtKB sequence data line "
                    + f"found:\n'{line}'"
                )
            data["up_sequence"] += "".join(sline)
        elif line.startswith("RP   "):
            if "ISOFORM" in line.upper():
                RuntimeError(
                    f"First ISOFORM found for '{up_ac}', needs " + "handling."
                )
        elif line.startswith("DT   "):
            # 2012-10-03
            dt_flds = line[len("DT   ") :].split(", ")
            if dt_flds[1].upper().startswith("SEQUENCE VERSION "):
                data["up_last_mod"] = datetime.datetime.strptime(
                    dt_flds[0], "%d-%b-%Y"
                )
        elif line.startswith("GN   Name="):
            data["up_gn"] = line[len("GN   Name=") :].split(";")[0]
            data["up_gn"] = data["up_gn"].split("{")[0].strip()

    # we have not seen isoforms in the data set, yet, so we just set them to '.'
    data["up_isoform"] = None

    if "up_gn" not in data:
        _abort_msg(f"No gene name found for UniProtKB entry '{up_ac}'.")
    if "up_last_mod" not in data:
        _abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.")
    if "up_crc64" not in data:
        _abort_msg(f"No CRC64 value found for UniProtKB entry '{up_ac}'.")
    if len(data["up_sequence"]) == 0:
        _abort_msg(f"No sequence found for UniProtKB entry '{up_ac}'.")
    # check that sequence length and CRC64 is correct
    if data["up_seqlen"] != len(data["up_sequence"]):
        _abort_msg(
            "Sequence length of SQ line and sequence data differ for "
            + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != "
            + f"{len(data['up_sequence'])}"
        )
    _check_sequence(data["up_ac"], data["up_sequence"])

    if "up_id" not in data:
        _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.")
    if "up_ncbi_taxid" not in data:
        _abort_msg(f"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.")
    if len(data["up_organism"]) == 0:
        _abort_msg(f"No organism species found for UniProtKB entry '{up_ac}'.")

    return data

def _get_upkb_for_sequence(sqe, up_ac):
    """Get UniProtKB entry data for given sequence."""
    up_data = _fetch_upkb_entry(up_ac)
    if sqe != up_data["up_sequence"]:
        raise RuntimeError(
            f"Sequences not equal from file: {sqe}, from UniProtKB: "
            + f"{up_data['up_sequence']}"
        )

    return up_data
In [2]:
def _get_ncbi_sequence(ncbi_ac):
    """Fetch OST sequence object from NCBI web service."""
    # src: https://www.ncbi.nlm.nih.gov/books/NBK25500/#_chapter1_Downloading_Full_Records_
    rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
                         f"efetch.fcgi?db=protein&id={ncbi_ac}" \
                         f"&rettype=fasta&retmode=text")
    return io.SequenceFromString(rspns.text, "fasta")
In [3]:
# s = _get_ncbi_sequence("CAD2068351.1")
# up_data = _fetch_upkb_entry("A0A485PQD1")
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data
In [5]:
# check USDA data
metadata_file = "./InputFiles/ASFV-G_proteome_accessions.csv"
pdb_dir = "./InputFiles/AlphaFold-RENAME"
In [6]:
metadata = pd.read_csv(metadata_file)
assert len(set(metadata.Protein)) == metadata.shape[0]
assert len(set(metadata["Associated PDB"])) == metadata.shape[0]
metadata
Out[6]:
Protein Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
0 285L 285L.pdb CAD2068351.1 A0A485PQD1 ASFV-G 285L This model was predicted using AlphaFold2 model_1_pred_0 NaN
1 A104R A104R.pdb CAD2068395.1 A0A0A1E0L7 ASFV-G A104R This model was predicted using AlphaFold2 model_2_pred_0 NaN
2 A118R A118R.pdb CAD2068397.1 A0A2X0RVA9 ASFV-G A118R This model was predicted using AlphaFold2 model_1_pred_0 NaN
3 A137R A137R.pdb CAD2068404.1 A0A2X0THQ0 ASFV-G A137R This model was predicted using AlphaFold2 model_3_pred_0 NaN
4 A151R A151R.pdb CAD2068398.1 A0A2X0TC55 ASFV-G A151R This model was predicted using AlphaFold2 model_4_pred_0 NaN
... ... ... ... ... ... ... ... ...
192 QP509L QP509L.pdb CAD2068484.1 A0A2X0THX2 ASFV-G QP509L This model was predicted using AlphaFold2 NaN NaN
193 R298L R298L.pdb CAD2068482.1 A0A2X0SE42 ASFV-G R298L This model was predicted using AlphaFold2 model_3_pred_0 NaN
194 S183L S183L.pdb CAD2068472.1 A0A2X0SE34 ASFV-G S183L This model was predicted using AlphaFold2 model_4_pred_0 NaN
195 S273R S273R.pdb CAD2068473.1 A0A2X0TKM5 ASFV-G S273R This model was predicted using AlphaFold2 model_2_pred_0 NaN
196 X69R X69R.pdb CAD2068372.1 A0A2X0TKC7 ASFV-G X69R This model was predicted using AlphaFold2 model_2_pred_0 NaN

197 rows × 8 columns

In [66]:
pdb_files = [f for f in sorted(os.listdir(pdb_dir)) if f.endswith(".pdb")]
In [67]:
# check names
pdb_file_split = [os.path.splitext(f) for f in pdb_files]
In [68]:
# CHECK THAT PDB FILES MATCH EXISTING ONES
# -> extra QP509L-unrelaxed expected
tstp = set(pdb_files)
tstm = set(metadata["Associated PDB"])
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: ['QP509L-unrelaxed.pdb']
ONLY IN METADATA: []
In [70]:
# CHECK THAT FILE NAMES MATCH PROTEIN NAMES
tstp = set(fs[0] for fs in pdb_file_split)
tstm = set(metadata.Protein)
print("ONLY AS PDB:", sorted(tstp - tstm))
print("ONLY IN METADATA:", sorted(tstm - tstp))
ONLY AS PDB: ['QP509L-unrelaxed']
ONLY IN METADATA: []
In [14]:
# can use either Protein or PDB name as index
metadata = metadata.set_index("Protein")
In [15]:
# NOTE: stupid space there...
metadata[metadata["_struct.title "].isna()]
Out[15]:
Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
Protein
In [16]:
metadata[metadata["_struct.pdbx_model_detail"].isna()]
Out[16]:
Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
Protein
In [17]:
metadata[~metadata.notes.isna()]
Out[17]:
Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes
Protein
CP2475L_p14 CP2475L_p14.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p14 This model was predicted using AlphaFold2 model_4_pred_0 protein p14 from the pp220 polyprotein encoded...
CP2475L_p34 CP2475L_p34.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p34 This model was predicted using AlphaFold2 model_4_pred_0 protein p34 from the pp220 polyprotein encoded...
CP2475L_p37 CP2475L_p37.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p37 This model was predicted using AlphaFold2 model_4_pred_0 protein p37 from the pp220 polyprotein encoded...
CP2475L_p150 CP2475L_p150.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p150 This model was predicted using AlphaFold2 model_1_pred_0 protein p150 from the pp220 polyprotein encode...
CP2475L_p5 CP2475L_p5.pdb CAD2068454.1 A0A2X0THU5 ASFV-G CP2475L p5 This model was predicted using AlphaFold2 model_3_pred_0 protein p5 from the pp220 polyprotein encoded ...
D250R D250R.pdb CAD2068464.1 A0A2X0THV3 ASFV-G D250R This model was predicted using AlphaFold2 model_3_pred_0 Mislabled on NCBI and Uniport as D205R
DP79L DP79L.pdb CAD2068466.1 A0A0A1E158 ASFV-G DP79L This model was predicted using AlphaFold2 model_3_pred_0 Mislabled on Uniprot as D79L
hypothetical_01 hypothetical_01.pdb CAD2068367.1 A0A485PU43 ASFV-G hypothetical_01 This model was predicted using AlphaFold2 model_5_pred_0 labeled as hypothetical on NCBI and Uniprot
hypothetical_02 hypothetical_02.pdb CAD2068400.1 A0A485PQI3 ASFV-G hypothetical_02 This model was predicted using AlphaFold2 model_3_pred_0 labeled as hypthetical on NCBI and Uniprot
hypothetical_03 hypothetical_03.pdb CAD2068512.1 A0A485PZB7 ASFV-G hypothetical_03 This model was predicted using AlphaFold2 model_3_pred_0 labeled as hypthetical on NCBI and Uniprot
In [126]:
for mdl_notes in metadata[~metadata.notes.isna()].notes:
    mdl_notes = mdl_notes.replace("hypthetical", "hypothetical") \
                         .replace("Uniport", "UniProt") \
                         .replace("Uniprot", "UniProt") \
                         .replace("Mislabled", "mislabeled")
    print(mdl_notes)
protein p14 from the pp220 polyprotein encoded by CP2475L
protein p34 from the pp220 polyprotein encoded by CP2475L
protein p37 from the pp220 polyprotein encoded by CP2475L
protein p150 from the pp220 polyprotein encoded by CP2475L
protein p5 from the pp220 polyprotein encoded by CP2475L
mislabeled on NCBI and UniProt as D205R
mislabeled on UniProt as D79L
labeled as hypothetical on NCBI and UniProt
labeled as hypothetical on NCBI and UniProt
labeled as hypothetical on NCBI and UniProt
In [121]:
mdl_notes = 'labeled as hypthetical on NCBI and Uniprot'
Out[121]:
'labeled as hypothetical on NCBI and Uniprot'
In [31]:
# tst = metadata.loc["hypothetical_03"]
# s = _get_ncbi_sequence(tst.NCBI_Accession)
# up_data = _fetch_upkb_entry(tst.UniProt_ID)
# print(s.name, s, len(s), up_data["up_sequence"] == str(s))
# up_data

# checked all the one above manually and ok as stated (best to add to model_detail!)
In [113]:
# special one...
metadata.loc["QP509L"]
Out[113]:
Associated PDB                                              QP509L.pdb
NCBI_Accession                                            CAD2068484.1
UniProt_ID                                                  A0A2X0THX2
_struct.title                                            ASFV-G QP509L
_struct.pdbx_model_detail    This model was predicted using AlphaFold2
ranking debugg model ID                                            NaN
notes                                                              NaN
NCBI_Gi                                                     1886137009
NCBI_UpdateDate                                             2020/08/05
NCBI_TaxId                                                       10497
Name: QP509L, dtype: object
In [33]:
set(metadata["_struct.pdbx_model_detail"])
Out[33]:
{'This model was predicted using AlphaFold2'}
In [34]:
len(set(metadata["_struct.title "]))
Out[34]:
197
In [35]:
# check model numbers
tst = metadata["ranking debugg model ID"]
for idx, mdl_id in tst.items():
    mdl_num = None
    if type(mdl_id) == str:
        mdl_id_split = mdl_id.split('_')
        if len(mdl_id_split) == 4:
            mdl_num = int(mdl_id_split[1])
    if not mdl_num:
        print(idx, mdl_id)
    elif mdl_num not in range(1, 6):
        print(idx, mdl_id, mdl_num)
QP509L nan
In [36]:
def _check_subset(s1, s2):
    # check if s2 is uniquely contained in s1
    # (and if so, returns values for seq_db_align_begin & seq_db_align_end)
    if s1.count(s2) == 1:
        align_begin = s1.find(s2) + 1
        align_end = align_begin + len(s2) - 1
        return align_begin, align_end
    else:
        return None
In [37]:
# check shared ones
for protein, pdb_ext in sorted(pdb_file_split):
    if protein not in metadata.index:
        print("SKIPPING", protein)
        continue
    else:
        row = metadata.loc[protein]
    pdb_path = os.path.join(pdb_dir, protein + pdb_ext)
    ent = io.LoadPDB(pdb_path)
    assert ent.chain_count == 1
    sqe = _get_sequence(ent.chains[0])
    s_ncbi = _get_ncbi_sequence(row.NCBI_Accession)
    up_data = _fetch_upkb_entry(row.UniProt_ID)
    if up_data["up_sequence"] != str(s_ncbi):
        print(protein, "inconsistent UP/NCBI sequences", up_data["up_sequence"], str(s_ncbi))
    if up_data["up_sequence"] != sqe:
        tst = _check_subset(up_data["up_sequence"], sqe)
        if tst:
            print(protein, "PDB seq. is subset of UP", tst)
        else:
            print(protein, "inconsistent UP/PDB sequences", up_data["up_sequence"], sqe)
    if str(s_ncbi) != sqe:
        tst = _check_subset(str(s_ncbi), sqe)
        if tst:
            print(protein, "PDB seq. is subset of NCBI", tst)
        else:
            print(protein, "inconsistent NCBI/PDB sequences", str(s_ncbi), sqe)
CP2475L_p14 PDB seq. is subset of UP (369, 522)
CP2475L_p14 PDB seq. is subset of NCBI (369, 522)
CP2475L_p150 PDB seq. is subset of UP (894, 2476)
CP2475L_p150 PDB seq. is subset of NCBI (894, 2476)
CP2475L_p34 PDB seq. is subset of UP (45, 368)
CP2475L_p34 PDB seq. is subset of NCBI (45, 368)
CP2475L_p37 PDB seq. is subset of UP (523, 893)
CP2475L_p37 PDB seq. is subset of NCBI (523, 893)
CP2475L_p5 PDB seq. is subset of UP (2, 39)
CP2475L_p5 PDB seq. is subset of NCBI (2, 39)
In [65]:
# check QP509L (take pLDDT from unrelaxed)
ent_unr = io.LoadPDB("./InputFiles/AlphaFold-RENAME/QP509L-unrelaxed.pdb")
ent_rel = io.LoadPDB("./InputFiles/AlphaFold-RENAME/QP509L.pdb")
print(set(a.occupancy for a in ent_rel.atoms), set(a.b_factor for a in ent_rel.atoms))
ev_atoms = set(a.qualified_name for a in ent_rel.atoms)
eu_atoms = set(a.qualified_name for a in ent_unr.atoms)
print("IN UNRELAXED:", sorted(eu_atoms - ev_atoms))
print("IN RELAXED:", sorted(ev_atoms - eu_atoms))
{1.0} {0.0}
IN UNRELAXED: []
IN RELAXED: []
In [43]:
def _get_ncbi_info(ncbi_ac):
    """Fetch dict with info from NCBI web service."""
    # src: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary
    rspns = requests.get(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" \
                         f"esummary.fcgi?db=protein&id={ncbi_ac}")
    dom = xml.dom.minidom.parseString(rspns.text)
    docsums = dom.getElementsByTagName("DocSum")
    assert len(docsums) == 1
    docsum = docsums[0]
    ncbi_dict = {}
    for cn in docsum.childNodes:
        if cn.nodeName == "Item":
            cn_name = cn.getAttribute("Name")
            cn_type = cn.getAttribute("Type")
            if cn.childNodes:
                d = cn.childNodes[0].data
                if cn_type == "String":
                    ncbi_dict[cn_name] = d
                elif cn_type == "Integer":
                    ncbi_dict[cn_name] = int(d)
                else:
                    raise RuntimeError(f"Unknown type {cn_type} for {ncbi_ac}")
            else:
                ncbi_dict[cn_name] = None
    return ncbi_dict
In [44]:
# fetch some extra info from NCBI
for idx, row in metadata.iterrows():
    ncbi_info = _get_ncbi_info(row.NCBI_Accession)
    # Gi is some numerical sequence identifier used internally by NCBI
    metadata.loc[idx, "NCBI_Gi"] = str(ncbi_info["Gi"])
    # UpdateData is to be stored as the version date in ModelCIF
    metadata.loc[idx, "NCBI_UpdateDate"] = ncbi_info["UpdateDate"]
    # TaxId should be same as one from UP
    metadata.loc[idx, "NCBI_TaxId"] = str(ncbi_info["TaxId"])
    # Status expected to be live
    if ncbi_info["Status"] != "live":
        print(idx, row.NCBI_Accession, "Status", ncbi_info["Status"])
    # ReplacedBy expected to be empty
    if ncbi_info["ReplacedBy"]:
        print(idx, row.NCBI_Accession, "ReplacedBy", ncbi_info["ReplacedBy"])
    # AccessionVersion expected to be NCBI_Accession
    if ncbi_info["AccessionVersion"] != row.NCBI_Accession:
        print(idx, row.NCBI_Accession, "AccessionVersion", ncbi_info["AccessionVersion"])
In [45]:
metadata
Out[45]:
Associated PDB NCBI_Accession UniProt_ID _struct.title _struct.pdbx_model_detail ranking debugg model ID notes NCBI_Gi NCBI_UpdateDate NCBI_TaxId
Protein
285L 285L.pdb CAD2068351.1 A0A485PQD1 ASFV-G 285L This model was predicted using AlphaFold2 model_1_pred_0 NaN 1886136876 2020/08/05 10497
A104R A104R.pdb CAD2068395.1 A0A0A1E0L7 ASFV-G A104R This model was predicted using AlphaFold2 model_2_pred_0 NaN 1886136920 2020/08/05 10497
A118R A118R.pdb CAD2068397.1 A0A2X0RVA9 ASFV-G A118R This model was predicted using AlphaFold2 model_1_pred_0 NaN 1886136922 2020/08/05 10497
A137R A137R.pdb CAD2068404.1 A0A2X0THQ0 ASFV-G A137R This model was predicted using AlphaFold2 model_3_pred_0 NaN 1886136929 2020/08/05 10497
A151R A151R.pdb CAD2068398.1 A0A2X0TC55 ASFV-G A151R This model was predicted using AlphaFold2 model_4_pred_0 NaN 1886136923 2020/08/05 10497
... ... ... ... ... ... ... ... ... ... ...
QP509L QP509L.pdb CAD2068484.1 A0A2X0THX2 ASFV-G QP509L This model was predicted using AlphaFold2 NaN NaN 1886137009 2020/08/05 10497
R298L R298L.pdb CAD2068482.1 A0A2X0SE42 ASFV-G R298L This model was predicted using AlphaFold2 model_3_pred_0 NaN 1886137007 2020/08/05 10497
S183L S183L.pdb CAD2068472.1 A0A2X0SE34 ASFV-G S183L This model was predicted using AlphaFold2 model_4_pred_0 NaN 1886136997 2020/08/05 10497
S273R S273R.pdb CAD2068473.1 A0A2X0TKM5 ASFV-G S273R This model was predicted using AlphaFold2 model_2_pred_0 NaN 1886136998 2020/08/05 10497
X69R X69R.pdb CAD2068372.1 A0A2X0TKC7 ASFV-G X69R This model was predicted using AlphaFold2 model_2_pred_0 NaN 1886136897 2020/08/05 10497

197 rows × 10 columns

In [46]:
len(set(metadata.NCBI_Accession)), set(metadata.NCBI_TaxId), \
len(set(metadata.NCBI_Gi)), set(metadata.NCBI_UpdateDate)
Out[46]:
(193, {'10497'}, 193, {'2020/08/05'})
In [54]:
# all matching tax IDs?
for protein, upac in metadata.UniProt_ID.items():
    up_info = _fetch_upkb_entry(upac)
    if up_info["up_ncbi_taxid"] != "10497":
        print(protein, up_info)
In [64]:
# check PDB files
import ost
from ost import testutils, conop
# setup conop
testutils.SetDefaultCompoundLib()
io.profiles['DEFAULT'].processor = conop.RuleBasedProcessor(conop.GetDefaultLib())
# check processing
ost.PushVerbosityLevel(2)
for protein, pdb_ext in sorted(pdb_file_split):
    pdb_path = os.path.join(pdb_dir, protein + pdb_ext)
    ent = io.LoadPDB(pdb_path)
ost.PopVerbosityLevel()
# NOTE: lack of output means that all atom names are ok
In [84]:
up_info
Out[84]:
{'up_organism': 'African swine fever virus (ASFV)',
 'up_sequence': 'MLLYIVIIVACIISKLVPNEYWAIHLFFIIMIFMVYMYEKLDIHQKSQFWNYTMSGLSGHNVQVTCKCY',
 'up_ac': 'A0A2X0TKC7',
 'up_id': 'A0A2X0TKC7_ASF',
 'up_last_mod': datetime.datetime(2018, 9, 12, 0, 0),
 'up_gn': 'X69R CDS',
 'up_ncbi_taxid': '10497',
 'up_seqlen': 69,
 'up_crc64': '3B92E4DB323A7A74',
 'up_isoform': None}
In [85]:
ncbi_info
Out[85]:
{'Caption': 'CAD2068372',
 'Title': 'X69R CDS [African swine fever virus]',
 'Extra': 'gi|1886136897|emb|CAD2068372.1|[1886136897]',
 'Gi': 1886136897,
 'CreateDate': '2010/08/18',
 'UpdateDate': '2020/08/05',
 'Flags': 0,
 'TaxId': 10497,
 'Length': 69,
 'Status': 'live',
 'ReplacedBy': None,
 'Comment': '  ',
 'AccessionVersion': 'CAD2068372.1'}
In [154]:
# check files that were converted (input is std out from run cut to only include "translating...")
log_lines = open("./script_out.txt").readlines()
In [155]:
# plenty of assertions in here should also catch any errors...
idx = 1
timings = dict()
while idx < len(log_lines) - 1:
    l = log_lines[idx].strip()
    if "already done..." in l:
        idx += 1
        continue
    assert l.startswith("translating")
    mdl_title = l[len("translating"):-3].strip()
    if mdl_title in timings:
        print("WEIRD", l)
    l = log_lines[idx + 1].strip()
    assert l.startswith("preparing data")
    assert l.endswith("s)")
    t_prep = float(l.split()[-1][1:-2])
    l = log_lines[idx + 2].strip()
    assert l.startswith("generating ModelCIF objects")
    assert l.endswith("s)")
    t_cif = float(l.split()[-1][1:-2])
    l = log_lines[idx + 3].strip()
    assert l.startswith("processing QA scores")
    assert l.endswith("s)")
    t_qa = float(l.split()[-1][1:-2])
    l = log_lines[idx + 4].strip()
    assert l.startswith("write to disk")
    assert l.endswith("s)")
    t_write = float(l.split()[-1][1:-2])
    l = log_lines[idx + 5].strip()
    assert l.startswith("... done with")
    assert l.endswith("s).")
    t_all = float(l.split()[-1][1:-3])
    timings[mdl_title] = {
        "t_prep": t_prep,
        "t_cif": t_cif,
        "t_qa": t_qa,
        "t_write": t_write,
        "t_all": t_all
    }
    idx += 6
In [156]:
print(f"DONE {len(timings)} models")
mdl_titles = set(metadata.index)
assert len(set(timings) - mdl_titles) == 0
missing_ones = mdl_titles - set(timings)
print(f"MISSING {len(missing_ones)} models")
DONE 197 models
MISSING 0 models
In [157]:
k_parts = ["t_prep", "t_cif", "t_qa", "t_write"]
totals = {k: sum(v[k] for v in timings.values()) \
          for k in k_parts + ["t_all"]}
print(f"TOTAL TIME SPENT: {round(totals['t_all'], 2)} s")
for k in k_parts:
    print(f"-> {k}: {round(100 * totals[k] / totals['t_all'], 1)}%")
TOTAL TIME SPENT: 356.38 s
-> t_prep: 80.5%
-> t_cif: 1.2%
-> t_qa: 0.0%
-> t_write: 17.8%
In [ ]: