Skip to content
Snippets Groups Projects
Commit 084f076f authored by Gerardo Tauriello's avatar Gerardo Tauriello
Browse files

Added USDA-ASFVG project

parent 1bfccb04
No related branches found
No related tags found
No related merge requests found
# Modelling of African Swine Fever proteome from USDA
Main links:
- [Link Project in MA](https://modelarchive.org/doi/10.5452/ma-asfv-asfvg) (incl. background on project itself)
- [Jira-story](https://jira.biozentrum.unibas.ch/browse/SCHWED-5651)
Setup:
- Using AlphaFold for monomer predictions with default CASP14 setup (no PAE, no pTM, templates used and relaxation enabled)
- 196 models done with default setup, 1 model (QP509L) with done with AF colab notebook and separate GROMACS relaxation step
- Input from them:
- PDB files for top ranked relaxed model
- CSV file with crosslinks (UniProt and NCBI), title, description and original filename
Special features here:
- Somewhat generic code for AlphaFold modeling step and sequence DBs used (can distinguish full_dbs and reduced_dbs and template search)
- pLDDT extracted from b-factors (simplest setup since no other QA scores anyway)
- Model file names did not contain information on AlphaFold model number (hence info in CSV file)
- Crosslinks to UniProt and NCBI (with sanity checks on both)
- Dealing with entries which cover subset of reference sequence (CP2475L.. for UniProt A0A2X0THU5)
- Special case (QP509L) with GROMACS model relaxation step (pLDDT fetched from separate file)
Content:
- translate2modelcif.py : script to do conversion based on CoFFE-sponge-proteins project (identical Docker setup used)
- tests folder with
- test_modelCIF_MA.py to convert ModelCIF to content displayed in ModelArchive (needs gemmi library)
- test.ipynb and .html for tests performed during development
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# USAGE: python test_modelCIF_MA.py [CIF-FILE]
# OUTPUT: preview of MA entry
import gemmi, sys
################################################################################
# define converters for printing or for html (here markdown used for display)
def _make_url(link_text, link_url):
return f"[{link_text}]({link_url})"
def _make_list(list_top, list_items):
list_texts = [f"- {item}" for item in list_items]
list_text = '\n'.join(list_texts)
if list_texts:
return(f"{list_top}\n{list_text}")
def _make_multiline_text(lines):
return '\n'.join(lines)
def _make_paragraph(text):
return f"{text}\n\n"
################################################################################
################################################################################
# HELPERS
################################################################################
def _get_chain_entity(block):
# NOTE: adapted from ParseCIF (asym_ids as list and handle optional fields)
chain_entity = {}
ma_target_entity = block.find(
"_ma_target_entity_instance.", ["asym_id", "entity_id"]
)
if ma_target_entity:
for row in ma_target_entity:
if row["entity_id"] in chain_entity:
chain_entity[row["entity_id"]]["asym_ids"].append(
gemmi.cif.as_string(row["asym_id"])
)
else:
chain_entity[row["entity_id"]] = {
"asym_ids": [row["asym_id"]],
"ma_target_ref_db_details": [],
}
for row in block.find("_entity.", ["id", "pdbx_description"]):
chain_entity[row["id"]].update(
{"pdbx_description": gemmi.cif.as_string(row["pdbx_description"])}
)
cols = [
"target_entity_id",
"db_accession",
"db_name",
"?db_name_other_details",
"?organism_scientific",
"?seq_db_align_begin",
"?seq_db_align_end",
]
# NOTE: row[col] doesn't work with '?' in find! Bad crashed happen if you try...
for row in block.find("_ma_target_ref_db_details.", cols):
json_obj = {}
target_entity_id = row[0] # make sure this stays at idx 0 in cols!
for idx, cq in enumerate(cols):
if cq.startswith('?'):
col = cq[1:]
if not row.has(idx) or gemmi.cif.is_null(row[idx]):
json_obj[col] = None
continue
else:
col = cq
if col != "target_entity_id":
if col.find("seq_db_align") > -1:
json_obj[col] = gemmi.cif.as_int(row[idx])
else:
json_obj[col] = gemmi.cif.as_string(row[idx])
chain_entity[target_entity_id][
"ma_target_ref_db_details"
].append(json_obj)
for row in block.find(
"_entity_poly.",
["entity_id", "pdbx_strand_id", "pdbx_seq_one_letter_code"],
):
chain_entity[row["entity_id"]]["olc"] = gemmi.cif.as_string(
row["pdbx_seq_one_letter_code"]
).replace("\n", "")
return chain_entity
def _fetch_qa_data(block):
"""Get 3 lists of QA scores and extract global scores."""
# fetch main info
table = block.find("_ma_qa_metric.", ["id", "name", "mode", "type"])
qa_dict = dict() # for easy access: keyed on "id" and rest as dict
for row in table:
d = {key: row.str(idx + 1) for idx, key in enumerate(["name", "mode", "type"])}
qa_dict[row.str(0)] = d
# fetch global scores
qa_global = []
table = block.find("_ma_qa_metric_global.", ["metric_id", "metric_value"])
for row in table:
metric_id = row.str(0)
metric_value = gemmi.cif.as_number(row.get(1))
if metric_id in qa_dict:
assert qa_dict[metric_id]["mode"] == "global"
qa_dict[metric_id]["value"] = metric_value
qa_global.append(qa_dict[metric_id])
# fetch local scores
qa_local = [d for d in qa_dict.values() if d["mode"] == "local"]
qa_local_pairwise = [d for d in qa_dict.values() if d["mode"] == "local-pairwise"]
return qa_global, qa_local, qa_local_pairwise
################################################################################
################################################################################
# TEXT GETTERS
# -> each function turns gemmi-block into a text
# -> bits that could be reused elsewhere are given by helpers above
################################################################################
def _get_title_text(block):
title = block.find_value("_struct.title")
if title:
title = gemmi.cif.as_string(title)
if title:
return title
else:
return "NO TITLE"
def _get_overview_text(block):
abstract = block.find_value("_struct.pdbx_model_details")
if abstract:
abstract = gemmi.cif.as_string(abstract)
if abstract:
text = _make_paragraph(abstract)
else:
text = _make_paragraph("NO ABSTRACT")
# TODO: "TOFILL" below to be filled by MA using info from model set
text += _make_paragraph('This model is part of the dataset "TOFILL"')
return text
def _get_entity_text(block):
# NOTE: reuse (updated!) code from ParseCIF
chain_entity = _get_chain_entity(block)
# do it
item_strings = []
for ent in chain_entity:
item_title = f"{chain_entity[ent]['pdbx_description']} " \
f"(chains: {', '.join(chain_entity[ent]['asym_ids'])})"
db_links = [item_title]
for i, ref in enumerate(chain_entity[ent]["ma_target_ref_db_details"]):
if ref["db_name"] == "UNP":
link_text = ref['db_accession']
link_url = f"https://www.uniprot.org/uniprot/{ref['db_accession']}"
db_link = f"UniProt: {_make_url(link_text, link_url)}"
elif ref["db_name"] == "OrthoDB":
link_text = ref['db_accession']
link_url = f"https://www.orthodb.org/?query={ref['db_accession']}"
db_link = f"OrthoDB: {_make_url(link_text, link_url)}"
elif ref["db_name"] == "NCBI":
link_text = ref['db_accession']
link_url = f"https://www.ncbi.nlm.nih.gov/protein/{ref['db_accession']}"
db_link = f"NCBI: {_make_url(link_text, link_url)}"
elif ref["db_name"] == "Other" and ref["db_name_other_details"]:
db_link = f"{ref['db_name_other_details']}: {ref['db_accession']}"
else:
db_link = f"{ref['db_name']}: {ref['db_accession']}"
if ref['seq_db_align_begin'] and ref['seq_db_align_end']:
db_link += f" {ref['seq_db_align_begin']}-{ref['seq_db_align_end']}"
if ref['organism_scientific']:
db_link += f"; {ref['organism_scientific']}"
db_links.append(db_link)
item_strings.append(_make_multiline_text(db_links))
# and into a paragraph...
if item_strings:
list_top = f"The following molecular entities are in the model:"
return _make_paragraph(_make_list(list_top, item_strings))
else:
return ""
def _get_sw_text(block):
# get author names for each citation
tmp = dict()
for row in block.find("_citation_author.", ["citation_id", "name"]):
cid = row.str(0)
name = row.str(1)
if cid not in tmp:
tmp[cid] = {"name": name.split()[0].split(",")[0], "etal": ""}
else:
tmp[cid]["etal"] = " et al."
cit_names = {cid: (d["name"] + d["etal"]) for cid, d in tmp.items()}
# add year if available
table = block.find("_citation.", ["id", "?year"])
if table.has_column(1):
for row in table:
cid = row.str(0)
year = row.str(1)
if cid in cit_names and year:
cit_names[cid] += " " + year
# add URL if available
cit_urls = {}
table = block.find("_citation.", ["id", "?pdbx_database_id_DOI", "?pdbx_database_id_PubMed"])
formatters = ["https://doi.org/%s",
"https://www.ncbi.nlm.nih.gov/pubmed/%s"]
for row in table:
cid = row.str(0)
# add whichever URL we find first
for i in range(1, table.width()):
if row.has(i) and row.str(i):
cit_urls[cid] = formatters[i - 1] % row.str(i)
break
# now map this to software
item_strings = []
table = block.find("_software.", ["name", "?location", "?version", "?citation_id"])
for row in table:
sw_name = row.str(0)
if row.has(1) and row.str(1):
item = _make_url(sw_name, row.str(1))
else:
item = sw_name
if row.has(2) and row.str(2):
item += f" ({row.str(2)})"
if row.has(3) and row.str(3) in cit_names:
cid = row.str(3)
if cid in cit_urls:
item += f" ({_make_url(cit_names[cid], cit_urls[cid])})"
else:
item += f" ({cit_names[cid]})"
item_strings.append(item)
# and into a paragraph...
if item_strings:
list_top = f"The following software was used:"
return _make_paragraph(_make_list(list_top, item_strings))
else:
return ""
def _get_ref_db_text(block):
# look for DBs with version or release date
item_strings = []
table = block.find("_ma_data_ref_db.", ["name", "?version", "?release_date"])
for row in table:
item = f"{row.str(0)}"
# add whichever version we find first
for i in range(1, 3):
if row.has(i) and row.str(i):
item += f" ({row.str(i)})"
break
item_strings.append(item)
# and into a paragraph...
if item_strings:
list_top = f"The following reference databases were used:"
return _make_paragraph(_make_list(list_top, item_strings))
else:
return ""
def _get_tpl_text(block):
# collect info per tpl-id
tpl_dict = {} # keyed on template_id
# fetch main info
cols = ["template_id", "target_asym_id", "template_auth_asym_id",
"?template_label_asym_id"]
for row in block.find("_ma_template_details.", cols):
tid = row.str(0)
tpl_dict[tid] = {
"trg_asym_id": row.str(1),
"tpl_auth_asym_id": row.str(2)
}
if row.has(3) and row.str(3):
tpl_dict[tid]["tpl_label_asym_id"] = row.str(3)
# add ref DBs
cols = ["template_id", "db_accession_code", "db_name",
"?db_name_other_details"]
for row in block.find("_ma_template_ref_db_details.", cols):
tid = row.str(0)
if tid in tpl_dict:
tpl_dict[tid]["db_acc"] = row.str(1)
if row.str(2) == "Other" and row.has(3) and row.str(3):
tpl_dict[tid]["db_name"] = row.str(3)
else:
tpl_dict[tid]["db_name"] = row.str(2)
# add info for small molecules
cols = ["template_id", "?comp_id", "?details"]
for row in block.find("_ma_template_non_poly.", cols):
tid = row.str(0)
if tid in tpl_dict:
if row.has(1) and row.str(1):
tpl_dict[tid]["non_poly_comp_id"] = row.str(1)
if row.has(2) and row.str(2):
tpl_dict[tid]["non_poly_details"] = row.str(2)
# aggregate per template for diplaying
tpl_td = dict()
for tpl in tpl_dict.values():
did = f"{tpl['db_name']}-{tpl['db_acc']}"
if did not in tpl_td:
if tpl['db_name'] == "PDB":
link_url = f"http://dx.doi.org/10.2210/pdb{tpl['db_acc']}/pdb"
elif tpl['db_name'] == "PubChem":
link_url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{tpl['db_acc']}"
elif tpl['db_name'] == "AlphaFoldDB":
link_url = f"https://alphafold.ebi.ac.uk/entry/{tpl['db_acc']}"
else:
link_url = None
print(f"URLs for {tpl['db_name']} NOT SUPPORTED YET")
if link_url:
tpl_text = f"{tpl['db_name']}: {_make_url(tpl['db_acc'], link_url)}"
else:
tpl_text = f"{tpl['db_name']}: {tpl['db_acc']}"
tpl_td[did] = {
"tpl_text": tpl_text,
"tpl_chains_label": [],
"tpl_chains_auth": [],
"tpl_chains_all_label": True,
"tpl_non_poly_ids": []
}
# collect chain names
if "tpl_label_asym_id" in tpl:
# if here it is guaranteed to be non-empty
tpl_td[did]["tpl_chains_label"].append(tpl["tpl_label_asym_id"])
else:
# if any missing, we set all to False and fall back to auth
tpl_td[did]["tpl_chains_all_label"] = False
if tpl["tpl_auth_asym_id"]:
# only add non empty ones
tpl_td[did]["tpl_chains_auth"].append(tpl["tpl_auth_asym_id"])
# collect info on non poly if available (prefer short comp. ID)
if "non_poly_comp_id" in tpl:
tpl_td[did]["tpl_non_poly_ids"].append(tpl["non_poly_comp_id"])
elif "non_poly_details" in tpl:
tpl_td[did]["tpl_non_poly_ids"].append(tpl["non_poly_details"])
# turn into text
item_strings = []
for tpl in tpl_td.values():
item = tpl["tpl_text"]
if tpl["tpl_chains_all_label"] and tpl["tpl_chains_label"]:
chain_ids = sorted(set(tpl['tpl_chains_label']))
item += f"; chains (label_asym_id): {', '.join(chain_ids)}"
elif tpl["tpl_chains_auth"]:
chain_ids = sorted(set(tpl['tpl_chains_auth']))
item += f"; chains (auth_asym_id): {', '.join(chain_ids)}"
if tpl["tpl_non_poly_ids"]:
np_ids = sorted(set(tpl['tpl_non_poly_ids']))
item += f"; non-polymers: {', '.join(np_ids)}"
item_strings.append(item)
# and into a paragraph...
if item_strings:
list_top = f"The following templates were used:"
return _make_paragraph(_make_list(list_top, item_strings))
else:
return ""
def _get_protocol_steps_text(block):
# do it
item_strings = []
table = block.find("_ma_protocol_step.", ["step_id", "method_type", "?details"])
for row in table:
item = f"Step {row.str(0)} - {row.str(1)}"
if row.has(2) and row.str(2):
item += f" : {row.str(2)}"
item_strings.append(item)
# and into a paragraph...
if item_strings:
return _make_paragraph(_make_multiline_text(item_strings))
else:
return ""
def _get_qa_acc_text(block):
# get QA part (can reuse if already used elsewhere)
qa_global, qa_local, qa_local_pairwise = _fetch_qa_data(block)
# parse accompanying data
file_contents = block.find_values("_ma_entry_associated_files.file_content")
has_single_zip_file = len(file_contents) == 1 and \
file_contents.str(0) == "archive with multiple files"
if has_single_zip_file:
# override with data from other block
file_contents = block.find_values("_ma_associated_archive_file_details.file_content")
has_loc_pw_in_acc = any(True for v in file_contents \
if gemmi.cif.as_string(v) == "local pairwise QA scores")
# put together text
text = ""
# text for QA
item = ""
if len(qa_global) > 1:
score_strings = [f"{v['name']} of {v['value']}" for v in qa_global]
item = f"The model has the following global model confidence scores:" \
f" {', '.join(score_strings)}."
elif len(qa_global) == 1:
item = f"The model has a global model confidence score " \
f"({qa_global[0]['name']}) of {qa_global[0]['value']}."
if item:
text += _make_paragraph(item)
# lots of options for local QA string
item = ""
qa_local_names = ", ".join([v["name"] for v in qa_local])
qa_loc_pw_names = ", ".join([v["name"] for v in qa_local_pairwise])
if qa_local_names and qa_loc_pw_names and has_loc_pw_in_acc:
item = f"Local per-residue model confidence scores ({qa_local_names}) " \
f"are available in the model mmCIF file " \
f"and local per-residue-pair scores ({qa_loc_pw_names}) " \
f"in the accompanying data download."
elif qa_local_names and qa_loc_pw_names and not has_loc_pw_in_acc:
item = f"Local per-residue model confidence scores ({qa_local_names}) " \
f"and local per-residue-pair scores ({qa_loc_pw_names}) " \
f"are available in the model mmCIF file."
elif qa_local_names and not qa_loc_pw_names:
item = f"Local per-residue model confidence scores ({qa_local_names}) " \
f"are available in the model mmCIF file."
elif not qa_local_names and qa_loc_pw_names and has_loc_pw_in_acc:
item = f"Local per-residue-pair model confidence scores ({qa_loc_pw_names}) " \
f"are available in the accompanying data download."
elif not qa_local_names and qa_loc_pw_names and not has_loc_pw_in_acc:
item = f"Local per-residue-pair model confidence scores ({qa_loc_pw_names}) " \
f"are available in the model mmCIF file."
if item:
text += _make_paragraph(item)
# list files in accompanying data (if any)
if has_single_zip_file:
table = block.find("_ma_associated_archive_file_details.",
["file_path", "?file_content", "?description"])
else:
# NOTE: aimed to work legacy-style for Baker-models but should be obsoleted
# -> can replace below with "table = None" in future
table = block.find("_ma_entry_associated_files.",
["file_url", "?file_content", "?details"])
if table:
list_top = "Files in accompanying data:"
list_items = []
for row in table:
item = f"{row.str(0)}"
if row.has(1) and row.str(1):
item += f" ({row.str(1)})"
if row.has(2) and row.str(2):
item += f": {row.str(2)}"
list_items.append(item)
text += _make_paragraph(_make_list(list_top, list_items))
# conclude with standard pointer to ModelCIF file
model_cif_link = _make_url(
"ModelCIF format",
"https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/"
)
text += _make_paragraph(
f"Full details are available in {model_cif_link}" \
f"in the model mmCIF file."
)
return text
################################################################################
# full parsing of file
def _process_file(file_path):
print(f"ModelCIF->MA for {file_path}")
doc = gemmi.cif.read(file_path)
block = doc.sole_block()
print('-'*80)
print(f"TITLE: {_get_title_text(block)}")
print('-'*80)
print(f"OVERVIEW:\n{'-'*80}")
print(_get_overview_text(block))
print('-'*80)
print(f"MATERIAL:\n{'-'*80}")
material_text = _get_entity_text(block) \
+ _get_sw_text(block) \
+ _get_ref_db_text(block) \
+ _get_tpl_text(block)
print(material_text)
print('-'*80)
print(f"PROCEDURE:\n{'-'*80}")
procedure_text = _get_protocol_steps_text(block) \
+ _get_qa_acc_text(block)
print(procedure_text)
print('-'*80)
def _main():
# get file to process
if len(sys.argv) == 2:
file_path = sys.argv[1]
_process_file(file_path)
else:
print("USAGE: python test_modelCIF_MA.py [CIF-FILE]")
if __name__ == "__main__":
_main()
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment