Added USDA-ASFVG project

084f076f · Gerardo Tauriello · 1bfccb04 · 084f076f · 084f076f · 084f076f
Commit 084f076f authored 2 years ago by Gerardo Tauriello
--- a/projects/USDA-ASFVG/README.md
+++ b/projects/USDA-ASFVG/README.md
+# Modelling of African Swine Fever proteome from USDA
+
+Main links:
+- [Link Project in MA](https://modelarchive.org/doi/10.5452/ma-asfv-asfvg) (incl. background on project itself)
+- [Jira-story](https://jira.biozentrum.unibas.ch/browse/SCHWED-5651)
+
+Setup:
+- Using AlphaFold for monomer predictions with default CASP14 setup (no PAE, no pTM, templates used and relaxation enabled)
+  - 196 models done with default setup, 1 model (QP509L) with done with AF colab notebook and separate GROMACS relaxation step
+- Input from them:
+  - PDB files for top ranked relaxed model
+  - CSV file with crosslinks (UniProt and NCBI), title, description and original filename
+
+Special features here:
+- Somewhat generic code for AlphaFold modeling step and sequence DBs used (can distinguish full_dbs and reduced_dbs and template search)
+- pLDDT extracted from b-factors (simplest setup since no other QA scores anyway)
+- Model file names did not contain information on AlphaFold model number (hence info in CSV file)
+- Crosslinks to UniProt and NCBI (with sanity checks on both)
+- Dealing with entries which cover subset of reference sequence (CP2475L.. for UniProt A0A2X0THU5)
+- Special case (QP509L) with GROMACS model relaxation step (pLDDT fetched from separate file)
+
+Content:
+- translate2modelcif.py : script to do conversion based on CoFFE-sponge-proteins project (identical Docker setup used)
+- tests folder with
+  - test_modelCIF_MA.py to convert ModelCIF to content displayed in ModelArchive (needs gemmi library)
+  - test.ipynb and .html for tests performed during development
--- a/projects/USDA-ASFVG/tests/test-220823.html
+++ b/projects/USDA-ASFVG/tests/test-220823.html
--- a/projects/USDA-ASFVG/tests/test.html
+++ b/projects/USDA-ASFVG/tests/test.html
--- a/projects/USDA-ASFVG/tests/test.ipynb
+++ b/projects/USDA-ASFVG/tests/test.ipynb
--- a/projects/USDA-ASFVG/tests/test_modelCIF_MA.py
+++ b/projects/USDA-ASFVG/tests/test_modelCIF_MA.py
+# USAGE: python test_modelCIF_MA.py [CIF-FILE]
+# OUTPUT: preview of MA entry
+
+import gemmi, sys
+
+################################################################################
+# define converters for printing or for html (here markdown used for display)
+def _make_url(link_text, link_url):
+    return f"[{link_text}]({link_url})"
+def _make_list(list_top, list_items):
+    list_texts = [f"- {item}" for item in list_items]
+    list_text = '\n'.join(list_texts)
+    if list_texts:
+        return(f"{list_top}\n{list_text}")
+def _make_multiline_text(lines):
+    return '\n'.join(lines)
+def _make_paragraph(text):
+    return f"{text}\n\n"
+################################################################################
+
+################################################################################
+# HELPERS
+################################################################################
+def _get_chain_entity(block):
+    # NOTE: adapted from ParseCIF (asym_ids as list and handle optional fields)
+    chain_entity = {}
+
+    ma_target_entity = block.find(
+        "_ma_target_entity_instance.", ["asym_id", "entity_id"]
+    )
+    if ma_target_entity:
+        for row in ma_target_entity:
+            if row["entity_id"] in chain_entity:
+                chain_entity[row["entity_id"]]["asym_ids"].append(
+                    gemmi.cif.as_string(row["asym_id"])
+                )
+            else:
+                chain_entity[row["entity_id"]] = {
+                    "asym_ids": [row["asym_id"]],
+                    "ma_target_ref_db_details": [],
+                }
+
+        for row in block.find("_entity.", ["id", "pdbx_description"]):
+            chain_entity[row["id"]].update(
+                {"pdbx_description": gemmi.cif.as_string(row["pdbx_description"])}
+            )
+
+        cols = [
+            "target_entity_id",
+            "db_accession",
+            "db_name",
+            "?db_name_other_details",
+            "?organism_scientific",
+            "?seq_db_align_begin",
+            "?seq_db_align_end",
+        ]
+        # NOTE: row[col] doesn't work with '?' in find! Bad crashed happen if you try...
+        for row in block.find("_ma_target_ref_db_details.", cols):
+            json_obj = {}
+            target_entity_id = row[0] # make sure this stays at idx 0 in cols!
+            for idx, cq in enumerate(cols):
+                if cq.startswith('?'):
+                    col = cq[1:]
+                    if not row.has(idx) or gemmi.cif.is_null(row[idx]):
+                        json_obj[col] = None
+                        continue
+                else:
+                    col = cq
+                if col != "target_entity_id":
+                    if col.find("seq_db_align") > -1:
+                        json_obj[col] = gemmi.cif.as_int(row[idx])
+                    else:
+                        json_obj[col] = gemmi.cif.as_string(row[idx])
+            chain_entity[target_entity_id][
+                "ma_target_ref_db_details"
+            ].append(json_obj)
+
+        for row in block.find(
+            "_entity_poly.",
+            ["entity_id", "pdbx_strand_id", "pdbx_seq_one_letter_code"],
+        ):
+            chain_entity[row["entity_id"]]["olc"] = gemmi.cif.as_string(
+                row["pdbx_seq_one_letter_code"]
+            ).replace("\n", "")
+
+    return chain_entity
+
+def _fetch_qa_data(block):
+    """Get 3 lists of QA scores and extract global scores."""
+    # fetch main info
+    table = block.find("_ma_qa_metric.", ["id", "name", "mode", "type"])
+    qa_dict = dict() # for easy access: keyed on "id" and rest as dict
+    for row in table:
+        d = {key: row.str(idx + 1) for idx, key in enumerate(["name", "mode", "type"])}
+        qa_dict[row.str(0)] = d
+    # fetch global scores
+    qa_global = []
+    table = block.find("_ma_qa_metric_global.", ["metric_id", "metric_value"])
+    for row in table:
+        metric_id = row.str(0)
+        metric_value = gemmi.cif.as_number(row.get(1))
+        if metric_id in qa_dict:
+            assert qa_dict[metric_id]["mode"] == "global"
+            qa_dict[metric_id]["value"] = metric_value
+            qa_global.append(qa_dict[metric_id])
+    # fetch local scores
+    qa_local = [d for d in qa_dict.values() if d["mode"] == "local"]
+    qa_local_pairwise = [d for d in qa_dict.values() if d["mode"] == "local-pairwise"]
+    return qa_global, qa_local, qa_local_pairwise
+################################################################################
+
+################################################################################
+# TEXT GETTERS
+# -> each function turns gemmi-block into a text
+# -> bits that could be reused elsewhere are given by helpers above
+################################################################################
+def _get_title_text(block):
+    title = block.find_value("_struct.title")
+    if title:
+        title = gemmi.cif.as_string(title)
+    if title:
+        return title
+    else:
+        return "NO TITLE"
+
+def _get_overview_text(block):
+    abstract = block.find_value("_struct.pdbx_model_details")
+    if abstract:
+        abstract = gemmi.cif.as_string(abstract)
+    if abstract:
+        text = _make_paragraph(abstract)
+    else:
+        text = _make_paragraph("NO ABSTRACT")
+    # TODO: "TOFILL" below to be filled by MA using info from model set
+    text += _make_paragraph('This model is part of the dataset "TOFILL"')
+    return text
+
+def _get_entity_text(block):
+    # NOTE: reuse (updated!) code from ParseCIF
+    chain_entity = _get_chain_entity(block)
+    # do it
+    item_strings = []
+    for ent in chain_entity:
+        item_title = f"{chain_entity[ent]['pdbx_description']} " \
+                     f"(chains: {', '.join(chain_entity[ent]['asym_ids'])})"
+        db_links = [item_title]
+        for i, ref in enumerate(chain_entity[ent]["ma_target_ref_db_details"]):
+            if ref["db_name"] == "UNP":
+                link_text = ref['db_accession']
+                link_url = f"https://www.uniprot.org/uniprot/{ref['db_accession']}"
+                db_link = f"UniProt: {_make_url(link_text, link_url)}"
+            elif ref["db_name"] == "OrthoDB":
+                link_text = ref['db_accession']
+                link_url = f"https://www.orthodb.org/?query={ref['db_accession']}"
+                db_link = f"OrthoDB: {_make_url(link_text, link_url)}"
+            elif ref["db_name"] == "NCBI":
+                link_text = ref['db_accession']
+                link_url = f"https://www.ncbi.nlm.nih.gov/protein/{ref['db_accession']}"
+                db_link = f"NCBI: {_make_url(link_text, link_url)}"
+            elif ref["db_name"] == "Other" and ref["db_name_other_details"]:
+                db_link = f"{ref['db_name_other_details']}: {ref['db_accession']}"
+            else:
+                db_link = f"{ref['db_name']}: {ref['db_accession']}"
+            if ref['seq_db_align_begin'] and ref['seq_db_align_end']:
+                db_link += f" {ref['seq_db_align_begin']}-{ref['seq_db_align_end']}"
+            if ref['organism_scientific']:
+                db_link += f"; {ref['organism_scientific']}"
+            db_links.append(db_link)
+        item_strings.append(_make_multiline_text(db_links))
+    # and into a paragraph...
+    if item_strings:
+        list_top = f"The following molecular entities are in the model:"
+        return _make_paragraph(_make_list(list_top, item_strings))
+    else:
+        return ""
+    
+def _get_sw_text(block):
+    # get author names for each citation
+    tmp = dict()
+    for row in block.find("_citation_author.", ["citation_id", "name"]):
+        cid = row.str(0)
+        name = row.str(1)
+        if cid not in tmp:
+            tmp[cid] = {"name": name.split()[0].split(",")[0], "etal": ""}
+        else:
+            tmp[cid]["etal"] = " et al."
+    cit_names = {cid: (d["name"] + d["etal"]) for cid, d in tmp.items()}
+    # add year if available
+    table = block.find("_citation.", ["id", "?year"])
+    if table.has_column(1):
+        for row in table:
+            cid = row.str(0)
+            year = row.str(1)
+            if cid in cit_names and year:
+                cit_names[cid] += " " + year
+    # add URL if available
+    cit_urls = {}
+    table = block.find("_citation.", ["id", "?pdbx_database_id_DOI", "?pdbx_database_id_PubMed"])
+    formatters = ["https://doi.org/%s",
+                  "https://www.ncbi.nlm.nih.gov/pubmed/%s"]
+    for row in table:
+        cid = row.str(0)
+        # add whichever URL we find first
+        for i in range(1, table.width()):
+            if row.has(i) and row.str(i):
+                cit_urls[cid] = formatters[i - 1] % row.str(i)
+                break
+    # now map this to software
+    item_strings = []
+    table = block.find("_software.", ["name", "?location", "?version", "?citation_id"])
+    for row in table:
+        sw_name = row.str(0)
+        if row.has(1) and row.str(1):
+            item = _make_url(sw_name, row.str(1))
+        else:
+            item = sw_name
+        if row.has(2) and row.str(2):
+            item += f" ({row.str(2)})"
+        if row.has(3) and row.str(3) in cit_names:
+            cid = row.str(3)
+            if cid in cit_urls:
+                item += f" ({_make_url(cit_names[cid], cit_urls[cid])})"
+            else:
+                item += f" ({cit_names[cid]})"
+        item_strings.append(item)
+    # and into a paragraph...
+    if item_strings:
+        list_top = f"The following software was used:"
+        return _make_paragraph(_make_list(list_top, item_strings))
+    else:
+        return ""
+
+def _get_ref_db_text(block):
+    # look for DBs with version or release date
+    item_strings = []
+    table = block.find("_ma_data_ref_db.", ["name", "?version", "?release_date"])
+    for row in table:
+        item = f"{row.str(0)}"
+        # add whichever version we find first
+        for i in range(1, 3):
+            if row.has(i) and row.str(i):
+                item += f" ({row.str(i)})"
+                break
+        item_strings.append(item)
+    # and into a paragraph...
+    if item_strings:
+        list_top = f"The following reference databases were used:"
+        return _make_paragraph(_make_list(list_top, item_strings))
+    else:
+        return ""
+
+def _get_tpl_text(block):
+    # collect info per tpl-id
+    tpl_dict = {}  # keyed on template_id
+    # fetch main info
+    cols = ["template_id", "target_asym_id", "template_auth_asym_id",
+            "?template_label_asym_id"]
+    for row in block.find("_ma_template_details.", cols):
+        tid = row.str(0)
+        tpl_dict[tid] = {
+            "trg_asym_id": row.str(1),
+            "tpl_auth_asym_id": row.str(2)
+        }
+        if row.has(3) and row.str(3):
+            tpl_dict[tid]["tpl_label_asym_id"] = row.str(3)
+    # add ref DBs
+    cols = ["template_id", "db_accession_code", "db_name",
+            "?db_name_other_details"]
+    for row in block.find("_ma_template_ref_db_details.", cols):
+        tid = row.str(0)
+        if tid in tpl_dict:
+            tpl_dict[tid]["db_acc"] = row.str(1)
+            if row.str(2) == "Other" and row.has(3) and row.str(3):
+                tpl_dict[tid]["db_name"] = row.str(3)
+            else:
+                tpl_dict[tid]["db_name"] = row.str(2)
+    # add info for small molecules
+    cols = ["template_id", "?comp_id", "?details"]
+    for row in block.find("_ma_template_non_poly.", cols):
+        tid = row.str(0)
+        if tid in tpl_dict:
+            if row.has(1) and row.str(1):
+                tpl_dict[tid]["non_poly_comp_id"] = row.str(1)
+            if row.has(2) and row.str(2):
+                tpl_dict[tid]["non_poly_details"] = row.str(2)
+    # aggregate per template for diplaying
+    tpl_td = dict()
+    for tpl in tpl_dict.values():
+        did = f"{tpl['db_name']}-{tpl['db_acc']}"
+        if did not in tpl_td:
+            if tpl['db_name'] == "PDB":
+                link_url = f"http://dx.doi.org/10.2210/pdb{tpl['db_acc']}/pdb"
+            elif tpl['db_name'] == "PubChem":
+                link_url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{tpl['db_acc']}"
+            elif tpl['db_name'] == "AlphaFoldDB":
+                link_url = f"https://alphafold.ebi.ac.uk/entry/{tpl['db_acc']}"
+            else:
+                link_url = None
+                print(f"URLs for {tpl['db_name']} NOT SUPPORTED YET")
+            if link_url:
+                tpl_text = f"{tpl['db_name']}: {_make_url(tpl['db_acc'], link_url)}"
+            else:
+                tpl_text = f"{tpl['db_name']}: {tpl['db_acc']}"
+            tpl_td[did] = {
+                "tpl_text": tpl_text,
+                "tpl_chains_label": [],
+                "tpl_chains_auth": [],
+                "tpl_chains_all_label": True,
+                "tpl_non_poly_ids": []
+            }
+        # collect chain names
+        if "tpl_label_asym_id" in tpl:
+            # if here it is guaranteed to be non-empty
+            tpl_td[did]["tpl_chains_label"].append(tpl["tpl_label_asym_id"])
+        else:
+            # if any missing, we set all to False and fall back to auth
+            tpl_td[did]["tpl_chains_all_label"] = False
+        if tpl["tpl_auth_asym_id"]:
+            # only add non empty ones
+            tpl_td[did]["tpl_chains_auth"].append(tpl["tpl_auth_asym_id"])
+        # collect info on non poly if available (prefer short comp. ID)
+        if "non_poly_comp_id" in tpl:
+            tpl_td[did]["tpl_non_poly_ids"].append(tpl["non_poly_comp_id"])
+        elif "non_poly_details" in tpl:
+            tpl_td[did]["tpl_non_poly_ids"].append(tpl["non_poly_details"])
+    # turn into text
+    item_strings = []
+    for tpl in tpl_td.values():
+        item = tpl["tpl_text"]
+        if tpl["tpl_chains_all_label"] and tpl["tpl_chains_label"]:
+            chain_ids = sorted(set(tpl['tpl_chains_label']))
+            item += f"; chains (label_asym_id): {', '.join(chain_ids)}"
+        elif tpl["tpl_chains_auth"]:
+            chain_ids = sorted(set(tpl['tpl_chains_auth']))
+            item += f"; chains (auth_asym_id): {', '.join(chain_ids)}"
+        if tpl["tpl_non_poly_ids"]:
+            np_ids = sorted(set(tpl['tpl_non_poly_ids']))
+            item += f"; non-polymers: {', '.join(np_ids)}"
+        item_strings.append(item)
+    # and into a paragraph...
+    if item_strings:
+        list_top = f"The following templates were used:"
+        return _make_paragraph(_make_list(list_top, item_strings))
+    else:
+        return ""
+
+def _get_protocol_steps_text(block):
+    # do it
+    item_strings = []
+    table = block.find("_ma_protocol_step.", ["step_id", "method_type", "?details"])
+    for row in table:
+        item = f"Step {row.str(0)} - {row.str(1)}"
+        if row.has(2) and row.str(2):
+            item += f" : {row.str(2)}"
+        item_strings.append(item)
+    # and into a paragraph...
+    if item_strings:
+        return _make_paragraph(_make_multiline_text(item_strings))
+    else:
+        return ""
+
+def _get_qa_acc_text(block):
+    # get QA part (can reuse if already used elsewhere)
+    qa_global, qa_local, qa_local_pairwise = _fetch_qa_data(block)
+    # parse accompanying data
+    file_contents = block.find_values("_ma_entry_associated_files.file_content")
+    has_single_zip_file = len(file_contents) == 1 and \
+                          file_contents.str(0) == "archive with multiple files"
+    if has_single_zip_file:
+        # override with data from other block
+        file_contents = block.find_values("_ma_associated_archive_file_details.file_content")
+    has_loc_pw_in_acc = any(True for v in file_contents \
+                            if gemmi.cif.as_string(v) == "local pairwise QA scores")
+    # put together text
+    text = ""
+    # text for QA
+    item = ""
+    if len(qa_global) > 1:
+        score_strings = [f"{v['name']} of {v['value']}" for v in qa_global]
+        item = f"The model has the following global model confidence scores:" \
+               f" {', '.join(score_strings)}."
+    elif len(qa_global) == 1:
+        item = f"The model has a global model confidence score " \
+               f"({qa_global[0]['name']}) of {qa_global[0]['value']}."
+    if item:
+        text += _make_paragraph(item)
+    # lots of options for local QA string
+    item = ""
+    qa_local_names = ", ".join([v["name"] for v in qa_local])
+    qa_loc_pw_names = ", ".join([v["name"] for v in qa_local_pairwise])
+    if qa_local_names and qa_loc_pw_names and has_loc_pw_in_acc:
+        item = f"Local per-residue model confidence scores ({qa_local_names}) " \
+               f"are available in the model mmCIF file " \
+               f"and local per-residue-pair scores ({qa_loc_pw_names}) " \
+               f"in the accompanying data download."
+    elif qa_local_names and qa_loc_pw_names and not has_loc_pw_in_acc:
+        item = f"Local per-residue model confidence scores ({qa_local_names}) " \
+               f"and local per-residue-pair scores ({qa_loc_pw_names}) " \
+               f"are available in the model mmCIF file."
+    elif qa_local_names and not qa_loc_pw_names:
+        item = f"Local per-residue model confidence scores ({qa_local_names}) " \
+               f"are available in the model mmCIF file."
+    elif not qa_local_names and qa_loc_pw_names and has_loc_pw_in_acc:
+        item = f"Local per-residue-pair model confidence scores ({qa_loc_pw_names}) " \
+               f"are available in the accompanying data download."
+    elif not qa_local_names and qa_loc_pw_names and not has_loc_pw_in_acc:
+        item = f"Local per-residue-pair model confidence scores ({qa_loc_pw_names}) " \
+               f"are available in the model mmCIF file."
+    if item:
+        text += _make_paragraph(item)
+    # list files in accompanying data (if any)
+    if has_single_zip_file:
+        table = block.find("_ma_associated_archive_file_details.",
+                           ["file_path", "?file_content", "?description"])
+    else:
+        # NOTE: aimed to work legacy-style for Baker-models but should be obsoleted
+        # -> can replace below with "table = None" in future
+        table = block.find("_ma_entry_associated_files.",
+                           ["file_url", "?file_content", "?details"])
+    if table:
+        list_top = "Files in accompanying data:"
+        list_items = []
+        for row in table:
+            item = f"{row.str(0)}"
+            if row.has(1) and row.str(1):
+                item += f" ({row.str(1)})"
+            if row.has(2) and row.str(2):
+                item += f": {row.str(2)}"
+            list_items.append(item)
+        text += _make_paragraph(_make_list(list_top, list_items))
+    # conclude with standard pointer to ModelCIF file
+    model_cif_link = _make_url(
+        "ModelCIF format",
+        "https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/"
+    )
+    text += _make_paragraph(
+        f"Full details are available in {model_cif_link}" \
+        f"in the model mmCIF file."
+    )
+    return text
+################################################################################
+
+# full parsing of file
+def _process_file(file_path):
+    print(f"ModelCIF->MA for {file_path}")
+    doc = gemmi.cif.read(file_path)
+    block = doc.sole_block()
+    print('-'*80)
+    print(f"TITLE: {_get_title_text(block)}")
+    print('-'*80)
+    print(f"OVERVIEW:\n{'-'*80}")
+    print(_get_overview_text(block))
+    print('-'*80)
+    print(f"MATERIAL:\n{'-'*80}")
+    material_text = _get_entity_text(block) \
+                  + _get_sw_text(block) \
+                  + _get_ref_db_text(block) \
+                  + _get_tpl_text(block)
+    print(material_text)
+    print('-'*80)
+    print(f"PROCEDURE:\n{'-'*80}")
+    procedure_text = _get_protocol_steps_text(block) \
+                   + _get_qa_acc_text(block)
+    print(procedure_text)
+    print('-'*80)
+
+def _main():
+    # get file to process
+    if len(sys.argv) == 2:
+        file_path = sys.argv[1]
+        _process_file(file_path)
+    else:
+        print("USAGE: python test_modelCIF_MA.py [CIF-FILE]")
+
+if __name__ == "__main__":
+    _main()
--- a/projects/USDA-ASFVG/translate2modelcif.py
+++ b/projects/USDA-ASFVG/translate2modelcif.py