From 27db3776407cf1ca84acc3635a6f5e428610e916 Mon Sep 17 00:00:00 2001
From: Stefan Bienert <stefan.bienert@unibas.ch>
Date: Thu, 12 Oct 2023 10:12:38 +0200
Subject: [PATCH] black * PEP8

---
 .../translate2modelcif.py                     | 291 ++++++++++--------
 1 file changed, 161 insertions(+), 130 deletions(-)

diff --git a/projects/dark-matter-metagenomics/translate2modelcif.py b/projects/dark-matter-metagenomics/translate2modelcif.py
index cd3a91d..fd44a6d 100644
--- a/projects/dark-matter-metagenomics/translate2modelcif.py
+++ b/projects/dark-matter-metagenomics/translate2modelcif.py
@@ -1,27 +1,18 @@
 #! /usr/local/bin/ost
 # -*- coding: utf-8 -*-
-
 """Translate models for Sergey from PDB + extra data into ModelCIF."""
 
-# EXAMPLES for running:
-"""
-ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \
-    ./web_dloads/pivot ./modelcif --prefix=F000347 \
-    --pdb-web-path=./web_dloads/pdb \
-    --refseq-path=./web_dloads/consensus_all.fasta
-"""
-# NOTE: add "--compress" for final runs
-
 from timeit import default_timer as timer
 import argparse
 import filecmp
 import gzip
 import os
-import pandas as pd
 import shutil
 import sys
 import zipfile
 
+import pandas as pd
+
 import ihm
 import ihm.citations
 import modelcif
@@ -34,6 +25,16 @@ import modelcif.reference
 from ost import io
 
 
+# EXAMPLES for running:
+"""
+ost translate2modelcif.py ./raw_data ./raw_data/ptm_plddt.all.txt \
+    ./web_dloads/pivot ./modelcif --prefix=F000347 \
+    --pdb-web-path=./web_dloads/pdb \
+    --refseq-path=./web_dloads/consensus_all.fasta
+"""
+# NOTE: add "--compress" for final runs
+
+
 def _abort_msg(msg, exit_code=1):
     """Write error message and exit with exit_code."""
     print(f"{msg}\nAborting.", file=sys.stderr)
@@ -105,7 +106,7 @@ def _parse_args():
         type=str,
         metavar="<PREFIX>",
         default="",
-        help="Only process families starting with given prefix. By default " \
+        help="Only process families starting with given prefix. By default "
         + "all families are processed.",
     )
     parser.add_argument(
@@ -120,7 +121,7 @@ def _parse_args():
         type=str,
         metavar="<PDB WEB PATH>",
         default=None,
-        help="Optional path to directory with F*.pdb files as available on " \
+        help="Optional path to directory with F*.pdb files as available on "
         + "NMPFamsDB web site. Used to check if top ranked model the same.",
     )
     parser.add_argument(
@@ -128,7 +129,7 @@ def _parse_args():
         type=str,
         metavar="<PDB REFSEQ PATH>",
         default=None,
-        help="Optional path to fasta file with all ref. seq. as available on " \
+        help="Optional path to fasta file with all ref. seq. as available on "
         + "NMPFamsDB web site. Used to check if it matches with MSA.",
     )
     opts = parser.parse_args()
@@ -155,26 +156,32 @@ def _parse_args():
 # pylint: disable=too-few-public-methods
 class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
     """Predicted accuracy according to the TM-score score in [0,1]"""
+
     name = "pTM"
     software = None
 
 
 class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
     """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+
     name = "pLDDT"
     software = None
 
 
 class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
     """Predicted accuracy according to the CA-only lDDT in [0,100]"""
+
     name = "pLDDT"
     software = None
 
 
 class _NmpfamsdbTrgRef(modelcif.reference.TargetReference):
     """PDB as target reference."""
+
     name = "Other"
     other_details = "NMPFamsDB"
+
+
 # pylint: enable=too-few-public-methods
 
 
@@ -213,12 +220,11 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
         self.plddts = []
         for res in self.ost_entity.residues:
             b_factors = [a.b_factor for a in res.atoms]
-            assert len(set(b_factors)) == 1 # must all be equal!
+            assert len(set(b_factors)) == 1  # must all be equal!
             self.plddts.append(b_factors[0])
 
         super().__init__(*args, **kwargs)
 
-
     def get_atoms(self):
         # ToDo [internal]: Take B-factor out since its not a B-factor?
         # NOTE: this assumes that _get_res_num maps residue to pos. in seqres
@@ -247,7 +253,6 @@ class _OST2ModelCIF(modelcif.model.AbInitioModel):
             )
         )
 
-
         # local scores
         i = 0
         for chn_i in self.ost_entity.chains:
@@ -296,8 +301,9 @@ def _get_audit_authors():
 
 def _get_metadata(metadata_file):
     """Read csv file with metedata and prepare for next steps."""
-    metadata = pd.read_csv(metadata_file, sep=' ',
-                           names=["ID", "mdl", "pTM", "pLDDT"])
+    metadata = pd.read_csv(
+        metadata_file, sep=" ", names=["ID", "mdl", "pTM", "pLDDT"]
+    )
     return metadata
 
 
@@ -305,18 +311,20 @@ def _get_pdb_files(model_base_dir):
     """Collect PDB files from pub_data_* folders.
     Returns dict with key = family name and value = list of paths to PDB files.
     """
-    pdb_files_split = dict() # to return
-    pdb_files_raw = set() # to check for duplicates
-    pub_paths = [f for f in os.listdir(model_base_dir) \
-                 if f.startswith("pub_data_")]
+    pdb_files_split = dict()  # to return
+    pdb_files_raw = set()  # to check for duplicates
+    pub_paths = [
+        f for f in os.listdir(model_base_dir) if f.startswith("pub_data_")
+    ]
     # NOTE: we sort pub_paths to ensure that pub_data_02 is before _03
     for pub_path in sorted(pub_paths):
         sub_path = os.path.join(model_base_dir, pub_path)
-        pdb_files_new = [f for f in os.listdir(sub_path) \
-                         if not f.startswith(".")]
+        pdb_files_new = [
+            f for f in os.listdir(sub_path) if not f.startswith(".")
+        ]
         for f in pdb_files_new:
             f_path = os.path.join(sub_path, f)
-            f_name = f.split('_')[0]
+            f_name = f.split("_")[0]
             if f_name in pdb_files_split:
                 pdb_files_split[f_name].append(f_path)
             else:
@@ -325,21 +333,27 @@ def _get_pdb_files(model_base_dir):
         pdb_files_new_set = set(pdb_files_new)
         new_duplicates = pdb_files_raw.intersection(pdb_files_new_set)
         if new_duplicates:
-            _warn_msg(f"{len(new_duplicates)} duplicated files found in " \
-                      f"{sub_path}.")
+            _warn_msg(
+                f"{len(new_duplicates)} duplicated files found in "
+                f"{sub_path}."
+            )
         pdb_files_raw = pdb_files_raw.union(pdb_files_new_set)
     return pdb_files_split
 
 
 def _get_config():
     """Define AF setup."""
-    msa_description = "MSA created by calculating the central or \"pivot\" " \
-                      "sequence of each seed MSA, and refining each " \
-                      "alignment using that sequence as the guide."
-    mdl_description = "Model generated using AlphaFold (v2.0.0 with models " \
-                      "finetuned to return ptm weights) producing 5 models, " \
-                      "without model relaxation, without templates, ranked " \
-                      "by pLDDT, starting from a custom MSA."
+    msa_description = (
+        'MSA created by calculating the central or "pivot" '
+        "sequence of each seed MSA, and refining each "
+        "alignment using that sequence as the guide."
+    )
+    mdl_description = (
+        "Model generated using AlphaFold (v2.0.0 with models "
+        "finetuned to return ptm weights) producing 5 models, "
+        "without model relaxation, without templates, ranked "
+        "by pLDDT, starting from a custom MSA."
+    )
     af_config = {}
     return {
         "af_config": af_config,
@@ -382,58 +396,62 @@ def _get_protocol_steps_and_software(config_data):
     step["output"] = "model"
     # get software
     if config_data["use_multimer"]:
-        step["software"] = [{
-            "name": "AlphaFold-Multimer",
-            "classification": "model building",
-            "description": "Structure prediction",
-            "citation": ihm.Citation(
-                pmid=None,
-                title="Protein complex prediction with "
-                + "AlphaFold-Multimer.",
-                journal="bioRxiv",
-                volume=None,
-                page_range=None,
-                year=2021,
-                authors=[
-                    "Evans, R.",
-                    "O'Neill, M.",
-                    "Pritzel, A.",
-                    "Antropova, N.",
-                    "Senior, A.",
-                    "Green, T.",
-                    "Zidek, A.",
-                    "Bates, R.",
-                    "Blackwell, S.",
-                    "Yim, J.",
-                    "Ronneberger, O.",
-                    "Bodenstein, S.",
-                    "Zielinski, M.",
-                    "Bridgland, A.",
-                    "Potapenko, A.",
-                    "Cowie, A.",
-                    "Tunyasuvunakool, K.",
-                    "Jain, R.",
-                    "Clancy, E.",
-                    "Kohli, P.",
-                    "Jumper, J.",
-                    "Hassabis, D.",
-                ],
-                doi="10.1101/2021.10.04.463034",
-            ),
-            "location": "https://github.com/deepmind/alphafold",
-            "type": "package",
-            "version": config_data["af_version"],
-        }]
+        step["software"] = [
+            {
+                "name": "AlphaFold-Multimer",
+                "classification": "model building",
+                "description": "Structure prediction",
+                "citation": ihm.Citation(
+                    pmid=None,
+                    title="Protein complex prediction with "
+                    + "AlphaFold-Multimer.",
+                    journal="bioRxiv",
+                    volume=None,
+                    page_range=None,
+                    year=2021,
+                    authors=[
+                        "Evans, R.",
+                        "O'Neill, M.",
+                        "Pritzel, A.",
+                        "Antropova, N.",
+                        "Senior, A.",
+                        "Green, T.",
+                        "Zidek, A.",
+                        "Bates, R.",
+                        "Blackwell, S.",
+                        "Yim, J.",
+                        "Ronneberger, O.",
+                        "Bodenstein, S.",
+                        "Zielinski, M.",
+                        "Bridgland, A.",
+                        "Potapenko, A.",
+                        "Cowie, A.",
+                        "Tunyasuvunakool, K.",
+                        "Jain, R.",
+                        "Clancy, E.",
+                        "Kohli, P.",
+                        "Jumper, J.",
+                        "Hassabis, D.",
+                    ],
+                    doi="10.1101/2021.10.04.463034",
+                ),
+                "location": "https://github.com/deepmind/alphafold",
+                "type": "package",
+                "version": config_data["af_version"],
+            }
+        ]
     else:
-        step["software"] = [{
-            "name": "AlphaFold",
-            "classification": "model building",
-            "description": "Structure prediction",
-            "citation": ihm.citations.alphafold2,
-            "location": "https://github.com/deepmind/alphafold",
-            "type": "package",
-            "version": config_data["af_version"],
-        }]
+        step["software"] = [
+            {
+                "name": "AlphaFold",
+                "classification": "model building",
+                "description": "Structure prediction",
+                "citation": ihm.citations.alphafold2,
+                "location": "https://github.com/deepmind/alphafold",
+                "type": "package",
+                "version": config_data["af_version"],
+            }
+        ]
     step["software_parameters"] = config_data["af_config"]
     protocol.append(step)
 
@@ -449,11 +467,13 @@ def _get_model_details(fam_name):
     """Get the model description."""
     db_url = f"https://bib.fleming.gr/NMPFamsDB/family?id={fam_name}"
     # TODO: check if ok to use HTML for the URL
-    db_url = f"<a href=\"{db_url}\" target=\"_blank\">{db_url}</a>"
-    return f"Model generated using AlphaFold (v2.0.0) for the " \
-           f"\"Representative Sequence\" of NMPFamsDB Metagenome / " \
-           f"Metatranscriptome Family {fam_name}.\n\nSee {db_url} for " \
-           f"additional details."
+    db_url = f'<a href="{db_url}" target="_blank">{db_url}</a>'
+    return (
+        f"Model generated using AlphaFold (v2.0.0) for the "
+        f'"Representative Sequence" of NMPFamsDB Metagenome / '
+        f"Metatranscriptome Family {fam_name}.\n\nSee {db_url} for "
+        f"additional details."
+    )
 
 
 def _get_model_group_name():
@@ -466,8 +486,7 @@ def _get_sequence(chn, use_auth=False):
     # initialise (add gaps if first is not at num. 1)
     lst_rn = _get_res_num(chn.residues[0], use_auth)
     idx = 1
-    sqe = "-" * (lst_rn - 1) \
-        + chn.residues[0].one_letter_code
+    sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code
 
     for res in chn.residues[idx:]:
         lst_rn += 1
@@ -484,28 +503,26 @@ def _get_entities(pdb_file, ref_seq, fam_name):
 
     ost_ent = io.LoadPDB(pdb_file)
     if ost_ent.chain_count != 1:
-        raise RuntimeError(
-            f"Unexpected oligomer in {pdb_file}"
-        )
+        raise RuntimeError(f"Unexpected oligomer in {pdb_file}")
     chn = ost_ent.chains[0]
     sqe_gaps = _get_sequence(chn)
 
     # NOTE: can have gaps to accommodate "X" in ref_seq
-    exp_seq = sqe_gaps.replace('-', 'X')
+    exp_seq = sqe_gaps.replace("-", "X")
     len_diff = len(ref_seq.string) - len(exp_seq)
     if len_diff > 0:
-        exp_seq += 'X' * len_diff
+        exp_seq += "X" * len_diff
     if exp_seq != ref_seq.string:
         raise RuntimeError(f"Sequence in {pdb_file} does not match ref_seq")
 
     # TODO: waiting for input on whether they change handling of "X" in seq
     # -> HERE: assuming that "X" were in modelled sequence and PDB can have gaps
     cif_ent = {
-        "seqres": ref_seq.string, # HACK for testing: .replace('X', 'A')
+        "seqres": ref_seq.string,  # HACK for testing: .replace('X', 'A')
         "pdb_sequence": sqe_gaps,
         "pdb_chain_id": [_get_ch_name(chn, False)],
         "fam_name": fam_name,
-        "description": f"Representative Sequence of NMPFamsDB Family {fam_name}"
+        "description": f"Representative Sequence of NMPFamsDB Family {fam_name}",
     }
 
     return [cif_ent], ost_ent
@@ -524,14 +541,15 @@ def _get_modelcif_entities(target_ents, asym_units, system):
                     cif_ent["fam_name"],
                     cif_ent["fam_name"],
                     align_begin=1,
-                    align_end=len(cif_ent["seqres"])
+                    align_end=len(cif_ent["seqres"]),
                 )
             ],
         )
         # NOTE: this assigns (potentially new) alphabetic chain names
         for pdb_chain_id in cif_ent["pdb_chain_id"]:
             asym_units[pdb_chain_id] = modelcif.AsymUnit(
-                mdlcif_ent, strand_id=pdb_chain_id,
+                mdlcif_ent,
+                strand_id=pdb_chain_id,
             )
         system.target_entities.append(mdlcif_ent)
 
@@ -543,7 +561,7 @@ def _get_assoc_aln_file(fle_path):
     cfile = modelcif.associated.File(
         fle_path,
         details="Custom MSA for modelling",
-        data=modelcif.data.Data("Custom MSA for modelling")
+        data=modelcif.data.Data("Custom MSA for modelling"),
     )
     cfile.file_format = "fasta"
     cfile.file_content = "multiple sequence alignments"
@@ -640,8 +658,8 @@ def _get_modelcif_protocol(protocol_steps, target_entities, aln_data, model):
 
 def _compress_cif_file(cif_file):
     """Compress cif file and delete original."""
-    with open(cif_file, 'rb') as f_in:
-        with gzip.open(cif_file + '.gz', 'wb') as f_out:
+    with open(cif_file, "rb") as f_in:
+        with gzip.open(cif_file + ".gz", "wb") as f_out:
             shutil.copyfileobj(f_in, f_out)
     os.remove(cif_file)
 
@@ -669,9 +687,7 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress):
 
     # create an asymmetric unit and an entity per target sequence
     asym_units = {}
-    _get_modelcif_entities(
-        data_json["target_entities"], asym_units, system
-    )
+    _get_modelcif_entities(data_json["target_entities"], asym_units, system)
 
     # audit_authors
     system.authors.extend(data_json["audit_authors"])
@@ -697,13 +713,14 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress):
 
     # handle additional files
     aln_file = _get_assoc_aln_file(data_json["aln_file_name"])
-    system.repositories.append(
-        _get_associated_files(mdl_name, [aln_file])
-    )
+    system.repositories.append(_get_associated_files(mdl_name, [aln_file]))
 
     # get data and steps
     protocol = _get_modelcif_protocol(
-        data_json["protocol"], system.target_entities, aln_file.data, model,
+        data_json["protocol"],
+        system.target_entities,
+        aln_file.data,
+        model,
     )
     system.protocols.append(protocol)
 
@@ -713,7 +730,7 @@ def _store_as_modelcif(data_json, ost_ent, out_dir, mdl_name, compress):
     # copy aln file to compress them
     shutil.copyfile(
         data_json["aln_file_path"],
-        os.path.join(out_dir, data_json["aln_file_name"])
+        os.path.join(out_dir, data_json["aln_file_name"]),
     )
     # NOTE: we change path and back while being exception-safe to handle zipfile
     oldpwd = os.getcwd()
@@ -736,8 +753,10 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
 
     # TODO: unclear what to do with such cases; skipped for now
     if len(metadata_fam) != 5:
-        _warn_msg(f"Unexpected number of {len(metadata_fam)} models in " \
-                  f"metadata for family {f_name}. Skipping...")
+        _warn_msg(
+            f"Unexpected number of {len(metadata_fam)} models in "
+            f"metadata for family {f_name}. Skipping..."
+        )
         return
     #
 
@@ -762,18 +781,23 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
     # TODO: if we need to handle files without ALN, this needs fixing
     # -> e.g. 11 extra models in pub_data_* cannot be handled right now
     if not os.path.exists(aln_path):
-        _warn_msg(f"Cannot deal with missing MSA for {f_name} (yet). " \
-                  f"Skipping...")
+        _warn_msg(
+            f"Cannot deal with missing MSA for {f_name} (yet). " f"Skipping..."
+        )
         return
     #
-    aln = io.LoadAlignment(aln_path) # note: this checks that it's an actual MSA
+    aln = io.LoadAlignment(
+        aln_path
+    )  # note: this checks that it's an actual MSA
     ref_seq = aln.sequences[0]
     if ref_seq_check is not None and ref_seq_check.string != ref_seq.string:
         raise RuntimeError(f"Sequence mismatch for {f_name}")
     # TODO: allow "X" (or whatever can be used to label unknown AA) if needed
     if "X" in ref_seq.string:
-        _warn_msg(f"Cannot deal with 'X' in ref_seq for {f_name} (yet). " \
-                  f"Skipping...")
+        _warn_msg(
+            f"Cannot deal with 'X' in ref_seq for {f_name} (yet). "
+            f"Skipping..."
+        )
         return
     #
 
@@ -814,8 +838,9 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
     if opts.pdb_web_path is not None:
         pdb_file_web = os.path.join(opts.pdb_web_path, f"{f_name}.pdb")
         if not filecmp.cmp(pdb_file, pdb_file_web):
-            raise RuntimeError(f"PDB file mismatch web vs top-ranked for " \
-                               f"{f_name}")
+            raise RuntimeError(
+                f"PDB file mismatch web vs top-ranked for " f"{f_name}"
+            )
 
     # fill annotations
     mdlcf_json["title"] = _get_title(f_name)
@@ -824,22 +849,27 @@ def _translate2modelcif(f_name, opts, metadata_fam, pdb_files, ref_seq_check):
     print(f" ({timer()-pstart:.2f}s)")
 
     # save ModelCIF
-    _store_as_modelcif(mdlcf_json, ost_ent, opts.out_dir, mdl_id, opts.compress)
+    _store_as_modelcif(
+        mdlcf_json, ost_ent, opts.out_dir, mdl_id, opts.compress
+    )
 
     # check if result can be read and has expected seq.
     ent, ss = io.LoadMMCIF(mdl_path, seqres=True)
-    exp_seqs = [trg_ent["pdb_sequence"] \
-                for trg_ent in mdlcf_json["target_entities"]]
+    exp_seqs = [
+        trg_ent["pdb_sequence"] for trg_ent in mdlcf_json["target_entities"]
+    ]
     assert ent.chain_count == len(exp_seqs), f"Bad chain count {mdl_id}"
     # here we expect auth = label IDs
     ent_seq = "".join([_get_sequence(chn, False) for chn in ent.chains])
     ent_seq_a = "".join([_get_sequence(chn, True) for chn in ent.chains])
     assert ent_seq == ent_seq_a
     assert ent_seq == "".join(exp_seqs), f"Bad seq. {mdl_id}"
-    ent_seqres = "".join([ss.FindSequence(chn.name).string \
-                          for chn in ent.chains])
-    exp_seqres = "".join([trg_ent["seqres"] \
-                         for trg_ent in mdlcf_json["target_entities"]])
+    ent_seqres = "".join(
+        [ss.FindSequence(chn.name).string for chn in ent.chains]
+    )
+    exp_seqres = "".join(
+        [trg_ent["seqres"] for trg_ent in mdlcf_json["target_entities"]]
+    )
     assert ent_seqres == exp_seqres, f"Bad seqres {mdl_id}"
 
     print(f"  ... done with {mdl_id} ({timer()-pdb_start:.2f}s).")
@@ -864,7 +894,8 @@ def _main():
     for f_name in sorted(pdb_files_split):
         if f_name.startswith(opts.prefix):
             _translate2modelcif(
-                f_name, opts,
+                f_name,
+                opts,
                 metadata_full[metadata_full.ID == f_name],
                 pdb_files_split[f_name],
                 refseqs.FindSequence(f_name) if refseqs is not None else None,
-- 
GitLab