Modelling with AFDB as template library

6e39a68e · Studer Gabriel · 570f0e12 · 6e39a68e · 6e39a68e · 6e39a68e
Commit 6e39a68e authored 2 years ago by Studer Gabriel
--- a/extras/data_generation/afdb_modelling/README.md
+++ b/extras/data_generation/afdb_modelling/README.md
+Data generation for AFDB Modelling capabilities in ProMod3
+
+Requires you to download the full proteomes database as described in
+https://github.com/deepmind/alphafold/blob/main/afdb/README.md.
+
+This gives one tar file per proteome which serves as starting point.
+
+In case of sciCORE they're here:
+`/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes`
+
+The `afdb_proteom_to_data_chunks.py` script generates one such chunk.
+It reads a list of filenames which needs to be generated manually.
+Something like: 
+
+```python
+import os
+files = os.listdir(<AFDB_PROTEOM_DIR>)
+with open("afdb_proteom_files.txt", 'w') as fh:
+    fh.write('\n'.join(files))
+```
+
+`create_commands.py` generates a command file which can be submitted
+as batch job. Carefully check the variables defined on top and adapt to your
+needs. Test to run one of these commands interactively to see whether the
+respective chunk file is created correctly before submitting.
+
+Once all chunks are there, an indexed database can be created with:
+
+```python
+from promod3.modelling import FSStructureServer
+fs_server = FSStructureServer.FromDataChunks("afdb_data_chunks", "afdb_fs")
+```
+
+## Data preparation for PentaMatch
+
+The same data chunks are used to extract the sequences that are searched by
+PentaMatch. `create_pentamatch_sequences.py` generates a Fasta file with all
+sequences of the previously generated FSStructureServer. Execute with:
+
+```bash
+ost create_pentamatch_sequences.py --data_chunks <DATA_CHUNK_DIR> --fs_server <FS_SERVER> --out <PENTAMATCH>.fasta
+```
+
+The searchable PentaMatch object is generated interactively with:
+```python
+from promod3.modelling import PentaMatch
+PentaMatch.FromSeqList("<PENTAMATCH>.fasta", "<PENTAMATCH_DIR>",
+                       entries_from_seqnames=True)
+```
+
+Be aware that the command above requires a substantial amount of memory.
+For 200e6 entries, 500GB was sufficient.
+
--- a/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py
+++ b/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py
+import os
+import tarfile
+import gzip
+import argparse
+import pickle
+
+from ost import io
+
+def parse_args():
+    desc = ("Parses proteom tarballs from AFDB and dumps pickle files ready "
+            "for database creation") 
+
+    parser = argparse.ArgumentParser(description = desc)
+    parser.add_argument("--afdb_dir", help="Directory containing the "
+                        "tarballs")
+    parser.add_argument("--tarballs", help=" File containing tarball filenames")
+    parser.add_argument("--start", help="idx in that file to start", type=int)
+    parser.add_argument("--end", help="idx in that file to end", type=int)
+    parser.add_argument("--out_file", help="Output file where stuff gets "
+                        "pickled")
+    parser.add_argument("--ca_only", help="Whether to use CA-only "
+                        "representation", action="store_true")
+    return parser.parse_args()
+
+def iterate_proteom(tarball):
+    """ Helper: iterate over proteome tarball provided by EBI-AFDB
+
+    Yields mmcif data
+    in detail: tuple with types :class:`str`, :class:`ost.mol.EntityHandle`,
+    :class:`ost.io.MMFicInfo` and :class:`ost.seq.SequenceList`
+
+    :param tarball: Path to tarball to iterate
+    :type tarball: :class:`str`
+    :returns: Generator of specified tuple
+    """
+    tf = tarfile.open(tarball)
+    prof = io.profiles.Get("STRICT")
+
+    # hack to correct for corrupt cif files in AFDB
+    corrupt_cit = "_citation.id                      primary"
+    correct_cit = "_citation.id                      1"
+    while True:
+        tar_info = tf.next()
+        if tar_info is None:
+            break # we're done
+        if tar_info.name.endswith("cif.gz"):
+            fh = tf.extractfile(tar_info)
+            mmcif_str = gzip.decompress(fh.read()).decode("utf-8")
+            if corrupt_cit in mmcif_str:
+                mmcif_str = mmcif_str.replace(corrupt_cit, correct_cit)
+            ent, mmcif_info, seqres = io.MMCifStrToEntity(mmcif_str,
+                                                          profile=prof,
+                                                          process=True)
+            yield (tar_info.name, ent, mmcif_info, seqres)
+
+def process_proteom(tarball, ca_only=False):
+    """ Helper: Creates list of entries for one proteom
+
+    :param tarball: Path to proteom tarball
+    :type tarball: :class:`str`
+    :ca_only: Whether to select for only CA positions
+    :type ca_only: :class:`bool`
+    :returns: :class: list of :class:`tuple` in format required by
+              :func:`FromDataChunks`
+    """
+    omf_opts = io.OMFOption.DEFAULT_PEPLIB | io.OMFOption.LOSSY | \
+               io.OMFOption.AVG_BFACTORS | io.OMFOption.ROUND_BFACTORS | \
+               io.OMFOption.SKIP_SS | io.OMFOption.INFER_PEP_BONDS
+
+    entries = list()
+    for entry in iterate_proteom(tarball):
+        uniprot_ac = entry[0].split('-')[1]
+        fragment = entry[0].split('-')[2]
+        version = entry[0].split('-')[3]
+        version = version.split('_')[1]
+        version = version.split('.')[0]
+        ent = entry[1]
+        if ca_only:
+            ent = ent.Select("aname=CA")
+            ent = mol.CreateEntityFromView(ent, False)
+        ent_name = f"{uniprot_ac} {fragment} {version}"
+        ent.SetName(ent_name)
+        omf = io.OMF.FromMMCIF(ent, entry[2], omf_opts)
+        omf = gzip.compress(omf.ToBytes())
+        entries.append([uniprot_ac, fragment, version, omf])
+    return entries
+
+def create_data_chunk(tarballs, afdb_dir, start, end, chunk_out,
+                      ca_only=False):
+    """ Create full data chunk
+
+    Requires preprocessing step of creating a text file that lists all
+    proteom tarballs.
+
+    :param tarballs: Path to file where each line corresponds to one proteom
+                     tarball, i.e. its filename
+    :type tarballs: :class:`str`
+    :param afdb_dir: Directory containing the tarball files specified in
+                     *tarballs*
+    :type afdb_dir: :class:`str`
+    :param start: Start of tarball slice to be added to chunk 
+    :type start: :class:`int`
+    :param end: End of tarball slice to be added to chunk (*end* won't be
+                added...)
+    :type end: :class:`str`
+    :param chunk_out: Chunk will be pickled to that file
+    :type chunk_out: :class:`str`
+    :ca_only: Whether to select for only CA positions
+    :type ca_only: :class:`bool`
+    """
+    with open(tarballs, 'r') as fh:
+        tarballs = fh.readlines()
+    entries = list()
+    for tarball in tarballs[start:end]:
+        full_path = os.path.join(afdb_dir, tarball.strip())
+        entries += process_proteom(full_path, ca_only=ca_only)
+    with open(chunk_out, 'wb') as fh:
+        pickle.dump(entries, fh)
+
+def main():
+    args = parse_args()
+    create_data_chunk(args.tarballs, args.afdb_dir, args.start, args.end,
+                      args.out_file, ca_only = args.ca_only)
+
+if __name__ == '__main__':
+    main()
--- a/extras/data_generation/afdb_modelling/create_commands.py
+++ b/extras/data_generation/afdb_modelling/create_commands.py
+import os
+
+# path to Singularity container with recent OpenStructure installation
+omf_sif = "<PATH_TO_SIF>"
+
+# path to directory that contains AFDB - one tarball per proteom
+afdb_dir = "/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes"
+
+# file with one proteom filename per line - in principle the content of os.listdir(afdb_dir)
+proteom_file = "<WHEREVER>/afdb_proteom_files.txt"
+
+# number of proteoms that end up in one data chunk
+chunk_size = 100
+
+# path to directory where chunks will be dumped
+chunk_out_dir = "<WHEREVER>/afdb_data_chunks"
+
+# path to script that does the job
+fancy_script = "<WHEREVER>/afdb_proteom_to_data_chunks.py"
+
+# whether structures should be reduced to CA only representation
+ca_only = False
+
+# path to file into which the commands are written
+cmd_file = "allatom_commands.cmd"
+
+
+with open(proteom_file, 'r') as fh:
+  proteom_data = fh.readlines()
+N = len(proteom_data)
+
+commands = list()
+chunk_idx = 0
+for i in range(0, N, chunk_size):
+    start = i
+    end = min(N, i+chunk_size)
+    out_file = os.path.join(chunk_out_dir, str(chunk_idx) + ".pkl")
+    cmd = ["singularity", "exec", "--bind", f"{afdb_dir}:{afdb_dir}",
+           omf_sif, "ost", fancy_script,
+           "--afdb_dir", afdb_dir,
+           "--tarballs", proteom_file,
+           "--start", str(start),
+           "--end", str(end),
+           "--out_file", out_file]
+    if ca_only:
+        cmd.append("--ca_only")
+    commands.append(' '.join(cmd))
+    chunk_idx += 1
+
+with open(cmd_file, 'w') as fh:
+    fh.write('\n'.join(commands))
+
--- a/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py
+++ b/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py
+import os
+import tarfile
+import gzip
+import argparse
+import pickle
+
+from ost import io
+from promod3.modelling import FSStructureServer
+
+def parse_args():
+    desc = ("Extracts sequences from pickled files generated with "
+            "create_fsserver_data_chunks.py and relates them to entries in an "
+            "FSStructureServer object. I.e. Dumps a sequence list in Fasta "
+            "format with entry indices as sequence names. This file serves as "
+            "starting point for an actual PentaMatch object. Requires faf.py "
+            "to be importable.") 
+
+    parser = argparse.ArgumentParser(description = desc)
+    parser.add_argument("--data_chunks", help="Directory with all data chunks "
+                        "generated with create_fsserver_data_chunks.py")
+    parser.add_argument("--fs_server", help="Directory containing the files "
+                        "for an FSStructureServer object derived from "
+                        "data_chunks directory")
+    parser.add_argument("--out", help="Outfile in Fasta format that will "
+                        "contain all sequences with their respective index in "
+                        "the underlying FSStructureServer object as names")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    fs_server = FSStructureServer(args.fs_server)
+    chunk_files = os.listdir(args.data_chunks)
+    outfile = open(args.out, 'w')
+
+    for cf_idx, cf in enumerate(chunk_files):
+        print(f"processing chunk {cf_idx}, {cf}")
+        with open(os.path.join(args.data_chunks, cf), 'rb') as fh:
+            data = pickle.load(fh)
+        for entry in data:
+            uniprot_ac = entry[0]
+            fragment = entry[1]
+            version = entry[2]
+            omf_data = entry[3]
+            idx = fs_server.GetIdx(uniprot_ac, fragment = fragment,
+                                   version = version)
+            omf = io.OMF.FromBytes(gzip.decompress(omf_data))
+            assert(omf.GetChainNames() == ["A"])
+            sequence = omf.GetSequence("A")
+            outfile.write(f">{idx}\n")
+            seq_chunks = [sequence[i:i+80] for i in range(0, len(sequence), 80)]
+            outfile.write('\n'.join(seq_chunks))
+            outfile.write('\n')
+    outfile.close()
+
+if __name__ == '__main__':
+    main()
+
--- a/modelling/doc/algorithms.rst
+++ b/modelling/doc/algorithms.rst
@@ -326,3 +326,34 @@ iteration.
  :returns:             All found matches

  :rtype:               :class:`list` of :class:`MotifMatch`
+
+
+AFDB Modelling
+--------------------------------------------------------------------------------
+
+Template based modelling using AFDB as template library comes with two
+challenges for which |project| provides solutions:
+
+* efficient structure storage for the whole AFDB: :class:`FSStructureServer`
+* fast sequence searches with limited sensitivity: :class:`PentaMatch`
+
+The creation of these two object requires extensive preprocessing. The required
+scripts and documentation are available in
+`<GIT_ROOT>/extras/data_generation/afdb_modelling`.
+
+Basic modelling functionality is available in the following two functions:
+
+* :func:`AFDBTPLSearch`
+* :func:`AFDBModel`
+
+.. autofunction:: AFDBTPLSearch
+
+.. autofunction:: AFDBModel
+
+.. autoclass:: FSStructureServer
+  :members:
+  :member-order: bysource
+
+.. autoclass:: PentaMatch
+  :members:
+  :member-order: bysource
--- a/modelling/pymod/CMakeLists.txt
+++ b/modelling/pymod/CMakeLists.txt
@@ -29,6 +29,7 @@ set(MODELLING_PYMOD
  _alignment_fiddling.py
  _raw_model.py
  _planar_rings.py
+  _afdb_modelling.py
 )

 pymod(NAME modelling

--- a/modelling/pymod/__init__.py
+++ b/modelling/pymod/__init__.py
@@ -28,3 +28,4 @@ from ._monte_carlo import *
 from ._mhandle_helper import *
 from ._raw_model import *
 from ._planar_rings import *
+from ._afdb_modelling import *
--- a/modelling/pymod/_afdb_modelling.py
+++ b/modelling/pymod/_afdb_modelling.py