Modelling with AFDB as template library

6e39a68e · Studer Gabriel · 570f0e12 · 6e39a68e · 6e39a68e · 6e39a68e
Commit 6e39a68e authored 2 years ago by Studer Gabriel
--- a/extras/data_generation/afdb_modelling/README.md
+++ b/extras/data_generation/afdb_modelling/README.md
+Data generation for AFDB Modelling capabilities in ProMod3
+Requires you to download the full proteomes database as described in
+https://github.com/deepmind/alphafold/blob/main/afdb/README.md.
+This gives one tar file per proteome which serves as starting point.
+In case of sciCORE they're here:
+`/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes`
+The `afdb_proteom_to_data_chunks.py` script generates one such chunk.
+It reads a list of filenames which needs to be generated manually.
+Something like: 
+```python
+import os
+files = os.listdir(<AFDB_PROTEOM_DIR>)
+with open("afdb_proteom_files.txt", 'w') as fh:
+    fh.write('\n'.join(files))
+```
+`create_commands.py` generates a command file which can be submitted
+as batch job. Carefully check the variables defined on top and adapt to your
+needs. Test to run one of these commands interactively to see whether the
+respective chunk file is created correctly before submitting.
+Once all chunks are there, an indexed database can be created with:
+```python
+from promod3.modelling import FSStructureServer
+fs_server = FSStructureServer.FromDataChunks("afdb_data_chunks", "afdb_fs")
+```
+## Data preparation for PentaMatch
+The same data chunks are used to extract the sequences that are searched by
+PentaMatch. `create_pentamatch_sequences.py` generates a Fasta file with all
+sequences of the previously generated FSStructureServer. Execute with:
+```bash
+ost create_pentamatch_sequences.py --data_chunks <DATA_CHUNK_DIR> --fs_server <FS_SERVER> --out <PENTAMATCH>.fasta
+```
+The searchable PentaMatch object is generated interactively with:
+```python
+from promod3.modelling import PentaMatch
+PentaMatch.FromSeqList("<PENTAMATCH>.fasta", "<PENTAMATCH_DIR>",
+                       entries_from_seqnames=True)
+```
+Be aware that the command above requires a substantial amount of memory.
+For 200e6 entries, 500GB was sufficient.
--- a/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py
+++ b/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py
+import os
+import tarfile
+import gzip
+import argparse
+import pickle
+from ost import io
+def parse_args():
+    desc = ("Parses proteom tarballs from AFDB and dumps pickle files ready "
+            "for database creation") 
+    parser = argparse.ArgumentParser(description = desc)
+    parser.add_argument("--afdb_dir", help="Directory containing the "
+                        "tarballs")
+    parser.add_argument("--tarballs", help=" File containing tarball filenames")
+    parser.add_argument("--start", help="idx in that file to start", type=int)
+    parser.add_argument("--end", help="idx in that file to end", type=int)
+    parser.add_argument("--out_file", help="Output file where stuff gets "
+                        "pickled")
+    parser.add_argument("--ca_only", help="Whether to use CA-only "
+                        "representation", action="store_true")
+    return parser.parse_args()
+def iterate_proteom(tarball):
+    """ Helper: iterate over proteome tarball provided by EBI-AFDB
+    Yields mmcif data
+    in detail: tuple with types :class:`str`, :class:`ost.mol.EntityHandle`,
+    :class:`ost.io.MMFicInfo` and :class:`ost.seq.SequenceList`
+    :param tarball: Path to tarball to iterate
+    :type tarball: :class:`str`
+    :returns: Generator of specified tuple
+    """
+    tf = tarfile.open(tarball)
+    prof = io.profiles.Get("STRICT")
+    # hack to correct for corrupt cif files in AFDB
+    corrupt_cit = "_citation.id                      primary"
+    correct_cit = "_citation.id                      1"
+    while True:
+        tar_info = tf.next()
+        if tar_info is None:
+            break # we're done
+        if tar_info.name.endswith("cif.gz"):
+            fh = tf.extractfile(tar_info)
+            mmcif_str = gzip.decompress(fh.read()).decode("utf-8")
+            if corrupt_cit in mmcif_str:
+                mmcif_str = mmcif_str.replace(corrupt_cit, correct_cit)
+            ent, mmcif_info, seqres = io.MMCifStrToEntity(mmcif_str,
+                                                          profile=prof,
+                                                          process=True)
+            yield (tar_info.name, ent, mmcif_info, seqres)
+def process_proteom(tarball, ca_only=False):
+    """ Helper: Creates list of entries for one proteom
+    :param tarball: Path to proteom tarball
+    :type tarball: :class:`str`
+    :ca_only: Whether to select for only CA positions
+    :type ca_only: :class:`bool`
+    :returns: :class: list of :class:`tuple` in format required by
+              :func:`FromDataChunks`
+    """
+    omf_opts = io.OMFOption.DEFAULT_PEPLIB | io.OMFOption.LOSSY | \
+               io.OMFOption.AVG_BFACTORS | io.OMFOption.ROUND_BFACTORS | \
+               io.OMFOption.SKIP_SS | io.OMFOption.INFER_PEP_BONDS
+    entries = list()
+    for entry in iterate_proteom(tarball):
+        uniprot_ac = entry[0].split('-')[1]
+        fragment = entry[0].split('-')[2]
+        version = entry[0].split('-')[3]
+        version = version.split('_')[1]
+        version = version.split('.')[0]
+        ent = entry[1]
+        if ca_only:
+            ent = ent.Select("aname=CA")
+            ent = mol.CreateEntityFromView(ent, False)
+        ent_name = f"{uniprot_ac} {fragment} {version}"
+        ent.SetName(ent_name)
+        omf = io.OMF.FromMMCIF(ent, entry[2], omf_opts)
+        omf = gzip.compress(omf.ToBytes())
+        entries.append([uniprot_ac, fragment, version, omf])
+    return entries
+def create_data_chunk(tarballs, afdb_dir, start, end, chunk_out,
+                      ca_only=False):
+    """ Create full data chunk
+    Requires preprocessing step of creating a text file that lists all
+    proteom tarballs.
+    :param tarballs: Path to file where each line corresponds to one proteom
+                     tarball, i.e. its filename
+    :type tarballs: :class:`str`
+    :param afdb_dir: Directory containing the tarball files specified in
+                     *tarballs*
+    :type afdb_dir: :class:`str`
+    :param start: Start of tarball slice to be added to chunk 
+    :type start: :class:`int`
+    :param end: End of tarball slice to be added to chunk (*end* won't be
+                added...)
+    :type end: :class:`str`
+    :param chunk_out: Chunk will be pickled to that file
+    :type chunk_out: :class:`str`
+    :ca_only: Whether to select for only CA positions
+    :type ca_only: :class:`bool`
+    """
+    with open(tarballs, 'r') as fh:
+        tarballs = fh.readlines()
+    entries = list()
+    for tarball in tarballs[start:end]:
+        full_path = os.path.join(afdb_dir, tarball.strip())
+        entries += process_proteom(full_path, ca_only=ca_only)
+    with open(chunk_out, 'wb') as fh:
+        pickle.dump(entries, fh)
+def main():
+    args = parse_args()
+    create_data_chunk(args.tarballs, args.afdb_dir, args.start, args.end,
+                      args.out_file, ca_only = args.ca_only)
+if __name__ == '__main__':
+    main()
--- a/extras/data_generation/afdb_modelling/create_commands.py
+++ b/extras/data_generation/afdb_modelling/create_commands.py
+import os
+# path to Singularity container with recent OpenStructure installation
+omf_sif = "<PATH_TO_SIF>"
+# path to directory that contains AFDB - one tarball per proteom
+afdb_dir = "/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes"
+# file with one proteom filename per line - in principle the content of os.listdir(afdb_dir)
+proteom_file = "<WHEREVER>/afdb_proteom_files.txt"
+# number of proteoms that end up in one data chunk
+chunk_size = 100
+# path to directory where chunks will be dumped
+chunk_out_dir = "<WHEREVER>/afdb_data_chunks"
+# path to script that does the job
+fancy_script = "<WHEREVER>/afdb_proteom_to_data_chunks.py"
+# whether structures should be reduced to CA only representation
+ca_only = False
+# path to file into which the commands are written
+cmd_file = "allatom_commands.cmd"
+with open(proteom_file, 'r') as fh:
+  proteom_data = fh.readlines()
+N = len(proteom_data)
+commands = list()
+chunk_idx = 0
+for i in range(0, N, chunk_size):
+    start = i
+    end = min(N, i+chunk_size)
+    out_file = os.path.join(chunk_out_dir, str(chunk_idx) + ".pkl")
+    cmd = ["singularity", "exec", "--bind", f"{afdb_dir}:{afdb_dir}",
+           omf_sif, "ost", fancy_script,
+           "--afdb_dir", afdb_dir,
+           "--tarballs", proteom_file,
+           "--start", str(start),
+           "--end", str(end),
+           "--out_file", out_file]
+    if ca_only:
+        cmd.append("--ca_only")
+    commands.append(' '.join(cmd))
+    chunk_idx += 1
+with open(cmd_file, 'w') as fh:
+    fh.write('\n'.join(commands))
--- a/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py
+++ b/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py
+import os
+import tarfile
+import gzip
+import argparse
+import pickle
+from ost import io
+from promod3.modelling import FSStructureServer
+def parse_args():
+    desc = ("Extracts sequences from pickled files generated with "
+            "create_fsserver_data_chunks.py and relates them to entries in an "
+            "FSStructureServer object. I.e. Dumps a sequence list in Fasta "
+            "format with entry indices as sequence names. This file serves as "
+            "starting point for an actual PentaMatch object. Requires faf.py "
+            "to be importable.") 
+    parser = argparse.ArgumentParser(description = desc)
+    parser.add_argument("--data_chunks", help="Directory with all data chunks "
+                        "generated with create_fsserver_data_chunks.py")
+    parser.add_argument("--fs_server", help="Directory containing the files "
+                        "for an FSStructureServer object derived from "
+                        "data_chunks directory")
+    parser.add_argument("--out", help="Outfile in Fasta format that will "
+                        "contain all sequences with their respective index in "
+                        "the underlying FSStructureServer object as names")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    fs_server = FSStructureServer(args.fs_server)
+    chunk_files = os.listdir(args.data_chunks)
+    outfile = open(args.out, 'w')
+    for cf_idx, cf in enumerate(chunk_files):
+        print(f"processing chunk {cf_idx}, {cf}")
+        with open(os.path.join(args.data_chunks, cf), 'rb') as fh:
+            data = pickle.load(fh)
+        for entry in data:
+            uniprot_ac = entry[0]
+            fragment = entry[1]
+            version = entry[2]
+            omf_data = entry[3]
+            idx = fs_server.GetIdx(uniprot_ac, fragment = fragment,
+                                   version = version)
+            omf = io.OMF.FromBytes(gzip.decompress(omf_data))
+            assert(omf.GetChainNames() == ["A"])
+            sequence = omf.GetSequence("A")
+            outfile.write(f">{idx}\n")
+            seq_chunks = [sequence[i:i+80] for i in range(0, len(sequence), 80)]
+            outfile.write('\n'.join(seq_chunks))
+            outfile.write('\n')
+    outfile.close()
+if __name__ == '__main__':
+    main()
--- a/modelling/doc/algorithms.rst
+++ b/modelling/doc/algorithms.rst
@@ -326,3 +326,34 @@ iteration.
  :returns:             All found matches
  :rtype:               :class:`list` of :class:`MotifMatch`
+AFDB Modelling
+--------------------------------------------------------------------------------
+Template based modelling using AFDB as template library comes with two
+challenges for which |project| provides solutions:
+* efficient structure storage for the whole AFDB: :class:`FSStructureServer`
+* fast sequence searches with limited sensitivity: :class:`PentaMatch`
+The creation of these two object requires extensive preprocessing. The required
+scripts and documentation are available in
+`<GIT_ROOT>/extras/data_generation/afdb_modelling`.
+Basic modelling functionality is available in the following two functions:
+* :func:`AFDBTPLSearch`
+* :func:`AFDBModel`
+.. autofunction:: AFDBTPLSearch
+.. autofunction:: AFDBModel
+.. autoclass:: FSStructureServer
+  :members:
+  :member-order: bysource
+.. autoclass:: PentaMatch
+  :members:
+  :member-order: bysource
--- a/modelling/pymod/CMakeLists.txt
+++ b/modelling/pymod/CMakeLists.txt
@@ -29,6 +29,7 @@ set(MODELLING_PYMOD
  _alignment_fiddling.py
  _raw_model.py
  _planar_rings.py
+  _afdb_modelling.py
 )
 pymod(NAME modelling

--- a/modelling/pymod/__init__.py
+++ b/modelling/pymod/__init__.py
@@ -28,3 +28,4 @@ from ._monte_carlo import *
 from ._mhandle_helper import *
 from ._raw_model import *
 from ._planar_rings import *
+from ._afdb_modelling import *
--- a/modelling/pymod/_afdb_modelling.py
+++ b/modelling/pymod/_afdb_modelling.py
+# Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and
+#                          Biozentrum - University of Basel
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import gzip
+import mmap
+import pickle
+import numpy as np
+import time
+import tarfile
+import glob
+from ost import io
+from ost import seq
+from ._raw_model import BuildRawModel
+from ._pipeline import BuildFromRawModel
+from ._modelling import *
+class FSStructureServer:
+    """ FSStructureServer - A filesystem based AFDB structure server
+    Stores OMF data entries in huge binary files and extracts the respective
+    binary data blobs with an indexing mechanism. Efficient reading of these
+    huge files is delegated to the OS using the Python mmap module.
+    Data creation happens with static Create function: :func:`FromDataChunks`.
+    The required preprocessing must be done with external scripts.
+    :param db_dir: Directory containing required files. 1) pos.dat (see
+                   :attr:`pos`) 2) length.dat (see :attr:`length`)
+                   3) chunk.dat (see :attr:`chunk`) 4) indexer.dat (see
+                   :attr:`indexer`) 4) one or several files with pattern
+                   fs_data_[x].dat (see :attr:`data`)
+    :type db_dir: :class:`str`
+    """
+    def __init__(self, db_dir):
+        super().__init__()
+        self._pos = None
+        self._length = None
+        self._chunk = None
+        self._indexer = None
+        self._search_keys = None
+        self._data = None
+        self._data_fh = None
+        self._n_entries = None
+        # only check if required files are there, they're lazy loaded
+        self.pos_file = os.path.join(db_dir, "pos.dat")
+        self.length_file = os.path.join(db_dir, "length.dat")
+        self.chunk_file = os.path.join(db_dir, "chunk.dat")
+        self.search_key_file = os.path.join(db_dir, "search_keys.dat")
+        self.indexer_file = os.path.join(db_dir, "indexer.dat")
+        self.data_files = glob.glob(os.path.join(db_dir, "fs_data_*.dat"))
+        if not os.path.exists(self.pos_file):
+            raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})")
+        if not os.path.exists(self.length_file):
+            raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})")
+        if not os.path.exists(self.chunk_file):
+            raise RuntimeError(f"Exp \"chunk.dat\" file in db_dir ({db_dir})")
+        if not os.path.exists(self.indexer_file):
+            raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})")
+        # there can be one or more data files, just check whether there is at
+        # least one
+        if len(self.data_files) == 0:
+            raise RuntimeError(f"Exp at least one data file in form "
+                               f"\"fs_data_[x].dat\" in db_dir ({db_dir})")
+    def __del__(self):
+        if self._data:
+            for fh in self._data:
+                fh.close()
+            for fh in self._data_fh:
+                fh.close()
+    @property
+    def pos(self):
+        """ Start positions of data in internal linear memory layout
+        :type: :class:`np.ndarray` with dtype np.int64
+        """
+        if self._pos is None:
+            with open(self.pos_file, 'rb') as fh:
+                self._pos = np.load(fh)
+        return self._pos
+    @property
+    def length(self):
+        """ Lengths of data in internal linear memory layout
+        :type: :class:`np.ndarray` with dtype np.int32
+        """
+        if self._length is None:
+            with open(self.length_file, 'rb') as fh:
+                self._length = np.load(fh)
+        return self._length
+    @property
+    def chunk(self):
+        """ Chunk, in which entry is stored
+        :type: :class:`np.ndarray` with dtype np.int16
+        """
+        if self._chunk is None:
+            with open(self.chunk_file, 'rb') as fh:
+                self._chunk = np.load(fh)
+        return self._chunk
+    @property
+    def indexer(self):
+        """ Internal data structure - Relates entries with data indices
+        :type: :class:`np.ndarray` with dtype np.int32
+        """
+        if self._indexer is None:
+            with open(self.indexer_file, 'rb') as fh:
+                self._indexer = np.load(fh)
+        return self._indexer
+    @property
+    def search_keys(self):
+        """ Internal data structure - Relates entries with data indices
+        """
+        if self._search_keys is None:
+            with open(self.search_key_file, 'rb') as fh:
+                self._search_keys = np.load(fh)
+        return self._search_keys               
+    @property
+    def data(self):
+        """ Internal binary data in memory mapped files
+        :type: :class:`list` of :class:`mmap.mmap`
+        """
+        if self._data is None:
+            self._data = list()
+            self._data_fh = list()
+            tmp = list()
+            for f in self.data_files:
+                idx = int(f.split('_')[-1].replace(".dat", ""))
+                tmp.append((idx, f))
+            tmp.sort()
+            sorted_data_files = [x[1] for x in tmp]
+            for f in sorted_data_files:
+                self._data_fh.append(open(f, 'r+b'))
+                self._data.append(mmap.mmap(self._data_fh[-1].fileno(), 0))
+        return self._data
+    @property
+    def n_entries(self):
+        """ Number of entries
+        :type: :class:`int`
+        """
+        if self._n_entries is None:
+            self._n_entries = self.pos.shape[0]
+        return self._n_entries
+    def GetIdx(self, uniprot_ac, fragment="F1", version="v4"):
+        """ Get internal idx of stored data
+        Can be used for data retrieval with :func:`GetOMFByIdx`
+        :param uniprot_ac: Uniprot AC 
+        :type uniprot_ac: :class:`str`
+        :param fragment: AFDB entries are potentially split to fragments
+                         99.999% of all entries only have one fragment: F1
+        :type fragment: class:`str`
+        :param version: Version of entry
+        :type version: :class:`str`
+        :returns: Internal idx of that entry
+        :raises: :class:`RuntimeError` if no such entry exists
+        """
+        if not fragment.startswith('F'):
+            raise RuntimeError("expect fragment to start with F")
+        if not version.startswith('v'):
+            raise RuntimeError("expect version to start with v")
+        search_key = CreateAFDBIdx(uniprot_ac, int(fragment[1:]),
+                                   int(version[1:]))
+        idx = np.searchsorted(self.search_keys, np.uint64(search_key))
+        if idx != len(self.indexer) and self.search_keys[idx] == search_key:
+            return self.indexer[idx]
+        raise RuntimeError(f"No entry for {uniprot_ac} {fragment} "
+                           f"{version}")
+    def GetOMFByIdx(self, idx):
+        """ Get stored OMF data structure
+        :param idx: Internal index which can be derived from :func:`GetIdx`  
+        :type idx: :class:`int`
+        :returns: OMF data structure of type :class:`ost.io.OMF`
+        :raises: :class:`RuntimeError` if *idx* is out of range
+        """
+        if idx < 0 or idx >= self.n_entries:
+            raise RuntimeError(f"Invalid idx, must be in [0, {self.n_entries-1}]")
+        pos = self.pos[idx]
+        length = self.length[idx]
+        chunk = self.chunk[idx]
+        omf_data = self.data[chunk][pos:pos+length]
+        return io.OMF.FromBytes(gzip.decompress(omf_data))
+    def GetOMF(self, uniprot_ac, fragment="F1", version="v4"):
+        """ Get stored OMF data structure
+        :param uniprot_ac: Uniprot AC 
+        :type uniprot_ac: :class:`str`
+        :param fragment: AFDB entries are potentially split to fragments
+                         99.999% of all entries only have one fragment: F1
+        :type fragment: class:`str`
+        :param version: Version of entry
+        :type version: :class:`str`
+        :returns: OMF data structure of type :class:`ost.io.OMF`
+        :raises: :class:`RuntimeError` if no such entry exists
+        """
+        idx = self.GetIdx(uniprot_ac, fragment = fragment, version = version)
+        return self.GetOMFByIdx(idx)
+    @staticmethod
+    def FromDataChunks(chunk_dir, db_dir, chunk_bytes=None):
+        """ Static method to create new database from preprocessed data
+        Data preprocessing consists of creating several data chunks that are
+        pickled to disk.
+        In detail: each chunk is a pickled file containing a list, where each
+        entry is a tuple with 4 elements: 1) uniprot_ac (:class:`str`)
+        2) fragment (:class:`str`) 3) version (:class:`str`) 4) structure data
+        (:class:`ost.io.OMF` object which has been written to a bytes object and
+        compressed with gzip)
+        The data itself is again stored in chunks of binary data which are
+        indexed.
+        :param chunk_dir: Path to directory containing described data chunks
+        :type chunk_dir: :class:`str`
+        :param db_dir: Output directory - Creates all files that are needed for
+                       :class:`FSStructureServer`
+        :type db_dir: :class:`str`
+        :param chunk_bytes: Size in bytes of binary data chunks in final
+                            database - default None: Everything ends up in one
+                            single chunk
+        :type chunk_bytes: :class:`int`
+        :returns: :class:`FSStructureServer` with all data from *chunk_dir*
+        """
+        if not os.path.exists(db_dir):
+            raise RuntimeError(f"{db_dir} does not exist")
+        positions = list()
+        lengths = list()
+        chunks = list()
+        search_keys = list()
+        indexer = list()
+        chunk_files = os.listdir(chunk_dir)
+        chunk_files = [f for f in chunk_files if f.endswith(".pkl")]
+        current_pos = 0
+        current_chunk = 0
+        current_entry = 0
+        data_file = open(os.path.join(db_dir, "fs_data_0.dat"), 'wb')
+        t0 = time.time()
+        for cf_idx, cf in enumerate(chunk_files):
+            print(f"processing chunk {cf_idx}, {cf}")
+            with open(os.path.join(chunk_dir, cf), 'rb') as fh:
+                data = pickle.load(fh)
+            for entry in data:
+                uniprot_ac = entry[0]
+                fragment = entry[1]
+                version = entry[2]
+                data_bytes = entry[3]
+                length = len(data_bytes)
+                data_file.write(data_bytes)
+                positions.append(current_pos)
+                lengths.append(length)
+                chunks.append(current_chunk)
+                k = CreateAFDBIdx(uniprot_ac, int(fragment[1:]),
+                                  int(version[1:]))
+                search_keys.append(k)
+                indexer.append(current_entry)
+                current_pos += length
+                current_entry += 1
+                if chunk_bytes and current_pos > chunk_bytes:
+                    data_file.close()
+                    current_chunk += 1
+                    f = os.path.join(db_dir, f"fs_data_{current_chunk}.dat")
+                    data_file = open(f, 'wb')
+                    current_pos = 0
+        data_file.close()
+        print(f"done processing chunks ({round(time.time() - t0, 3)}s)")
+        print("sort indexer matrix")
+        t0 = time.time()
+        # make search keys searchable by bisect search => sort
+        search_keys = np.asarray(search_keys, dtype=np.uint64)
+        indexer = np.asarray(indexer, dtype=np.int32)
+        sort_indices = np.argsort(search_keys)
+        search_keys = search_keys[sort_indices]
+        indexer = indexer[sort_indices]
+        with open(os.path.join(db_dir, "search_keys.dat"), 'wb') as fh:
+            np.save(fh, search_keys)
+        with open(os.path.join(db_dir, "indexer.dat"), 'wb') as fh:
+            np.save(fh, indexer)
+        with open(os.path.join(db_dir, "pos.dat"), 'wb') as fh:
+            np.save(fh, np.array(positions, dtype=np.int64))
+        with open(os.path.join(db_dir, "length.dat"), 'wb') as fh:
+            np.save(fh, np.array(lengths, dtype=np.int32))
+        with open(os.path.join(db_dir, "chunk.dat"), 'wb') as fh:
+            np.save(fh, np.array(chunks, dtype=np.int16))
+        fs_server = FSStructureServer(db_dir)
+        return fs_server
+class PentaMatch:
+    """ Pentamer matching for fast sequence searches
+    :class:`PentamerMatch` has fast sequence searches with low sensitivity as
+    use case. Specifically searching the full AFDB. Stores all unique pentamers
+    for each search sequence. Given a query sequence, it computes the number of
+    matching pentamers with respect to each search sequence and returns the top
+    hits.
+    :param db_dir: Directory containing all required files (indexer.dat,
+                   pos.dat, length.dat, meta.dat). New :class:`PentaMatch`
+                   objects can be derived using the static :func:`FromSeqList`
+                   creator.
+    :type db_dir: :class:`str`
+    :raises: :class:`RuntimeError` if any required file is missing in *db_dir*
+    """
+    def __init__(self, db_dir):
+        self._indexer = None
+        self._pos = None
+        self._length = None
+        self._N = None
+        self.db_dir = db_dir
+        # only check if required files are there, they're lazy loaded
+        self.indexer_file = os.path.join(db_dir, "indexer.dat")
+        self.pos_file = os.path.join(db_dir, "pos.dat")
+        self.length_file = os.path.join(db_dir, "length.dat")
+        self.meta_file = os.path.join(db_dir, "meta.dat")
+        if not os.path.exists(self.indexer_file):
+            raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})")
+        if not os.path.exists(self.pos_file):
+            raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})")
+        if not os.path.exists(self.length_file):
+            raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})")
+        if not os.path.exists(self.meta_file):
+            raise RuntimeError(f"Exp \"meta.dat\" file in db_dir ({db_dir})")
+    @property
+    def indexer(self):
+        """ Entry indices for pentamers
+        Entry data for one pentamer can be extracted with the respective values in
+        :attr:`pos` and :attr:`length`
+        :type: memory mapped :class:`np.ndarray` of dtype np.int32
+        """
+        if self._indexer is None:
+            self._indexer = np.memmap(self.indexer_file, dtype=np.int32,
+                                      mode='r')
+        return self._indexer
+    @property
+    def pos(self):
+        """ Start position for each pentamer in :attr:`indexer`
+        :type: :class:`np.ndarray` of dtype np.int64 with 3.2E6 entries
+        """
+        if self._pos is None:
+            self._pos = np.fromfile(self.pos_file, dtype=np.int64)
+        return self._pos
+    @property
+    def length(self):
+        """ Length for each pentamer in :attr:`indexer`
+        :type: :class:`np.ndarray` of dtype np.int32 with 3.2E6 entries
+        """
+        if self._length is None:
+            self._length = np.fromfile(self.length_file, dtype=np.int32)
+        return self._length
+    @property
+    def N(self):
+        """ Number of entries in underlying :class:`FSStructureServer`
+        :type: :class:`int`
+        """
+        if self._N is None:
+            with open(self.meta_file, 'r') as fh:
+                self._N = int(fh.read()) + 1 # the files stores max idx => +1
+        return self._N
+    def TopN(self, N, sequence, return_counts=False, unique_pentamers=True):
+        """ Find top-N matches given *sequence*
+        Identifies unique pentamers in *sequence* and counts number of
+        occurences in each entry. Returns the top-N entries with respect
+        to counts.
+        :param N: Number of results to return
+        :type N: :class:`int`
+        :param sequence: Sequence to search
+        :type sequence: :class:`str`
+        :param return_counts: Additionally return underlying pentamer counts
+        :type return_counts: :class:`bool`
+        :param unique_pentamers: Set to True if :attr:`indexer` contains only
+                                 unique pentamers for each entry. This way we
+                                 can use faster methods to accumulate counts.
+                                 In detail: accumulator[indices] += 1 is much
+                                 faster than np.add.at(accumulator, indices, 1).
+                                 But the latter is required if there are
+                                 duplicates.
+        :type unique_pentamers: :class:`bool`
+        :returns:  :class:`list` of :class:`int` with length *N*. If
+                   *return_counts* is true, the :class:`list` contains
+                   :class:`tuple` with two elements: 1) count 2) index.
+        :raises: :class:`RuntimeError` if N is invalid or sequence is shorter
+                 than 5 characters
+        """
+        if N <=0:
+            raise RuntimeError(f"N ({N}) must be larger than 0")
+        if N > self.N:
+            raise RuntimeError(f"N ({N}) larger than actual entries ({self.N})")
+        if len(sequence) < 5:
+            raise RuntimeError(f"sequence must have length >=5, got: {sequence}")
+        pentamers = list()
+        SeqToPentamerIndices(sequence, True, pentamers)
+        # uint16 allows for up to 65535 pentamer matches
+        accumulator = np.zeros(self.N, dtype=np.int32)
+        if unique_pentamers:
+            for p in pentamers:
+                pos = self.pos[p]
+                length = self.length[p]
+                accumulator[self.indexer[pos:pos+length]] += 1
+        else:
+            for p in pentamers:
+                pos = self.pos[p]
+                length = self.length[p]
+                np.add.at(accumulator, self.indexer[pos:pos+length], 1)
+        top_n_indices = np.argpartition(accumulator, -N)[-N:]
+        top_n_counts = accumulator[top_n_indices]
+        # top_n_indices is not sorted by counts => sort by counts and return
+        tmp = [(a,b) for a,b in zip(top_n_counts, top_n_indices)]
+        tmp.sort(reverse=True)
+        if return_counts:
+            return tmp
+        else:
+            return [x[1] for x in tmp]
+    @staticmethod
+    def FromSeqList(fasta_file, db_dir, entries_from_seqnames=False):
+        """ Creates PentaMatch object from Fasta file
+        :param fasta_file: Path to Fasta file with sequences
+        :type fasta_file: :class:`str`
+        :param db_dir: Files required for :class:`PentaMatch` will be dumped
+                       here, will be created if non-existent.
+        :type db_dir: :class:`str`
+        :param entries_from_seqnames: If set to False, indices returned by
+                                      :func:`TopN` refer to position in
+                                      *fasta_file*. If set to True, integer
+                                      indices are parsed from sequence name.
+        :type entries_from_seqnames: :class:`bool`
+        :returns: class:`PentaMatch`
+        :raises: :class:`ost.Error` if *entries_from_seqnames* is True but
+                 sequence name cannot be casted to int.
+        """
+        slist = io.LoadSequenceList(fasta_file)
+        if not os.path.exists(db_dir):
+            os.makedirs(db_dir)
+        CreatePentaMatch(slist, db_dir, entries_from_seqnames)
+        return PentaMatch(db_dir)
+def AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100,
+                  seqid_thresh = 70, tpl_n = 5):
+    """ Searches *tpl_n* templates in *fs_server*/*pentamatch*
+    Step 1: Identifies *pentamatch_n* sequences in *pentamatch* with largest
+    number of matching pentamers with respect to *trg_seq*.
+    Step 2: Generate pairwise alignments with :func:`ost.seq.alg.LocalAlign`
+    and only retain the ones with seqid >= *seqid_thresh*.
+    Step 3: Extract respective templates from *fs_server* and score them by
+    the sum of plDDT of aligned residues divided by *trg_seq* length.
+    Step 4: Return top *tpl_n* (or less)
+    :param fs_server: Structure database - The AFDB
+    :type fs_server: :class:`FSStructureServer`
+    :param pentamatch: Pentamatch object specific for *fs_server*
+    :type pentamatch: :class:`PentaMatch`
+    :param trg_seq: Target sequence
+    :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str`
+    :pentamatch_n: Number of sequences that are initially searched in
+                   *pentamatch*
+    :type pentamatch_n: :class:`int`
+    :param seqid_thresh: Sequence Identity threshold [0-100] that alignment is
+                         considered further
+    :type seqid_thresh: :class:`int`
+    :param tpl_n: Number of templates that are finally returned based on
+                  described scoring
+    :type tpl_n: :class:`int`
+    :returns: :class:`list` of pairs with first element being the tpl score,
+              the second element being a :class:`ost.seq.AlignmentHandle` with
+              first sequence being *trg_seq* and second sequence the hit found
+              in *fs_server* with structure attached.
+    """
+    top_n = pentamatch.TopN(pentamatch_n, str(trg_seq))
+    if isinstance(trg_seq, str):
+        trg_seq = seq.CreateSequence("A", trg_seq)
+    tmp = list()
+    for idx in top_n:
+        omf = fs_server.GetOMFByIdx(idx)
+        assert(omf.GetChainNames() == ["A"])
+        omf_s = omf.GetSequence("A")
+        aln = seq.alg.LocalAlign(trg_seq, seq.CreateSequence("A", omf_s),
+                                 seq.alg.BLOSUM62)[0]
+        if seq.alg.SequenceIdentity(aln) >= seqid_thresh:
+            bfactors = omf.GetBFactors("A")
+            summed_bfac = 0.0
+            current_pos = aln.GetSequence(1).offset
+            for col in aln:
+                if col[0] != '-' and col[1] != '-':
+                    summed_bfac += bfactors[current_pos]
+                if col[1] != '-':
+                    current_pos += 1
+            score = summed_bfac / len(trg_seq)
+            tmp.append((score, aln, omf))
+    tmp.sort(reverse=True, key=lambda x: x[0])
+    return_list = list()
+    for item in tmp[:tpl_n]:
+        # the alignments are local, expand the first sequence to
+        # *trg_seq*, i.e. make sure that the offset is 0
+        aln = item[1]
+        if aln.GetSequence(0).offset > 0:
+            s1 = aln.GetSequence(0)
+            s2 = aln.GetSequence(1)
+            s1_prefix = trg_seq[:s1.offset]
+            s2_prefix = '-' * s1.offset
+            new_s1 = seq.CreateSequence(s1.name, s1_prefix + str(s1))
+            new_s2 = seq.CreateSequence(s2.name, s2_prefix + str(s2))
+            new_s2.SetOffset(s2.offset)
+            aln = seq.CreateAlignment(new_s1, new_s2)
+        aln.AttachView(1, item[2].GetAUChain("A").CreateFullView())
+        return_list.append((item[0], aln))
+    return return_list
+def AFDBModel(fs_server, pentamatch, trg_seq):
+    """ Build model with AFDB as template library
+    :param fs_server: Structure database - The AFDB
+    :type fs_server: :class:`FSStructureServer`
+    :param pentamatch: Pentamatch object specific for *fs_server*
+    :type pentamatch: :class:`PentaMatch`
+    :param trg_seq: Target sequence
+    :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str`
+    :returns: :class:`tuple` with 4 elements. 1: The model 2: The model score
+              based on plDDT 3: Pairwise alignment (first seq: *trg_seq*,
+              second seq: tpl seq) 4: Template name (formatted as
+              "<uniprot AC> <AFDB_fragment> <AFDB_version> <chain name>").
+              If no appropriate template can be found, all 4 elements are None.
+    """
+    tpl_list = AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100,
+                             seqid_thresh=70, tpl_n = 1)
+    if len(tpl_list):
+        score = tpl_list[0][0]
+        aln = tpl_list[0][1]
+        mhandle = BuildRawModel(aln)
+        model = BuildFromRawModel(mhandle)
+        name = aln.GetSequence(1).GetAttachedView().GetName()
+        return (model, score, name, aln)
+    return (None, None, None, None)
+__all__ = ('FSStructureServer', 'PentaMatch', 'AFDBTPLSearch', 'AFDBModel')