diff --git a/extras/data_generation/afdb_modelling/README.md b/extras/data_generation/afdb_modelling/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6690891ad1b6dc41868e05201ac22471b92c509d --- /dev/null +++ b/extras/data_generation/afdb_modelling/README.md @@ -0,0 +1,53 @@ +Data generation for AFDB Modelling capabilities in ProMod3 + +Requires you to download the full proteomes database as described in +https://github.com/deepmind/alphafold/blob/main/afdb/README.md. + +This gives one tar file per proteome which serves as starting point. + +In case of sciCORE they're here: +`/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes` + +The `afdb_proteom_to_data_chunks.py` script generates one such chunk. +It reads a list of filenames which needs to be generated manually. +Something like: + +```python +import os +files = os.listdir(<AFDB_PROTEOM_DIR>) +with open("afdb_proteom_files.txt", 'w') as fh: + fh.write('\n'.join(files)) +``` + +`create_commands.py` generates a command file which can be submitted +as batch job. Carefully check the variables defined on top and adapt to your +needs. Test to run one of these commands interactively to see whether the +respective chunk file is created correctly before submitting. + +Once all chunks are there, an indexed database can be created with: + +```python +from promod3.modelling import FSStructureServer +fs_server = FSStructureServer.FromDataChunks("afdb_data_chunks", "afdb_fs") +``` + +## Data preparation for PentaMatch + +The same data chunks are used to extract the sequences that are searched by +PentaMatch. `create_pentamatch_sequences.py` generates a Fasta file with all +sequences of the previously generated FSStructureServer. Execute with: + +```bash +ost create_pentamatch_sequences.py --data_chunks <DATA_CHUNK_DIR> --fs_server <FS_SERVER> --out <PENTAMATCH>.fasta +``` + +The searchable PentaMatch object is generated interactively with: +```python +from promod3.modelling import PentaMatch +PentaMatch.FromSeqList("<PENTAMATCH>.fasta", "<PENTAMATCH_DIR>", + entries_from_seqnames=True) +``` + +Be aware that the command above requires a substantial amount of memory. +For 200e6 entries, 500GB was sufficient. + diff --git a/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py b/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py new file mode 100644 index 0000000000000000000000000000000000000000..895d7e44c8a83c012606e2d465ccb32e6041f88a --- /dev/null +++ b/extras/data_generation/afdb_modelling/afdb_proteom_to_data_chunks.py @@ -0,0 +1,126 @@ +import os +import tarfile +import gzip +import argparse +import pickle + +from ost import io + +def parse_args(): + desc = ("Parses proteom tarballs from AFDB and dumps pickle files ready " + "for database creation") + + parser = argparse.ArgumentParser(description = desc) + parser.add_argument("--afdb_dir", help="Directory containing the " + "tarballs") + parser.add_argument("--tarballs", help=" File containing tarball filenames") + parser.add_argument("--start", help="idx in that file to start", type=int) + parser.add_argument("--end", help="idx in that file to end", type=int) + parser.add_argument("--out_file", help="Output file where stuff gets " + "pickled") + parser.add_argument("--ca_only", help="Whether to use CA-only " + "representation", action="store_true") + return parser.parse_args() + +def iterate_proteom(tarball): + """ Helper: iterate over proteome tarball provided by EBI-AFDB + + Yields mmcif data + in detail: tuple with types :class:`str`, :class:`ost.mol.EntityHandle`, + :class:`ost.io.MMFicInfo` and :class:`ost.seq.SequenceList` + + :param tarball: Path to tarball to iterate + :type tarball: :class:`str` + :returns: Generator of specified tuple + """ + tf = tarfile.open(tarball) + prof = io.profiles.Get("STRICT") + + # hack to correct for corrupt cif files in AFDB + corrupt_cit = "_citation.id primary" + correct_cit = "_citation.id 1" + while True: + tar_info = tf.next() + if tar_info is None: + break # we're done + if tar_info.name.endswith("cif.gz"): + fh = tf.extractfile(tar_info) + mmcif_str = gzip.decompress(fh.read()).decode("utf-8") + if corrupt_cit in mmcif_str: + mmcif_str = mmcif_str.replace(corrupt_cit, correct_cit) + ent, mmcif_info, seqres = io.MMCifStrToEntity(mmcif_str, + profile=prof, + process=True) + yield (tar_info.name, ent, mmcif_info, seqres) + +def process_proteom(tarball, ca_only=False): + """ Helper: Creates list of entries for one proteom + + :param tarball: Path to proteom tarball + :type tarball: :class:`str` + :ca_only: Whether to select for only CA positions + :type ca_only: :class:`bool` + :returns: :class: list of :class:`tuple` in format required by + :func:`FromDataChunks` + """ + omf_opts = io.OMFOption.DEFAULT_PEPLIB | io.OMFOption.LOSSY | \ + io.OMFOption.AVG_BFACTORS | io.OMFOption.ROUND_BFACTORS | \ + io.OMFOption.SKIP_SS | io.OMFOption.INFER_PEP_BONDS + + entries = list() + for entry in iterate_proteom(tarball): + uniprot_ac = entry[0].split('-')[1] + fragment = entry[0].split('-')[2] + version = entry[0].split('-')[3] + version = version.split('_')[1] + version = version.split('.')[0] + ent = entry[1] + if ca_only: + ent = ent.Select("aname=CA") + ent = mol.CreateEntityFromView(ent, False) + ent_name = f"{uniprot_ac} {fragment} {version}" + ent.SetName(ent_name) + omf = io.OMF.FromMMCIF(ent, entry[2], omf_opts) + omf = gzip.compress(omf.ToBytes()) + entries.append([uniprot_ac, fragment, version, omf]) + return entries + +def create_data_chunk(tarballs, afdb_dir, start, end, chunk_out, + ca_only=False): + """ Create full data chunk + + Requires preprocessing step of creating a text file that lists all + proteom tarballs. + + :param tarballs: Path to file where each line corresponds to one proteom + tarball, i.e. its filename + :type tarballs: :class:`str` + :param afdb_dir: Directory containing the tarball files specified in + *tarballs* + :type afdb_dir: :class:`str` + :param start: Start of tarball slice to be added to chunk + :type start: :class:`int` + :param end: End of tarball slice to be added to chunk (*end* won't be + added...) + :type end: :class:`str` + :param chunk_out: Chunk will be pickled to that file + :type chunk_out: :class:`str` + :ca_only: Whether to select for only CA positions + :type ca_only: :class:`bool` + """ + with open(tarballs, 'r') as fh: + tarballs = fh.readlines() + entries = list() + for tarball in tarballs[start:end]: + full_path = os.path.join(afdb_dir, tarball.strip()) + entries += process_proteom(full_path, ca_only=ca_only) + with open(chunk_out, 'wb') as fh: + pickle.dump(entries, fh) + +def main(): + args = parse_args() + create_data_chunk(args.tarballs, args.afdb_dir, args.start, args.end, + args.out_file, ca_only = args.ca_only) + +if __name__ == '__main__': + main() diff --git a/extras/data_generation/afdb_modelling/create_commands.py b/extras/data_generation/afdb_modelling/create_commands.py new file mode 100644 index 0000000000000000000000000000000000000000..f83e41f9be5516d1987d4864c146eb009f086389 --- /dev/null +++ b/extras/data_generation/afdb_modelling/create_commands.py @@ -0,0 +1,52 @@ +import os + +# path to Singularity container with recent OpenStructure installation +omf_sif = "<PATH_TO_SIF>" + +# path to directory that contains AFDB - one tarball per proteom +afdb_dir = "/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes" + +# file with one proteom filename per line - in principle the content of os.listdir(afdb_dir) +proteom_file = "<WHEREVER>/afdb_proteom_files.txt" + +# number of proteoms that end up in one data chunk +chunk_size = 100 + +# path to directory where chunks will be dumped +chunk_out_dir = "<WHEREVER>/afdb_data_chunks" + +# path to script that does the job +fancy_script = "<WHEREVER>/afdb_proteom_to_data_chunks.py" + +# whether structures should be reduced to CA only representation +ca_only = False + +# path to file into which the commands are written +cmd_file = "allatom_commands.cmd" + + +with open(proteom_file, 'r') as fh: + proteom_data = fh.readlines() +N = len(proteom_data) + +commands = list() +chunk_idx = 0 +for i in range(0, N, chunk_size): + start = i + end = min(N, i+chunk_size) + out_file = os.path.join(chunk_out_dir, str(chunk_idx) + ".pkl") + cmd = ["singularity", "exec", "--bind", f"{afdb_dir}:{afdb_dir}", + omf_sif, "ost", fancy_script, + "--afdb_dir", afdb_dir, + "--tarballs", proteom_file, + "--start", str(start), + "--end", str(end), + "--out_file", out_file] + if ca_only: + cmd.append("--ca_only") + commands.append(' '.join(cmd)) + chunk_idx += 1 + +with open(cmd_file, 'w') as fh: + fh.write('\n'.join(commands)) + diff --git a/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py b/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py new file mode 100644 index 0000000000000000000000000000000000000000..7f202dbb4f7ad4b421e8e36c1fbab64beab0f06e --- /dev/null +++ b/extras/data_generation/afdb_modelling/create_pentamatch_sequences.py @@ -0,0 +1,58 @@ +import os +import tarfile +import gzip +import argparse +import pickle + +from ost import io +from promod3.modelling import FSStructureServer + +def parse_args(): + desc = ("Extracts sequences from pickled files generated with " + "create_fsserver_data_chunks.py and relates them to entries in an " + "FSStructureServer object. I.e. Dumps a sequence list in Fasta " + "format with entry indices as sequence names. This file serves as " + "starting point for an actual PentaMatch object. Requires faf.py " + "to be importable.") + + parser = argparse.ArgumentParser(description = desc) + parser.add_argument("--data_chunks", help="Directory with all data chunks " + "generated with create_fsserver_data_chunks.py") + parser.add_argument("--fs_server", help="Directory containing the files " + "for an FSStructureServer object derived from " + "data_chunks directory") + parser.add_argument("--out", help="Outfile in Fasta format that will " + "contain all sequences with their respective index in " + "the underlying FSStructureServer object as names") + return parser.parse_args() + + +def main(): + args = parse_args() + fs_server = FSStructureServer(args.fs_server) + chunk_files = os.listdir(args.data_chunks) + outfile = open(args.out, 'w') + + for cf_idx, cf in enumerate(chunk_files): + print(f"processing chunk {cf_idx}, {cf}") + with open(os.path.join(args.data_chunks, cf), 'rb') as fh: + data = pickle.load(fh) + for entry in data: + uniprot_ac = entry[0] + fragment = entry[1] + version = entry[2] + omf_data = entry[3] + idx = fs_server.GetIdx(uniprot_ac, fragment = fragment, + version = version) + omf = io.OMF.FromBytes(gzip.decompress(omf_data)) + assert(omf.GetChainNames() == ["A"]) + sequence = omf.GetSequence("A") + outfile.write(f">{idx}\n") + seq_chunks = [sequence[i:i+80] for i in range(0, len(sequence), 80)] + outfile.write('\n'.join(seq_chunks)) + outfile.write('\n') + outfile.close() + +if __name__ == '__main__': + main() + diff --git a/modelling/doc/algorithms.rst b/modelling/doc/algorithms.rst index b36bf5bb77ef1ee30a716469a803e652a4420a5e..c9db60f1f7f57dc8ac74e00d8db94b9cb018c4be 100644 --- a/modelling/doc/algorithms.rst +++ b/modelling/doc/algorithms.rst @@ -326,3 +326,34 @@ iteration. :returns: All found matches :rtype: :class:`list` of :class:`MotifMatch` + + +AFDB Modelling +-------------------------------------------------------------------------------- + +Template based modelling using AFDB as template library comes with two +challenges for which |project| provides solutions: + +* efficient structure storage for the whole AFDB: :class:`FSStructureServer` +* fast sequence searches with limited sensitivity: :class:`PentaMatch` + +The creation of these two object requires extensive preprocessing. The required +scripts and documentation are available in +`<GIT_ROOT>/extras/data_generation/afdb_modelling`. + +Basic modelling functionality is available in the following two functions: + +* :func:`AFDBTPLSearch` +* :func:`AFDBModel` + +.. autofunction:: AFDBTPLSearch + +.. autofunction:: AFDBModel + +.. autoclass:: FSStructureServer + :members: + :member-order: bysource + +.. autoclass:: PentaMatch + :members: + :member-order: bysource diff --git a/modelling/pymod/CMakeLists.txt b/modelling/pymod/CMakeLists.txt index 22cc8f3305c84e6e72132282e8b46b38624b42f0..63b25b975f4af1a0ac3918c8b8b48b2219d4409c 100644 --- a/modelling/pymod/CMakeLists.txt +++ b/modelling/pymod/CMakeLists.txt @@ -29,6 +29,7 @@ set(MODELLING_PYMOD _alignment_fiddling.py _raw_model.py _planar_rings.py + _afdb_modelling.py ) pymod(NAME modelling diff --git a/modelling/pymod/__init__.py b/modelling/pymod/__init__.py index a0eb62b66c12c597a411d19967d500d3f75918b5..c195865ecc45e91a0d314ac107fe9a02c0ccdbc3 100644 --- a/modelling/pymod/__init__.py +++ b/modelling/pymod/__init__.py @@ -28,3 +28,4 @@ from ._monte_carlo import * from ._mhandle_helper import * from ._raw_model import * from ._planar_rings import * +from ._afdb_modelling import * diff --git a/modelling/pymod/_afdb_modelling.py b/modelling/pymod/_afdb_modelling.py new file mode 100644 index 0000000000000000000000000000000000000000..51fb3d958bce0cca271b8fd311756703a9bc7e2a --- /dev/null +++ b/modelling/pymod/_afdb_modelling.py @@ -0,0 +1,585 @@ +# Copyright (c) 2013-2023, SIB - Swiss Institute of Bioinformatics and +# Biozentrum - University of Basel +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import gzip +import mmap +import pickle +import numpy as np +import time +import tarfile +import glob + +from ost import io +from ost import seq +from ._raw_model import BuildRawModel +from ._pipeline import BuildFromRawModel +from ._modelling import * + +class FSStructureServer: + """ FSStructureServer - A filesystem based AFDB structure server + + Stores OMF data entries in huge binary files and extracts the respective + binary data blobs with an indexing mechanism. Efficient reading of these + huge files is delegated to the OS using the Python mmap module. + + Data creation happens with static Create function: :func:`FromDataChunks`. + The required preprocessing must be done with external scripts. + + :param db_dir: Directory containing required files. 1) pos.dat (see + :attr:`pos`) 2) length.dat (see :attr:`length`) + 3) chunk.dat (see :attr:`chunk`) 4) indexer.dat (see + :attr:`indexer`) 4) one or several files with pattern + fs_data_[x].dat (see :attr:`data`) + :type db_dir: :class:`str` + """ + def __init__(self, db_dir): + super().__init__() + + self._pos = None + self._length = None + self._chunk = None + self._indexer = None + self._search_keys = None + self._data = None + self._data_fh = None + self._n_entries = None + + # only check if required files are there, they're lazy loaded + self.pos_file = os.path.join(db_dir, "pos.dat") + self.length_file = os.path.join(db_dir, "length.dat") + self.chunk_file = os.path.join(db_dir, "chunk.dat") + self.search_key_file = os.path.join(db_dir, "search_keys.dat") + self.indexer_file = os.path.join(db_dir, "indexer.dat") + self.data_files = glob.glob(os.path.join(db_dir, "fs_data_*.dat")) + if not os.path.exists(self.pos_file): + raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})") + if not os.path.exists(self.length_file): + raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})") + if not os.path.exists(self.chunk_file): + raise RuntimeError(f"Exp \"chunk.dat\" file in db_dir ({db_dir})") + if not os.path.exists(self.indexer_file): + raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})") + # there can be one or more data files, just check whether there is at + # least one + if len(self.data_files) == 0: + raise RuntimeError(f"Exp at least one data file in form " + f"\"fs_data_[x].dat\" in db_dir ({db_dir})") + + def __del__(self): + if self._data: + for fh in self._data: + fh.close() + for fh in self._data_fh: + fh.close() + + @property + def pos(self): + """ Start positions of data in internal linear memory layout + + :type: :class:`np.ndarray` with dtype np.int64 + """ + if self._pos is None: + with open(self.pos_file, 'rb') as fh: + self._pos = np.load(fh) + return self._pos + + @property + def length(self): + """ Lengths of data in internal linear memory layout + + :type: :class:`np.ndarray` with dtype np.int32 + """ + if self._length is None: + with open(self.length_file, 'rb') as fh: + self._length = np.load(fh) + return self._length + + @property + def chunk(self): + """ Chunk, in which entry is stored + + :type: :class:`np.ndarray` with dtype np.int16 + """ + if self._chunk is None: + with open(self.chunk_file, 'rb') as fh: + self._chunk = np.load(fh) + return self._chunk + + @property + def indexer(self): + """ Internal data structure - Relates entries with data indices + + :type: :class:`np.ndarray` with dtype np.int32 + """ + if self._indexer is None: + with open(self.indexer_file, 'rb') as fh: + self._indexer = np.load(fh) + return self._indexer + + @property + def search_keys(self): + """ Internal data structure - Relates entries with data indices + """ + if self._search_keys is None: + with open(self.search_key_file, 'rb') as fh: + self._search_keys = np.load(fh) + return self._search_keys + + @property + def data(self): + """ Internal binary data in memory mapped files + + :type: :class:`list` of :class:`mmap.mmap` + """ + if self._data is None: + self._data = list() + self._data_fh = list() + tmp = list() + for f in self.data_files: + idx = int(f.split('_')[-1].replace(".dat", "")) + tmp.append((idx, f)) + tmp.sort() + sorted_data_files = [x[1] for x in tmp] + for f in sorted_data_files: + self._data_fh.append(open(f, 'r+b')) + self._data.append(mmap.mmap(self._data_fh[-1].fileno(), 0)) + return self._data + + @property + def n_entries(self): + """ Number of entries + + :type: :class:`int` + """ + if self._n_entries is None: + self._n_entries = self.pos.shape[0] + return self._n_entries + + def GetIdx(self, uniprot_ac, fragment="F1", version="v4"): + """ Get internal idx of stored data + + Can be used for data retrieval with :func:`GetOMFByIdx` + + :param uniprot_ac: Uniprot AC + :type uniprot_ac: :class:`str` + :param fragment: AFDB entries are potentially split to fragments + 99.999% of all entries only have one fragment: F1 + :type fragment: class:`str` + :param version: Version of entry + :type version: :class:`str` + :returns: Internal idx of that entry + :raises: :class:`RuntimeError` if no such entry exists + """ + if not fragment.startswith('F'): + raise RuntimeError("expect fragment to start with F") + if not version.startswith('v'): + raise RuntimeError("expect version to start with v") + search_key = CreateAFDBIdx(uniprot_ac, int(fragment[1:]), + int(version[1:])) + idx = np.searchsorted(self.search_keys, np.uint64(search_key)) + if idx != len(self.indexer) and self.search_keys[idx] == search_key: + return self.indexer[idx] + raise RuntimeError(f"No entry for {uniprot_ac} {fragment} " + f"{version}") + + def GetOMFByIdx(self, idx): + """ Get stored OMF data structure + + :param idx: Internal index which can be derived from :func:`GetIdx` + :type idx: :class:`int` + :returns: OMF data structure of type :class:`ost.io.OMF` + :raises: :class:`RuntimeError` if *idx* is out of range + """ + if idx < 0 or idx >= self.n_entries: + raise RuntimeError(f"Invalid idx, must be in [0, {self.n_entries-1}]") + pos = self.pos[idx] + length = self.length[idx] + chunk = self.chunk[idx] + omf_data = self.data[chunk][pos:pos+length] + return io.OMF.FromBytes(gzip.decompress(omf_data)) + + def GetOMF(self, uniprot_ac, fragment="F1", version="v4"): + """ Get stored OMF data structure + + :param uniprot_ac: Uniprot AC + :type uniprot_ac: :class:`str` + :param fragment: AFDB entries are potentially split to fragments + 99.999% of all entries only have one fragment: F1 + :type fragment: class:`str` + :param version: Version of entry + :type version: :class:`str` + :returns: OMF data structure of type :class:`ost.io.OMF` + :raises: :class:`RuntimeError` if no such entry exists + """ + idx = self.GetIdx(uniprot_ac, fragment = fragment, version = version) + return self.GetOMFByIdx(idx) + + @staticmethod + def FromDataChunks(chunk_dir, db_dir, chunk_bytes=None): + """ Static method to create new database from preprocessed data + + Data preprocessing consists of creating several data chunks that are + pickled to disk. + In detail: each chunk is a pickled file containing a list, where each + entry is a tuple with 4 elements: 1) uniprot_ac (:class:`str`) + 2) fragment (:class:`str`) 3) version (:class:`str`) 4) structure data + (:class:`ost.io.OMF` object which has been written to a bytes object and + compressed with gzip) + + The data itself is again stored in chunks of binary data which are + indexed. + + :param chunk_dir: Path to directory containing described data chunks + :type chunk_dir: :class:`str` + :param db_dir: Output directory - Creates all files that are needed for + :class:`FSStructureServer` + :type db_dir: :class:`str` + :param chunk_bytes: Size in bytes of binary data chunks in final + database - default None: Everything ends up in one + single chunk + :type chunk_bytes: :class:`int` + :returns: :class:`FSStructureServer` with all data from *chunk_dir* + """ + if not os.path.exists(db_dir): + raise RuntimeError(f"{db_dir} does not exist") + positions = list() + lengths = list() + chunks = list() + search_keys = list() + indexer = list() + chunk_files = os.listdir(chunk_dir) + chunk_files = [f for f in chunk_files if f.endswith(".pkl")] + current_pos = 0 + current_chunk = 0 + current_entry = 0 + data_file = open(os.path.join(db_dir, "fs_data_0.dat"), 'wb') + t0 = time.time() + for cf_idx, cf in enumerate(chunk_files): + print(f"processing chunk {cf_idx}, {cf}") + with open(os.path.join(chunk_dir, cf), 'rb') as fh: + data = pickle.load(fh) + for entry in data: + uniprot_ac = entry[0] + fragment = entry[1] + version = entry[2] + data_bytes = entry[3] + length = len(data_bytes) + data_file.write(data_bytes) + positions.append(current_pos) + lengths.append(length) + chunks.append(current_chunk) + k = CreateAFDBIdx(uniprot_ac, int(fragment[1:]), + int(version[1:])) + search_keys.append(k) + indexer.append(current_entry) + current_pos += length + current_entry += 1 + if chunk_bytes and current_pos > chunk_bytes: + data_file.close() + current_chunk += 1 + f = os.path.join(db_dir, f"fs_data_{current_chunk}.dat") + data_file = open(f, 'wb') + current_pos = 0 + data_file.close() + print(f"done processing chunks ({round(time.time() - t0, 3)}s)") + print("sort indexer matrix") + t0 = time.time() + # make search keys searchable by bisect search => sort + search_keys = np.asarray(search_keys, dtype=np.uint64) + indexer = np.asarray(indexer, dtype=np.int32) + sort_indices = np.argsort(search_keys) + search_keys = search_keys[sort_indices] + indexer = indexer[sort_indices] + + with open(os.path.join(db_dir, "search_keys.dat"), 'wb') as fh: + np.save(fh, search_keys) + with open(os.path.join(db_dir, "indexer.dat"), 'wb') as fh: + np.save(fh, indexer) + with open(os.path.join(db_dir, "pos.dat"), 'wb') as fh: + np.save(fh, np.array(positions, dtype=np.int64)) + with open(os.path.join(db_dir, "length.dat"), 'wb') as fh: + np.save(fh, np.array(lengths, dtype=np.int32)) + with open(os.path.join(db_dir, "chunk.dat"), 'wb') as fh: + np.save(fh, np.array(chunks, dtype=np.int16)) + fs_server = FSStructureServer(db_dir) + return fs_server + + +class PentaMatch: + """ Pentamer matching for fast sequence searches + + :class:`PentamerMatch` has fast sequence searches with low sensitivity as + use case. Specifically searching the full AFDB. Stores all unique pentamers + for each search sequence. Given a query sequence, it computes the number of + matching pentamers with respect to each search sequence and returns the top + hits. + + :param db_dir: Directory containing all required files (indexer.dat, + pos.dat, length.dat, meta.dat). New :class:`PentaMatch` + objects can be derived using the static :func:`FromSeqList` + creator. + :type db_dir: :class:`str` + :raises: :class:`RuntimeError` if any required file is missing in *db_dir* + """ + def __init__(self, db_dir): + self._indexer = None + self._pos = None + self._length = None + self._N = None + self.db_dir = db_dir + + # only check if required files are there, they're lazy loaded + self.indexer_file = os.path.join(db_dir, "indexer.dat") + self.pos_file = os.path.join(db_dir, "pos.dat") + self.length_file = os.path.join(db_dir, "length.dat") + self.meta_file = os.path.join(db_dir, "meta.dat") + + if not os.path.exists(self.indexer_file): + raise RuntimeError(f"Exp \"indexer.dat\" file in db_dir ({db_dir})") + if not os.path.exists(self.pos_file): + raise RuntimeError(f"Exp \"pos.dat\" file in db_dir ({db_dir})") + if not os.path.exists(self.length_file): + raise RuntimeError(f"Exp \"length.dat\" file in db_dir ({db_dir})") + if not os.path.exists(self.meta_file): + raise RuntimeError(f"Exp \"meta.dat\" file in db_dir ({db_dir})") + + @property + def indexer(self): + """ Entry indices for pentamers + + Entry data for one pentamer can be extracted with the respective values in + :attr:`pos` and :attr:`length` + + :type: memory mapped :class:`np.ndarray` of dtype np.int32 + """ + if self._indexer is None: + self._indexer = np.memmap(self.indexer_file, dtype=np.int32, + mode='r') + return self._indexer + + @property + def pos(self): + """ Start position for each pentamer in :attr:`indexer` + + :type: :class:`np.ndarray` of dtype np.int64 with 3.2E6 entries + """ + if self._pos is None: + self._pos = np.fromfile(self.pos_file, dtype=np.int64) + return self._pos + + @property + def length(self): + """ Length for each pentamer in :attr:`indexer` + + :type: :class:`np.ndarray` of dtype np.int32 with 3.2E6 entries + """ + if self._length is None: + self._length = np.fromfile(self.length_file, dtype=np.int32) + return self._length + + @property + def N(self): + """ Number of entries in underlying :class:`FSStructureServer` + + :type: :class:`int` + """ + if self._N is None: + with open(self.meta_file, 'r') as fh: + self._N = int(fh.read()) + 1 # the files stores max idx => +1 + return self._N + + def TopN(self, N, sequence, return_counts=False, unique_pentamers=True): + """ Find top-N matches given *sequence* + + Identifies unique pentamers in *sequence* and counts number of + occurences in each entry. Returns the top-N entries with respect + to counts. + + :param N: Number of results to return + :type N: :class:`int` + :param sequence: Sequence to search + :type sequence: :class:`str` + :param return_counts: Additionally return underlying pentamer counts + :type return_counts: :class:`bool` + :param unique_pentamers: Set to True if :attr:`indexer` contains only + unique pentamers for each entry. This way we + can use faster methods to accumulate counts. + In detail: accumulator[indices] += 1 is much + faster than np.add.at(accumulator, indices, 1). + But the latter is required if there are + duplicates. + :type unique_pentamers: :class:`bool` + :returns: :class:`list` of :class:`int` with length *N*. If + *return_counts* is true, the :class:`list` contains + :class:`tuple` with two elements: 1) count 2) index. + :raises: :class:`RuntimeError` if N is invalid or sequence is shorter + than 5 characters + """ + if N <=0: + raise RuntimeError(f"N ({N}) must be larger than 0") + if N > self.N: + raise RuntimeError(f"N ({N}) larger than actual entries ({self.N})") + if len(sequence) < 5: + raise RuntimeError(f"sequence must have length >=5, got: {sequence}") + pentamers = list() + SeqToPentamerIndices(sequence, True, pentamers) + # uint16 allows for up to 65535 pentamer matches + accumulator = np.zeros(self.N, dtype=np.int32) + if unique_pentamers: + for p in pentamers: + pos = self.pos[p] + length = self.length[p] + accumulator[self.indexer[pos:pos+length]] += 1 + else: + for p in pentamers: + pos = self.pos[p] + length = self.length[p] + np.add.at(accumulator, self.indexer[pos:pos+length], 1) + + top_n_indices = np.argpartition(accumulator, -N)[-N:] + top_n_counts = accumulator[top_n_indices] + # top_n_indices is not sorted by counts => sort by counts and return + tmp = [(a,b) for a,b in zip(top_n_counts, top_n_indices)] + tmp.sort(reverse=True) + if return_counts: + return tmp + else: + return [x[1] for x in tmp] + + @staticmethod + def FromSeqList(fasta_file, db_dir, entries_from_seqnames=False): + """ Creates PentaMatch object from Fasta file + + :param fasta_file: Path to Fasta file with sequences + :type fasta_file: :class:`str` + :param db_dir: Files required for :class:`PentaMatch` will be dumped + here, will be created if non-existent. + :type db_dir: :class:`str` + :param entries_from_seqnames: If set to False, indices returned by + :func:`TopN` refer to position in + *fasta_file*. If set to True, integer + indices are parsed from sequence name. + :type entries_from_seqnames: :class:`bool` + :returns: class:`PentaMatch` + :raises: :class:`ost.Error` if *entries_from_seqnames* is True but + sequence name cannot be casted to int. + """ + slist = io.LoadSequenceList(fasta_file) + if not os.path.exists(db_dir): + os.makedirs(db_dir) + CreatePentaMatch(slist, db_dir, entries_from_seqnames) + return PentaMatch(db_dir) + +def AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100, + seqid_thresh = 70, tpl_n = 5): + """ Searches *tpl_n* templates in *fs_server*/*pentamatch* + + Step 1: Identifies *pentamatch_n* sequences in *pentamatch* with largest + number of matching pentamers with respect to *trg_seq*. + Step 2: Generate pairwise alignments with :func:`ost.seq.alg.LocalAlign` + and only retain the ones with seqid >= *seqid_thresh*. + Step 3: Extract respective templates from *fs_server* and score them by + the sum of plDDT of aligned residues divided by *trg_seq* length. + Step 4: Return top *tpl_n* (or less) + + :param fs_server: Structure database - The AFDB + :type fs_server: :class:`FSStructureServer` + :param pentamatch: Pentamatch object specific for *fs_server* + :type pentamatch: :class:`PentaMatch` + :param trg_seq: Target sequence + :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str` + :pentamatch_n: Number of sequences that are initially searched in + *pentamatch* + :type pentamatch_n: :class:`int` + :param seqid_thresh: Sequence Identity threshold [0-100] that alignment is + considered further + :type seqid_thresh: :class:`int` + :param tpl_n: Number of templates that are finally returned based on + described scoring + :type tpl_n: :class:`int` + :returns: :class:`list` of pairs with first element being the tpl score, + the second element being a :class:`ost.seq.AlignmentHandle` with + first sequence being *trg_seq* and second sequence the hit found + in *fs_server* with structure attached. + """ + top_n = pentamatch.TopN(pentamatch_n, str(trg_seq)) + if isinstance(trg_seq, str): + trg_seq = seq.CreateSequence("A", trg_seq) + tmp = list() + for idx in top_n: + omf = fs_server.GetOMFByIdx(idx) + assert(omf.GetChainNames() == ["A"]) + omf_s = omf.GetSequence("A") + aln = seq.alg.LocalAlign(trg_seq, seq.CreateSequence("A", omf_s), + seq.alg.BLOSUM62)[0] + if seq.alg.SequenceIdentity(aln) >= seqid_thresh: + bfactors = omf.GetBFactors("A") + summed_bfac = 0.0 + current_pos = aln.GetSequence(1).offset + for col in aln: + if col[0] != '-' and col[1] != '-': + summed_bfac += bfactors[current_pos] + if col[1] != '-': + current_pos += 1 + score = summed_bfac / len(trg_seq) + tmp.append((score, aln, omf)) + tmp.sort(reverse=True, key=lambda x: x[0]) + return_list = list() + for item in tmp[:tpl_n]: + # the alignments are local, expand the first sequence to + # *trg_seq*, i.e. make sure that the offset is 0 + aln = item[1] + if aln.GetSequence(0).offset > 0: + s1 = aln.GetSequence(0) + s2 = aln.GetSequence(1) + s1_prefix = trg_seq[:s1.offset] + s2_prefix = '-' * s1.offset + new_s1 = seq.CreateSequence(s1.name, s1_prefix + str(s1)) + new_s2 = seq.CreateSequence(s2.name, s2_prefix + str(s2)) + new_s2.SetOffset(s2.offset) + aln = seq.CreateAlignment(new_s1, new_s2) + aln.AttachView(1, item[2].GetAUChain("A").CreateFullView()) + return_list.append((item[0], aln)) + return return_list + + +def AFDBModel(fs_server, pentamatch, trg_seq): + """ Build model with AFDB as template library + + :param fs_server: Structure database - The AFDB + :type fs_server: :class:`FSStructureServer` + :param pentamatch: Pentamatch object specific for *fs_server* + :type pentamatch: :class:`PentaMatch` + :param trg_seq: Target sequence + :type trg_seq: :class:`ost.seq.SequenceHandle`/:class:`str` + :returns: :class:`tuple` with 4 elements. 1: The model 2: The model score + based on plDDT 3: Pairwise alignment (first seq: *trg_seq*, + second seq: tpl seq) 4: Template name (formatted as + "<uniprot AC> <AFDB_fragment> <AFDB_version> <chain name>"). + If no appropriate template can be found, all 4 elements are None. + """ + tpl_list = AFDBTPLSearch(fs_server, pentamatch, trg_seq, pentamatch_n = 100, + seqid_thresh=70, tpl_n = 1) + if len(tpl_list): + score = tpl_list[0][0] + aln = tpl_list[0][1] + mhandle = BuildRawModel(aln) + model = BuildFromRawModel(mhandle) + name = aln.GetSequence(1).GetAttachedView().GetName() + return (model, score, name, aln) + return (None, None, None, None) + +__all__ = ('FSStructureServer', 'PentaMatch', 'AFDBTPLSearch', 'AFDBModel')