Skip to content
Snippets Groups Projects
Commit 6e39a68e authored by Studer Gabriel's avatar Studer Gabriel
Browse files

Modelling with AFDB as template library

parent 570f0e12
No related branches found
No related tags found
No related merge requests found
Data generation for AFDB Modelling capabilities in ProMod3
Requires you to download the full proteomes database as described in
https://github.com/deepmind/alphafold/blob/main/afdb/README.md.
This gives one tar file per proteome which serves as starting point.
In case of sciCORE they're here:
`/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes`
The `afdb_proteom_to_data_chunks.py` script generates one such chunk.
It reads a list of filenames which needs to be generated manually.
Something like:
```python
import os
files = os.listdir(<AFDB_PROTEOM_DIR>)
with open("afdb_proteom_files.txt", 'w') as fh:
fh.write('\n'.join(files))
```
`create_commands.py` generates a command file which can be submitted
as batch job. Carefully check the variables defined on top and adapt to your
needs. Test to run one of these commands interactively to see whether the
respective chunk file is created correctly before submitting.
Once all chunks are there, an indexed database can be created with:
```python
from promod3.modelling import FSStructureServer
fs_server = FSStructureServer.FromDataChunks("afdb_data_chunks", "afdb_fs")
```
## Data preparation for PentaMatch
The same data chunks are used to extract the sequences that are searched by
PentaMatch. `create_pentamatch_sequences.py` generates a Fasta file with all
sequences of the previously generated FSStructureServer. Execute with:
```bash
ost create_pentamatch_sequences.py --data_chunks <DATA_CHUNK_DIR> --fs_server <FS_SERVER> --out <PENTAMATCH>.fasta
```
The searchable PentaMatch object is generated interactively with:
```python
from promod3.modelling import PentaMatch
PentaMatch.FromSeqList("<PENTAMATCH>.fasta", "<PENTAMATCH_DIR>",
entries_from_seqnames=True)
```
Be aware that the command above requires a substantial amount of memory.
For 200e6 entries, 500GB was sufficient.
import os
import tarfile
import gzip
import argparse
import pickle
from ost import io
def parse_args():
desc = ("Parses proteom tarballs from AFDB and dumps pickle files ready "
"for database creation")
parser = argparse.ArgumentParser(description = desc)
parser.add_argument("--afdb_dir", help="Directory containing the "
"tarballs")
parser.add_argument("--tarballs", help=" File containing tarball filenames")
parser.add_argument("--start", help="idx in that file to start", type=int)
parser.add_argument("--end", help="idx in that file to end", type=int)
parser.add_argument("--out_file", help="Output file where stuff gets "
"pickled")
parser.add_argument("--ca_only", help="Whether to use CA-only "
"representation", action="store_true")
return parser.parse_args()
def iterate_proteom(tarball):
""" Helper: iterate over proteome tarball provided by EBI-AFDB
Yields mmcif data
in detail: tuple with types :class:`str`, :class:`ost.mol.EntityHandle`,
:class:`ost.io.MMFicInfo` and :class:`ost.seq.SequenceList`
:param tarball: Path to tarball to iterate
:type tarball: :class:`str`
:returns: Generator of specified tuple
"""
tf = tarfile.open(tarball)
prof = io.profiles.Get("STRICT")
# hack to correct for corrupt cif files in AFDB
corrupt_cit = "_citation.id primary"
correct_cit = "_citation.id 1"
while True:
tar_info = tf.next()
if tar_info is None:
break # we're done
if tar_info.name.endswith("cif.gz"):
fh = tf.extractfile(tar_info)
mmcif_str = gzip.decompress(fh.read()).decode("utf-8")
if corrupt_cit in mmcif_str:
mmcif_str = mmcif_str.replace(corrupt_cit, correct_cit)
ent, mmcif_info, seqres = io.MMCifStrToEntity(mmcif_str,
profile=prof,
process=True)
yield (tar_info.name, ent, mmcif_info, seqres)
def process_proteom(tarball, ca_only=False):
""" Helper: Creates list of entries for one proteom
:param tarball: Path to proteom tarball
:type tarball: :class:`str`
:ca_only: Whether to select for only CA positions
:type ca_only: :class:`bool`
:returns: :class: list of :class:`tuple` in format required by
:func:`FromDataChunks`
"""
omf_opts = io.OMFOption.DEFAULT_PEPLIB | io.OMFOption.LOSSY | \
io.OMFOption.AVG_BFACTORS | io.OMFOption.ROUND_BFACTORS | \
io.OMFOption.SKIP_SS | io.OMFOption.INFER_PEP_BONDS
entries = list()
for entry in iterate_proteom(tarball):
uniprot_ac = entry[0].split('-')[1]
fragment = entry[0].split('-')[2]
version = entry[0].split('-')[3]
version = version.split('_')[1]
version = version.split('.')[0]
ent = entry[1]
if ca_only:
ent = ent.Select("aname=CA")
ent = mol.CreateEntityFromView(ent, False)
ent_name = f"{uniprot_ac} {fragment} {version}"
ent.SetName(ent_name)
omf = io.OMF.FromMMCIF(ent, entry[2], omf_opts)
omf = gzip.compress(omf.ToBytes())
entries.append([uniprot_ac, fragment, version, omf])
return entries
def create_data_chunk(tarballs, afdb_dir, start, end, chunk_out,
ca_only=False):
""" Create full data chunk
Requires preprocessing step of creating a text file that lists all
proteom tarballs.
:param tarballs: Path to file where each line corresponds to one proteom
tarball, i.e. its filename
:type tarballs: :class:`str`
:param afdb_dir: Directory containing the tarball files specified in
*tarballs*
:type afdb_dir: :class:`str`
:param start: Start of tarball slice to be added to chunk
:type start: :class:`int`
:param end: End of tarball slice to be added to chunk (*end* won't be
added...)
:type end: :class:`str`
:param chunk_out: Chunk will be pickled to that file
:type chunk_out: :class:`str`
:ca_only: Whether to select for only CA positions
:type ca_only: :class:`bool`
"""
with open(tarballs, 'r') as fh:
tarballs = fh.readlines()
entries = list()
for tarball in tarballs[start:end]:
full_path = os.path.join(afdb_dir, tarball.strip())
entries += process_proteom(full_path, ca_only=ca_only)
with open(chunk_out, 'wb') as fh:
pickle.dump(entries, fh)
def main():
args = parse_args()
create_data_chunk(args.tarballs, args.afdb_dir, args.start, args.end,
args.out_file, ca_only = args.ca_only)
if __name__ == '__main__':
main()
import os
# path to Singularity container with recent OpenStructure installation
omf_sif = "<PATH_TO_SIF>"
# path to directory that contains AFDB - one tarball per proteom
afdb_dir = "/scicore/data/managed/AF_UniProt/frozen_221115T101000/proteomes"
# file with one proteom filename per line - in principle the content of os.listdir(afdb_dir)
proteom_file = "<WHEREVER>/afdb_proteom_files.txt"
# number of proteoms that end up in one data chunk
chunk_size = 100
# path to directory where chunks will be dumped
chunk_out_dir = "<WHEREVER>/afdb_data_chunks"
# path to script that does the job
fancy_script = "<WHEREVER>/afdb_proteom_to_data_chunks.py"
# whether structures should be reduced to CA only representation
ca_only = False
# path to file into which the commands are written
cmd_file = "allatom_commands.cmd"
with open(proteom_file, 'r') as fh:
proteom_data = fh.readlines()
N = len(proteom_data)
commands = list()
chunk_idx = 0
for i in range(0, N, chunk_size):
start = i
end = min(N, i+chunk_size)
out_file = os.path.join(chunk_out_dir, str(chunk_idx) + ".pkl")
cmd = ["singularity", "exec", "--bind", f"{afdb_dir}:{afdb_dir}",
omf_sif, "ost", fancy_script,
"--afdb_dir", afdb_dir,
"--tarballs", proteom_file,
"--start", str(start),
"--end", str(end),
"--out_file", out_file]
if ca_only:
cmd.append("--ca_only")
commands.append(' '.join(cmd))
chunk_idx += 1
with open(cmd_file, 'w') as fh:
fh.write('\n'.join(commands))
import os
import tarfile
import gzip
import argparse
import pickle
from ost import io
from promod3.modelling import FSStructureServer
def parse_args():
desc = ("Extracts sequences from pickled files generated with "
"create_fsserver_data_chunks.py and relates them to entries in an "
"FSStructureServer object. I.e. Dumps a sequence list in Fasta "
"format with entry indices as sequence names. This file serves as "
"starting point for an actual PentaMatch object. Requires faf.py "
"to be importable.")
parser = argparse.ArgumentParser(description = desc)
parser.add_argument("--data_chunks", help="Directory with all data chunks "
"generated with create_fsserver_data_chunks.py")
parser.add_argument("--fs_server", help="Directory containing the files "
"for an FSStructureServer object derived from "
"data_chunks directory")
parser.add_argument("--out", help="Outfile in Fasta format that will "
"contain all sequences with their respective index in "
"the underlying FSStructureServer object as names")
return parser.parse_args()
def main():
args = parse_args()
fs_server = FSStructureServer(args.fs_server)
chunk_files = os.listdir(args.data_chunks)
outfile = open(args.out, 'w')
for cf_idx, cf in enumerate(chunk_files):
print(f"processing chunk {cf_idx}, {cf}")
with open(os.path.join(args.data_chunks, cf), 'rb') as fh:
data = pickle.load(fh)
for entry in data:
uniprot_ac = entry[0]
fragment = entry[1]
version = entry[2]
omf_data = entry[3]
idx = fs_server.GetIdx(uniprot_ac, fragment = fragment,
version = version)
omf = io.OMF.FromBytes(gzip.decompress(omf_data))
assert(omf.GetChainNames() == ["A"])
sequence = omf.GetSequence("A")
outfile.write(f">{idx}\n")
seq_chunks = [sequence[i:i+80] for i in range(0, len(sequence), 80)]
outfile.write('\n'.join(seq_chunks))
outfile.write('\n')
outfile.close()
if __name__ == '__main__':
main()
......@@ -326,3 +326,34 @@ iteration.
:returns: All found matches
:rtype: :class:`list` of :class:`MotifMatch`
AFDB Modelling
--------------------------------------------------------------------------------
Template based modelling using AFDB as template library comes with two
challenges for which |project| provides solutions:
* efficient structure storage for the whole AFDB: :class:`FSStructureServer`
* fast sequence searches with limited sensitivity: :class:`PentaMatch`
The creation of these two object requires extensive preprocessing. The required
scripts and documentation are available in
`<GIT_ROOT>/extras/data_generation/afdb_modelling`.
Basic modelling functionality is available in the following two functions:
* :func:`AFDBTPLSearch`
* :func:`AFDBModel`
.. autofunction:: AFDBTPLSearch
.. autofunction:: AFDBModel
.. autoclass:: FSStructureServer
:members:
:member-order: bysource
.. autoclass:: PentaMatch
:members:
:member-order: bysource
......@@ -29,6 +29,7 @@ set(MODELLING_PYMOD
_alignment_fiddling.py
_raw_model.py
_planar_rings.py
_afdb_modelling.py
)
pymod(NAME modelling
......
......@@ -28,3 +28,4 @@ from ._monte_carlo import *
from ._mhandle_helper import *
from ._raw_model import *
from ._planar_rings import *
from ._afdb_modelling import *
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment