diff --git a/extras/data_generation/structure_db/README b/extras/data_generation/structure_db/README new file mode 100644 index 0000000000000000000000000000000000000000..227313066aa288d29d215b5309f38ceab4689116 --- /dev/null +++ b/extras/data_generation/structure_db/README @@ -0,0 +1,30 @@ +We provide several scripts to obtain a StructureDB. +More information regarding the different steps you'll find in the comments +in the corresponding scripts. + +Steps to obtain a structural database: + +1. Get the structure/profile data: preferably you want a non redundant set of + structures to generate a StructureDB. We suggest to cull a list of pdb entries + from PISCES: http://dunbrack.fccc.edu/PISCES.php + The get_data_from_smtl.py script directly reads in such a file and produces + the input for the second step in StructureDB generation. + +2. Build an initial database: use build_structure_db.py with input generated + in step one. + +IF YOU NEVER EVER REQUIRE A STRUCTURE PROFILE, YOU'RE ALREADY DONE! +BUT BE AWARE, THE ACCORDING FREQUENCIES IN THE DATABASE WILL BE SET TO 0.0! + +3. Calculate structure profiles: use create_structure_profiles.py. + The script calculates the structure profiles for a subset of the + sequences in the desired StructureDB and is intended for parallelization + +4. Parallelize step 3: use submit_structure_profile_calculations.py + This script uses the SGE submission on BC2 and you might have to + adapt for your system + +5. Assign the structure profiles generated in step 3 and 4 to the + initial database generated in step 2: + use assign_structure_profiles.py to perform this task + diff --git a/extras/data_generation/structure_db/assign_structure_profiles.py b/extras/data_generation/structure_db/assign_structure_profiles.py new file mode 100644 index 0000000000000000000000000000000000000000..64029be3aedf16222f622dd102fd8c664b5295c5 --- /dev/null +++ b/extras/data_generation/structure_db/assign_structure_profiles.py @@ -0,0 +1,49 @@ +import os +from promod3 import loop + + +# Path to StructureDB to which you want to assign the profile information. +# (relative to current working directory!) +db_path = "structure_db.dat" + +# Directory that contains the profile databases (and only profile databases) +# as generated by the create_structure_profiles.py script. +# (relative to current working directory) +profile_dir = "profile_dbs" + +# path to store the StructureDB with assigned_profiles +# (relative to current working directory!) +db_out_path = "structure_db_with_structure_profiles.dat" + + +structure_db = loop.StructureDB.Load(db_path) + +# lets gather all profiles in one ProfileDB +profile_db_files = os.listdir(profile_dir) +profile_db = ost.seq.ProfileDB() + +for f in profile_db_files: + db_path = os.path.join(profile_dir, f) + loaded_db = ost.seq.ProfileDB.Load(db_path) + entry_names = loaded_db.GetNames() + for name in entry_names: + profile_db.AddProfile(name, loaded_db.GetProfile(name)) + print name + +# Assign the profiles +for coord_idx in range(structure_db.GetNumCoords()): + coord_info = structure_db.GetCoordInfo(coord_idx) + entry_id = coord_info.pdb_id + + structure_profile = None + try: + structure_profile = profile_db.GetProfile(entry_id) + except: + print "Could not find structure profile for ", entry_id, " skip..." + continue + + structure_db.SetStructureProfile(coord_idx, structure_profile) + +# DONE! +structure_db.Save(db_out_path) + diff --git a/extras/data_generation/structure_db/build_structure_db.py b/extras/data_generation/structure_db/build_structure_db.py index 750716bfbfeac3fd3c5f17d9510a4fbf96df7755..409f67b34a8f775ad00db032a1f963282936e447 100644 --- a/extras/data_generation/structure_db/build_structure_db.py +++ b/extras/data_generation/structure_db/build_structure_db.py @@ -1,11 +1,14 @@ from promod3 import loop import os -import pickle import traceback from ost import io -from ost.bindings import dssp from ost.bindings import msms +from ost.mol.alg import AssignSecStruct +from ost.mol.alg import Accessibility +############### +# SCROLL DOWN # +############### bb_atoms = ["N","CA","C","O","CB"] @@ -90,62 +93,87 @@ def Clean(prot): r = r_view.handle ed.RenameResidue(r,parent_residues[r.GetName()]) r.SetOneLetterCode(one_letter_codes[r.GetName()]) + + atoms_to_delete = list() + for a in r.atoms: if a.GetName() not in bb_atoms: - ed.DeleteAtom(a) - - - -#This script generates a StructureDB using a list of nonredundant pdb -#files. All pdb files get cleaned using some basic heuristics. -#You need the tool dssp and msms somewhere in you path. -#please note, that the values for the structure profile in the produced -#db are only allocated and simply set to zero. -#You therefore need to create an initial db, from which you can create -#these structure profiles and set them in a second step. -#This should be pretty well described in the documentation. - -#file leading to prot_data: -#list of nonredundant pdb files pulled from: -#http://dunbrack.fccc.edu/PISCES.php -# -#structure_dir: -#You must define a directory containing pdb files with the -#4 letter code as written in the pisces file and .pdb as an ending -#e.g. 1AKE.pdb -# -#hmm_dir -#You must define a directory containing the hmm for every unique -#chain in the hhm format. Naming is: pdb_id + chain_name + .hhm -# e.g. 1AKEA.hhm + atoms_to_delete.append(a) + else: + a.is_hetatom=False -structure_db = loop.StructureDB() -prot_data = open('cullpdb_pc30_res2.0_R0.25_d150314_chains8016','r').readlines() -structure_dir = "/home/schdaude/workspace/promod_tests/generate_databases/structures" -hmm_dir = "/home/schdaude/workspace/promod_tests/generate_databases/hmms" -for i,line in enumerate(prot_data[1:]): + for a in atoms_to_delete: + ed.DeleteAtom(a) + + + +# This script generates a StructureDB using a list of nonredundant pdb +# files. All pdb files get cleaned using some basic heuristics. +# please note, that the values for the structure profile in the produced +# db are only allocated and simply set to zero. +# You therefore need to create an initial db, from which you can create +# these structure profiles and set them in a second step. +# This should be pretty well described in the documentation. + +# To obtain the required pdb and hhm files, you need to run +# the get_data_from_smtl.py script. As an alternative you fetch your own +# structures from the pdb and obtain the hhm file using the hh-suite +# (https://github.com/soedinglab/hh-suite) and hack this script that +# it uses your input directly. + +# file generated by get_data_from_smtl.py (or by yourself) +# every line requires to be in the format: pdb_id chain_name +# e.g. 5UZG A +data = open("data_extracted_from_smtl.txt",'r').readlines() + +# directory containing the structures. naming: pdb_id.pdb +# (pdb_id in upper case!) e.g. 5UZG.pdb +structure_dir = "structures" + +# directory containing the hhm files for chain. naming pdb_id+chain_id.hhm +# (pdb_id and chain_id in upper case!) e.g. 5UZGA.hhm +profile_dir = "hmms" - prot_id = line[:4] - chain_id = line[4] - if not os.path.exists(os.path.join(hmm_dir,prot_id+chain_id+".hhm")): - continue +path_to_msms_binary = "/scicore/soft/apps/msms/2.6.1-linux-x86_64/msms.x86_64Linux2.2.6.1" + + +# Do it!! +structure_db = loop.StructureDB() + +for i,line in enumerate(data): + + split_line = line.split() + prot_id = split_line[0] + chain_id = split_line[1] try: print 'processing: ',prot_id,' on line: ',i - prot = io.LoadPDB(os.path.join(structure_dir,prot_id+'.pdb')).Select('peptide=true') + prot_path = os.path.join(structure_dir, prot_id+'.pdb') + hhm_path = os.path.join(profile_dir,prot_id+chain_id+".hhm") + + if not os.path.exists(prot_path): + print "Could not find structure file... skip..." + continue + if not os.path.exists(prot_path): + print "Could not find hhm file... skip..." + continue + + prot = io.LoadPDB(prot_path).Select('peptide=true') + hmm = io.LoadSequenceProfile(hhm_path) + Clean(prot) - dssp.AssignDSSP(prot,extract_burial_status=True) - surf = msms.CalculateSurface(prot)[0] - - try: - hmm = io.LoadSequenceProfile(os.path.join(hmm_dir,prot_id+chain_id+".hhm")) - prot = prot.Select("cname="+chain_id) - idx = structure_db.AddCoordinates(prot_id,chain_id,prot.chains[0],surf,hmm) - except: - traceback.print_exc() + AssignSecStruct(prot) + Accessibility(prot, algorithm=mol.alg.AccessibilityAlgorithm.DSSP) + surf = msms.CalculateSurface(prot, msms_exe=path_to_msms_binary)[0] + prot_single_chain = prot.Select("cname="+chain_id) + idx = structure_db.AddCoordinates(prot_id, chain_id, + prot_single_chain.chains[0], surf, hmm, + solvent_accessibility_string="asaAbs") + + except: traceback.print_exc() diff --git a/extras/data_generation/structure_db/create_structure_profiles.py b/extras/data_generation/structure_db/create_structure_profiles.py new file mode 100644 index 0000000000000000000000000000000000000000..ec227549785f97790208abaa3764af16eb06ea6c --- /dev/null +++ b/extras/data_generation/structure_db/create_structure_profiles.py @@ -0,0 +1,60 @@ +import os +import sys +from promod3 import loop +from ost import seq +import time + +if len(sys.argv) != 6: + + params = ["structure_db_path", "structure_db_source_path",\ + "start", "end", "profile_db_out_path"] + print "USAGE: ost create_structure_profiles.py " + ' '.join(params) + sys.exit() + +# loads a StructureDB that can be arbitrarily large and estimates the +# structure profile for a subset of its entries with the structural information +# from another StructureDB. This other DB, the "source" DB should not be too +# large because its size heavily influences compuation time. + +# structure_db_path: path to database for which you want to create +# StructureProfiles +# +# structure_db_source_path: path to database from which you get the +# structural information to calculate the profiles +# this database should not be much larger than 5000 +# entries in order to keep computation time in a +# feasible range +# +# start: index of entry in StructureDB at structure_db_path to start with +# +# end: index of entry in StructureDB at structure_db_path to end with +# (not inclusive) +# +# profile_db_out_path: All structure profiles will be stored in a +# ost.seq.HMMDB() that will be dumped at this path + +structure_db_path = sys.argv[1] +structure_db_source_path = sys.argv[2] +start = int(sys.argv[3]) +end = int(sys.argv[4]) +profile_db_out_path = sys.argv[5] + +structure_db = loop.StructureDB.Load(structure_db_path) +structure_db_source = loop.StructureDB.Load(structure_db_source_path) + +profile_db = seq.ProfileDB() + +for i in range(start,end): + print "processing chain with idx", i + + # generate fragment info object + coord_info = structure_db.GetCoordInfo(i) + fragment_info = loop.FragmentInfo(i,0,coord_info.size) + + # generate the structure profile given the source StructureDB + profile = structure_db.GenerateStructureProfile(structure_db_source, fragment_info) + profile_db.AddProfile(coord_info.pdb_id, profile) + + +profile_db.Save(profile_db_out_path) + diff --git a/extras/data_generation/structure_db/get_data_from_smtl.py b/extras/data_generation/structure_db/get_data_from_smtl.py new file mode 100644 index 0000000000000000000000000000000000000000..a881e5206502bd07fe11234d0162d7e4dc274fa7 --- /dev/null +++ b/extras/data_generation/structure_db/get_data_from_smtl.py @@ -0,0 +1,104 @@ +import time +import os +import pickle +import traceback +from sm import smtl + +# This is how we extract all required data for a StructureDB from the +# SWISS-MODEL template library given a non redundant set of proteins as +# pulled from Pisces: http://dunbrack.fccc.edu/PISCES.php. +# If you have no working SWISS-MODEL installation you have two possibilities: +# +# 1. Ping someone from the mighty SCHWEDE-Lab to run this script for you +# with your desired PISCES file +# +# 2. Fetch the data manually from the PDB and obtain the profiles with +# the hh-suite available at: https://github.com/soedinglab/hh-suite. + +pisces_path = "cullpdb_pc20_res1.8_R0.25_d170820_chains4892" +smtl_path = "/scicore/data/databases/SMTL/" + +profile_out_dir = "hmms" +structure_out_dir = "structures" + +tpl_lib = smtl.SMTL(smtl_path) +pisces_data = open(pisces_path).readlines() + +# We have to do some mapping magic, since the chain names of SMTL and pdb +# do not match. The PISCES data however specifies the pdb names +# in here we store pdb id and chain name relative to the SMTL +fetched_structures = list() + + +if not os.path.exists(profile_out_dir): + os.makedirs(profile_out_dir) +if not os.path.exists(structure_out_dir): + os.makedirs(structure_out_dir) + + +for i,line in enumerate(pisces_data[1:]): + + prot_id = line[:4] + chain_id = line[4] + print "processing ", prot_id, chain_id + + try: + bus = tpl_lib.GetAll(prot_id.upper()) + except: + print "could not find entry for ", prot_id, chain_id + continue + + bu = None + if len(bus) == 0: + print "didnt find any valid biounit!" + continue + if len(bus) == 1: + bu = bus[0] + else: + # If there are several biounits we search for the one that + # is likely to have the relevant oligomeric state + bu = bus[0] + for b in bus: + if "author" in b.details and "software" in b.details: + bu = b + break + + for polypeptide in bu.polypeptides: + + if polypeptide.is_polypeptide: + found_chain = False + + for ch in polypeptide.chains: + + if ch.orig_pdb_name == chain_id: + + unique_name = ch.unique_id + structure = ch.structure + profile_path = tpl_lib.ProfileForTemplate(unique_name) + + profile_out_path = os.path.join(profile_out_dir, prot_id+ch.name+".hhm") + structure_out_path = os.path.join(structure_out_dir, prot_id+".pdb") + + # copy over the profile + os.system(' '.join(["cp", profile_path, profile_out_path])) + + # get the structure + full_structure = bu.GetSubstructure(bu.polypeptides) + ost.io.SavePDB(full_structure,structure_out_path) + found_chain=True + + # add the actually extracted pdb id and chain name + fetched_structures.append((prot_id, ch.name)) + + print "found the according data..." + break + + if found_chain: + break + + + +outfile = open("data_extracted_from_smtl.txt",'w') +outfile.write('\n'.join([(x[0] + ' ' + x[1]) for x in fetched_structures])) +outfile.close() + diff --git a/extras/data_generation/structure_db/submit_structure_profile_calculations.py b/extras/data_generation/structure_db/submit_structure_profile_calculations.py new file mode 100644 index 0000000000000000000000000000000000000000..87f26a5f99cd2df34a38958f6a5f8acaf54a7fd2 --- /dev/null +++ b/extras/data_generation/structure_db/submit_structure_profile_calculations.py @@ -0,0 +1,115 @@ +import os, math +from promod3 import loop + +# path to StructureDB for which you want to calculate the profiles: +# (relative to current working directory!) +structure_db_path = "structure_db.dat" + +# path to StructureDB from which you extract the structural information +# if the StructureDB above is very large, you might want to give it a smaller +# database here to limit CPU time. A database with around 5000 chains should +# be sufficient +# (relative to current working directory!) +structure_db_source_path = "structure_db.dat" + +# the profiles that get generated end up here: +# (relative to current working directory!) +out_dir = "profile_dbs" + +# number of jobs +num_jobs = 100 + +# all stdout, stderr and submission_script files will go here +# (relative to current working directory!) +submission_workspace = "submission_workspace" + + +# path to the pm bash script +# Once we have the latest and greatest ProMod3 module, this should not +# be necessary anymore and we can simply call pm +pm_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/bin/pm" + + +# This is stuff relevant for the submission script +max_runtime = "13:00:00" +membycore = "3G" +my_email = "gabriel.studer@unibas.ch" + +# we only need ost and promod3 as soon as they're released... +required_modules = ["Boost/1.53.0-goolf-1.4.10-Python-2.7.5", + "Python/2.7.5-goolf-1.4.10", + "OpenMM/7.1.1-goolf-1.4.10-Python-2.7.5"] + +# they won't be necessary anymore as soon as there are ost +# and promod3 modules +ost_python_path = "/scicore/home/schwede/studga00/prog/ost_dev/build/stage/lib64/python2.7/site-packages" +promod_python_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/lib64/python2.7/site-packages" + + +################################################################# +# NO EDITING BELOW THIS POINT (EXCEPT ON DIFFERENT SYSTEMS ;) ) # +################################################################# + +if not os.path.exists(submission_workspace): + os.makedirs(submission_workspace) + +if not os.path.exists(out_dir): + os.makedirs(out_dir) + +structure_db = loop.StructureDB.Load(structure_db_path) +num_coords = structure_db.GetNumCoords() +chunk_size = math.ceil(float(num_coords ) / num_jobs) +current_start = 0 +current_end = chunk_size + +for job_idx in range(num_jobs): + + start = current_start + end = current_end + if end > num_coords: + end = num_coords + + # we estimate the chunk size with a ceil => we might get away with even + # less jobs... + if start >= end: + break + + start = int(start) + end = int(end) + + current_start += chunk_size + current_end += chunk_size + + cmd = [pm_path, \ + os.path.join(os.getcwd(),"create_structure_profiles.py"), \ + os.path.join(os.getcwd(),structure_db_path), \ + os.path.join(os.getcwd(),structure_db_source_path), \ + str(start), str(end), \ + os.path.join(os.getcwd(), out_dir, str(job_idx)+".dat")] + + stdout_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stdout") + stderr_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stderr") + + content = list() + content.append("#!/bin/bash") + content.append("#$ -o " + stdout_path) + content.append("#$ -e " + stderr_path) + content.append("#$ -l runtime=" + max_runtime) + content.append("#$ -l membycore=" + membycore) + content.append("#$ -m as -M " + my_email) + + for module in required_modules: + content.append("ml " + module) + + content.append("export PYTHONPATH=" + ost_python_path + ":PYTHONPATH") + content.append("export PYTHONPATH=" + promod_python_path + ":PYTHONPATH") + content.append(' '.join(cmd)) + + s_script_path = os.path.join(submission_workspace, "sub_"+str(job_idx)+".sh") + outfile = open(s_script_path,'w') + outfile.write('\n'.join(content)) + outfile.close() + + # FIRE!!! + os.system("qsub "+ s_script_path) +