Skip to content
Snippets Groups Projects
Commit ea595ae6 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

scripts to help you generate a StructureDB including the structure profiles

parent 2acd83f0
No related branches found
No related tags found
No related merge requests found
We provide several scripts to obtain a StructureDB.
More information regarding the different steps you'll find in the comments
in the corresponding scripts.
Steps to obtain a structural database:
1. Get the structure/profile data: preferably you want a non redundant set of
structures to generate a StructureDB. We suggest to cull a list of pdb entries
from PISCES: http://dunbrack.fccc.edu/PISCES.php
The get_data_from_smtl.py script directly reads in such a file and produces
the input for the second step in StructureDB generation.
2. Build an initial database: use build_structure_db.py with input generated
in step one.
IF YOU NEVER EVER REQUIRE A STRUCTURE PROFILE, YOU'RE ALREADY DONE!
BUT BE AWARE, THE ACCORDING FREQUENCIES IN THE DATABASE WILL BE SET TO 0.0!
3. Calculate structure profiles: use create_structure_profiles.py.
The script calculates the structure profiles for a subset of the
sequences in the desired StructureDB and is intended for parallelization
4. Parallelize step 3: use submit_structure_profile_calculations.py
This script uses the SGE submission on BC2 and you might have to
adapt for your system
5. Assign the structure profiles generated in step 3 and 4 to the
initial database generated in step 2:
use assign_structure_profiles.py to perform this task
import os
from promod3 import loop
# Path to StructureDB to which you want to assign the profile information.
# (relative to current working directory!)
db_path = "structure_db.dat"
# Directory that contains the profile databases (and only profile databases)
# as generated by the create_structure_profiles.py script.
# (relative to current working directory)
profile_dir = "profile_dbs"
# path to store the StructureDB with assigned_profiles
# (relative to current working directory!)
db_out_path = "structure_db_with_structure_profiles.dat"
structure_db = loop.StructureDB.Load(db_path)
# lets gather all profiles in one ProfileDB
profile_db_files = os.listdir(profile_dir)
profile_db = ost.seq.ProfileDB()
for f in profile_db_files:
db_path = os.path.join(profile_dir, f)
loaded_db = ost.seq.ProfileDB.Load(db_path)
entry_names = loaded_db.GetNames()
for name in entry_names:
profile_db.AddProfile(name, loaded_db.GetProfile(name))
print name
# Assign the profiles
for coord_idx in range(structure_db.GetNumCoords()):
coord_info = structure_db.GetCoordInfo(coord_idx)
entry_id = coord_info.pdb_id
structure_profile = None
try:
structure_profile = profile_db.GetProfile(entry_id)
except:
print "Could not find structure profile for ", entry_id, " skip..."
continue
structure_db.SetStructureProfile(coord_idx, structure_profile)
# DONE!
structure_db.Save(db_out_path)
from promod3 import loop
import os
import pickle
import traceback
from ost import io
from ost.bindings import dssp
from ost.bindings import msms
from ost.mol.alg import AssignSecStruct
from ost.mol.alg import Accessibility
###############
# SCROLL DOWN #
###############
bb_atoms = ["N","CA","C","O","CB"]
......@@ -90,62 +93,87 @@ def Clean(prot):
r = r_view.handle
ed.RenameResidue(r,parent_residues[r.GetName()])
r.SetOneLetterCode(one_letter_codes[r.GetName()])
atoms_to_delete = list()
for a in r.atoms:
if a.GetName() not in bb_atoms:
ed.DeleteAtom(a)
#This script generates a StructureDB using a list of nonredundant pdb
#files. All pdb files get cleaned using some basic heuristics.
#You need the tool dssp and msms somewhere in you path.
#please note, that the values for the structure profile in the produced
#db are only allocated and simply set to zero.
#You therefore need to create an initial db, from which you can create
#these structure profiles and set them in a second step.
#This should be pretty well described in the documentation.
#file leading to prot_data:
#list of nonredundant pdb files pulled from:
#http://dunbrack.fccc.edu/PISCES.php
#
#structure_dir:
#You must define a directory containing pdb files with the
#4 letter code as written in the pisces file and .pdb as an ending
#e.g. 1AKE.pdb
#
#hmm_dir
#You must define a directory containing the hmm for every unique
#chain in the hhm format. Naming is: pdb_id + chain_name + .hhm
# e.g. 1AKEA.hhm
atoms_to_delete.append(a)
else:
a.is_hetatom=False
structure_db = loop.StructureDB()
prot_data = open('cullpdb_pc30_res2.0_R0.25_d150314_chains8016','r').readlines()
structure_dir = "/home/schdaude/workspace/promod_tests/generate_databases/structures"
hmm_dir = "/home/schdaude/workspace/promod_tests/generate_databases/hmms"
for i,line in enumerate(prot_data[1:]):
for a in atoms_to_delete:
ed.DeleteAtom(a)
# This script generates a StructureDB using a list of nonredundant pdb
# files. All pdb files get cleaned using some basic heuristics.
# please note, that the values for the structure profile in the produced
# db are only allocated and simply set to zero.
# You therefore need to create an initial db, from which you can create
# these structure profiles and set them in a second step.
# This should be pretty well described in the documentation.
# To obtain the required pdb and hhm files, you need to run
# the get_data_from_smtl.py script. As an alternative you fetch your own
# structures from the pdb and obtain the hhm file using the hh-suite
# (https://github.com/soedinglab/hh-suite) and hack this script that
# it uses your input directly.
# file generated by get_data_from_smtl.py (or by yourself)
# every line requires to be in the format: pdb_id chain_name
# e.g. 5UZG A
data = open("data_extracted_from_smtl.txt",'r').readlines()
# directory containing the structures. naming: pdb_id.pdb
# (pdb_id in upper case!) e.g. 5UZG.pdb
structure_dir = "structures"
# directory containing the hhm files for chain. naming pdb_id+chain_id.hhm
# (pdb_id and chain_id in upper case!) e.g. 5UZGA.hhm
profile_dir = "hmms"
prot_id = line[:4]
chain_id = line[4]
if not os.path.exists(os.path.join(hmm_dir,prot_id+chain_id+".hhm")):
continue
path_to_msms_binary = "/scicore/soft/apps/msms/2.6.1-linux-x86_64/msms.x86_64Linux2.2.6.1"
# Do it!!
structure_db = loop.StructureDB()
for i,line in enumerate(data):
split_line = line.split()
prot_id = split_line[0]
chain_id = split_line[1]
try:
print 'processing: ',prot_id,' on line: ',i
prot = io.LoadPDB(os.path.join(structure_dir,prot_id+'.pdb')).Select('peptide=true')
prot_path = os.path.join(structure_dir, prot_id+'.pdb')
hhm_path = os.path.join(profile_dir,prot_id+chain_id+".hhm")
if not os.path.exists(prot_path):
print "Could not find structure file... skip..."
continue
if not os.path.exists(prot_path):
print "Could not find hhm file... skip..."
continue
prot = io.LoadPDB(prot_path).Select('peptide=true')
hmm = io.LoadSequenceProfile(hhm_path)
Clean(prot)
dssp.AssignDSSP(prot,extract_burial_status=True)
surf = msms.CalculateSurface(prot)[0]
try:
hmm = io.LoadSequenceProfile(os.path.join(hmm_dir,prot_id+chain_id+".hhm"))
prot = prot.Select("cname="+chain_id)
idx = structure_db.AddCoordinates(prot_id,chain_id,prot.chains[0],surf,hmm)
except:
traceback.print_exc()
AssignSecStruct(prot)
Accessibility(prot, algorithm=mol.alg.AccessibilityAlgorithm.DSSP)
surf = msms.CalculateSurface(prot, msms_exe=path_to_msms_binary)[0]
prot_single_chain = prot.Select("cname="+chain_id)
idx = structure_db.AddCoordinates(prot_id, chain_id,
prot_single_chain.chains[0], surf, hmm,
solvent_accessibility_string="asaAbs")
except:
traceback.print_exc()
......
import os
import sys
from promod3 import loop
from ost import seq
import time
if len(sys.argv) != 6:
params = ["structure_db_path", "structure_db_source_path",\
"start", "end", "profile_db_out_path"]
print "USAGE: ost create_structure_profiles.py " + ' '.join(params)
sys.exit()
# loads a StructureDB that can be arbitrarily large and estimates the
# structure profile for a subset of its entries with the structural information
# from another StructureDB. This other DB, the "source" DB should not be too
# large because its size heavily influences compuation time.
# structure_db_path: path to database for which you want to create
# StructureProfiles
#
# structure_db_source_path: path to database from which you get the
# structural information to calculate the profiles
# this database should not be much larger than 5000
# entries in order to keep computation time in a
# feasible range
#
# start: index of entry in StructureDB at structure_db_path to start with
#
# end: index of entry in StructureDB at structure_db_path to end with
# (not inclusive)
#
# profile_db_out_path: All structure profiles will be stored in a
# ost.seq.HMMDB() that will be dumped at this path
structure_db_path = sys.argv[1]
structure_db_source_path = sys.argv[2]
start = int(sys.argv[3])
end = int(sys.argv[4])
profile_db_out_path = sys.argv[5]
structure_db = loop.StructureDB.Load(structure_db_path)
structure_db_source = loop.StructureDB.Load(structure_db_source_path)
profile_db = seq.ProfileDB()
for i in range(start,end):
print "processing chain with idx", i
# generate fragment info object
coord_info = structure_db.GetCoordInfo(i)
fragment_info = loop.FragmentInfo(i,0,coord_info.size)
# generate the structure profile given the source StructureDB
profile = structure_db.GenerateStructureProfile(structure_db_source, fragment_info)
profile_db.AddProfile(coord_info.pdb_id, profile)
profile_db.Save(profile_db_out_path)
import time
import os
import pickle
import traceback
from sm import smtl
# This is how we extract all required data for a StructureDB from the
# SWISS-MODEL template library given a non redundant set of proteins as
# pulled from Pisces: http://dunbrack.fccc.edu/PISCES.php.
# If you have no working SWISS-MODEL installation you have two possibilities:
#
# 1. Ping someone from the mighty SCHWEDE-Lab to run this script for you
# with your desired PISCES file
#
# 2. Fetch the data manually from the PDB and obtain the profiles with
# the hh-suite available at: https://github.com/soedinglab/hh-suite.
pisces_path = "cullpdb_pc20_res1.8_R0.25_d170820_chains4892"
smtl_path = "/scicore/data/databases/SMTL/"
profile_out_dir = "hmms"
structure_out_dir = "structures"
tpl_lib = smtl.SMTL(smtl_path)
pisces_data = open(pisces_path).readlines()
# We have to do some mapping magic, since the chain names of SMTL and pdb
# do not match. The PISCES data however specifies the pdb names
# in here we store pdb id and chain name relative to the SMTL
fetched_structures = list()
if not os.path.exists(profile_out_dir):
os.makedirs(profile_out_dir)
if not os.path.exists(structure_out_dir):
os.makedirs(structure_out_dir)
for i,line in enumerate(pisces_data[1:]):
prot_id = line[:4]
chain_id = line[4]
print "processing ", prot_id, chain_id
try:
bus = tpl_lib.GetAll(prot_id.upper())
except:
print "could not find entry for ", prot_id, chain_id
continue
bu = None
if len(bus) == 0:
print "didnt find any valid biounit!"
continue
if len(bus) == 1:
bu = bus[0]
else:
# If there are several biounits we search for the one that
# is likely to have the relevant oligomeric state
bu = bus[0]
for b in bus:
if "author" in b.details and "software" in b.details:
bu = b
break
for polypeptide in bu.polypeptides:
if polypeptide.is_polypeptide:
found_chain = False
for ch in polypeptide.chains:
if ch.orig_pdb_name == chain_id:
unique_name = ch.unique_id
structure = ch.structure
profile_path = tpl_lib.ProfileForTemplate(unique_name)
profile_out_path = os.path.join(profile_out_dir, prot_id+ch.name+".hhm")
structure_out_path = os.path.join(structure_out_dir, prot_id+".pdb")
# copy over the profile
os.system(' '.join(["cp", profile_path, profile_out_path]))
# get the structure
full_structure = bu.GetSubstructure(bu.polypeptides)
ost.io.SavePDB(full_structure,structure_out_path)
found_chain=True
# add the actually extracted pdb id and chain name
fetched_structures.append((prot_id, ch.name))
print "found the according data..."
break
if found_chain:
break
outfile = open("data_extracted_from_smtl.txt",'w')
outfile.write('\n'.join([(x[0] + ' ' + x[1]) for x in fetched_structures]))
outfile.close()
import os, math
from promod3 import loop
# path to StructureDB for which you want to calculate the profiles:
# (relative to current working directory!)
structure_db_path = "structure_db.dat"
# path to StructureDB from which you extract the structural information
# if the StructureDB above is very large, you might want to give it a smaller
# database here to limit CPU time. A database with around 5000 chains should
# be sufficient
# (relative to current working directory!)
structure_db_source_path = "structure_db.dat"
# the profiles that get generated end up here:
# (relative to current working directory!)
out_dir = "profile_dbs"
# number of jobs
num_jobs = 100
# all stdout, stderr and submission_script files will go here
# (relative to current working directory!)
submission_workspace = "submission_workspace"
# path to the pm bash script
# Once we have the latest and greatest ProMod3 module, this should not
# be necessary anymore and we can simply call pm
pm_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/bin/pm"
# This is stuff relevant for the submission script
max_runtime = "13:00:00"
membycore = "3G"
my_email = "gabriel.studer@unibas.ch"
# we only need ost and promod3 as soon as they're released...
required_modules = ["Boost/1.53.0-goolf-1.4.10-Python-2.7.5",
"Python/2.7.5-goolf-1.4.10",
"OpenMM/7.1.1-goolf-1.4.10-Python-2.7.5"]
# they won't be necessary anymore as soon as there are ost
# and promod3 modules
ost_python_path = "/scicore/home/schwede/studga00/prog/ost_dev/build/stage/lib64/python2.7/site-packages"
promod_python_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/lib64/python2.7/site-packages"
#################################################################
# NO EDITING BELOW THIS POINT (EXCEPT ON DIFFERENT SYSTEMS ;) ) #
#################################################################
if not os.path.exists(submission_workspace):
os.makedirs(submission_workspace)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
structure_db = loop.StructureDB.Load(structure_db_path)
num_coords = structure_db.GetNumCoords()
chunk_size = math.ceil(float(num_coords ) / num_jobs)
current_start = 0
current_end = chunk_size
for job_idx in range(num_jobs):
start = current_start
end = current_end
if end > num_coords:
end = num_coords
# we estimate the chunk size with a ceil => we might get away with even
# less jobs...
if start >= end:
break
start = int(start)
end = int(end)
current_start += chunk_size
current_end += chunk_size
cmd = [pm_path, \
os.path.join(os.getcwd(),"create_structure_profiles.py"), \
os.path.join(os.getcwd(),structure_db_path), \
os.path.join(os.getcwd(),structure_db_source_path), \
str(start), str(end), \
os.path.join(os.getcwd(), out_dir, str(job_idx)+".dat")]
stdout_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stdout")
stderr_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stderr")
content = list()
content.append("#!/bin/bash")
content.append("#$ -o " + stdout_path)
content.append("#$ -e " + stderr_path)
content.append("#$ -l runtime=" + max_runtime)
content.append("#$ -l membycore=" + membycore)
content.append("#$ -m as -M " + my_email)
for module in required_modules:
content.append("ml " + module)
content.append("export PYTHONPATH=" + ost_python_path + ":PYTHONPATH")
content.append("export PYTHONPATH=" + promod_python_path + ":PYTHONPATH")
content.append(' '.join(cmd))
s_script_path = os.path.join(submission_workspace, "sub_"+str(job_idx)+".sh")
outfile = open(s_script_path,'w')
outfile.write('\n'.join(content))
outfile.close()
# FIRE!!!
os.system("qsub "+ s_script_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment