Skip to content
Snippets Groups Projects
Commit ea595ae6 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

scripts to help you generate a StructureDB including the structure profiles

parent 2acd83f0
Branches
Tags
No related merge requests found
We provide several scripts to obtain a StructureDB.
More information regarding the different steps you'll find in the comments
in the corresponding scripts.
Steps to obtain a structural database:
1. Get the structure/profile data: preferably you want a non redundant set of
structures to generate a StructureDB. We suggest to cull a list of pdb entries
from PISCES: http://dunbrack.fccc.edu/PISCES.php
The get_data_from_smtl.py script directly reads in such a file and produces
the input for the second step in StructureDB generation.
2. Build an initial database: use build_structure_db.py with input generated
in step one.
IF YOU NEVER EVER REQUIRE A STRUCTURE PROFILE, YOU'RE ALREADY DONE!
BUT BE AWARE, THE ACCORDING FREQUENCIES IN THE DATABASE WILL BE SET TO 0.0!
3. Calculate structure profiles: use create_structure_profiles.py.
The script calculates the structure profiles for a subset of the
sequences in the desired StructureDB and is intended for parallelization
4. Parallelize step 3: use submit_structure_profile_calculations.py
This script uses the SGE submission on BC2 and you might have to
adapt for your system
5. Assign the structure profiles generated in step 3 and 4 to the
initial database generated in step 2:
use assign_structure_profiles.py to perform this task
import os
from promod3 import loop
# Path to StructureDB to which you want to assign the profile information.
# (relative to current working directory!)
db_path = "structure_db.dat"
# Directory that contains the profile databases (and only profile databases)
# as generated by the create_structure_profiles.py script.
# (relative to current working directory)
profile_dir = "profile_dbs"
# path to store the StructureDB with assigned_profiles
# (relative to current working directory!)
db_out_path = "structure_db_with_structure_profiles.dat"
structure_db = loop.StructureDB.Load(db_path)
# lets gather all profiles in one ProfileDB
profile_db_files = os.listdir(profile_dir)
profile_db = ost.seq.ProfileDB()
for f in profile_db_files:
db_path = os.path.join(profile_dir, f)
loaded_db = ost.seq.ProfileDB.Load(db_path)
entry_names = loaded_db.GetNames()
for name in entry_names:
profile_db.AddProfile(name, loaded_db.GetProfile(name))
print name
# Assign the profiles
for coord_idx in range(structure_db.GetNumCoords()):
coord_info = structure_db.GetCoordInfo(coord_idx)
entry_id = coord_info.pdb_id
structure_profile = None
try:
structure_profile = profile_db.GetProfile(entry_id)
except:
print "Could not find structure profile for ", entry_id, " skip..."
continue
structure_db.SetStructureProfile(coord_idx, structure_profile)
# DONE!
structure_db.Save(db_out_path)
from promod3 import loop from promod3 import loop
import os import os
import pickle
import traceback import traceback
from ost import io from ost import io
from ost.bindings import dssp
from ost.bindings import msms from ost.bindings import msms
from ost.mol.alg import AssignSecStruct
from ost.mol.alg import Accessibility
###############
# SCROLL DOWN #
###############
bb_atoms = ["N","CA","C","O","CB"] bb_atoms = ["N","CA","C","O","CB"]
...@@ -90,62 +93,87 @@ def Clean(prot): ...@@ -90,62 +93,87 @@ def Clean(prot):
r = r_view.handle r = r_view.handle
ed.RenameResidue(r,parent_residues[r.GetName()]) ed.RenameResidue(r,parent_residues[r.GetName()])
r.SetOneLetterCode(one_letter_codes[r.GetName()]) r.SetOneLetterCode(one_letter_codes[r.GetName()])
atoms_to_delete = list()
for a in r.atoms: for a in r.atoms:
if a.GetName() not in bb_atoms: if a.GetName() not in bb_atoms:
ed.DeleteAtom(a) atoms_to_delete.append(a)
else:
a.is_hetatom=False
#This script generates a StructureDB using a list of nonredundant pdb
#files. All pdb files get cleaned using some basic heuristics.
#You need the tool dssp and msms somewhere in you path.
#please note, that the values for the structure profile in the produced
#db are only allocated and simply set to zero.
#You therefore need to create an initial db, from which you can create
#these structure profiles and set them in a second step.
#This should be pretty well described in the documentation.
#file leading to prot_data:
#list of nonredundant pdb files pulled from:
#http://dunbrack.fccc.edu/PISCES.php
#
#structure_dir:
#You must define a directory containing pdb files with the
#4 letter code as written in the pisces file and .pdb as an ending
#e.g. 1AKE.pdb
#
#hmm_dir
#You must define a directory containing the hmm for every unique
#chain in the hhm format. Naming is: pdb_id + chain_name + .hhm
# e.g. 1AKEA.hhm
structure_db = loop.StructureDB()
prot_data = open('cullpdb_pc30_res2.0_R0.25_d150314_chains8016','r').readlines()
structure_dir = "/home/schdaude/workspace/promod_tests/generate_databases/structures"
hmm_dir = "/home/schdaude/workspace/promod_tests/generate_databases/hmms"
for i,line in enumerate(prot_data[1:]): for a in atoms_to_delete:
ed.DeleteAtom(a)
# This script generates a StructureDB using a list of nonredundant pdb
# files. All pdb files get cleaned using some basic heuristics.
# please note, that the values for the structure profile in the produced
# db are only allocated and simply set to zero.
# You therefore need to create an initial db, from which you can create
# these structure profiles and set them in a second step.
# This should be pretty well described in the documentation.
# To obtain the required pdb and hhm files, you need to run
# the get_data_from_smtl.py script. As an alternative you fetch your own
# structures from the pdb and obtain the hhm file using the hh-suite
# (https://github.com/soedinglab/hh-suite) and hack this script that
# it uses your input directly.
# file generated by get_data_from_smtl.py (or by yourself)
# every line requires to be in the format: pdb_id chain_name
# e.g. 5UZG A
data = open("data_extracted_from_smtl.txt",'r').readlines()
# directory containing the structures. naming: pdb_id.pdb
# (pdb_id in upper case!) e.g. 5UZG.pdb
structure_dir = "structures"
# directory containing the hhm files for chain. naming pdb_id+chain_id.hhm
# (pdb_id and chain_id in upper case!) e.g. 5UZGA.hhm
profile_dir = "hmms"
prot_id = line[:4]
chain_id = line[4]
if not os.path.exists(os.path.join(hmm_dir,prot_id+chain_id+".hhm")): path_to_msms_binary = "/scicore/soft/apps/msms/2.6.1-linux-x86_64/msms.x86_64Linux2.2.6.1"
continue
# Do it!!
structure_db = loop.StructureDB()
for i,line in enumerate(data):
split_line = line.split()
prot_id = split_line[0]
chain_id = split_line[1]
try: try:
print 'processing: ',prot_id,' on line: ',i print 'processing: ',prot_id,' on line: ',i
prot = io.LoadPDB(os.path.join(structure_dir,prot_id+'.pdb')).Select('peptide=true') prot_path = os.path.join(structure_dir, prot_id+'.pdb')
hhm_path = os.path.join(profile_dir,prot_id+chain_id+".hhm")
if not os.path.exists(prot_path):
print "Could not find structure file... skip..."
continue
if not os.path.exists(prot_path):
print "Could not find hhm file... skip..."
continue
prot = io.LoadPDB(prot_path).Select('peptide=true')
hmm = io.LoadSequenceProfile(hhm_path)
Clean(prot) Clean(prot)
dssp.AssignDSSP(prot,extract_burial_status=True) AssignSecStruct(prot)
surf = msms.CalculateSurface(prot)[0] Accessibility(prot, algorithm=mol.alg.AccessibilityAlgorithm.DSSP)
surf = msms.CalculateSurface(prot, msms_exe=path_to_msms_binary)[0]
try: prot_single_chain = prot.Select("cname="+chain_id)
hmm = io.LoadSequenceProfile(os.path.join(hmm_dir,prot_id+chain_id+".hhm")) idx = structure_db.AddCoordinates(prot_id, chain_id,
prot = prot.Select("cname="+chain_id) prot_single_chain.chains[0], surf, hmm,
idx = structure_db.AddCoordinates(prot_id,chain_id,prot.chains[0],surf,hmm) solvent_accessibility_string="asaAbs")
except:
traceback.print_exc()
except: except:
traceback.print_exc() traceback.print_exc()
......
import os
import sys
from promod3 import loop
from ost import seq
import time
if len(sys.argv) != 6:
params = ["structure_db_path", "structure_db_source_path",\
"start", "end", "profile_db_out_path"]
print "USAGE: ost create_structure_profiles.py " + ' '.join(params)
sys.exit()
# loads a StructureDB that can be arbitrarily large and estimates the
# structure profile for a subset of its entries with the structural information
# from another StructureDB. This other DB, the "source" DB should not be too
# large because its size heavily influences compuation time.
# structure_db_path: path to database for which you want to create
# StructureProfiles
#
# structure_db_source_path: path to database from which you get the
# structural information to calculate the profiles
# this database should not be much larger than 5000
# entries in order to keep computation time in a
# feasible range
#
# start: index of entry in StructureDB at structure_db_path to start with
#
# end: index of entry in StructureDB at structure_db_path to end with
# (not inclusive)
#
# profile_db_out_path: All structure profiles will be stored in a
# ost.seq.HMMDB() that will be dumped at this path
structure_db_path = sys.argv[1]
structure_db_source_path = sys.argv[2]
start = int(sys.argv[3])
end = int(sys.argv[4])
profile_db_out_path = sys.argv[5]
structure_db = loop.StructureDB.Load(structure_db_path)
structure_db_source = loop.StructureDB.Load(structure_db_source_path)
profile_db = seq.ProfileDB()
for i in range(start,end):
print "processing chain with idx", i
# generate fragment info object
coord_info = structure_db.GetCoordInfo(i)
fragment_info = loop.FragmentInfo(i,0,coord_info.size)
# generate the structure profile given the source StructureDB
profile = structure_db.GenerateStructureProfile(structure_db_source, fragment_info)
profile_db.AddProfile(coord_info.pdb_id, profile)
profile_db.Save(profile_db_out_path)
import time
import os
import pickle
import traceback
from sm import smtl
# This is how we extract all required data for a StructureDB from the
# SWISS-MODEL template library given a non redundant set of proteins as
# pulled from Pisces: http://dunbrack.fccc.edu/PISCES.php.
# If you have no working SWISS-MODEL installation you have two possibilities:
#
# 1. Ping someone from the mighty SCHWEDE-Lab to run this script for you
# with your desired PISCES file
#
# 2. Fetch the data manually from the PDB and obtain the profiles with
# the hh-suite available at: https://github.com/soedinglab/hh-suite.
pisces_path = "cullpdb_pc20_res1.8_R0.25_d170820_chains4892"
smtl_path = "/scicore/data/databases/SMTL/"
profile_out_dir = "hmms"
structure_out_dir = "structures"
tpl_lib = smtl.SMTL(smtl_path)
pisces_data = open(pisces_path).readlines()
# We have to do some mapping magic, since the chain names of SMTL and pdb
# do not match. The PISCES data however specifies the pdb names
# in here we store pdb id and chain name relative to the SMTL
fetched_structures = list()
if not os.path.exists(profile_out_dir):
os.makedirs(profile_out_dir)
if not os.path.exists(structure_out_dir):
os.makedirs(structure_out_dir)
for i,line in enumerate(pisces_data[1:]):
prot_id = line[:4]
chain_id = line[4]
print "processing ", prot_id, chain_id
try:
bus = tpl_lib.GetAll(prot_id.upper())
except:
print "could not find entry for ", prot_id, chain_id
continue
bu = None
if len(bus) == 0:
print "didnt find any valid biounit!"
continue
if len(bus) == 1:
bu = bus[0]
else:
# If there are several biounits we search for the one that
# is likely to have the relevant oligomeric state
bu = bus[0]
for b in bus:
if "author" in b.details and "software" in b.details:
bu = b
break
for polypeptide in bu.polypeptides:
if polypeptide.is_polypeptide:
found_chain = False
for ch in polypeptide.chains:
if ch.orig_pdb_name == chain_id:
unique_name = ch.unique_id
structure = ch.structure
profile_path = tpl_lib.ProfileForTemplate(unique_name)
profile_out_path = os.path.join(profile_out_dir, prot_id+ch.name+".hhm")
structure_out_path = os.path.join(structure_out_dir, prot_id+".pdb")
# copy over the profile
os.system(' '.join(["cp", profile_path, profile_out_path]))
# get the structure
full_structure = bu.GetSubstructure(bu.polypeptides)
ost.io.SavePDB(full_structure,structure_out_path)
found_chain=True
# add the actually extracted pdb id and chain name
fetched_structures.append((prot_id, ch.name))
print "found the according data..."
break
if found_chain:
break
outfile = open("data_extracted_from_smtl.txt",'w')
outfile.write('\n'.join([(x[0] + ' ' + x[1]) for x in fetched_structures]))
outfile.close()
import os, math
from promod3 import loop
# path to StructureDB for which you want to calculate the profiles:
# (relative to current working directory!)
structure_db_path = "structure_db.dat"
# path to StructureDB from which you extract the structural information
# if the StructureDB above is very large, you might want to give it a smaller
# database here to limit CPU time. A database with around 5000 chains should
# be sufficient
# (relative to current working directory!)
structure_db_source_path = "structure_db.dat"
# the profiles that get generated end up here:
# (relative to current working directory!)
out_dir = "profile_dbs"
# number of jobs
num_jobs = 100
# all stdout, stderr and submission_script files will go here
# (relative to current working directory!)
submission_workspace = "submission_workspace"
# path to the pm bash script
# Once we have the latest and greatest ProMod3 module, this should not
# be necessary anymore and we can simply call pm
pm_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/bin/pm"
# This is stuff relevant for the submission script
max_runtime = "13:00:00"
membycore = "3G"
my_email = "gabriel.studer@unibas.ch"
# we only need ost and promod3 as soon as they're released...
required_modules = ["Boost/1.53.0-goolf-1.4.10-Python-2.7.5",
"Python/2.7.5-goolf-1.4.10",
"OpenMM/7.1.1-goolf-1.4.10-Python-2.7.5"]
# they won't be necessary anymore as soon as there are ost
# and promod3 modules
ost_python_path = "/scicore/home/schwede/studga00/prog/ost_dev/build/stage/lib64/python2.7/site-packages"
promod_python_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/lib64/python2.7/site-packages"
#################################################################
# NO EDITING BELOW THIS POINT (EXCEPT ON DIFFERENT SYSTEMS ;) ) #
#################################################################
if not os.path.exists(submission_workspace):
os.makedirs(submission_workspace)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
structure_db = loop.StructureDB.Load(structure_db_path)
num_coords = structure_db.GetNumCoords()
chunk_size = math.ceil(float(num_coords ) / num_jobs)
current_start = 0
current_end = chunk_size
for job_idx in range(num_jobs):
start = current_start
end = current_end
if end > num_coords:
end = num_coords
# we estimate the chunk size with a ceil => we might get away with even
# less jobs...
if start >= end:
break
start = int(start)
end = int(end)
current_start += chunk_size
current_end += chunk_size
cmd = [pm_path, \
os.path.join(os.getcwd(),"create_structure_profiles.py"), \
os.path.join(os.getcwd(),structure_db_path), \
os.path.join(os.getcwd(),structure_db_source_path), \
str(start), str(end), \
os.path.join(os.getcwd(), out_dir, str(job_idx)+".dat")]
stdout_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stdout")
stderr_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stderr")
content = list()
content.append("#!/bin/bash")
content.append("#$ -o " + stdout_path)
content.append("#$ -e " + stderr_path)
content.append("#$ -l runtime=" + max_runtime)
content.append("#$ -l membycore=" + membycore)
content.append("#$ -m as -M " + my_email)
for module in required_modules:
content.append("ml " + module)
content.append("export PYTHONPATH=" + ost_python_path + ":PYTHONPATH")
content.append("export PYTHONPATH=" + promod_python_path + ":PYTHONPATH")
content.append(' '.join(cmd))
s_script_path = os.path.join(submission_workspace, "sub_"+str(job_idx)+".sh")
outfile = open(s_script_path,'w')
outfile.write('\n'.join(content))
outfile.close()
# FIRE!!!
os.system("qsub "+ s_script_path)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment