scripts to help you generate a StructureDB including the structure profiles

ea595ae6 · Studer Gabriel · 2acd83f0 · ea595ae6 · ea595ae6 · ea595ae6
Commit ea595ae6 authored 7 years ago by Studer Gabriel
--- a/extras/data_generation/structure_db/README
+++ b/extras/data_generation/structure_db/README
+We provide several scripts to obtain a StructureDB. 
+More information regarding the different steps you'll find in the comments
+in the corresponding scripts.
+
+Steps to obtain a structural database:
+
+1. Get the structure/profile data: preferably you want a non redundant set of
+   structures to generate a StructureDB. We suggest to cull a list of pdb entries
+   from PISCES: http://dunbrack.fccc.edu/PISCES.php
+   The get_data_from_smtl.py script directly reads in such a file and produces
+   the input for the second step in StructureDB generation. 
+
+2. Build an initial database: use build_structure_db.py with input generated
+   in step one.
+
+IF YOU NEVER EVER REQUIRE A STRUCTURE PROFILE, YOU'RE ALREADY DONE!
+BUT BE AWARE, THE ACCORDING FREQUENCIES IN THE DATABASE WILL BE SET TO 0.0!
+
+3. Calculate structure profiles: use create_structure_profiles.py.
+   The script calculates the structure profiles for a subset of the
+   sequences in the desired StructureDB and is intended for parallelization
+   
+4. Parallelize step 3: use submit_structure_profile_calculations.py
+   This script uses the SGE submission on BC2 and you might have to 
+   adapt for your system
+
+5. Assign the structure profiles generated in step 3 and 4 to the 
+   initial database generated in step 2: 
+   use assign_structure_profiles.py to perform this task
+
--- a/extras/data_generation/structure_db/assign_structure_profiles.py
+++ b/extras/data_generation/structure_db/assign_structure_profiles.py
+import os
+from promod3 import loop
+
+
+# Path to StructureDB to which you want to assign the profile information.
+# (relative to current working directory!)
+db_path = "structure_db.dat"
+
+# Directory that contains the profile databases (and only profile databases)
+# as generated by the create_structure_profiles.py script.
+# (relative to current working directory)
+profile_dir = "profile_dbs"
+
+# path to store the StructureDB with assigned_profiles
+# (relative to current working directory!)
+db_out_path = "structure_db_with_structure_profiles.dat"
+
+
+structure_db = loop.StructureDB.Load(db_path)
+
+# lets gather all profiles in one ProfileDB
+profile_db_files = os.listdir(profile_dir)
+profile_db = ost.seq.ProfileDB()
+
+for f in profile_db_files:
+  db_path = os.path.join(profile_dir, f)
+  loaded_db = ost.seq.ProfileDB.Load(db_path)
+  entry_names = loaded_db.GetNames()
+  for name in entry_names:
+    profile_db.AddProfile(name, loaded_db.GetProfile(name))
+    print name
+
+# Assign the profiles
+for coord_idx in range(structure_db.GetNumCoords()):
+  coord_info = structure_db.GetCoordInfo(coord_idx)
+  entry_id = coord_info.pdb_id
+
+  structure_profile = None
+  try:
+    structure_profile = profile_db.GetProfile(entry_id)
+  except:
+    print "Could not find structure profile for ", entry_id, " skip..."
+    continue
+    
+  structure_db.SetStructureProfile(coord_idx, structure_profile)
+
+# DONE! 
+structure_db.Save(db_out_path)
+
--- a/extras/data_generation/structure_db/build_structure_db.py
+++ b/extras/data_generation/structure_db/build_structure_db.py
 from promod3 import loop
 import os
-import pickle
 import traceback
 from ost import io
-from ost.bindings import dssp
 from ost.bindings import msms
+from ost.mol.alg import AssignSecStruct
+from ost.mol.alg import Accessibility

+###############
+# SCROLL DOWN #
+###############

 bb_atoms = ["N","CA","C","O","CB"]

@@ -90,62 +93,87 @@ def Clean(prot):
      r = r_view.handle
      ed.RenameResidue(r,parent_residues[r.GetName()])
      r.SetOneLetterCode(one_letter_codes[r.GetName()])
+ 
+      atoms_to_delete = list()
+
      for a in r.atoms:
        if a.GetName() not in bb_atoms:
-          ed.DeleteAtom(a)       
-
-
-
-#This script generates a StructureDB using a list of nonredundant pdb
-#files. All pdb files get cleaned using some basic heuristics.
-#You need the tool dssp and msms somewhere in you path.
-#please note, that the values for the structure profile in the produced
-#db are only allocated and simply set to zero.
-#You therefore need to create an initial db, from which you can create
-#these structure profiles and set them in a second step.
-#This should be pretty well described in the documentation.
-
-#file leading to prot_data:
-#list of nonredundant pdb files pulled from:
-#http://dunbrack.fccc.edu/PISCES.php
-#
-#structure_dir:
-#You must define a directory containing pdb files with the
-#4 letter code as written in the pisces file and .pdb as an ending
-#e.g. 1AKE.pdb
-#
-#hmm_dir
-#You must define a directory containing the hmm for every unique
-#chain in the hhm format. Naming is: pdb_id + chain_name + .hhm
-# e.g. 1AKEA.hhm
+          atoms_to_delete.append(a)
+        else:
+          a.is_hetatom=False

-structure_db = loop.StructureDB()
-prot_data = open('cullpdb_pc30_res2.0_R0.25_d150314_chains8016','r').readlines()
-structure_dir = "/home/schdaude/workspace/promod_tests/generate_databases/structures"
-hmm_dir = "/home/schdaude/workspace/promod_tests/generate_databases/hmms"

-for i,line in enumerate(prot_data[1:]):
+      for a in atoms_to_delete:
+        ed.DeleteAtom(a)
+
+
+
+# This script generates a StructureDB using a list of nonredundant pdb
+# files. All pdb files get cleaned using some basic heuristics.
+# please note, that the values for the structure profile in the produced
+# db are only allocated and simply set to zero.
+# You therefore need to create an initial db, from which you can create
+# these structure profiles and set them in a second step.
+# This should be pretty well described in the documentation.
+
+# To obtain the required pdb and hhm files, you need to run 
+# the get_data_from_smtl.py script. As an alternative you fetch your own
+# structures from the pdb and obtain the hhm file using the hh-suite
+# (https://github.com/soedinglab/hh-suite) and hack this script that
+# it uses your input directly.
+
+# file generated by get_data_from_smtl.py (or by yourself)
+# every line requires to be in the format: pdb_id chain_name
+# e.g. 5UZG A
+data = open("data_extracted_from_smtl.txt",'r').readlines()
+
+# directory containing the structures. naming: pdb_id.pdb
+# (pdb_id in upper case!) e.g. 5UZG.pdb
+structure_dir = "structures"
+
+# directory containing the hhm files for chain. naming pdb_id+chain_id.hhm
+# (pdb_id and chain_id in upper case!) e.g. 5UZGA.hhm
+profile_dir = "hmms" 

-  prot_id = line[:4]
-  chain_id = line[4]

-  if not os.path.exists(os.path.join(hmm_dir,prot_id+chain_id+".hhm")):
-    continue
+path_to_msms_binary = "/scicore/soft/apps/msms/2.6.1-linux-x86_64/msms.x86_64Linux2.2.6.1"
+
+
+# Do it!!
+structure_db = loop.StructureDB()
+
+for i,line in enumerate(data):
+
+  split_line = line.split()
+  prot_id = split_line[0]
+  chain_id = split_line[1]

  try:
    print 'processing: ',prot_id,' on line: ',i

-    prot = io.LoadPDB(os.path.join(structure_dir,prot_id+'.pdb')).Select('peptide=true')
+    prot_path = os.path.join(structure_dir, prot_id+'.pdb')
+    hhm_path = os.path.join(profile_dir,prot_id+chain_id+".hhm")
+
+    if not os.path.exists(prot_path):
+      print "Could not find structure file... skip..."
+      continue
+    if not os.path.exists(prot_path):
+      print "Could not find hhm file... skip..."
+      continue
+
+    prot = io.LoadPDB(prot_path).Select('peptide=true')
+    hmm = io.LoadSequenceProfile(hhm_path)
+
    Clean(prot)
-    dssp.AssignDSSP(prot,extract_burial_status=True)
-    surf = msms.CalculateSurface(prot)[0]
-
-    try:
-      hmm = io.LoadSequenceProfile(os.path.join(hmm_dir,prot_id+chain_id+".hhm"))
-      prot = prot.Select("cname="+chain_id)
-      idx = structure_db.AddCoordinates(prot_id,chain_id,prot.chains[0],surf,hmm) 
-    except:
-      traceback.print_exc()
+    AssignSecStruct(prot)
+    Accessibility(prot, algorithm=mol.alg.AccessibilityAlgorithm.DSSP)
+    surf = msms.CalculateSurface(prot, msms_exe=path_to_msms_binary)[0]
+    prot_single_chain = prot.Select("cname="+chain_id)
+    idx = structure_db.AddCoordinates(prot_id, chain_id, 
+                                      prot_single_chain.chains[0], surf, hmm,
+                                      solvent_accessibility_string="asaAbs") 
+   
+      
  except:
    traceback.print_exc()


--- a/extras/data_generation/structure_db/create_structure_profiles.py
+++ b/extras/data_generation/structure_db/create_structure_profiles.py
+import os
+import sys
+from promod3 import loop
+from ost import seq
+import time
+
+if len(sys.argv) != 6:
+
+  params = ["structure_db_path", "structure_db_source_path",\
+            "start", "end", "profile_db_out_path"]
+  print "USAGE: ost create_structure_profiles.py " + ' '.join(params)
+  sys.exit()
+
+# loads a StructureDB that can be arbitrarily large and estimates the
+# structure profile for a subset of its entries with the structural information
+# from another StructureDB. This other DB, the "source" DB should not be too
+# large because its size heavily influences compuation time.
+
+# structure_db_path: path to database for which you want to create
+#                    StructureProfiles
+#
+# structure_db_source_path: path to database from which you get the 
+#                           structural information to calculate the profiles
+#                           this database should not be much larger than 5000
+#                           entries in order to keep computation time in a 
+#                           feasible range
+#
+# start: index of entry in StructureDB at structure_db_path to start with
+#
+# end: index of entry in StructureDB at structure_db_path to end with 
+#      (not inclusive)
+#
+# profile_db_out_path: All structure profiles will be stored in a 
+#                      ost.seq.HMMDB() that will be dumped at this path
+
+structure_db_path = sys.argv[1]
+structure_db_source_path = sys.argv[2]
+start = int(sys.argv[3])
+end = int(sys.argv[4])
+profile_db_out_path = sys.argv[5]
+
+structure_db = loop.StructureDB.Load(structure_db_path)
+structure_db_source = loop.StructureDB.Load(structure_db_source_path)
+
+profile_db = seq.ProfileDB()
+
+for i in range(start,end):
+  print "processing chain with idx", i
+
+  # generate fragment info object
+  coord_info = structure_db.GetCoordInfo(i)
+  fragment_info = loop.FragmentInfo(i,0,coord_info.size)
+  
+  # generate the structure profile given the source StructureDB
+  profile = structure_db.GenerateStructureProfile(structure_db_source, fragment_info)
+  profile_db.AddProfile(coord_info.pdb_id, profile)
+
+
+profile_db.Save(profile_db_out_path)
+
--- a/extras/data_generation/structure_db/get_data_from_smtl.py
+++ b/extras/data_generation/structure_db/get_data_from_smtl.py
+import time
+import os
+import pickle
+import traceback
+from sm import smtl
+
+# This is how we extract all required data for a StructureDB from the 
+# SWISS-MODEL template library given a non redundant set of proteins as
+# pulled from Pisces: http://dunbrack.fccc.edu/PISCES.php. 
+# If you have no working SWISS-MODEL installation you have two possibilities:
+#
+# 1. Ping someone from the mighty SCHWEDE-Lab to run this script for you 
+#    with your desired PISCES file
+#
+# 2. Fetch the data manually from the PDB and obtain the profiles with 
+#    the hh-suite available at: https://github.com/soedinglab/hh-suite.
+
+pisces_path = "cullpdb_pc20_res1.8_R0.25_d170820_chains4892"
+smtl_path = "/scicore/data/databases/SMTL/"
+
+profile_out_dir = "hmms"
+structure_out_dir = "structures"
+
+tpl_lib = smtl.SMTL(smtl_path)
+pisces_data = open(pisces_path).readlines()
+
+# We have to do some mapping magic, since the chain names of SMTL and pdb
+# do not match. The PISCES data however specifies the pdb names
+# in here we store pdb id and chain name relative to the SMTL
+fetched_structures = list()
+
+
+if not os.path.exists(profile_out_dir):
+  os.makedirs(profile_out_dir)
+if not os.path.exists(structure_out_dir):
+  os.makedirs(structure_out_dir)
+
+
+for i,line in enumerate(pisces_data[1:]):
+
+  prot_id = line[:4]
+  chain_id = line[4]
+  print "processing ", prot_id, chain_id
+
+  try:
+    bus = tpl_lib.GetAll(prot_id.upper())
+  except:
+    print "could not find entry for ", prot_id, chain_id
+    continue
+
+  bu = None
+  if len(bus) == 0:
+    print "didnt find any valid biounit!"
+    continue
+  if len(bus) == 1:
+    bu = bus[0]
+  else:
+    # If there are several biounits we search for the one that
+    # is likely to have the relevant oligomeric state
+    bu = bus[0]
+    for b in bus:
+      if "author" in b.details and "software" in b.details:
+        bu = b
+        break
+ 
+  for polypeptide in bu.polypeptides:
+
+    if polypeptide.is_polypeptide:
+      found_chain = False
+
+      for ch in polypeptide.chains:
+
+        if ch.orig_pdb_name == chain_id:
+              
+          unique_name = ch.unique_id
+          structure = ch.structure
+          profile_path = tpl_lib.ProfileForTemplate(unique_name)
+
+          profile_out_path = os.path.join(profile_out_dir, prot_id+ch.name+".hhm")
+          structure_out_path = os.path.join(structure_out_dir, prot_id+".pdb")
+
+          # copy over the profile
+          os.system(' '.join(["cp", profile_path, profile_out_path]))
+
+          # get the structure
+          full_structure = bu.GetSubstructure(bu.polypeptides)
+          ost.io.SavePDB(full_structure,structure_out_path)
+          found_chain=True
+
+          # add the actually extracted pdb id and chain name
+          fetched_structures.append((prot_id, ch.name))
+
+          print "found the according data..."
+          break
+
+      if found_chain:
+        break
+  
+
+
+outfile = open("data_extracted_from_smtl.txt",'w')
+outfile.write('\n'.join([(x[0] + ' ' + x[1]) for x in fetched_structures]))
+outfile.close()
+
--- a/extras/data_generation/structure_db/submit_structure_profile_calculations.py
+++ b/extras/data_generation/structure_db/submit_structure_profile_calculations.py
+import os, math
+from promod3 import loop
+
+# path to StructureDB for which you want to calculate the profiles:
+# (relative to current working directory!)
+structure_db_path = "structure_db.dat"
+
+# path to StructureDB from which you extract the structural information
+# if the StructureDB above is very large, you might want to give it a smaller
+# database here to limit CPU time. A database with around 5000 chains should 
+# be sufficient
+# (relative to current working directory!)
+structure_db_source_path = "structure_db.dat"
+
+# the profiles that get generated end up here:
+# (relative to current working directory!)
+out_dir = "profile_dbs"
+
+# number of jobs
+num_jobs = 100
+
+# all stdout, stderr and submission_script files will go here
+# (relative to current working directory!)
+submission_workspace = "submission_workspace"
+
+
+# path to the pm bash script
+# Once we have the latest and greatest ProMod3 module, this should not 
+# be necessary anymore and we can simply call pm
+pm_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/bin/pm"
+
+
+# This is stuff relevant for the submission script
+max_runtime = "13:00:00"
+membycore = "3G"
+my_email = "gabriel.studer@unibas.ch"
+
+# we only need ost and promod3 as soon as they're released...
+required_modules = ["Boost/1.53.0-goolf-1.4.10-Python-2.7.5",
+                    "Python/2.7.5-goolf-1.4.10",
+                    "OpenMM/7.1.1-goolf-1.4.10-Python-2.7.5"]
+
+# they won't be necessary anymore as soon as there are ost
+# and promod3 modules
+ost_python_path = "/scicore/home/schwede/studga00/prog/ost_dev/build/stage/lib64/python2.7/site-packages"
+promod_python_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/lib64/python2.7/site-packages"
+
+
+#################################################################
+# NO EDITING BELOW THIS POINT (EXCEPT ON DIFFERENT SYSTEMS ;) ) #
+#################################################################
+
+if not os.path.exists(submission_workspace):
+  os.makedirs(submission_workspace)
+
+if not os.path.exists(out_dir):
+  os.makedirs(out_dir)
+
+structure_db = loop.StructureDB.Load(structure_db_path)
+num_coords = structure_db.GetNumCoords()
+chunk_size = math.ceil(float(num_coords ) / num_jobs)
+current_start = 0
+current_end = chunk_size
+
+for job_idx in range(num_jobs):
+
+  start = current_start
+  end = current_end
+  if end > num_coords:
+    end = num_coords
+
+  # we estimate the chunk size with a ceil => we might get away with even
+  # less jobs...
+  if start >= end:
+    break
+
+  start = int(start)
+  end = int(end)
+
+  current_start += chunk_size
+  current_end += chunk_size
+
+  cmd = [pm_path, \
+         os.path.join(os.getcwd(),"create_structure_profiles.py"), \
+         os.path.join(os.getcwd(),structure_db_path), \
+         os.path.join(os.getcwd(),structure_db_source_path), \
+         str(start), str(end), \
+         os.path.join(os.getcwd(), out_dir, str(job_idx)+".dat")]
+
+  stdout_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stdout")
+  stderr_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stderr")
+
+  content = list()
+  content.append("#!/bin/bash")
+  content.append("#$ -o " + stdout_path)
+  content.append("#$ -e " + stderr_path)
+  content.append("#$ -l runtime=" + max_runtime)
+  content.append("#$ -l membycore=" + membycore)
+  content.append("#$ -m as -M " + my_email)   
+
+  for module in required_modules:
+    content.append("ml " + module)
+
+  content.append("export PYTHONPATH=" + ost_python_path + ":PYTHONPATH")
+  content.append("export PYTHONPATH=" + promod_python_path + ":PYTHONPATH")
+  content.append(' '.join(cmd))
+
+  s_script_path = os.path.join(submission_workspace, "sub_"+str(job_idx)+".sh")
+  outfile = open(s_script_path,'w')
+  outfile.write('\n'.join(content))
+  outfile.close()
+
+  # FIRE!!!
+  os.system("qsub "+ s_script_path)
+