diff --git a/doc/tests/data/port_str_db.dat b/doc/tests/data/port_str_db.dat index 5c183190ce902c88a35cc99fb5e4fd69a71ef62f..a861977c6a974dd2a15929b9f5c71415d57901d3 100644 Binary files a/doc/tests/data/port_str_db.dat and b/doc/tests/data/port_str_db.dat differ diff --git a/doc/tests/scripts/loop_frag_db.py b/doc/tests/scripts/loop_frag_db.py index c1489985dbb9d51bbea02cf9f41ea9075a1f0556..0efd8226859dbfc115f5973d3d456b6e1b88bfed 100644 --- a/doc/tests/scripts/loop_frag_db.py +++ b/doc/tests/scripts/loop_frag_db.py @@ -31,4 +31,4 @@ fragment_infos = frag_db.SearchDB(n_stem, c_stem, 5) for i,f_i in enumerate(fragment_infos): bb_list = structure_db.GetBackboneList(n_stem, c_stem, f_i, frag_seq) - io.SavePDB(bb_list.ToEntity(), str(i) + ".pdb") \ No newline at end of file + io.SavePDB(bb_list.ToEntity(), str(i) + ".pdb") diff --git a/doc/tests/scripts/loop_structure_db.py b/doc/tests/scripts/loop_structure_db.py index 1696e8dd71480993f6556723300707315356796e..31ff521c1ab2fe4e01db0f4df3466e24e65a0108 100644 --- a/doc/tests/scripts/loop_structure_db.py +++ b/doc/tests/scripts/loop_structure_db.py @@ -14,8 +14,8 @@ structure_db_two = loop.StructureDB( # Lets fill in some structures. It gets assumed, that all required # data lies in the following directories. -structure_dir = "../data" -prof_dir = "../data" +structure_dir = "data" +prof_dir = "data" # The first 4 letters are suposed to be the pdb id, and the last one # the corresponding chain name. The naming of the files in the @@ -84,7 +84,9 @@ for i in range(structure_db_one.GetNumCoords()): structure_db_one.SetStructureProfile(i, prof) # That's it! Let's save both databases down. -# structure_db_two will use much less memory, -# as it contains less data -structure_db_one.Save("my_db_one.dat") -structure_db_two.Save("my_db_two.dat") +# structure_db_two will use much less memory, as it contains less data. +# Sidenote: We're saving the portable version. If you intent to use +# your database only on one system, the Save / Load functions should +# be preferred, as the loading will be much faster. +structure_db_one.SavePortable("my_db_one.dat") +structure_db_two.SavePortable("my_db_two.dat") diff --git a/doc/tests/test_doctests.py b/doc/tests/test_doctests.py index 533148559befc5dc1d2fd23e01fcea2c4a2d962d..e3e6e58b9a1a105871233f3316218b9284b503c4 100644 --- a/doc/tests/test_doctests.py +++ b/doc/tests/test_doctests.py @@ -226,9 +226,11 @@ class DocTests(unittest.TestCase): # run it self.checkPMRun('loop_structure_db.py', [], 0) # check that result exists and is readable - loop.StructureDB.Load('my_db.dat') + loop.StructureDB.LoadPortable('my_db_one.dat') + loop.StructureDB.LoadPortable('my_db_two.dat') # clean up - os.remove('my_db.dat') + os.remove('my_db_one.dat') + os.remove('my_db_two.dat') def testLoopFragDB(self): # run it @@ -248,7 +250,7 @@ class DocTests(unittest.TestCase): self.assertEqual(len(out_lines), 101) # NOTE: this last output depends on the structure-db! self.assertEqual(out_lines[-1].strip(), - 'Fraction of fragments below 3A: 0.48') + 'Fraction of fragments below 3A: 0.47') # check that result exists and is readable loop.FraggerMap.LoadBB('frag_map.dat') # clean up diff --git a/extras/data_generation/frag_db/build_frag_db.py b/extras/data_generation/frag_db/build_frag_db.py index 19602b2f274ac4bd04699dac11c4c89053ceeacf..5faa5ad22d6b209846544ddbc1900aa43e5c7b36 100644 --- a/extras/data_generation/frag_db/build_frag_db.py +++ b/extras/data_generation/frag_db/build_frag_db.py @@ -16,5 +16,5 @@ for i in range(3,15): fragdb.AddFragments(i,max_pairwise_rmsd,structure_db) fragdb.PrintStatistics() -fragdb.Save('frag_db.dat') +fragdb.SavePortable('frag_db.dat') diff --git a/extras/data_generation/structure_db/README b/extras/data_generation/structure_db/README index ed5115ddf021c6095061af6b05ae971248816da0..f86fa19e8c37a82083e61ecc780679ff606a4dbb 100644 --- a/extras/data_generation/structure_db/README +++ b/extras/data_generation/structure_db/README @@ -9,6 +9,8 @@ Steps to obtain a structural database: from PISCES: http://dunbrack.fccc.edu/PISCES.php The get_data_from_smtl.py script directly reads in such a file and produces the input for the second step in StructureDB generation. + BUT: THE CHAIN NAMING WILL BE THE ONE AS IN THE SMTL, IT DOESN'T RELATE + ANYMORE TO THE PDB NAMING! 2. Build an initial database: use build_structure_db.py with input generated in step one. @@ -20,9 +22,12 @@ BUT BE AWARE, THE ACCORDING FREQUENCIES IN THE DATABASE WILL BE SET TO 0.0! The script calculates the structure profiles for a subset of the sequences in the desired StructureDB and is intended for parallelization -4. Parallelize step 3: use submit_structure_profile_calculations.py - This script uses the SGE submission on BC2 and you might have to - adapt for your system +4. Parallelize step 3: use prepare_slurm_submission_structure_profiles.py + This script generates a submission script and a batch command file + that you can submit with sbatch structure_profile_generation_array.sh. + This is intended to be used with the SLURM submission system. + IMPORTANT: there are many paths to adapt on top of + prepare_slurm_submission_structure_profiles.py 5. Assign the structure profiles generated in step 3 and 4 to the initial database generated in step 2: @@ -31,10 +36,9 @@ BUT BE AWARE, THE ACCORDING FREQUENCIES IN THE DATABASE WILL BE SET TO 0.0! To qualitatively reproduce the default StructuralDB in ProMod3, you first perform step 1 and 2 with a non redundant set of protein structures as -defined by PISCES with around 25000 chains (e.g. seq id threshold: 90, -resolution threshold: 2.2). +defined by PISCES with around 20000 chains (e.g. seq id threshold: 60, +resolution threshold: 2.5). Repeat step 1 and 2 with a smaller PISCES list (5000-6000 entries, -e.g. seq id threshold: 25 , resolution threshold: 1.8). +e.g. seq id threshold: 20 , resolution threshold: 2.0). The first database serves as default StructureDB and the second db as the source db for the structural profiles generated in steps 3-5. - diff --git a/extras/data_generation/structure_db/assign_structure_profiles.py b/extras/data_generation/structure_db/assign_structure_profiles.py index 64029be3aedf16222f622dd102fd8c664b5295c5..2c14c44087fbd2a84be740c1f626565cfae81dd4 100644 --- a/extras/data_generation/structure_db/assign_structure_profiles.py +++ b/extras/data_generation/structure_db/assign_structure_profiles.py @@ -1,49 +1,50 @@ import os from promod3 import loop - +from ost import seq # Path to StructureDB to which you want to assign the profile information. # (relative to current working directory!) -db_path = "structure_db.dat" +db_path = "portable_structure_db_60.dat" # Directory that contains the profile databases (and only profile databases) # as generated by the create_structure_profiles.py script. # (relative to current working directory) -profile_dir = "profile_dbs" +profile_dir = "structure_profile_dbs" # path to store the StructureDB with assigned_profiles # (relative to current working directory!) -db_out_path = "structure_db_with_structure_profiles.dat" - +db_out_path = "portable_structure_db_60_with_structure_profiles.dat" -structure_db = loop.StructureDB.Load(db_path) +structure_db = loop.StructureDB.LoadPortable(db_path) # lets gather all profiles in one ProfileDB profile_db_files = os.listdir(profile_dir) -profile_db = ost.seq.ProfileDB() +profile_db = seq.ProfileDB() for f in profile_db_files: db_path = os.path.join(profile_dir, f) - loaded_db = ost.seq.ProfileDB.Load(db_path) + loaded_db = seq.ProfileDB.Load(db_path) entry_names = loaded_db.GetNames() for name in entry_names: profile_db.AddProfile(name, loaded_db.GetProfile(name)) - print name # Assign the profiles for coord_idx in range(structure_db.GetNumCoords()): coord_info = structure_db.GetCoordInfo(coord_idx) - entry_id = coord_info.pdb_id + + entry_id = coord_info.id + entry_chain_name = coord_info.chain_name + full_id = '_'.join([entry_id, entry_chain_name]) structure_profile = None try: - structure_profile = profile_db.GetProfile(entry_id) + structure_profile = profile_db.GetProfile(full_id) except: - print "Could not find structure profile for ", entry_id, " skip..." + print "Could not find structure profile for ", full_id, " skip..." continue structure_db.SetStructureProfile(coord_idx, structure_profile) # DONE! -structure_db.Save(db_out_path) +structure_db.SavePortable(db_out_path) diff --git a/extras/data_generation/structure_db/build_structure_db.py b/extras/data_generation/structure_db/build_structure_db.py index 409f67b34a8f775ad00db032a1f963282936e447..ac23ae07a5ef94e49cf963ba0dee97be917767af 100644 --- a/extras/data_generation/structure_db/build_structure_db.py +++ b/extras/data_generation/structure_db/build_structure_db.py @@ -2,115 +2,14 @@ from promod3 import loop import os import traceback from ost import io -from ost.bindings import msms -from ost.mol.alg import AssignSecStruct -from ost.mol.alg import Accessibility - -############### -# SCROLL DOWN # -############### - -bb_atoms = ["N","CA","C","O","CB"] - -parent_residues = dict() -parent_residues["MSE"] = "MET" -parent_residues["CSX"] = "CYS" -parent_residues["FP9"] = "PRO" -parent_residues["HYP"] = "PRO" -parent_residues["SEP"] = "SER" -parent_residues["SAH"] = "CYS" -parent_residues["MLY"] = "LYS" -parent_residues["NLE"] = "LEU" -parent_residues["ABA"] = "ALA" -parent_residues["SLZ"] = "LYS" -parent_residues["KCX"] = "LYS" -parent_residues["OCS"] = "CYS" -parent_residues["HIC"] = "HIS" -parent_residues["TPO"] = "THR" -parent_residues["CME"] = "CYS" -parent_residues["ALY"] = "LYS" -parent_residues["SAM"] = "MET" -parent_residues["MEN"] = "ASN" -parent_residues["LLP"] = "LYS" -parent_residues["CSD"] = "ALA" -parent_residues["PCA"] = "PRO" -parent_residues["SCY"] = "CYS" -parent_residues["CSO"] = "CYS" -parent_residues["PHD"] = "ASP" -parent_residues["SLZ"] = "LYS" -parent_residues["TYS"] = "TYR" -parent_residues["CAS"] = "CYS" -parent_residues["TRQ"] = "TRP" -parent_residues["DPR"] = "PRO" -parent_residues["CSW"] = "CYS" -parent_residues["CAF"] = "CYS" -parent_residues["NIY"] = "TYR" -parent_residues["LYZ"] = "LYS" -parent_residues["SME"] = "MET" -parent_residues["CSS"] = "CYS" -parent_residues["MLZ"] = "LYS" -parent_residues["DAH"] = "PHE" -parent_residues["MSO"] = "MET" -parent_residues["PHI"] = "PHE" -parent_residues["TYI"] = "TYR" -parent_residues["GLX"] = "GLU" #could also be GLN -parent_residues["OMT"] = "MET" -parent_residues["CSU"] = "CYS" -parent_residues["HZP"] = "PRO" -parent_residues["SEC"] = "CYS" -parent_residues["DPN"] = "PHE" -parent_residues["NEP"] = "HIS" -parent_residues["HIP"] = "HIS" -parent_residues["DGL"] = "GLU" - -one_letter_codes = dict() -one_letter_codes["ALA"] = 'A' -one_letter_codes["ARG"] = 'R' -one_letter_codes["HIS"] = 'H' -one_letter_codes["LYS"] = 'K' -one_letter_codes["ASP"] = 'D' -one_letter_codes["GLU"] = 'E' -one_letter_codes["SER"] = 'S' -one_letter_codes["THR"] = 'T' -one_letter_codes["ASN"] = 'N' -one_letter_codes["GLN"] = 'Q' -one_letter_codes["CYS"] = 'C' -one_letter_codes["GLY"] = 'G' -one_letter_codes["PRO"] = 'P' -one_letter_codes["VAL"] = 'V' -one_letter_codes["ILE"] = 'I' -one_letter_codes["LEU"] = 'L' -one_letter_codes["MET"] = 'M' -one_letter_codes["PHE"] = 'F' -one_letter_codes["TYR"] = 'Y' -one_letter_codes["TRP"] = 'W' - -def Clean(prot): - ed = prot.handle.EditXCS() - - for r_view in prot.residues: - if r_view.GetName() in parent_residues: - r = r_view.handle - ed.RenameResidue(r,parent_residues[r.GetName()]) - r.SetOneLetterCode(one_letter_codes[r.GetName()]) - - atoms_to_delete = list() - - for a in r.atoms: - if a.GetName() not in bb_atoms: - atoms_to_delete.append(a) - else: - a.is_hetatom=False - - - for a in atoms_to_delete: - ed.DeleteAtom(a) - - +from ost import seq +from ost import mol +from ost import conop # This script generates a StructureDB using a list of nonredundant pdb -# files. All pdb files get cleaned using some basic heuristics. -# please note, that the values for the structure profile in the produced +# files. All pdb files get cleaned using Molck from openstructure with +# with settings specified below. +# Please note, that the values for the structure profile in the produced # db are only allocated and simply set to zero. # You therefore need to create an initial db, from which you can create # these structure profiles and set them in a second step. @@ -125,7 +24,7 @@ def Clean(prot): # file generated by get_data_from_smtl.py (or by yourself) # every line requires to be in the format: pdb_id chain_name # e.g. 5UZG A -data = open("data_extracted_from_smtl.txt",'r').readlines() +data = open("cullpdb_pc60_res2.5_R1.0_d180524_chains25634_data_extracted_from_smtl.txt", 'r').readlines() # directory containing the structures. naming: pdb_id.pdb # (pdb_id in upper case!) e.g. 5UZG.pdb @@ -135,12 +34,22 @@ structure_dir = "structures" # (pdb_id and chain_id in upper case!) e.g. 5UZGA.hhm profile_dir = "hmms" +# molck settings to process the input structures +molck_settings = mol.alg.MolckSettings() +molck_settings.rm_unk_atoms = True +molck_settings.rm_non_std = True +molck_settings.rm_hyd_atoms = True +molck_settings.rm_oxt_atoms = True +molck_settings.rm_zero_occ_atoms = True +molck_settings.map_nonstd_res = True +molck_settings.assign_elem = True -path_to_msms_binary = "/scicore/soft/apps/msms/2.6.1-linux-x86_64/msms.x86_64Linux2.2.6.1" - +# Assumes that a proper compound lib has been specified when +# compiling openstructure +compound_lib = conop.GetDefaultLib() # Do it!! -structure_db = loop.StructureDB() +structure_db = loop.StructureDB(loop.StructureDBDataType.All) for i,line in enumerate(data): @@ -149,6 +58,7 @@ for i,line in enumerate(data): chain_id = split_line[1] try: + print 'processing: ',prot_id,' on line: ',i prot_path = os.path.join(structure_dir, prot_id+'.pdb') @@ -161,25 +71,31 @@ for i,line in enumerate(data): print "Could not find hhm file... skip..." continue - prot = io.LoadPDB(prot_path).Select('peptide=true') + # load and clean full structure + prot = io.LoadPDB(prot_path) + mol.alg.Molck(prot, compound_lib, molck_settings) + + # hack that will go away when lateste QS-lDDT branch is merged in + processor = conop.RuleBasedProcessor(compound_lib) + processor.Process(prot) + + # we're only interested in the peptide chains... + prot = prot.Select("peptide=true") + + # get profile and seqres hmm = io.LoadSequenceProfile(hhm_path) + raw_seqres = hmm.sequence + seqres = seq.CreateSequence("seqres", raw_seqres) - Clean(prot) - AssignSecStruct(prot) - Accessibility(prot, algorithm=mol.alg.AccessibilityAlgorithm.DSSP) - surf = msms.CalculateSurface(prot, msms_exe=path_to_msms_binary)[0] - prot_single_chain = prot.Select("cname="+chain_id) - idx = structure_db.AddCoordinates(prot_id, chain_id, - prot_single_chain.chains[0], surf, hmm, - solvent_accessibility_string="asaAbs") - - + # add it + structure_db.AddCoordinates(prot_id, chain_id, + prot, seqres, hmm) + except: traceback.print_exc() structure_db.PrintStatistics() -structure_db.Save('structure_db.dat') - +structure_db.SavePortable('portable_structure_db_60.dat') diff --git a/extras/data_generation/structure_db/create_structure_profiles.py b/extras/data_generation/structure_db/create_structure_profiles.py index ec227549785f97790208abaa3764af16eb06ea6c..a0e335925a4ed4fa5ced2e07a44bd0b379da4bb8 100644 --- a/extras/data_generation/structure_db/create_structure_profiles.py +++ b/extras/data_generation/structure_db/create_structure_profiles.py @@ -3,6 +3,7 @@ import sys from promod3 import loop from ost import seq import time +import traceback if len(sys.argv) != 6: @@ -47,14 +48,30 @@ profile_db = seq.ProfileDB() for i in range(start,end): print "processing chain with idx", i - # generate fragment info object - coord_info = structure_db.GetCoordInfo(i) - fragment_info = loop.FragmentInfo(i,0,coord_info.size) - - # generate the structure profile given the source StructureDB - profile = structure_db.GenerateStructureProfile(structure_db_source, fragment_info) - profile_db.AddProfile(coord_info.pdb_id, profile) + try: + # generate fragment info object + coord_info = structure_db.GetCoordInfo(i) + fragment_info = loop.FragmentInfo(i,0,coord_info.size) + sequence = structure_db.GetSequence(fragment_info) + bb_list = structure_db.GetBackboneList(fragment_info, sequence) + residue_depths = structure_db.GetResidueDepths(fragment_info) + print "id: ", coord_info.id + print "chain_name: ", coord_info.chain_name + print "sequence length: ", len(sequence) + + # generate the structure profile given the source StructureDB + start = time.time() + profile = structure_db_source.GenerateStructureProfile(bb_list, residue_depths) + end = time.time() + + print "it took: ", end-start + + profile_db.AddProfile('_'.join([coord_info.id, coord_info.chain_name]), profile) + + except: + traceback.print_exc() + print "failed to create structure profile... skip..." profile_db.Save(profile_db_out_path) diff --git a/extras/data_generation/structure_db/get_data_from_smtl.py b/extras/data_generation/structure_db/get_data_from_smtl.py index a881e5206502bd07fe11234d0162d7e4dc274fa7..7965046a728bd55260ab275073d7449ce7147db7 100644 --- a/extras/data_generation/structure_db/get_data_from_smtl.py +++ b/extras/data_generation/structure_db/get_data_from_smtl.py @@ -15,7 +15,7 @@ from sm import smtl # 2. Fetch the data manually from the PDB and obtain the profiles with # the hh-suite available at: https://github.com/soedinglab/hh-suite. -pisces_path = "cullpdb_pc20_res1.8_R0.25_d170820_chains4892" +pisces_path = "cullpdb_pc20_res2.0_R0.25_d180524_chains6670" smtl_path = "/scicore/data/databases/SMTL/" profile_out_dir = "hmms" @@ -29,13 +29,11 @@ pisces_data = open(pisces_path).readlines() # in here we store pdb id and chain name relative to the SMTL fetched_structures = list() - if not os.path.exists(profile_out_dir): os.makedirs(profile_out_dir) if not os.path.exists(structure_out_dir): os.makedirs(structure_out_dir) - for i,line in enumerate(pisces_data[1:]): prot_id = line[:4] @@ -76,8 +74,8 @@ for i,line in enumerate(pisces_data[1:]): structure = ch.structure profile_path = tpl_lib.ProfileForTemplate(unique_name) - profile_out_path = os.path.join(profile_out_dir, prot_id+ch.name+".hhm") - structure_out_path = os.path.join(structure_out_dir, prot_id+".pdb") + profile_out_path = os.path.join(profile_out_dir, prot_id + ch.name + ".hhm") + structure_out_path = os.path.join(structure_out_dir, prot_id + ".pdb") # copy over the profile os.system(' '.join(["cp", profile_path, profile_out_path])) @@ -98,7 +96,7 @@ for i,line in enumerate(pisces_data[1:]): -outfile = open("data_extracted_from_smtl.txt",'w') +outfile = open(pisces_path + "_data_extracted_from_smtl.txt",'w') outfile.write('\n'.join([(x[0] + ' ' + x[1]) for x in fetched_structures])) outfile.close() diff --git a/extras/data_generation/structure_db/prepare_slurm_submission_structure_profiles.py b/extras/data_generation/structure_db/prepare_slurm_submission_structure_profiles.py new file mode 100644 index 0000000000000000000000000000000000000000..d982a468161a2ccfbcceb994961f4f7e706f543d --- /dev/null +++ b/extras/data_generation/structure_db/prepare_slurm_submission_structure_profiles.py @@ -0,0 +1,57 @@ +from promod3 import loop +import os + +profiles_per_job = 60 +structure_db_path = "/scicore/home/schwede/studga00/data/promod3_databases/final_shit/structure_db_60.dat" +source_structure_db_path = "/scicore/home/schwede/studga00/data/promod3_databases/final_shit/structure_db_20.dat" +profile_db_out_dir = "/scicore/home/schwede/studga00/data/promod3_databases/final_shit/structure_profile_dbs" +awesome_script = "/scicore/home/schwede/studga00/data/promod3_databases/final_shit/create_structure_profiles.py" + + +structure_db = loop.StructureDB.Load(structure_db_path) +n_entries = structure_db.GetNumCoords() + +from_idx = 0 +to_idx = profiles_per_job + +n_jobs = int(n_entries / profiles_per_job) + 1 + +commands = list() + +for job_idx in range(n_jobs): + + current_cmd = list() + current_cmd.append("pm") + current_cmd.append(awesome_script) + current_cmd.append(structure_db_path) + current_cmd.append(source_structure_db_path) + current_cmd.append(str(from_idx)) + current_cmd.append(str(to_idx)) + current_cmd.append(os.path.join(profile_db_out_dir, str(job_idx)+".dat")) + + commands.append(' '.join(current_cmd)) + + from_idx = min(n_entries - 1, from_idx + profiles_per_job) + to_idx = min(n_entries, to_idx + profiles_per_job) + +outfile = open("structure_profile_generation.cmd", 'w') +outfile.write('\n'.join(commands)) +outfile.close() + +stuff_to_write = list() +stuff_to_write.append("#!/bin/bash") +stuff_to_write.append("#SBATCH --job-name=structure_scoring_array") +stuff_to_write.append("#SBATCH --time=06:00:00") +stuff_to_write.append("#SBATCH --output=%s"%(os.path.join("stdout", "%A_%a.out"))) +stuff_to_write.append("#SBATCH --mem=8G") +stuff_to_write.append("#SBATCH --array=1-%i"%(len(commands))) +stuff_to_write.append("source /scicore/home/schwede/studga00/data/promod3_databases/final_shit/setup_bc2_centos7") +stuff_to_write.append("SEEDFILE=%s"%(os.path.join(os.getcwd(), "structure_profile_generation.cmd"))) +stuff_to_write.append("SEED=$(sed -n ${SLURM_ARRAY_TASK_ID}p $SEEDFILE)") +stuff_to_write.append("eval $SEED") + +outfile = open("structure_profile_generation_array.sh", 'w') +outfile.write('\n'.join(stuff_to_write)) +outfile.close() + + diff --git a/extras/data_generation/structure_db/submit_structure_profile_calculations.py b/extras/data_generation/structure_db/submit_structure_profile_calculations.py deleted file mode 100644 index 87f26a5f99cd2df34a38958f6a5f8acaf54a7fd2..0000000000000000000000000000000000000000 --- a/extras/data_generation/structure_db/submit_structure_profile_calculations.py +++ /dev/null @@ -1,115 +0,0 @@ -import os, math -from promod3 import loop - -# path to StructureDB for which you want to calculate the profiles: -# (relative to current working directory!) -structure_db_path = "structure_db.dat" - -# path to StructureDB from which you extract the structural information -# if the StructureDB above is very large, you might want to give it a smaller -# database here to limit CPU time. A database with around 5000 chains should -# be sufficient -# (relative to current working directory!) -structure_db_source_path = "structure_db.dat" - -# the profiles that get generated end up here: -# (relative to current working directory!) -out_dir = "profile_dbs" - -# number of jobs -num_jobs = 100 - -# all stdout, stderr and submission_script files will go here -# (relative to current working directory!) -submission_workspace = "submission_workspace" - - -# path to the pm bash script -# Once we have the latest and greatest ProMod3 module, this should not -# be necessary anymore and we can simply call pm -pm_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/bin/pm" - - -# This is stuff relevant for the submission script -max_runtime = "13:00:00" -membycore = "3G" -my_email = "gabriel.studer@unibas.ch" - -# we only need ost and promod3 as soon as they're released... -required_modules = ["Boost/1.53.0-goolf-1.4.10-Python-2.7.5", - "Python/2.7.5-goolf-1.4.10", - "OpenMM/7.1.1-goolf-1.4.10-Python-2.7.5"] - -# they won't be necessary anymore as soon as there are ost -# and promod3 modules -ost_python_path = "/scicore/home/schwede/studga00/prog/ost_dev/build/stage/lib64/python2.7/site-packages" -promod_python_path = "/scicore/home/schwede/studga00/prog/promod3_dev/build/stage/lib64/python2.7/site-packages" - - -################################################################# -# NO EDITING BELOW THIS POINT (EXCEPT ON DIFFERENT SYSTEMS ;) ) # -################################################################# - -if not os.path.exists(submission_workspace): - os.makedirs(submission_workspace) - -if not os.path.exists(out_dir): - os.makedirs(out_dir) - -structure_db = loop.StructureDB.Load(structure_db_path) -num_coords = structure_db.GetNumCoords() -chunk_size = math.ceil(float(num_coords ) / num_jobs) -current_start = 0 -current_end = chunk_size - -for job_idx in range(num_jobs): - - start = current_start - end = current_end - if end > num_coords: - end = num_coords - - # we estimate the chunk size with a ceil => we might get away with even - # less jobs... - if start >= end: - break - - start = int(start) - end = int(end) - - current_start += chunk_size - current_end += chunk_size - - cmd = [pm_path, \ - os.path.join(os.getcwd(),"create_structure_profiles.py"), \ - os.path.join(os.getcwd(),structure_db_path), \ - os.path.join(os.getcwd(),structure_db_source_path), \ - str(start), str(end), \ - os.path.join(os.getcwd(), out_dir, str(job_idx)+".dat")] - - stdout_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stdout") - stderr_path = os.path.join(os.getcwd(), submission_workspace, str(job_idx) + ".stderr") - - content = list() - content.append("#!/bin/bash") - content.append("#$ -o " + stdout_path) - content.append("#$ -e " + stderr_path) - content.append("#$ -l runtime=" + max_runtime) - content.append("#$ -l membycore=" + membycore) - content.append("#$ -m as -M " + my_email) - - for module in required_modules: - content.append("ml " + module) - - content.append("export PYTHONPATH=" + ost_python_path + ":PYTHONPATH") - content.append("export PYTHONPATH=" + promod_python_path + ":PYTHONPATH") - content.append(' '.join(cmd)) - - s_script_path = os.path.join(submission_workspace, "sub_"+str(job_idx)+".sh") - outfile = open(s_script_path,'w') - outfile.write('\n'.join(content)) - outfile.close() - - # FIRE!!! - os.system("qsub "+ s_script_path) - diff --git a/loop/data/CMakeLists.txt b/loop/data/CMakeLists.txt index 8c79b4397cdcbfefc0f2eb9f46591a71c9f38221..118be4c53f957e9d33a6de1b21bf3761393890b5 100644 --- a/loop/data/CMakeLists.txt +++ b/loop/data/CMakeLists.txt @@ -8,11 +8,11 @@ convert_module_data(MODULE loop FILE torsion_sampler_extended.dat convert_module_data(MODULE loop FILE torsion_sampler_helical.dat SCRIPT convert_binaries.py ARGS TorsionSampler) -#convert_module_data(MODULE loop FILE frag_db.dat -# SCRIPT convert_binaries.py ARGS FragDB) +convert_module_data(MODULE loop FILE frag_db.dat + SCRIPT convert_binaries.py ARGS FragDB) -#convert_module_data(MODULE loop FILE structure_db.dat -# SCRIPT convert_binaries.py ARGS StructureDB) +convert_module_data(MODULE loop FILE structure_db.dat + SCRIPT convert_binaries.py ARGS StructureDB) convert_module_data(MODULE loop FILE ff_lookup_charmm.dat SCRIPT convert_binaries.py ARGS ForcefieldLookup) diff --git a/loop/data/portable_frag_db.dat b/loop/data/portable_frag_db.dat index 802c3651567e94c49ffa483cab3827049869c141..f612b3cd9f7ae1d878363545027ea4158565b71e 100644 Binary files a/loop/data/portable_frag_db.dat and b/loop/data/portable_frag_db.dat differ diff --git a/loop/data/portable_structure_db.dat b/loop/data/portable_structure_db.dat index 73ab352c929d9c0c693b57f4a8da568c833d8cb5..0498195c094fbc0dae623c42903b5c91c5990a2c 100644 Binary files a/loop/data/portable_structure_db.dat and b/loop/data/portable_structure_db.dat differ diff --git a/loop/src/structure_db.hh b/loop/src/structure_db.hh index d34ea6edb053915ba0eab01d77674bc5a29404fc..e058724cb48dca0df5f3040570a892d117b72079 100644 --- a/loop/src/structure_db.hh +++ b/loop/src/structure_db.hh @@ -129,7 +129,7 @@ public: void SavePortable(const String& filename); bool HasData(DBDataType data_type) const { - return (data_type & stored_data_) > 0; + return (data_type & stored_data_) == data_type; } bool operator==(const StructureDB& rhs) const; diff --git a/modelling/tests/data/port_frag_db.dat b/modelling/tests/data/port_frag_db.dat index 1cc848360e16ec2840bc6c540fa6c7eda9e4fb95..167582ab4c931037fc88519a53f8a6d841a67e1c 100644 Binary files a/modelling/tests/data/port_frag_db.dat and b/modelling/tests/data/port_frag_db.dat differ diff --git a/modelling/tests/data/port_str_db.dat b/modelling/tests/data/port_str_db.dat index 5c183190ce902c88a35cc99fb5e4fd69a71ef62f..a861977c6a974dd2a15929b9f5c71415d57901d3 100644 Binary files a/modelling/tests/data/port_str_db.dat and b/modelling/tests/data/port_str_db.dat differ diff --git a/modelling/tests/test_loop_candidates.cc b/modelling/tests/test_loop_candidates.cc index fe902a579d7de55cad3a2a06026aa325b765847b..70c92d55fb35c002c0d3077aaaa1c334b9df29b5 100644 --- a/modelling/tests/test_loop_candidates.cc +++ b/modelling/tests/test_loop_candidates.cc @@ -303,7 +303,7 @@ BOOST_AUTO_TEST_CASE(test_loop_candidates_from_db) { ost::mol::EntityHandle ent = LoadTestStructure("data/1CRN.pdb"); ost::seq::ProfileHandlePtr prof; prof = ost::io::LoadSequenceProfile("data/1CRNA.hhm"); - const uint loop_length = 5; + const uint loop_length = 7; const uint start_idx = 10; // get dbs (small DBs created as in example codes using crambin) @@ -311,15 +311,15 @@ BOOST_AUTO_TEST_CASE(test_loop_candidates_from_db) { structure_db = loop::StructureDB::LoadPortable("data/port_str_db.dat"); // get known fragments to put in here std::vector<loop::FragmentInfo> fragments; - fragments.push_back(loop::FragmentInfo(1, 108, 5)); - fragments.push_back(loop::FragmentInfo(1, 24, 5)); - fragments.push_back(loop::FragmentInfo(1, 93, 5)); - fragments.push_back(loop::FragmentInfo(0, 10, 5)); - fragments.push_back(loop::FragmentInfo(0, 9, 5)); - fragments.push_back(loop::FragmentInfo(1, 90, 5)); - fragments.push_back(loop::FragmentInfo(0, 7, 5)); - fragments.push_back(loop::FragmentInfo(1, 28, 5)); - fragments.push_back(loop::FragmentInfo(1, 4, 5)); + fragments.push_back(loop::FragmentInfo(1, 108, loop_length)); + fragments.push_back(loop::FragmentInfo(1, 24, loop_length)); + fragments.push_back(loop::FragmentInfo(1, 93, loop_length)); + fragments.push_back(loop::FragmentInfo(0, 10, loop_length)); + fragments.push_back(loop::FragmentInfo(0, 9, loop_length)); + fragments.push_back(loop::FragmentInfo(1, 90, loop_length)); + fragments.push_back(loop::FragmentInfo(0, 7, loop_length)); + fragments.push_back(loop::FragmentInfo(1, 28, loop_length)); + fragments.push_back(loop::FragmentInfo(1, 4, loop_length)); // manually fill loop candidates String loop_seq = ""; @@ -343,9 +343,9 @@ BOOST_AUTO_TEST_CASE(test_loop_candidates_from_db) { structure_db, *prof, start_idx); stem_rmsds = loop_candidates->CalculateStemRMSDs(n_stem, c_stem); // check scores - CheckScore(seq_prof_scores, N_cand, -2.12749, 2.07746, -0.484273); - CheckScore(str_prof_scores, N_cand, 0.149246, 0.426513, 0.282906); - CheckScore(stem_rmsds, N_cand, 0.00893292, 1.66355, 0.691202); + CheckScore(seq_prof_scores, N_cand, -2.73226428, 2.37290597, -1.04137301); + CheckScore(str_prof_scores, N_cand, 0.0652512088, 0.749628067, 0.391301066); + CheckScore(stem_rmsds, N_cand, 0.0314558744, 2.9163394, 1.22806919); // check ApplyXXX // -> expected ApplyCCD: 0, 2, .., 8 converging @@ -354,41 +354,51 @@ BOOST_AUTO_TEST_CASE(test_loop_candidates_from_db) { orig_indices = lc.ApplyCCD(n_stem, c_stem); CheckOrigIndices(lc, orig_indices, fragments); // ShowIndices(orig_indices); - BOOST_CHECK_EQUAL(orig_indices.size(), uint(8)); + BOOST_CHECK_EQUAL(orig_indices.size(), uint(9)); BOOST_CHECK_EQUAL(orig_indices[0], uint(0)); - BOOST_CHECK_EQUAL(orig_indices[1], uint(2)); - BOOST_CHECK_EQUAL(orig_indices[7], uint(8)); + BOOST_CHECK_EQUAL(orig_indices[1], uint(1)); + BOOST_CHECK_EQUAL(orig_indices[8], uint(8)); CheckProfScores(lc, orig_indices, structure_db, prof, start_idx, seq_prof_scores, str_prof_scores); // -> expected ApplyKIC: 0 converging with 2 LC lc = *loop_candidates; // COPY - orig_indices = lc.ApplyKIC(n_stem, c_stem, 1, 2, 3); + orig_indices = lc.ApplyKIC(n_stem, c_stem, 1, 3, 5); CheckOrigIndices(lc, orig_indices, fragments); // ShowIndices(orig_indices); - BOOST_CHECK_EQUAL(orig_indices.size(), uint(2)); - BOOST_CHECK_EQUAL(orig_indices[0], uint(3)); - BOOST_CHECK_EQUAL(orig_indices[1], uint(3)); + BOOST_CHECK_EQUAL(orig_indices.size(), uint(50)); + + + BOOST_CHECK_EQUAL(orig_indices[0], uint(0)); + BOOST_CHECK_EQUAL(orig_indices[1], uint(0)); + BOOST_CHECK_EQUAL(orig_indices[2], uint(0)); + BOOST_CHECK_EQUAL(orig_indices[3], uint(0)); + BOOST_CHECK_EQUAL(orig_indices[4], uint(1)); + BOOST_CHECK_EQUAL(orig_indices[5], uint(1)); + BOOST_CHECK_EQUAL(orig_indices[6], uint(1)); + BOOST_CHECK_EQUAL(orig_indices[49], uint(8)); + CheckProfScores(lc, orig_indices, structure_db, prof, start_idx, seq_prof_scores, str_prof_scores); // cluster them std::vector< std::vector<uint> > clusters; - const Real max_dist = 0.23; + const Real max_dist = 0.35; loop_candidates->GetClusters(max_dist, clusters); // ShowClusters(clusters); - BOOST_CHECK_EQUAL(clusters.size(), uint(4)); + BOOST_CHECK_EQUAL(clusters.size(), uint(5)); std::vector<uint> cluster_sizes; - for(int i = 0; i < 4; ++i){ + for(uint i = 0; i < clusters.size(); ++i){ cluster_sizes.push_back(clusters[i].size()); } std::sort(cluster_sizes.begin(), cluster_sizes.end()); BOOST_CHECK_EQUAL(cluster_sizes[0], uint(1)); BOOST_CHECK_EQUAL(cluster_sizes[1], uint(1)); - BOOST_CHECK_EQUAL(cluster_sizes[2], uint(3)); - BOOST_CHECK_EQUAL(cluster_sizes[3], uint(4)); + BOOST_CHECK_EQUAL(cluster_sizes[2], uint(1)); + BOOST_CHECK_EQUAL(cluster_sizes[3], uint(3)); + BOOST_CHECK_EQUAL(cluster_sizes[4], uint(3)); CheckClusters(clusters, N_cand); // get filtered variant to check neglect_size_one below