From 1dfd44ba32535bb15e1dfe7f04d4ac576b6129b3 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Fri, 31 May 2019 15:13:59 +0200 Subject: [PATCH] updated CAD score binding - format of result has changed, check doc for further details --- CHANGELOG.txt | 2 + modules/bindings/pymod/cadscore.py | 261 +++++++++++++++++++++++------ 2 files changed, 216 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index eb306e7ac..f0f8504a8 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -4,6 +4,8 @@ Changes in Release 1.x.x * Use system provided SQLite3 library * C++ wrapper around C++ implementation of TMalign * Database for efficient in-memory lookup of coordinates and sequences + * Updated CAD score binding with change in result format, check documentation + for further details Changes in Release 1.9.0 -------------------------------------------------------------------------------- diff --git a/modules/bindings/pymod/cadscore.py b/modules/bindings/pymod/cadscore.py index fdaef13a9..8b81c79a9 100644 --- a/modules/bindings/pymod/cadscore.py +++ b/modules/bindings/pymod/cadscore.py @@ -28,8 +28,8 @@ Proteins. 2012 Aug 30. [Epub ahead of print] Authors: Valerio Mariani, Alessandro Barbato """ -import subprocess, os, tempfile, platform -from ost import settings, io +import subprocess, os, tempfile, platform, re +from ost import settings, io, mol def _SetupFiles(model,reference): # create temporary directory @@ -37,7 +37,8 @@ def _SetupFiles(model,reference): dia = 'PDB' for chain in model.chains: if chain.name==" ": - raise RuntimeError("One of the chains in the model has no name. Cannot calculate CAD score") + raise RuntimeError("One of the chains in the model has no name. Cannot " + "calculate CAD score") if len(chain.name) > 1: dia = 'CHARMM' break; @@ -49,7 +50,8 @@ def _SetupFiles(model,reference): dia = 'PDB' for chain in reference.chains: if chain.name==" ": - raise RuntimeError("One of the chains in the reference has no name. Cannot calculate CAD score") + raise RuntimeError("One of the chains in the reference has no name. " + "Cannot calculate CAD score") if len(chain.name) > 1: dia = 'CHARMM' break; @@ -64,7 +66,6 @@ def _CleanupFiles(dir_name): import shutil shutil.rmtree(dir_name) - class CADResult: """ Holds the result of running CAD @@ -77,78 +78,244 @@ class CADResult: Dictionary containing local CAD's atom-atom (AA) scores. - :type: dictionary (key: chain.resnum (e.g.: A.24), value: CAD local AA score (see CAD Documentation online) + :type: dictionary (key: tuple(chain, resnum) (e.g.: + ("A", ost.mol.ResNum(24)), value: CAD local AA score + (see CAD Documentation online) """ def __init__(self, globalAA, localAA): self.globalAA=globalAA self.localAA=localAA def _ParseCADGlobal(lines): - interesting_lines=lines[1] - aa=float(interesting_lines.split()[10]) - return aa + header = lines[0].split() + aa_idx = header.index("AA") + aa_score=float(lines[1].split()[aa_idx]) + return aa_score def _ParseCADLocal(lines): + local_scores_idx = None + for line_idx in range(len(lines)): + if "local_scores" in lines[line_idx]: + local_scores_idx = line_idx + break + if local_scores_idx == None: + raise RuntimeError("Failed to parse local cadscores") + local_aa_dict={} + for line_idx in range(local_scores_idx+2, len(lines)): + items=lines[line_idx].split() + local_aa = float(items[2]) + if local_aa < 0.0: + continue # invalid CAD score + key = (items[0], mol.ResNum(int(items[1]))) + local_aa_dict[key] = local_aa + + return local_aa_dict + +def _ParseVoronotaGlobal(lines): + return float(lines[0].split()[4]) + +def _ParseVoronotaLocal(lines): local_aa_dict={} - for lin in lines[11:]: - items=lin.split() - chain=items[0] - resnum=int(items[1]) - key=chain+'.'+str(resnum) - aa=float(items[2]) - local_aa_dict[key]=aa + chain_name_regex = r'c\<\D+\>' + resnum_regex = r'r\<\d+\>' + insertion_code_regex = r'i\<\D\>' + for line in lines: + local_aa = float(line.split()[-1]) + if local_aa < 0.0: + continue # invalid CAD score + chain_data = re.findall(chain_name_regex, line) + resnum_data = re.findall(resnum_regex, line) + insertion_code_data = re.findall(insertion_code_regex, line) + resnum = None + if len(insertion_code_data) == 0: + resnum = mol.ResNum(int(resnum_data[0][1:].strip('><'))) + else: + resnum = mol.ResNum(int(resnum_data[0][1:].strip('><')), + insertion_code_data[0][1:].strip('><')) + key = (chain_data[0][1:].strip('><'), resnum) + local_aa_dict[key] = local_aa return local_aa_dict -def _RunCAD(max_iter, tmp_dir): +def _RunCAD(tmp_dir, mode, cad_bin_path, old_regime): + model_filename=os.path.join(tmp_dir, 'model.pdb') reference_filename=os.path.join(tmp_dir, 'reference.pdb') + globalAA = None + localAA = None + if platform.system() == "Windows": raise RuntimeError('CAD score not available on Windows') + + if mode == "classic": + cad_calc_path = None + cad_read_g_path = None + cad_read_l_path = None + if cad_bin_path: + cad_calc_path = settings.Locate('CADscore_calc.bash', + search_paths=[cad_bin_path], + search_system_paths=False) + cad_read_g_path = settings.Locate('CADscore_read_global_scores.bash', + search_paths=[cad_bin_path], + search_system_paths=False) + cad_read_l_path=settings.Locate('CADscore_read_local_scores.bash', + search_paths=[cad_bin_path], + search_system_paths=False) + # also try to locate the actual executable that is called from the + # bash scripts + executable_path = settings.Locate('voroprot2', + search_paths=[cad_bin_path], + search_system_paths=False) + else: + cad_calc_path = settings.Locate('CADscore_calc.bash') + cad_read_g_path = settings.Locate('CADscore_read_global_scores.bash') + cad_read_l_path = settings.Locate('CADscore_read_local_scores.bash') + # also try to locate the actual executable that is called from the + # bash scripts + executable_path = settings.Locate('voroprot2') + command1="\"%s\" -m \"%s\" -t \"%s\" -D \"%s\"" %(cad_calc_path, + model_filename, + reference_filename, + os.path.join(tmp_dir, + "cadtemp")) + command2="\"%s\" -D \"%s\"" %(cad_read_g_path, os.path.join(tmp_dir, + "cadtemp")) + command3="\"%s\" -m \"%s\" -t \"%s\" -D \"%s\" -c AA" %(cad_read_l_path, + model_filename, + reference_filename, + os.path.join(tmp_dir, + "cadtemp")) + + ps1=subprocess.Popen(command1, shell=True, stdout=subprocess.PIPE) + ps1.wait() + ps2=subprocess.Popen(command2, shell=True, stdout=subprocess.PIPE) + ps2.wait() + lines=ps2.stdout.readlines() + try: + globalAA=_ParseCADGlobal(lines) + except: + raise RuntimeError("CAD calculation failed") + ps3=subprocess.Popen(command3, shell=True, stdout=subprocess.PIPE) + ps3.wait() + lines=ps3.stdout.readlines() + try: + localAA=_ParseCADLocal(lines) + except: + raise RuntimeError("CAD calculation failed") + + elif mode == "voronota": + local_score_filename = os.path.join(tmp_dir, "local_scores.txt") + voronota_cadscore_path = None + if cad_bin_path: + voronota_cadscore_path = settings.Locate("voronota-cadscore", + search_paths=[cad_bin_path], + search_system_paths=False) + # also try to locate the actual executable that is called from the + # bash script + executable_path = settings.Locate("voronota", + search_paths=[cad_bin_path], + search_system_paths=False) + else: + voronota_cadscore_path = settings.Locate("voronota-cadscore") + # also try to locate the actual executable that is called from the + # bash script + executable_path = settings.Locate("voronota") + cmd = [voronota_cadscore_path, '-m', model_filename, '-t', + reference_filename, '--contacts-query-by-code AA', + '--output-residue-scores', local_score_filename] + if old_regime: + cmd.append("--old-regime") + cmd = ' '.join(cmd) + ps = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + ps.wait() + try: + globalAA = _ParseVoronotaGlobal(ps.stdout.readlines()) + except: + raise RuntimeError("CAD calculation failed") + try: + localAA = _ParseVoronotaLocal(open(local_score_filename).readlines()) + except: + raise RuntimeError("CAD calculation failed") + else: - cad_calc_path=settings.Locate('CADscore_calc.bash') - cad_read_g_path=settings.Locate('CADscore_read_global_scores.bash') - cad_read_l_path=settings.Locate('CADscore_read_local_scores.bash') - command1="\"%s\" -o %i -m \"%s\" -t \"%s\" -D \"%s\"" %(cad_calc_path, max_iter, model_filename, reference_filename, os.path.join(tmp_dir,"cadtemp")) - command2="\"%s\" -D \"%s\"" %(cad_read_g_path, os.path.join(tmp_dir,"cadtemp")) - command3="\"%s\" -m \"%s\" -t \"%s\" -D \"%s\" -c AA" %(cad_read_l_path, model_filename, reference_filename,os.path.join(tmp_dir,"cadtemp")) - ps1=subprocess.Popen(command1, shell=True, stdout=subprocess.PIPE) - ps1.wait() - ps2=subprocess.Popen(command2, shell=True, stdout=subprocess.PIPE) - ps2.wait() - lines=ps2.stdout.readlines() - try: - globalAA=_ParseCADGlobal(lines) - except: - raise RuntimeError("CAD calculation failed") - ps3=subprocess.Popen(command3, shell=True, stdout=subprocess.PIPE) - ps3.wait() - lines=ps3.stdout.readlines() - try: - localAA=_ParseCADLocal(lines) - except: - raise RuntimeError("CAD calculation failed") + raise RuntimeError("Invalid CAD mode! Allowed are: " + "[\"classic\", \"voronota\"]") + return CADResult(globalAA,localAA) +def _HasInsertionCodes(model, reference): + for r in model.residues: + if r.GetNumber().GetInsCode() != "\0": + print r + return True + for r in reference.residues: + if r.GetNumber().GetInsCode() != "\0": + print r + return True + return False -def CADScore(model, reference, max_iter=300): +def _MapLabels(model, cad_results, label): + for k,v in cad_results.localAA.iteritems(): + r = model.FindResidue(k[0], k[1]) + if not r.IsValid(): + raise RuntimeError("Failed to map cadscore on residues: " + + "CAD score estimated for residue in chain \"" + + k[0] + "\" with ResNum " + str(k[1]) + ". Residue " + + "could not be found in model.") + r.SetFloatProp(label, v) + +def CADScore(model, reference, mode = "classic", label = "localcad", + old_regime = False, cad_bin_path = None): """ - Calculates global and local atom-atom (AA) CAD Scores - + Calculates global and local atom-atom (AA) CAD Scores. + + You can either access the original implementation available from + https://bitbucket.org/kliment/cadscore/downloads/ + or the new implementation which is part of the Voronota package + available from https://bitbucket.org/kliment/voronota/downloads/. + + The scores of the two implementations differ but strongly correlate + as the contacts between atoms are estimated differently. When using + the "voronota" *mode* you can minimize those discrepancies by + setting the *old_regime* flag to True. + + Furthermore, the "voronota" *mode* generates per-residue scores that + are inverted when compared to the classical implementation + (0.0: bad, 1.0 good). :param model: The model structure. :type model: :class:`~ost.mol.EntityView` or :class:`~ost.mol.EntityHandle` :param reference: The reference structure - :type model2: :class:`~ost.mol.EntityView` or :class:`~ost.mol.EntityHandle` - :param max_iter: Optional. The maximum number of iteration for the tessellation algorithm before giving up. By default 300 + :type reference: :class:`~ost.mol.EntityView` or + :class:`~ost.mol.EntityHandle` + :param mode: What CAD score implementation to use, must be one in + ["classic", "voronota"] + :param label: Local CAD scores will be mapped on residues of model as + float property with this name + :type label: :class:`str` + :param old_regime: Only has an effect if *mode* is "voronota". If set to true, + the discrepancies between the two modes is minimized but + the behaviour of inverted scores persists. + :type old_regime: :class:`bool` + :param cad_bin_path: Path to search for the required executables + (["CADscore_calc.bash", + "CADscore_read_global_scores.bash", + "CADscore_read_local_scores.bash"] for "classic" *mode* + or ["voronota-cadscore"] for "voronota" *mode*). If not + set, the env path is searched. + :type cad_bin_path: :class:`str` :returns: The result of the CAD score calculation :rtype: :class:`CADResult` - :raises: :class:`~ost.settings.FileNotFound` if any of the CAD score exacutables could not be located. + :raises: :class:`~ost.settings.FileNotFound` if any of the CAD score + executables could not be located. :raises: :class:`RuntimeError` if the calculation failed """ + if mode == "classic" and _HasInsertionCodes(model, reference): + raise RuntimeError("The classic CAD score implementation does not support " + "insertion codes in residues") tmp_dir_name=_SetupFiles(model, reference) - result=_RunCAD(max_iter, tmp_dir_name) + result=_RunCAD(tmp_dir_name, mode, cad_bin_path, old_regime) _CleanupFiles(tmp_dir_name) + _MapLabels(model, result, label) return result - -- GitLab