From 1dfd44ba32535bb15e1dfe7f04d4ac576b6129b3 Mon Sep 17 00:00:00 2001
From: Gabriel Studer <gabriel.studer@unibas.ch>
Date: Fri, 31 May 2019 15:13:59 +0200
Subject: [PATCH] updated CAD score binding - format of result has changed,
 check doc for further details

---
 CHANGELOG.txt                      |   2 +
 modules/bindings/pymod/cadscore.py | 261 +++++++++++++++++++++++------
 2 files changed, 216 insertions(+), 47 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index eb306e7ac..f0f8504a8 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -4,6 +4,8 @@ Changes in Release 1.x.x
  * Use system provided SQLite3 library
  * C++ wrapper around C++ implementation of TMalign
  * Database for efficient in-memory lookup of coordinates and sequences
+ * Updated CAD score binding with change in result format, check documentation
+   for further details 
 
 Changes in Release 1.9.0
 --------------------------------------------------------------------------------
diff --git a/modules/bindings/pymod/cadscore.py b/modules/bindings/pymod/cadscore.py
index fdaef13a9..8b81c79a9 100644
--- a/modules/bindings/pymod/cadscore.py
+++ b/modules/bindings/pymod/cadscore.py
@@ -28,8 +28,8 @@ Proteins. 2012 Aug 30. [Epub ahead of print]
 Authors: Valerio Mariani, Alessandro Barbato
 """
 
-import subprocess, os, tempfile, platform
-from ost import settings, io
+import subprocess, os, tempfile, platform, re
+from ost import settings, io, mol
 
 def _SetupFiles(model,reference):
   # create temporary directory
@@ -37,7 +37,8 @@ def _SetupFiles(model,reference):
   dia = 'PDB'
   for chain in model.chains:
     if chain.name==" ":
-      raise RuntimeError("One of the chains in the model has no name. Cannot calculate CAD score")
+      raise RuntimeError("One of the chains in the model has no name. Cannot "
+                         "calculate CAD score")
     if len(chain.name) > 1:
       dia = 'CHARMM'
       break;
@@ -49,7 +50,8 @@ def _SetupFiles(model,reference):
   dia = 'PDB'
   for chain in reference.chains:
     if chain.name==" ":
-      raise RuntimeError("One of the chains in the reference has no name. Cannot calculate CAD score")
+      raise RuntimeError("One of the chains in the reference has no name. "
+                         "Cannot calculate CAD score")
     if len(chain.name) > 1:
       dia = 'CHARMM'
       break;
@@ -64,7 +66,6 @@ def _CleanupFiles(dir_name):
   import shutil
   shutil.rmtree(dir_name)
 
-
 class CADResult:
   """
   Holds the result of running CAD
@@ -77,78 +78,244 @@ class CADResult:
   
     Dictionary containing local CAD's atom-atom (AA) scores. 
     
-    :type: dictionary (key: chain.resnum (e.g.: A.24), value: CAD local AA score (see CAD Documentation online)
+    :type: dictionary (key: tuple(chain, resnum) (e.g.: 
+    ("A", ost.mol.ResNum(24)), value: CAD local AA score 
+    (see CAD Documentation online)
   """
   def __init__(self, globalAA, localAA):    
     self.globalAA=globalAA
     self.localAA=localAA    
 
 def _ParseCADGlobal(lines):
-  interesting_lines=lines[1]
-  aa=float(interesting_lines.split()[10])
-  return aa
+  header = lines[0].split()
+  aa_idx = header.index("AA")
+  aa_score=float(lines[1].split()[aa_idx])
+  return aa_score
 
 def _ParseCADLocal(lines):
+  local_scores_idx = None
+  for line_idx in range(len(lines)):
+    if "local_scores" in lines[line_idx]:
+      local_scores_idx = line_idx
+      break
+  if local_scores_idx == None:
+    raise RuntimeError("Failed to parse local cadscores")
+  local_aa_dict={}
+  for line_idx in range(local_scores_idx+2, len(lines)):
+    items=lines[line_idx].split()
+    local_aa = float(items[2])
+    if local_aa < 0.0:
+      continue # invalid CAD score
+    key = (items[0], mol.ResNum(int(items[1])))
+    local_aa_dict[key] = local_aa
+    
+  return local_aa_dict
+
+def _ParseVoronotaGlobal(lines):
+  return float(lines[0].split()[4])
+
+def _ParseVoronotaLocal(lines):
   local_aa_dict={}
-  for lin in lines[11:]:
-    items=lin.split()
-    chain=items[0]
-    resnum=int(items[1])
-    key=chain+'.'+str(resnum)
-    aa=float(items[2])
-    local_aa_dict[key]=aa
+  chain_name_regex = r'c\<\D+\>'
+  resnum_regex = r'r\<\d+\>'
+  insertion_code_regex = r'i\<\D\>'
+  for line in lines:
+    local_aa = float(line.split()[-1])
+    if local_aa < 0.0:
+      continue # invalid CAD score
+    chain_data = re.findall(chain_name_regex, line)
+    resnum_data = re.findall(resnum_regex, line)
+    insertion_code_data = re.findall(insertion_code_regex, line)
+    resnum = None
+    if len(insertion_code_data) == 0:
+      resnum = mol.ResNum(int(resnum_data[0][1:].strip('><')))
+    else:
+      resnum = mol.ResNum(int(resnum_data[0][1:].strip('><')), 
+                          insertion_code_data[0][1:].strip('><'))
+    key = (chain_data[0][1:].strip('><'), resnum)
+    local_aa_dict[key] = local_aa
   return local_aa_dict
 
-def _RunCAD(max_iter, tmp_dir):
+def _RunCAD(tmp_dir, mode, cad_bin_path, old_regime):
+
   model_filename=os.path.join(tmp_dir, 'model.pdb')
   reference_filename=os.path.join(tmp_dir, 'reference.pdb')
+  globalAA = None
+  localAA = None
+
   if platform.system() == "Windows":
     raise RuntimeError('CAD score not available on Windows')
+
+  if mode == "classic":
+    cad_calc_path = None
+    cad_read_g_path = None
+    cad_read_l_path = None
+    if cad_bin_path:
+      cad_calc_path = settings.Locate('CADscore_calc.bash', 
+                                      search_paths=[cad_bin_path],
+                                      search_system_paths=False)  
+      cad_read_g_path = settings.Locate('CADscore_read_global_scores.bash', 
+                                        search_paths=[cad_bin_path],
+                                        search_system_paths=False)  
+      cad_read_l_path=settings.Locate('CADscore_read_local_scores.bash', 
+                                      search_paths=[cad_bin_path],
+                                      search_system_paths=False)
+      # also try to locate the actual executable that is called from the 
+      # bash scripts 
+      executable_path = settings.Locate('voroprot2', 
+                                        search_paths=[cad_bin_path],
+                                        search_system_paths=False) 
+    else:
+      cad_calc_path = settings.Locate('CADscore_calc.bash')  
+      cad_read_g_path = settings.Locate('CADscore_read_global_scores.bash')  
+      cad_read_l_path = settings.Locate('CADscore_read_local_scores.bash')
+      # also try to locate the actual executable that is called from the 
+      # bash scripts
+      executable_path = settings.Locate('voroprot2')  
+    command1="\"%s\" -m \"%s\" -t \"%s\" -D \"%s\"" %(cad_calc_path, 
+                                                      model_filename, 
+                                                      reference_filename, 
+                                                      os.path.join(tmp_dir,
+                                                                   "cadtemp"))
+    command2="\"%s\" -D \"%s\"" %(cad_read_g_path, os.path.join(tmp_dir,
+                                                                "cadtemp"))
+    command3="\"%s\" -m \"%s\" -t \"%s\" -D \"%s\" -c AA" %(cad_read_l_path, 
+                                                            model_filename, 
+                                                            reference_filename,
+                                                            os.path.join(tmp_dir,
+                                                            "cadtemp"))
+
+    ps1=subprocess.Popen(command1, shell=True, stdout=subprocess.PIPE)
+    ps1.wait()
+    ps2=subprocess.Popen(command2, shell=True, stdout=subprocess.PIPE)
+    ps2.wait()
+    lines=ps2.stdout.readlines()
+    try:
+      globalAA=_ParseCADGlobal(lines)
+    except:
+      raise RuntimeError("CAD calculation failed")
+    ps3=subprocess.Popen(command3, shell=True, stdout=subprocess.PIPE)
+    ps3.wait()
+    lines=ps3.stdout.readlines()
+    try:
+      localAA=_ParseCADLocal(lines)
+    except:
+      raise RuntimeError("CAD calculation failed")
+
+  elif mode == "voronota":
+    local_score_filename = os.path.join(tmp_dir, "local_scores.txt")
+    voronota_cadscore_path = None
+    if cad_bin_path:
+      voronota_cadscore_path = settings.Locate("voronota-cadscore", 
+                                               search_paths=[cad_bin_path],
+                                               search_system_paths=False)
+      # also try to locate the actual executable that is called from the 
+      # bash script
+      executable_path = settings.Locate("voronota", 
+                                        search_paths=[cad_bin_path],
+                                        search_system_paths=False)      
+    else:
+      voronota_cadscore_path = settings.Locate("voronota-cadscore")
+      # also try to locate the actual executable that is called from the 
+      # bash script
+      executable_path = settings.Locate("voronota")  
+    cmd = [voronota_cadscore_path, '-m', model_filename, '-t', 
+           reference_filename, '--contacts-query-by-code AA', 
+           '--output-residue-scores', local_score_filename]
+    if old_regime:
+      cmd.append("--old-regime")
+    cmd = ' '.join(cmd)
+    ps = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
+    ps.wait()
+    try:
+      globalAA = _ParseVoronotaGlobal(ps.stdout.readlines())
+    except:
+      raise RuntimeError("CAD calculation failed")
+    try:
+      localAA = _ParseVoronotaLocal(open(local_score_filename).readlines())
+    except:
+      raise RuntimeError("CAD calculation failed")
+
   else:
-    cad_calc_path=settings.Locate('CADscore_calc.bash')  
-    cad_read_g_path=settings.Locate('CADscore_read_global_scores.bash')  
-    cad_read_l_path=settings.Locate('CADscore_read_local_scores.bash')  
-    command1="\"%s\" -o %i -m \"%s\" -t \"%s\" -D \"%s\"" %(cad_calc_path, max_iter, model_filename, reference_filename, os.path.join(tmp_dir,"cadtemp"))
-    command2="\"%s\" -D \"%s\"" %(cad_read_g_path, os.path.join(tmp_dir,"cadtemp"))
-    command3="\"%s\" -m \"%s\" -t \"%s\" -D \"%s\" -c AA" %(cad_read_l_path, model_filename, reference_filename,os.path.join(tmp_dir,"cadtemp"))
-  ps1=subprocess.Popen(command1, shell=True, stdout=subprocess.PIPE)
-  ps1.wait()
-  ps2=subprocess.Popen(command2, shell=True, stdout=subprocess.PIPE)
-  ps2.wait()
-  lines=ps2.stdout.readlines()
-  try:
-    globalAA=_ParseCADGlobal(lines)
-  except:
-    raise RuntimeError("CAD calculation failed")
-  ps3=subprocess.Popen(command3, shell=True, stdout=subprocess.PIPE)
-  ps3.wait()
-  lines=ps3.stdout.readlines()
-  try:
-    localAA=_ParseCADLocal(lines)
-  except:
-    raise RuntimeError("CAD calculation failed")
+    raise RuntimeError("Invalid CAD mode! Allowed are: "
+                       "[\"classic\", \"voronota\"]")
+
 
   return CADResult(globalAA,localAA)
 
+def _HasInsertionCodes(model, reference):
+  for r in model.residues:
+    if r.GetNumber().GetInsCode() != "\0":
+      print r
+      return True
+  for r in reference.residues:
+    if r.GetNumber().GetInsCode() != "\0":
+      print r
+      return True
+  return False
 
-def CADScore(model, reference, max_iter=300):
+def _MapLabels(model, cad_results, label):
+  for k,v in cad_results.localAA.iteritems():
+    r = model.FindResidue(k[0], k[1])
+    if not r.IsValid():
+      raise RuntimeError("Failed to map cadscore on residues: " +
+                         "CAD score estimated for residue in chain \"" +
+                         k[0] + "\" with ResNum " + str(k[1]) + ". Residue " +
+                         "could not be found in model.")
+    r.SetFloatProp(label, v)
+
+def CADScore(model, reference, mode = "classic", label = "localcad",
+             old_regime = False, cad_bin_path = None):
   """
-  Calculates global and local atom-atom (AA) CAD Scores
-  
+  Calculates global and local atom-atom (AA) CAD Scores. 
+
+  You can either access the original implementation available from
+  https://bitbucket.org/kliment/cadscore/downloads/
+  or the new implementation which is part of the Voronota package 
+  available from https://bitbucket.org/kliment/voronota/downloads/.
+
+  The scores of the two implementations differ but strongly correlate
+  as the contacts between atoms are estimated differently. When using
+  the "voronota" *mode* you can minimize those discrepancies by
+  setting the *old_regime* flag to True.
+
+  Furthermore, the "voronota" *mode* generates per-residue scores that 
+  are inverted when compared to the classical implementation 
+  (0.0: bad, 1.0 good). 
 
   :param model: The model structure. 
   :type model: :class:`~ost.mol.EntityView` or :class:`~ost.mol.EntityHandle`
   :param reference: The reference structure
-  :type model2: :class:`~ost.mol.EntityView` or :class:`~ost.mol.EntityHandle`
-  :param max_iter: Optional. The maximum number of iteration for the tessellation algorithm before giving up. By default 300
+  :type reference: :class:`~ost.mol.EntityView` or 
+                   :class:`~ost.mol.EntityHandle`
+  :param mode:  What CAD score implementation to use, must be one in 
+                ["classic", "voronota"]
+  :param label: Local CAD scores will be mapped on residues of model as 
+                float property with this name
+  :type label: :class:`str`
+  :param old_regime: Only has an effect if *mode* is "voronota". If set to true,
+                     the discrepancies between the two modes is minimized but
+                     the behaviour of inverted scores persists.
+  :type old_regime: :class:`bool`
+  :param cad_bin_path: Path to search for the required executables 
+                       (["CADscore_calc.bash", 
+                       "CADscore_read_global_scores.bash",
+                       "CADscore_read_local_scores.bash"] for "classic" *mode* 
+                       or ["voronota-cadscore"] for "voronota" *mode*). If not
+                       set, the env path is searched.
+  :type cad_bin_path: :class:`str`
   :returns: The result of the CAD score calculation
   :rtype: :class:`CADResult`
   
-  :raises: :class:`~ost.settings.FileNotFound` if any of the CAD score exacutables could not be located.
+  :raises: :class:`~ost.settings.FileNotFound` if any of the CAD score 
+           executables could not be located.
   :raises: :class:`RuntimeError` if the calculation failed
   """
+  if mode == "classic" and _HasInsertionCodes(model, reference):
+    raise RuntimeError("The classic CAD score implementation does not support "
+                       "insertion codes in residues")
   tmp_dir_name=_SetupFiles(model, reference)
-  result=_RunCAD(max_iter, tmp_dir_name)
+  result=_RunCAD(tmp_dir_name, mode, cad_bin_path, old_regime)
   _CleanupFiles(tmp_dir_name)
+  _MapLabels(model, result, label)
   return result
-
-- 
GitLab