From 68899ae1d7285159c1c1e3025558fb9c8abf5803 Mon Sep 17 00:00:00 2001
From: Xavier Robin <xavalias-github@xavier.robin.name>
Date: Thu, 16 May 2024 15:46:53 +0200
Subject: [PATCH] fix: SCHWED-6274 read SEQRES from entity_poly_seq

This commit additional drops more side-effects of the presence of a
seqres record by moving all the logic to the GetSeqRes function, rather
than being generated on the fly during parsing.
---
 CHANGELOG.txt                                 |   3 +
 modules/io/doc/mmcif.rst                      |  26 +--
 modules/io/pymod/__init__.py                  |   3 +-
 modules/io/pymod/export_mmcif_io.cc           |   6 +-
 modules/io/src/mol/mmcif_info.hh              |   3 +-
 modules/io/src/mol/mmcif_reader.cc            | 187 ++++++++----------
 modules/io/src/mol/mmcif_reader.hh            |  43 +---
 modules/io/src/mol/mmcif_str.cc               |   1 -
 modules/io/tests/test_mmcif_reader.cc         |  64 ------
 .../io/tests/testfiles/mmcif/atom_site.mmcif  |   9 +
 .../alg/tests/testfiles/align_to_seqres.mmcif |  82 +++++++-
 .../validate_segres_aln_breakage.mmcif        |  81 +++++++-
 .../validate_seqres_aln_connected.mmcif       |  83 +++++++-
 13 files changed, 364 insertions(+), 227 deletions(-)

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index f3f863c14..0864bb6e6 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -8,6 +8,9 @@ Changes in Release x.x.x
  * Added 'full_bs_search' argument in ligand scoring to optionally speed up
    computations in large complexes.
    the number of model binding sites
+ * SEQRES information is now read from the entity_poly_seq category in mmCIF.
+   The canonical as well as semi-canonical (with 3 letter-codes in bracket)
+   SEQRES are now available from the MMCifInfo object.
  * Several bug fixes and improvements.
 
 Changes in Release 2.7.0
diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst
index 91d6cda1f..43618968f 100644
--- a/modules/io/doc/mmcif.rst
+++ b/modules/io/doc/mmcif.rst
@@ -65,8 +65,7 @@ Notes:
   It is added as string property named "pdb_auth_chain_name" to the
   :class:`~ost.mol.ChainHandle`. The mapping is also stored in
   :class:`MMCifInfo` as :meth:`~MMCifInfo.GetMMCifPDBChainTr` and
-  :meth:`~MMCifInfo.GetPDBMMCifChainTr` if a non-empty SEQRES record exists for
-  that chain (this should exclude ligands and water).
+  :meth:`~MMCifInfo.GetPDBMMCifChainTr` (the latter only for polymer chains).
 * Molecular entities in mmCIF are identified by an ``entity.id``, which is
   extracted from ``atom_site.label_entity_id`` for the first atom of the chain.
   It is added as string property named "entity_id" to the
@@ -1417,16 +1416,21 @@ of the annotation available.
 
     :class:`str`
 
-  .. attribute:: seqres
+  .. attribute:: seqres_canonical
 
-    SEQRES with gentle preprocessing - empty string if entity is not of type
-    "polymer". By default, the :class:`ost.io.MMCifReader` reads the value of the
-    ``_entity_poly.pdbx_seq_one_letter_code`` token. Copies all letters but
-    searches a :class:`ost.conop.CompoundLib` for compound names in brackets.
-    *seqres* gets an 'X' if no compound is found or the respective compound has
-    one letter code '?'. Uses the one letter code of the found compound
-    otherwise. So it's basically a canonical SEQRES with exactly one character
-    per residue.
+    Canonical SEQRES - empty string if entity is not of type "polymer".
+    This contains the canonical sequence extracted from the
+    ``_entity_poly.pdbx_seq_one_letter_code_can`` data item.
+
+    :type: :class:`str`
+
+  .. attribute:: seqres_pdbx
+
+    PDBx SEQRES - empty string if entity is not of type "polymer".
+    This contains the sequence extracted from the
+    ``_entity_poly.pdbx_seq_one_letter_code`` data item.
+    Modifications and non-standard amino acids are represented by
+    their three letter code in brackets, e.g. "(MSE)"
 
     :type: :class:`str`
 
diff --git a/modules/io/pymod/__init__.py b/modules/io/pymod/__init__.py
index 89c1c3c8e..66a518def 100644
--- a/modules/io/pymod/__init__.py
+++ b/modules/io/pymod/__init__.py
@@ -430,8 +430,7 @@ def LoadMMCIF(filename, fault_tolerant=None, calpha_only=None,
   try:
     ent = mol.CreateEntity()
     reader = MMCifReader(filename, ent, prof)
-    reader.read_seqres = seqres
-    
+
     # NOTE: to speed up things, we could introduce a restrict_chains parameter
     #       similar to the one in LoadPDB. Here, it would have to be a list/set
     #       of chain-name-strings.
diff --git a/modules/io/pymod/export_mmcif_io.cc b/modules/io/pymod/export_mmcif_io.cc
index fa09b5bfa..aa6595bc2 100644
--- a/modules/io/pymod/export_mmcif_io.cc
+++ b/modules/io/pymod/export_mmcif_io.cc
@@ -107,7 +107,6 @@ void export_mmcif_io()
   class_<MMCifReader, boost::noncopyable>("MMCifReader", init<const String&, EntityHandle&, const IOProfile&>())
     .def("Parse", &MMCifReader::Parse)
     .def("SetRestrictChains", &MMCifReader::SetRestrictChains)
-    .def("SetReadCanonicalSeqRes", &MMCifReader::SetReadCanonicalSeqRes)
     .def("GetSeqRes", &MMCifReader::GetSeqRes)
     .def("GetInfo", make_function(&MMCifReader::GetInfo,
                                   return_value_policy<copy_const_reference>()))
@@ -116,8 +115,6 @@ void export_mmcif_io()
                                 return_value_policy<copy_const_reference>()),
                   &MMCifReader::SetRestrictChains)
     .add_property("seqres", &MMCifReader::GetSeqRes)
-    .add_property("read_seqres", &MMCifReader::GetReadSeqRes, 
-                  &MMCifReader::SetReadSeqRes)
     .add_property("info", make_function(&MMCifReader::GetInfo,
                                    return_value_policy<copy_const_reference>()))
     ;
@@ -493,7 +490,8 @@ void export_mmcif_io()
    .add_property("entity_poly_type", &MMCifEntityDesc::entity_poly_type)
    .add_property("branched_type", &MMCifEntityDesc::branched_type)
    .add_property("details", &MMCifEntityDesc::details)
-   .add_property("seqres", &MMCifEntityDesc::seqres)
+   .add_property("seqres_canonical", &MMCifEntityDesc::seqres_canonical)
+   .add_property("seqres_pdbx", &MMCifEntityDesc::seqres_pdbx)
    .add_property("mon_ids", &MMCifEntityDesc::mon_ids)
    .add_property("hetero_num", &MMCifEntityDesc::hetero_num)
    .add_property("hetero_ids", &MMCifEntityDesc::hetero_ids)
diff --git a/modules/io/src/mol/mmcif_info.hh b/modules/io/src/mol/mmcif_info.hh
index c658af2b5..2892ae7c6 100644
--- a/modules/io/src/mol/mmcif_info.hh
+++ b/modules/io/src/mol/mmcif_info.hh
@@ -957,7 +957,8 @@ typedef struct {
   String entity_poly_type;       ///< value of _entity_poly.type
   String branched_type;          ///< value of _pdbx_entity_branch.type
   String details;                ///< description of this entity
-  String seqres;                 ///< chain of monomers
+  String seqres_canonical;       ///< _entity_poly.pdbx_seq_one_letter_code_can
+  String seqres_pdbx;            ///< _entity_poly.pdbx_seq_one_letter_code
   std::vector<String> mon_ids;   ///< list of monomer names from _entity_poly_seq
   std::vector<int> hetero_num;   ///< res num of heterogeneous compounds
   std::vector<String> hetero_ids;///< names of heterogeneous compounds
diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc
index 6ad799100..2a3fed856 100644
--- a/modules/io/src/mol/mmcif_reader.cc
+++ b/modules/io/src/mol/mmcif_reader.cc
@@ -59,15 +59,11 @@ void MMCifReader::Init()
   atom_count_           = 0;
   residue_count_        = 0;
   auth_chain_id_        = false;
-  seqres_can_           = false;
   has_model_            = false;
   restrict_chains_      = "";
   subst_res_id_         = "";
   curr_chain_           = mol::ChainHandle();
   curr_residue_         = mol::ResidueHandle();
-  seqres_               = seq::CreateSequenceList();
-  read_seqres_          = true;
-  warned_rule_based_    = false;
   info_                 = MMCifInfo();
 }
 
@@ -80,7 +76,6 @@ void MMCifReader::ClearState()
   atom_count_           = 0;
   category_             = DONT_KNOW;
   warned_name_mismatch_ = false;
-  seqres_               = seq::CreateSequenceList();
   info_                 = MMCifInfo();
   entity_desc_map_.clear();
   authors_map_.clear();
@@ -737,7 +732,8 @@ MMCifEntityDescMap::iterator MMCifReader::GetEntityDescMapIterator(
                             .entity_poly_type = "",
                             .branched_type = "",
                             .details="",
-                            .seqres="",
+                            .seqres_canonical="",
+                            .seqres_pdbx="",
                             .mon_ids=std::vector<String>(),
                             .hetero_num=std::vector<int>(),
                             .hetero_ids=std::vector<String>()};
@@ -785,93 +781,17 @@ void MMCifReader::ParseEntityPoly(const std::vector<StringRef>& columns)
     edm_it->second.entity_poly_type = columns[indices_[EP_TYPE]].str();
   }
 
-  // store seqres
-  if (edm_it->second.seqres.length() > 0) {
-    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-     "entity_poly.pdbx_seq_one_letter_code[_can] clash: sequence for entry '" +
-                                            columns[indices_[ENTITY_ID]].str() +
-                                             "' is already set to '" +
-                                             edm_it->second.seqres + "'.",
-                                             this->GetCurrentLinenum()));
+  // store canonical seqres
+  if (indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] != -1) {
+    StringRef seqres_can=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN]];
+    edm_it->second.seqres_canonical = seqres_can.str_no_whitespace();
   }
-  if (read_seqres_) {
-    StringRef seqres;
-    if (seqres_can_) {
-      if (indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] != -1) {
-        seqres=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN]];
-        edm_it->second.seqres = seqres.str_no_whitespace();        
-      } else {
-        throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-                   "'entity_poly.pdbx_seq_one_letter_code_can' not available.'",
-                                                 this->GetCurrentLinenum()));
-      }
-    } else if (indices_[PDBX_SEQ_ONE_LETTER_CODE] != -1) {
-      seqres=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE]];
-
-      conop::CompoundLibBasePtr comp_lib=conop::Conopology::Instance()
-                                                .GetDefaultLib();
-      if (!comp_lib) {
-        if (!warned_rule_based_) {
-          LOG_WARNING("SEQRES import requires a valid compound library to "
-                       "handle non standard compounds. Their One letter "
-                       "codes will be set to X.");      
-        }
-        warned_rule_based_=true;
-        comp_lib = conop::CompoundLibBasePtr(new ost::conop::MinimalCompoundLib);
-      }
-      edm_it->second.seqres = this->ConvertSEQRES(seqres.str_no_whitespace(),
-                                                  comp_lib);
-    } else {
-      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-                       "'entity_poly.pdbx_seq_one_letter_code' not available.'",
-                                               this->GetCurrentLinenum()));
-    }
-  }
-}
-
-String MMCifReader::ConvertSEQRES(const String& seqres, 
-                                  conop::CompoundLibBasePtr comp_lib)
-{
-  String can_seqres;
-  for (String::const_iterator i=seqres.begin(), e=seqres.end(); i!=e; ++i) {
-    if (*i=='(') {
-      bool found_end_paren=false;
-      String tlc;
-      tlc.reserve(3);
-      while ((++i)!=seqres.end()) {
-        if (*i==')') {
-          found_end_paren=true;
-          break;
-        }
-        tlc.push_back(*i);
-      }
-      if (!found_end_paren) {
-        throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
-                          "'entity_poly.pdbx_seq_one_letter_code' contains "
-                          "unmatched '('", this->GetCurrentLinenum()));
-      }
-      conop::CompoundPtr compound=comp_lib->FindCompound(tlc, 
-                                                         conop::Compound::PDB);
-      if (!compound) {
-        if (tlc!="UNK") {
 
-          LOG_WARNING("unknown residue '" << tlc << "' in SEQRES record. "
-                      "Setting one-letter-code to 'X'");
-        }
-        can_seqres.push_back('X');
-        continue;
-      }
-      if (compound->GetOneLetterCode()=='?') {
-        can_seqres.push_back('X');
-      } else {
-        can_seqres.push_back(compound->GetOneLetterCode());
-      }
-
-    } else {
-      can_seqres.push_back(*i);
+  // store non canonical seqres
+  if (indices_[PDBX_SEQ_ONE_LETTER_CODE] != -1) {
+    StringRef seqres_pdbx=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE]];
+    edm_it->second.seqres_pdbx = seqres_pdbx.str_no_whitespace();
     }
-  }
-  return can_seqres;
 }
 
 void MMCifReader::ParseCitation(const std::vector<StringRef>& columns)
@@ -1943,11 +1863,13 @@ void MMCifReader::OnEndData()
     if (edm_it != entity_desc_map_.end()) {
       editor.SetChainType(css->first, edm_it->second.type);
       editor.SetChainDescription(css->first, edm_it->second.details);
-      if (edm_it->second.seqres.length() > 0) {
-        seqres_.AddSequence(seq::CreateSequence(css->first.GetName(),
-                                                edm_it->second.seqres));
-        pdb_auth_chain_name = css->first.GetStringProp("pdb_auth_chain_name");
-        info_.AddMMCifPDBChainTr(css->first.GetName(), pdb_auth_chain_name);
+      // Add chain mapping for all chains
+      pdb_auth_chain_name = css->first.GetStringProp("pdb_auth_chain_name");
+      info_.AddMMCifPDBChainTr(css->first.GetName(), pdb_auth_chain_name);
+
+      if (edm_it->second.entity_type=="polymer") {
+        // PDB -> mmCIF chain mapping only for polymers
+        // This is not a 1:1 mapping because of ligands
         info_.AddPDBMMCifChainTr(pdb_auth_chain_name, css->first.GetName());
       } else if (edm_it->second.entity_type=="non-polymer") {
         mol::ChainHandle chain=css->first;
@@ -1959,7 +1881,8 @@ void MMCifReader::OnEndData()
       }
     } else {
       LOG_WARNING("No entity description found for atom_site.label_entity_id '"
-                  << css->second << "'");
+                  << css->second << "'. SEQRES, chain mapping and ligand "
+                  << " annotation will be missing.");
     }
     // find
     blm_it = entity_branch_link_map_.find(css->second);
@@ -2075,19 +1998,19 @@ void MMCifReader::OnEndData()
   }
 
   // conclude EntityDesc (add entity_poly_seq if present) and add to MMCifInfo
-  for(auto entity_it: entity_desc_map_) {
+  for(auto &entity_it: entity_desc_map_) {
     if(entity_poly_seq_map_.find(entity_it.first) != entity_poly_seq_map_.end()) {
       int max_num = 1;
-      for(auto seqres_it: entity_poly_seq_map_[entity_it.first]) {
+      for(auto &seqres_it: entity_poly_seq_map_[entity_it.first]) {
         max_num = std::max(max_num, seqres_it.first);
       }
       entity_it.second.mon_ids.assign(max_num, "?");
-      for(auto seqres_it: entity_poly_seq_map_[entity_it.first]) {
+      for(auto &seqres_it: entity_poly_seq_map_[entity_it.first]) {
         entity_it.second.mon_ids[seqres_it.first-1] = seqres_it.second;
       }
     }
     if(entity_poly_seq_h_map_.find(entity_it.first) != entity_poly_seq_h_map_.end()) {
-      for(auto hetero_it: entity_poly_seq_h_map_[entity_it.first]) {
+      for(auto &hetero_it: entity_poly_seq_h_map_[entity_it.first]) {
         entity_it.second.hetero_num.push_back(hetero_it.first);
         entity_it.second.hetero_ids.push_back(hetero_it.second);
       }
@@ -2163,4 +2086,68 @@ String OSTBondOrderToMMCifValueOrder(const unsigned char bond_order)
   return String("");
 }
 
+seq::SequenceList MMCifReader::GetSeqRes() const {
+  std::map<String, String> entity_seqres_map; // Map entity_id -> seqres
+  seq::SequenceList seqres_list = seq::CreateSequenceList();
+
+  // We need a compound lib for the conversion
+  conop::CompoundLibBasePtr comp_lib=conop::Conopology::Instance()
+                                            .GetDefaultLib();
+  if (!comp_lib) {
+    LOG_WARNING("SEQRES requires a valid compound library to "
+                 "handle non standard compounds. Their One letter "
+                 "codes will be set to X.");
+    comp_lib = conop::CompoundLibBasePtr(new ost::conop::MinimalCompoundLib);
+  }
+
+  // Generate the SEQRES for every entity
+  for(auto const &entity_it: entity_desc_map_) {
+    if (entity_it.second.entity_type == "polymer") {
+      entity_seqres_map[entity_it.first] = "";
+      auto mon_ids = entity_it.second.mon_ids;
+//      if (mon_ids.size() == 0) {
+//        // We hit this if there was an _entity_poly category but no _entity_poly_seq
+//        LOG_WARNING("No SEQRES found for entity '"
+//                    << entity_it.first << "'.");
+//      }
+      entity_seqres_map[entity_it.first].reserve(mon_ids.size());
+      for (auto const &mon_id: mon_ids) {
+        conop::CompoundPtr compound=comp_lib->FindCompound(mon_id, conop::Compound::PDB);
+        if (!compound) {
+          if (mon_id != "UNK") {
+            LOG_WARNING("unknown residue '" << mon_id << "' in SEQRES record. "
+                        "Setting one-letter-code to 'X'");
+          }
+          entity_seqres_map[entity_it.first].push_back('X');
+          continue;
+        }
+        if (compound->GetOneLetterCode()=='?') {
+          entity_seqres_map[entity_it.first].push_back('X');
+        } else {
+          entity_seqres_map[entity_it.first].push_back(compound->GetOneLetterCode());
+        }
+      }
+    }
+  }
+
+  // Assign
+  for (auto const &css: chain_id_pairs_) {
+    auto entity_seqres_map_it = entity_seqres_map.find(css.second);
+    if (entity_seqres_map_it != entity_seqres_map.end()) {
+      if (entity_seqres_map_it->second == "") {
+        // We hit this if there was an _entity_poly category but no _entity_poly_seq
+        LOG_WARNING("No SEQRES found for chain '"
+                    << css.first << "'. Most likely the entity_poly_seq "
+                    << "category was missing from the input file.");
+      } else {
+        seqres_list.AddSequence(seq::CreateSequence(css.first.GetName(),
+                                                    entity_seqres_map_it->second));
+      }
+    }
+    // else: either non polymer chain, or no entity_poly was available (this
+    // triggered a warning before)
+  }
+  return seqres_list;
+}
+
 }}
diff --git a/modules/io/src/mol/mmcif_reader.hh b/modules/io/src/mol/mmcif_reader.hh
index 28798a49f..cc61e9ca3 100644
--- a/modules/io/src/mol/mmcif_reader.hh
+++ b/modules/io/src/mol/mmcif_reader.hh
@@ -89,16 +89,6 @@ public:
   /// \param restrict_chains chain name
   void SetRestrictChains(const String& restrict_chains);
 
-  /// \brief Toggle reading of canonical sequence residues
-  ///        (entity_poly.pdbx_seq_one_letter_code_can instead of
-  ///        entity_poly.pdbx_seq_one_letter_code). This flag is exclusive.
-  ///
-  /// \param flag True for reading canonical sequences.
-  void SetReadCanonicalSeqRes(bool flag)
-  {
-    seqres_can_ = flag;
-  }
-
   const String& GetRestrictChains() const
   {
     return restrict_chains_;
@@ -141,25 +131,7 @@ public:
   /// \brief Return sequences
   ///
   /// \return List of sequences
-  seq::SequenceList GetSeqRes() const {
-    return seqres_;
-  }
-
-  /// \brief Toggle reading of SEQRES
-  ///
-  /// \param flag True enables, False disables reading SEQRES
-  void SetReadSeqRes(bool flag)
-  {
-    read_seqres_ = flag;
-  }
-
-  /// \brief Check if reading of SEQRES is enabled
-  ///
-  /// \return True if reading of SEQRES is enabled
-  bool GetReadSeqRes() const
-  {
-    return read_seqres_;
-  }
+  seq::SequenceList GetSeqRes() const;
 
   /// \brief Get additional information of the mmCIF file.
   ///
@@ -232,15 +204,7 @@ protected:
   void ParseCitation(const std::vector<StringRef>& columns);
 
 	const MMCifInfoStructRefs& GetStructRefs() const { return struct_refs_; }
-  /// \brief convert the seqres data item to canonical form. 
-  /// 
-  /// The seqres sequence lists non-standard residues in parenthesis. For 
-  /// proper handling of our sequence classes, these need to be converted to 
-  /// one-letter-codes. Ideally, we would use the canonical SEQRES. This is 
-  /// not possible, however, since the PDB assigns multiple one letter codes 
-  /// to some of the residues. To be consistent, we have to do the conversion
-  /// on our own.
-  String ConvertSEQRES(const String& seqres, conop::CompoundLibBasePtr compound_lib);
+
   /// \brief Fetch mmCIF citation_author information
   ///
   /// \param columns data row
@@ -717,7 +681,6 @@ private:
   mol::EntityHandle& ent_handle_;
   String restrict_chains_;
   bool auth_chain_id_;       ///< use chain IDs given by authors rather than pdb
-  bool seqres_can_;          ///< read canonical 1-letter residues?
   mol::ChainHandle curr_chain_;
   mol::ResidueHandle curr_residue_;
   int chain_count_;
@@ -731,8 +694,6 @@ private:
   std::vector<std::pair<mol::ChainHandle, String> > chain_id_pairs_;
   ///< chain and label_entity_id
   MMCifEntityDescMap entity_desc_map_; ///< stores entity items
-  seq::SequenceList seqres_;
-  bool read_seqres_;
   MMCifInfo info_;      ///< info container
   MMCifCitationAuthorMap authors_map_;
   MMCifBioUAssemblyVector bu_assemblies_;
diff --git a/modules/io/src/mol/mmcif_str.cc b/modules/io/src/mol/mmcif_str.cc
index 6fde265e4..604af0d2d 100644
--- a/modules/io/src/mol/mmcif_str.cc
+++ b/modules/io/src/mol/mmcif_str.cc
@@ -50,7 +50,6 @@ MMCifStringToEntity(const String& mmcif, const IOProfile& profile, bool process)
   std::stringstream stream(mmcif);
   mol::EntityHandle ent = mol::CreateEntity();
   MMCifReader reader(stream, ent, profile);
-  reader.SetReadSeqRes(true);
   reader.Parse();
   if(profile.processor && process) {
     profile.processor->Process(ent);
diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc
index 6d81b4603..4670b815a 100644
--- a/modules/io/tests/test_mmcif_reader.cc
+++ b/modules/io/tests/test_mmcif_reader.cc
@@ -65,10 +65,7 @@ public:
   using MMCifReader::ParsePdbxEntityBranchLink;
   using MMCifReader::TryStoreIdx;
   using MMCifReader::SetRestrictChains;
-  using MMCifReader::SetReadSeqRes;
-  using MMCifReader::SetReadCanonicalSeqRes;
   using MMCifReader::ClearState;
-  using MMCifReader::ConvertSEQRES;
   using MMCifReader::GetInfo;
   using MMCifReader::DetermineSecStructType;
   using MMCifReader::MMCifSecStructElement;
@@ -128,23 +125,6 @@ BOOST_AUTO_TEST_CASE(mmcif_trystoreidx)
   BOOST_CHECK_NO_THROW(tmmcif_p.TryStoreIdx(0, "bar", mmcif_h));
 }
 
-BOOST_AUTO_TEST_CASE(mmcif_convert_seqres)
-{
-  conop::CompoundLibPtr compound_lib = SetDefaultCompoundLib();
-  if (!compound_lib) {
-    std::cout << "WARNING: skipping mmcif_convert_seqres unit test. " 
-              << "Compound library is required" << std::endl;
-    return;
-  }
-
-  mol::EntityHandle eh=mol::CreateEntity();
-  
-  TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
-  BOOST_CHECK_EQUAL(tmmcif_p.ConvertSEQRES("A(MSE)Y", compound_lib), "AMY");
-  BOOST_CHECK_THROW(tmmcif_p.ConvertSEQRES("A(MSEY", compound_lib), 
-                    IOException);
-}
-
 BOOST_AUTO_TEST_CASE(mmcif_onbeginloop)
 {
   mol::EntityHandle eh=mol::CreateEntity();
@@ -404,7 +384,6 @@ BOOST_AUTO_TEST_CASE(mmcif_entity_poly_tests)
   mol::EntityHandle eh = mol::CreateEntity();
   MMCifReader mmcif_p("testfiles/mmcif/atom_site.mmcif", eh, profile);
 
-  mmcif_p.SetReadSeqRes(true);
   mmcif_p.Parse();
 
   seq::SequenceList seqres = mmcif_p.GetSeqRes();
@@ -414,7 +393,6 @@ BOOST_AUTO_TEST_CASE(mmcif_entity_poly_tests)
   BOOST_TEST_MESSAGE("          testing type recognition...");
   {
     TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
-    tmmcif_p.SetReadSeqRes(false);
     std::vector<StringRef> columns;
 
     // create corresponding entity entry
@@ -493,46 +471,7 @@ columns.push_back(StringRef("polydeoxyribonucleotide/polyribonucleotide hybrid",
     columns.push_back(StringRef("1", 1));
     columns.push_back(StringRef("other", 5));
     columns.push_back(StringRef("ABRND", 5));
-    tmmcif_p.SetReadSeqRes(true);
-    tmmcif_p.SetReadCanonicalSeqRes(true);
-    BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException);
-    tmmcif_p.SetReadCanonicalSeqRes(false);
     BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntityPoly(columns));
-    BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException);
-  }
-  BOOST_TEST_MESSAGE("          done.");
-  BOOST_TEST_MESSAGE("          testing pdbx_seq_one_letter_code_can "
-                     "reading...");
-  {
-    TestMMCifReaderProtected tmmcif_p("testfiles/mmcif/atom_site.mmcif", eh);
-    std::vector<StringRef> columns;
-
-    tmmcif_h.Clear();
-    tmmcif_h.SetCategory(StringRef("entity", 6));
-    tmmcif_h.Add(StringRef("id", 2));
-    tmmcif_h.Add(StringRef("type", 4));
-    tmmcif_p.OnBeginLoop(tmmcif_h);
-    columns.push_back(StringRef("1", 1));
-    columns.push_back(StringRef("polymer", 7));
-    tmmcif_p.ParseEntity(columns);
-    columns.pop_back();
-    columns.pop_back();
-
-    tmmcif_h.Clear();
-    tmmcif_h.SetCategory(StringRef("entity_poly", 11));
-    tmmcif_h.Add(StringRef("entity_id", 9));
-    tmmcif_h.Add(StringRef("type", 4));
-    tmmcif_h.Add(StringRef("pdbx_seq_one_letter_code_can", 28));
-    tmmcif_p.OnBeginLoop(tmmcif_h);
-    tmmcif_p.SetReadCanonicalSeqRes(false);
-    columns.push_back(StringRef("1", 1));
-    columns.push_back(StringRef("other", 5));
-    columns.push_back(StringRef("ABRND", 5));
-    tmmcif_p.SetReadSeqRes(true);
-    BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException);
-    tmmcif_p.SetReadCanonicalSeqRes(true);
-    BOOST_CHECK_NO_THROW(tmmcif_p.ParseEntityPoly(columns));
-    BOOST_CHECK_THROW(tmmcif_p.ParseEntityPoly(columns), IOException);
   }
   BOOST_TEST_MESSAGE("          done.");
 
@@ -1323,9 +1262,6 @@ BOOST_AUTO_TEST_CASE(mmcif_test_chain_mappings)
   std::ifstream s("testfiles/mmcif/atom_site.mmcif");
   IOProfile profile;
   MMCifReader mmcif_p(s, eh, profile);
-  if (compound_lib_available) {
-    mmcif_p.SetReadSeqRes(true);
-  }
   BOOST_REQUIRE_NO_THROW(mmcif_p.Parse());
   const MMCifInfo& info = mmcif_p.GetInfo();
   
diff --git a/modules/io/tests/testfiles/mmcif/atom_site.mmcif b/modules/io/tests/testfiles/mmcif/atom_site.mmcif
index 9bd8159d6..e01cf812a 100644
--- a/modules/io/tests/testfiles/mmcif/atom_site.mmcif
+++ b/modules/io/tests/testfiles/mmcif/atom_site.mmcif
@@ -26,6 +26,15 @@ _entity_poly.nstd_linkage                   no
 _entity_poly.nstd_monomer                   no
 _entity_poly.pdbx_seq_one_letter_code       'VTI'
 
+loop_
+_entity_poly_seq.entity_id
+_entity_poly_seq.num
+_entity_poly_seq.mon_id
+_entity_poly_seq.hetero
+1 1  VAL n
+1 2  THR n
+1 3  ILE n
+
 loop_
 _citation.id
 _citation.abstract_id_CAS
diff --git a/modules/seq/alg/tests/testfiles/align_to_seqres.mmcif b/modules/seq/alg/tests/testfiles/align_to_seqres.mmcif
index 177b53a64..9fce63c4b 100644
--- a/modules/seq/alg/tests/testfiles/align_to_seqres.mmcif
+++ b/modules/seq/alg/tests/testfiles/align_to_seqres.mmcif
@@ -20,7 +20,87 @@ _entity_poly.nstd_monomer                   no
 _entity_poly.pdbx_seq_one_letter_code       MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQFTEHTSAIKVRGKAYIQTRHGVIESEGKK 
 _entity_poly.pdbx_seq_one_letter_code_can   MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQFTEHTSAIKVRGKAYIQTRHGVIESEGKK 
 _entity_poly.pdbx_strand_id                 A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V 
-# 
+#
+loop_
+_entity_poly_seq.entity_id
+_entity_poly_seq.num
+_entity_poly_seq.mon_id
+_entity_poly_seq.hetero
+1 1  MET n
+1 2  TYR n
+1 3  THR n
+1 4  ASN n
+1 5  SER n
+1 6  ASP n
+1 7  PHE n
+1 8  VAL n
+1 9  VAL n
+1 10 ILE n
+1 11 LYS n
+1 12 ALA n
+1 13 LEU n
+1 14 GLU n
+1 15 ASP n
+1 16 GLY n
+1 17 VAL n
+1 18 ASN n
+1 19 VAL n
+1 20 ILE n
+1 21 GLY n
+1 22 LEU n
+1 23 THR n
+1 24 ARG n
+1 25 GLY n
+1 26 ALA n
+1 27 ASP n
+1 28 THR n
+1 29 ARG n
+1 30 PHE n
+1 31 HIS n
+1 32 HIS n
+1 33 SER n
+1 34 GLU n
+1 35 LYS n
+1 36 LEU n
+1 37 ASP n
+1 38 LYS n
+1 39 GLY n
+1 40 GLU n
+1 41 VAL n
+1 42 LEU n
+1 43 ILE n
+1 44 ALA n
+1 45 GLN n
+1 46 PHE n
+1 47 THR n
+1 48 GLU n
+1 49 HIS n
+1 50 THR n
+1 51 SER n
+1 52 ALA n
+1 53 ILE n
+1 54 LYS n
+1 55 VAL n
+1 56 ARG n
+1 57 GLY n
+1 58 LYS n
+1 59 ALA n
+1 60 TYR n
+1 61 ILE n
+1 62 GLN n
+1 63 THR n
+1 64 ARG n
+1 65 HIS n
+1 66 GLY n
+1 67 VAL n
+1 68 ILE n
+1 69 GLU n
+1 70 SER n
+1 71 GLU n
+1 72 GLY n
+1 73 LYS n
+1 74 LYS n
+#
 loop_
 _atom_site.group_PDB 
 _atom_site.id 
diff --git a/modules/seq/alg/tests/testfiles/validate_segres_aln_breakage.mmcif b/modules/seq/alg/tests/testfiles/validate_segres_aln_breakage.mmcif
index 0d66b8fa2..57d45b9df 100644
--- a/modules/seq/alg/tests/testfiles/validate_segres_aln_breakage.mmcif
+++ b/modules/seq/alg/tests/testfiles/validate_segres_aln_breakage.mmcif
@@ -19,7 +19,86 @@ _entity_poly.nstd_linkage                   no
 _entity_poly.nstd_monomer                   no 
 _entity_poly.pdbx_seq_one_letter_code       MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQFTEHTSAIKVRGKAYIQTRHGVIESEKK 
 _entity_poly.pdbx_seq_one_letter_code_can   MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQFTEHTSAIKVRGKAYIQTRHGVIESEKK 
-_entity_poly.pdbx_strand_id                 A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V 
+_entity_poly.pdbx_strand_id                 A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V
+#
+loop_
+_entity_poly_seq.entity_id
+_entity_poly_seq.num
+_entity_poly_seq.mon_id
+_entity_poly_seq.hetero
+1 1  MET n
+1 2  TYR n
+1 3  THR n
+1 4  ASN n
+1 5  SER n
+1 6  ASP n
+1 7  PHE n
+1 8  VAL n
+1 9  VAL n
+1 10 ILE n
+1 11 LYS n
+1 12 ALA n
+1 13 LEU n
+1 14 GLU n
+1 15 ASP n
+1 16 GLY n
+1 17 VAL n
+1 18 ASN n
+1 19 VAL n
+1 20 ILE n
+1 21 GLY n
+1 22 LEU n
+1 23 THR n
+1 24 ARG n
+1 25 GLY n
+1 26 ALA n
+1 27 ASP n
+1 28 THR n
+1 29 ARG n
+1 30 PHE n
+1 31 HIS n
+1 32 HIS n
+1 33 SER n
+1 34 GLU n
+1 35 LYS n
+1 36 LEU n
+1 37 ASP n
+1 38 LYS n
+1 39 GLY n
+1 40 GLU n
+1 41 VAL n
+1 42 LEU n
+1 43 ILE n
+1 44 ALA n
+1 45 GLN n
+1 46 PHE n
+1 47 THR n
+1 48 GLU n
+1 49 HIS n
+1 50 THR n
+1 51 SER n
+1 52 ALA n
+1 53 ILE n
+1 54 LYS n
+1 55 VAL n
+1 56 ARG n
+1 57 GLY n
+1 58 LYS n
+1 59 ALA n
+1 60 TYR n
+1 61 ILE n
+1 62 GLN n
+1 63 THR n
+1 64 ARG n
+1 65 HIS n
+1 66 GLY n
+1 67 VAL n
+1 68 ILE n
+1 69 GLU n
+1 70 SER n
+1 71 GLU n
+1 72 LYS n
+1 73 LYS n
 # 
 loop_
 _atom_site.group_PDB 
diff --git a/modules/seq/alg/tests/testfiles/validate_seqres_aln_connected.mmcif b/modules/seq/alg/tests/testfiles/validate_seqres_aln_connected.mmcif
index 2e76cf323..fb3a0807f 100644
--- a/modules/seq/alg/tests/testfiles/validate_seqres_aln_connected.mmcif
+++ b/modules/seq/alg/tests/testfiles/validate_seqres_aln_connected.mmcif
@@ -19,7 +19,88 @@ _entity_poly.nstd_linkage                   no
 _entity_poly.nstd_monomer                   no 
 _entity_poly.pdbx_seq_one_letter_code       MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQFTEHTSAIKVRGKAYIQTRHGVIESEGGKK 
 _entity_poly.pdbx_seq_one_letter_code_can   MYTNSDFVVIKALEDGVNVIGLTRGADTRFHHSEKLDKGEVLIAQFTEHTSAIKVRGKAYIQTRHGVIESEGGKK 
-_entity_poly.pdbx_strand_id                 A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V 
+_entity_poly.pdbx_strand_id                 A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V
+#
+loop_
+_entity_poly_seq.entity_id
+_entity_poly_seq.num
+_entity_poly_seq.mon_id
+_entity_poly_seq.hetero
+1 1  MET n
+1 2  TYR n
+1 3  THR n
+1 4  ASN n
+1 5  SER n
+1 6  ASP n
+1 7  PHE n
+1 8  VAL n
+1 9  VAL n
+1 10 ILE n
+1 11 LYS n
+1 12 ALA n
+1 13 LEU n
+1 14 GLU n
+1 15 ASP n
+1 16 GLY n
+1 17 VAL n
+1 18 ASN n
+1 19 VAL n
+1 20 ILE n
+1 21 GLY n
+1 22 LEU n
+1 23 THR n
+1 24 ARG n
+1 25 GLY n
+1 26 ALA n
+1 27 ASP n
+1 28 THR n
+1 29 ARG n
+1 30 PHE n
+1 31 HIS n
+1 32 HIS n
+1 33 SER n
+1 34 GLU n
+1 35 LYS n
+1 36 LEU n
+1 37 ASP n
+1 38 LYS n
+1 39 GLY n
+1 40 GLU n
+1 41 VAL n
+1 42 LEU n
+1 43 ILE n
+1 44 ALA n
+1 45 GLN n
+1 46 PHE n
+1 47 THR n
+1 48 GLU n
+1 49 HIS n
+1 50 THR n
+1 51 SER n
+1 52 ALA n
+1 53 ILE n
+1 54 LYS n
+1 55 VAL n
+1 56 ARG n
+1 57 GLY n
+1 58 LYS n
+1 59 ALA n
+1 60 TYR n
+1 61 ILE n
+1 62 GLN n
+1 63 THR n
+1 64 ARG n
+1 65 HIS n
+1 66 GLY n
+1 67 VAL n
+1 68 ILE n
+1 69 GLU n
+1 70 SER n
+1 71 GLU n
+1 72 GLY n
+1 73 GLY n
+1 74 LYS n
+1 75 LYS n
 # 
 loop_
 _atom_site.group_PDB 
-- 
GitLab