From b6b442044c9f17246fdbb0ae5fd4037b8ecf43fc Mon Sep 17 00:00:00 2001 From: Xavier Robin <xavier.robin@unibas.ch> Date: Fri, 21 Apr 2023 15:22:28 +0200 Subject: [PATCH] feat: SCHWED-3293 read mapping data into string properties --- modules/io/doc/mmcif.rst | 25 +++++++++++++++++++++++-- modules/io/pymod/__init__.py | 8 ++++---- modules/io/src/mol/mmcif_reader.cc | 9 ++++++++- modules/io/tests/test_mmcif_reader.cc | 11 +++++++++++ 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/modules/io/doc/mmcif.rst b/modules/io/doc/mmcif.rst index c4c361bda..79281e318 100644 --- a/modules/io/doc/mmcif.rst +++ b/modules/io/doc/mmcif.rst @@ -67,8 +67,29 @@ Notes: :meth:`~MMCifInfo.GetPDBMMCifChainTr` if SEQRES records are read in :func:`~ost.io.LoadMMCIF` and a non-empty SEQRES record exists for that chain (this should exclude ligands and water). -* Molecular entities in mmCIF are identified by an ``entity.id``. Each chain is - mapped to an ID in :class:`MMCifInfo` as :meth:`~MMCifInfo.GetMMCifEntityIdTr`. +* Molecular entities in mmCIF are identified by an ``entity.id``, which is + extracted from ``atom_site.label_entity_id`` for the first atom of the chain. + It is added as string property named "entity_id" to the + :class:`~ost.mol.ChainHandle`. Each chain is mapped to an ID in + :class:`MMCifInfo` as :meth:`~MMCifInfo.GetMMCifEntityIdTr`. +* For more complex mappings, such as ligands which may be in a same "old" chain + as the protein chain but are represented in a separate "new" chain in mmCIF, + we also store string properties on a per-residue level. + For mmCIF files from the PDB, there is a unique mapping between + ("label_asym_id", "label_seq_id") and ("auth_asym_id", "auth_seq_id", + "pdbx_PDB_ins_code"). + The following data items are available: + * ``atom_site.label_asym_id``: ``residue.chain.name`` + * ``atom_site.label_seq_id``: ``residue.GetStringProp("resnum")`` + (this is the same as ``residue.number`` for residues in polymer chains. + However, for ligands ``residue.number`` is unset in mmCIF, but it + is set to 1 by openstructure.) + * ``atom_site.label_entity_id``: ``residue.GetStringProp("entity_id")`` + * ``atom_site.auth_asym_id``: ``residue.GetStringProp("pdb_auth_chain_name")`` + * ``atom_site.auth_seq_id``: ``residue.GetStringProp("pdb_auth_resnum")`` + * ``atom_site.pdbx_PDB_ins_code``: ``residue.GetStringProp("pdb_auth_ins_code")`` +* Missing values in the aforementioned data items will be denoted as ``.`` or + ``?``. Info Classes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/modules/io/pymod/__init__.py b/modules/io/pymod/__init__.py index 1e9eb0b07..4be3ea2ca 100644 --- a/modules/io/pymod/__init__.py +++ b/modules/io/pymod/__init__.py @@ -356,10 +356,10 @@ def LoadMMCIF(filename, fault_tolerant=None, calpha_only=None, customize the exact behaviour of the mmCIF import. For more information on these options, see :doc:`profile`. - Residues are flagged as ligand if they are not covered by an ``entity_poly`` - record (ie. they are non-polymer entities in ``pdbx_entity_nonpoly``). Note - that all residues will be flagged as ligands if ``seqres=False`` (the - default). + Residues are flagged as ligand if they are not waters nor covered by an + ``entity_poly`` record (ie. they are non-polymer entities in + ``pdbx_entity_nonpoly``). Note that all residues except waters will be + flagged as ligands if ``seqres=False`` (the default). :param filename: File to be loaded :type filename: :class:`str` diff --git a/modules/io/src/mol/mmcif_reader.cc b/modules/io/src/mol/mmcif_reader.cc index 527f68fbe..07069ddc3 100644 --- a/modules/io/src/mol/mmcif_reader.cc +++ b/modules/io/src/mol/mmcif_reader.cc @@ -559,6 +559,7 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns) ++chain_count_; // store entity id String ent_id = columns[indices_[LABEL_ENTITY_ID]].str(); + curr_chain_.SetStringProp("entity_id", ent_id); chain_id_pairs_.push_back(std::pair<mol::ChainHandle,String>(curr_chain_, ent_id)); info_.AddMMCifEntityIdTr(cif_chain_name, ent_id); @@ -580,14 +581,20 @@ void MMCifReader::ParseAndAddAtom(const std::vector<StringRef>& columns) curr_residue_=curr_chain_.FindResidue(res_num); } if (!curr_residue_.IsValid()) { // unit test - LOG_DEBUG("new residue " << res_name << " " << res_num); + LOG_TRACE("new residue " << res_name << " " << res_num); if (valid_res_num) { curr_residue_ = editor.AppendResidue(curr_chain_, res_name.str(), res_num); + } else { curr_residue_ = editor.AppendResidue(curr_chain_, res_name.str()); } + curr_residue_.SetStringProp("pdb_auth_chain_name", auth_chain_name); + curr_residue_.SetStringProp("pdb_auth_resnum", columns[indices_[AUTH_SEQ_ID]].str()); + curr_residue_.SetStringProp("pdb_auth_ins_code", columns[indices_[PDBX_PDB_INS_CODE]].str()); + curr_residue_.SetStringProp("entity_id", columns[indices_[LABEL_ENTITY_ID]].str()); + curr_residue_.SetStringProp("resnum", columns[indices_[LABEL_SEQ_ID]].str()); warned_name_mismatch_=false; ++residue_count_; } diff --git a/modules/io/tests/test_mmcif_reader.cc b/modules/io/tests/test_mmcif_reader.cc index 57fb3d3e0..bee89e0e2 100644 --- a/modules/io/tests/test_mmcif_reader.cc +++ b/modules/io/tests/test_mmcif_reader.cc @@ -1225,6 +1225,17 @@ BOOST_AUTO_TEST_CASE(mmcif_testreader) BOOST_CHECK(ch.IsValid()); BOOST_TEST_MESSAGE(" done."); + BOOST_TEST_MESSAGE(" testing chain/residue mapping properties..."); + BOOST_CHECK_EQUAL(ch.GetStringProp("pdb_auth_chain_name"), "A"); + BOOST_CHECK_EQUAL(ch.GetStringProp("entity_id"), "1"); + mol::ResidueHandle res = ch.FindResidue(12); + BOOST_CHECK(res.IsValid()); + BOOST_CHECK_EQUAL(res.GetStringProp("pdb_auth_chain_name"), "A"); + BOOST_CHECK_EQUAL(res.GetStringProp("pdb_auth_resnum"), "12"); + BOOST_CHECK_EQUAL(res.GetStringProp("pdb_auth_ins_code"), "?"); + BOOST_CHECK_EQUAL(res.GetStringProp("entity_id"), "1"); + BOOST_TEST_MESSAGE(" done."); + BOOST_TEST_MESSAGE(" testing numbering water..."); ch = eh.FindChain("O"); BOOST_CHECK(ch.IsValid()); -- GitLab