Skip to content
Snippets Groups Projects
mmcif_reader.cc 28.51 KiB
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2011 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
//------------------------------------------------------------------------------

#include <ctype.h>

#include <ost/profile.hh>
#include <ost/log.hh>
#include <ost/dyn_cast.hh>
#include <ost/mol/xcs_editor.hh>
#include <ost/conop/conop.hh>

#include <ost/conop/rule_based_builder.hh>
#include <ost/io/mol/mmcif_reader.hh>

namespace ost { namespace io {

MMCifParser::MMCifParser(std::istream& stream, mol::EntityHandle& ent_handle,
                         const IOProfile& profile):
  StarParser(stream, true), profile_(profile), ent_handle_(ent_handle)
{
  this->Init();
}

MMCifParser::MMCifParser(const String& filename, mol::EntityHandle& ent_handle,
                         const IOProfile& profile):
  StarParser(filename, true), profile_(profile), ent_handle_(ent_handle)
{
  this->Init();
}

void MMCifParser::Init()
{
  warned_name_mismatch_ = false;
  category_             = DONT_KNOW;
  memset(category_counts_, 0, DONT_KNOW * sizeof(int));
  chain_count_          = 0;
  atom_count_           = 0;
  residue_count_        = 0;
  auth_chain_id_        = false;
  seqres_can_           = false;
  has_model_            = false;
  restrict_chains_      = "";
  subst_res_id_         = "";
  curr_chain_           = mol::ChainHandle();
  curr_residue_         = mol::ResidueHandle();
  seqres_               = seq::CreateSequenceList();
  read_seqres_          = false;
  warned_rule_based_    = false;
  info_                 = MMCifInfo();
}

void MMCifParser::ClearState()
{
  curr_chain_           = mol::ChainHandle();
  curr_residue_         = mol::ResidueHandle();
  chain_count_          = 0;
  residue_count_        = 0;
  atom_count_           = 0;
  category_             = DONT_KNOW;
  warned_name_mismatch_ = false;
  seqres_               = seq::CreateSequenceList();
  info_                 = MMCifInfo();
  entity_desc_map_.clear();
  authors_map_.clear();
}

void MMCifParser::SetRestrictChains(const String& restrict_chains)
{
  restrict_chains_ = restrict_chains;
}

bool MMCifParser::IsValidPDBIdent(const StringRef& pdbid)
{
  if (pdbid.length() == PDBID_LEN && isdigit(pdbid[0])) {
    return true;
  }
  return false;
}

bool MMCifParser::OnBeginData(const StringRef& data_name) 
{
  LOG_DEBUG("MCIFFReader: " << profile_);
  Profile profile_import("MMCifReader::OnBeginData");

  // check for PDB id
  if (!this->IsValidPDBIdent(data_name)) {
    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                         "No valid PDB id found for data block, read instead \'"
                                             + data_name.str() + "\'",
                                             this->GetCurrentLinenum()));
  }

  this->ClearState();

  return true;
}

bool MMCifParser::OnBeginLoop(const StarLoopDesc& header)
{
  bool cat_available = false;
  category_ = DONT_KNOW;
  // set whole array to -1
  memset(indices_, -1, MAX_ITEMS_IN_ROW * sizeof(int));
  // walk through possible categories
  if (header.GetCategory() == "atom_site") {
    category_ = ATOM_SITE;
    // mandatory items
    this->TryStoreIdx(AUTH_ASYM_ID,    "auth_asym_id",    header);
    this->TryStoreIdx(AS_ID,           "id",              header);
    this->TryStoreIdx(LABEL_ALT_ID,    "label_alt_id",    header);
    this->TryStoreIdx(LABEL_ASYM_ID,   "label_asym_id",   header);
    this->TryStoreIdx(LABEL_ATOM_ID,   "label_atom_id",   header);
    this->TryStoreIdx(LABEL_COMP_ID,   "label_comp_id",   header);
    this->TryStoreIdx(LABEL_ENTITY_ID, "label_entity_id", header);
    this->TryStoreIdx(LABEL_SEQ_ID,    "label_seq_id",    header);
    this->TryStoreIdx(TYPE_SYMBOL,     "type_symbol",     header);
    // assumed mandatory
    this->TryStoreIdx(CARTN_X, "Cartn_x", header);
    this->TryStoreIdx(CARTN_Y, "Cartn_y", header);
    this->TryStoreIdx(CARTN_Z, "Cartn_z", header);
    // optional
    indices_[OCCUPANCY]          = header.GetIndex("occupancy");
    indices_[B_ISO_OR_EQUIV]     = header.GetIndex("B_iso_or_equiv");
    indices_[GROUP_PDB]          = header.GetIndex("group_PDB");
    indices_[AUTH_SEQ_ID]        = header.GetIndex("auth_seq_id");
    indices_[PDBX_PDB_INS_CODE]  = header.GetIndex("pdbx_PDB_ins_code");
    indices_[PDBX_PDB_MODEL_NUM] = header.GetIndex("pdbx_PDB_model_num");

    // post processing
    if (category_counts_[category_] > 0) {
      if ((has_model_ && (indices_[PDBX_PDB_MODEL_NUM] == -1))||
          (!has_model_ && (indices_[PDBX_PDB_MODEL_NUM] != -1))) { // unit test
        throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                              "Not all atom_site entries carry a model number.",
                                                 this->GetCurrentLinenum()));
      }
    }
    cat_available = true;
  } else if (header.GetCategory() == "entity") {
    category_ = ENTITY;
    // mandatory items
    this->TryStoreIdx(E_ID, "id", header);
    // optional
    indices_[E_TYPE]  = header.GetIndex("type");
    indices_[PDBX_DESCRIPTION] = header.GetIndex("pdbx_description");
    cat_available = true;
  } else if (header.GetCategory() == "entity_poly") {
    category_ = ENTITY_POLY;
    // mandatory
    this->TryStoreIdx(ENTITY_ID, "entity_id", header);
    // optional
    indices_[EP_TYPE]  = header.GetIndex("type");
    indices_[PDBX_SEQ_ONE_LETTER_CODE] =
      header.GetIndex("pdbx_seq_one_letter_code");
    indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] =
      header.GetIndex("pdbx_seq_one_letter_code_can");
    cat_available = true;
  } else if (header.GetCategory() == "citation") {
    category_ = CITATION;
    // mandatory items
    this->TryStoreIdx(CITATION_ID, "id", header);
    // optional
    indices_[ABSTRACT_ID_CAS]         = header.GetIndex("abstract_id_CAS");
    indices_[BOOK_ID_ISBN]            = header.GetIndex("book_id_ISBN");
    indices_[BOOK_TITLE]              = header.GetIndex("book_title");
    indices_[JOURNAL_FULL]            = header.GetIndex("journal_full");
    indices_[YEAR]                    = header.GetIndex("year");
    indices_[TITLE]                   = header.GetIndex("title");
    indices_[JOURNAL_VOLUME]          = header.GetIndex("journal_volume");
    indices_[PAGE_FIRST]              = header.GetIndex("page_first");
    indices_[PAGE_LAST]               = header.GetIndex("page_last");
    indices_[PDBX_DATABASE_ID_DOI]    = header.GetIndex("pdbx_database_id_DOI");
    indices_[PDBX_DATABASE_ID_PUBMED] =
      header.GetIndex("pdbx_database_id_PubMed");
    cat_available = true;
  } else if (header.GetCategory()=="citation_author") {
    category_ = CITATION_AUTHOR;
    // mandatory items
    this->TryStoreIdx(AUTHOR_CITATION_ID, "citation_id", header);
    this->TryStoreIdx(AUTHOR_NAME,        "name", header);
    this->TryStoreIdx(ORDINAL,            "ordinal", header);
    cat_available = true;
  } else if (header.GetCategory() == "exptl") {
    category_ = EXPTL;
    // mandatory items
    this->TryStoreIdx(CITATION_ID, "entry_id", header);
    this->TryStoreIdx(METHOD,      "method", header);
    cat_available = true;
  } else if (header.GetCategory() == "refine") {
    category_ = REFINE;
    // mandatory items
    this->TryStoreIdx(REFINE_ENTRY_ID, "entry_id", header);
    this->TryStoreIdx(LS_D_RES_HIGH,   "ls_d_res_high", header);
    this->TryStoreIdx(LS_D_RES_LOW,    "ls_d_res_low", header);
    cat_available = true;
  }
  category_counts_[category_]++;
  return cat_available;
}

mol::ResNum to_res_num(int num, char ins_code)
{
  return mol::ResNum(num, ins_code==' ' ? '\0' : ins_code);
}

bool MMCifParser::ParseAtomIdent(const std::vector<StringRef>& columns,
                                 String& chain_name,
                                 StringRef& res_name,
                                 mol::ResNum& resnum,
                                 bool& valid_res_num,
                                 StringRef& atom_name,
                                 char& alt_loc)
{
  // ATOM name
  atom_name = columns[indices_[LABEL_ATOM_ID]];
  if (profile_.calpha_only) { // unit test profile
    if (atom_name != StringRef("CA", 2)) {
      return false;
    }
  }
  // CHAIN ID
  if (auth_chain_id_) { // unit test different IDs
    chain_name = columns[indices_[AUTH_ASYM_ID]].str();
  } else {
    chain_name = columns[indices_[LABEL_ASYM_ID]].str();
  }
  if (restrict_chains_.size() > 0 && // unit test
      restrict_chains_.find(chain_name) == String::npos) { // unit test
    return false;
  } 

  std::pair<bool, int> a_num = this->TryGetInt(columns[indices_[AS_ID]],
                                               "atom_site.id",
                                          profile_.fault_tolerant); // unit test

  alt_loc = columns[indices_[LABEL_ALT_ID]][0];
  res_name = columns[indices_[LABEL_COMP_ID]];
  std::pair<bool, int> res_num;
  if (columns[indices_[LABEL_SEQ_ID]][0] != '.') {
    res_num =this->TryGetInt(columns[indices_[LABEL_SEQ_ID]],
                             "atom_site.label_seq_id",
                             profile_.fault_tolerant); // unit test
    if (!res_num.first) { // unit test
      if (profile_.fault_tolerant) {
        return false;
      }
    }
    valid_res_num = true;
  } else {
    valid_res_num = false;
    return true;
  }

  resnum = to_res_num(res_num.second, ' ');

  return true;
}

void MMCifParser::ParseAndAddAtom(const std::vector<StringRef>& columns)
{
  mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT); // potbl
  char alt_loc=0;
  String chain_name;
  StringRef res_name, atom_name;
  mol::ResNum res_num(0);
  bool valid_res_num = false;
  if (indices_[PDBX_PDB_MODEL_NUM] != -1) {
    if (has_model_) {
      if (curr_model_ != TryGetInt(columns[indices_[PDBX_PDB_MODEL_NUM]],
                                   "atom_site.pdbx_PDB_model_num")) {
        return;
      }
    } else {
      has_model_ = true;
      curr_model_ = TryGetInt(columns[indices_[PDBX_PDB_MODEL_NUM]],
      "atom_site.pdbx_PDB_model_num");
    }
  }

  if (!this->ParseAtomIdent(columns,
                            chain_name,
                            res_name,
                            res_num,
                            valid_res_num,
                            atom_name,
                            alt_loc)) {// unit test
    return;                            
  }
  Real occ = 1.00f, temp = 0;
  geom::Vec3 apos;
  
  for (int i = CARTN_X; i <= CARTN_Z; ++i) {
    std::pair<bool, float> result = this->TryGetFloat(columns[indices_[i]],
                                                      "atom_site.cartn_[xyz]",
                                                      profile_.fault_tolerant);
    if (!result.first) { // unit test
      if (profile_.fault_tolerant) { // unit test
        return;
      }
    }
    apos[i - CARTN_X] = result.second;
  }

  if (indices_[OCCUPANCY] != -1) { // unit test
    occ = this->TryGetReal(columns[indices_[OCCUPANCY]], "atom_site.occupancy");
  }
  if (indices_[B_ISO_OR_EQUIV] != -1) { // unit test
    temp = this->TryGetReal(columns[indices_[B_ISO_OR_EQUIV]],
                            "atom_site.B_iso_or_equiv");
  }

  // determine element
  String s_ele(columns[indices_[TYPE_SYMBOL]].str());

  String aname(atom_name.str());  
  // some postprocessing
  LOG_TRACE( "s_chain: [" << chain_name << "]" );

  // determine chain and residue update
  bool update_chain = false;
  bool update_residue = false;
  if(!curr_chain_) { // unit test
      update_chain=true;
      update_residue=true;
  } else if(curr_chain_.GetName() != chain_name) { // unit test
    update_chain=true;
    update_residue=true;
  }

  if(!curr_residue_) { // unit test
    update_residue=true;
  } else if (!valid_res_num) {
    if (indices_[AUTH_SEQ_ID] != -1 &&
        indices_[PDBX_PDB_INS_CODE] != -1) {
      if (subst_res_id_ !=
          chain_name +
          columns[indices_[AUTH_SEQ_ID]].str() +
          columns[indices_[PDBX_PDB_INS_CODE]].str()) {
        update_residue=true;

        subst_res_id_ = chain_name +
                        columns[indices_[AUTH_SEQ_ID]].str() +
                        columns[indices_[PDBX_PDB_INS_CODE]].str();
      }
    } else {
      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                                           "Missing residue number information",
                                               this->GetCurrentLinenum()));
    }
  } else if(curr_residue_.GetNumber() != res_num) { // unit test
    update_residue=true;
  }

  if(update_chain) { // unit test
    curr_chain_ = ent_handle_.FindChain(chain_name);
    if(!curr_chain_.IsValid()) { // unit test
      LOG_DEBUG("new chain " << chain_name);
      curr_chain_=editor.InsertChain(chain_name);
      ++chain_count_;
      // store entity id
      chain_id_pairs_.push_back(std::pair<mol::ChainHandle,String>(curr_chain_,
                                     columns[indices_[LABEL_ENTITY_ID]].str()));
    }
    assert(curr_chain_.IsValid());
  } else if (chain_id_pairs_.back().second != // unit test
             columns[indices_[LABEL_ENTITY_ID]].str()) {
    // check that label_entity_id stays the same
    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
        "Change of 'atom_site.label_entity_id' item for chain " +
        curr_chain_.GetName() + "! Expected: " + chain_id_pairs_.back().second +
        ", found: " + columns[indices_[LABEL_ENTITY_ID]].str() + ".",
                                             this->GetCurrentLinenum()));
  }

  if(update_residue) { // unit test
    curr_residue_=mol::ResidueHandle();
    if (valid_res_num && profile_.join_spread_atom_records) { // unit test
      curr_residue_=curr_chain_.FindResidue(res_num);
    }
    if (!curr_residue_.IsValid()) { // unit test
      LOG_DEBUG("new residue " << res_name << " " << res_num);
      if (valid_res_num) {
        curr_residue_ = editor.AppendResidue(curr_chain_,
                                             res_name.str(),
                                             res_num);
      } else {
        curr_residue_ = editor.AppendResidue(curr_chain_, res_name.str());
      }
      warned_name_mismatch_=false;
      ++residue_count_; 
    }
    assert(curr_residue_.IsValid());
  }

  // finally add atom
  LOG_DEBUG("adding atom " << aname << " (" << s_ele << ") @" << apos);
  mol::AtomHandle ah;
  if (curr_residue_.GetName()!=res_name.str()) { // unit test
    if (!profile_.fault_tolerant && alt_loc=='.') { // unit test
      std::stringstream ss;
      ss << "Residue with number " << res_num << " has more than one name.";
      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                                               ss.str(),
                                               this->GetCurrentLinenum()));
    }
    if (!warned_name_mismatch_) { // unit test
      if (alt_loc=='.') { // unit test
        LOG_WARNING("Residue with number " << res_num << " has more than one "
                    "name. Ignoring atoms for everything but the first");
      } else {
        LOG_WARNING("Residue with number " << res_num 
                    << " contains a microheterogeneity. Everything but atoms "
                    "for the residue '" << curr_residue_.GetName() 
                    << "' will be ignored");
      }
    }
    warned_name_mismatch_=true;
    return;
  }
  if (alt_loc!='.') { // unit test
    // Check if there is already a atom with the same name.
    mol::AtomHandle me=curr_residue_.FindAtom(aname);
    if (me.IsValid()) { // unit test
      try {
        editor.AddAltAtomPos(String(1, alt_loc), me, apos);
      } catch (Error) {
        LOG_INFO("Ignoring atom alt location since there is already an atom "
                 "with name " << aname << ", but without an alt loc");
        return;
      }
      return;
    } else {
      ah = editor.InsertAltAtom(curr_residue_, aname,
                                String(1, alt_loc), apos, s_ele);
      ++atom_count_;
      }
  } else {
    mol::AtomHandle atom=curr_residue_.FindAtom(aname);
    if (atom.IsValid() && !profile_.quack_mode) { // unit test
      if (profile_.fault_tolerant) { // unit test
        LOG_WARNING("duplicate atom '" << aname << "' in residue " 
                    << curr_residue_);
        return;
      }
      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                                               "Duplicate atom '"+aname+
                                               "' in residue "+
                                               curr_residue_.GetQualifiedName(),
                                               this->GetCurrentLinenum()));
    }
    ah = editor.InsertAtom(curr_residue_, aname, apos, s_ele);
    ++atom_count_;
  }
  ah.SetBFactor(temp);

  ah.SetOccupancy(occ);

  // record type
  ah.SetHetAtom(indices_[GROUP_PDB] == -1 ? false :  
                columns[indices_[GROUP_PDB]][0]=='H');

}

void MMCifParser::ParseEntity(const std::vector<StringRef>& columns)
{
  bool store = false; // is it worth storing this record?
  MMCifEntityDesc desc;

  // type
  if (indices_[E_TYPE] != -1) {
    desc.type = mol::ChainTypeFromString(columns[indices_[E_TYPE]]);
    store = true;
  }
  // description
  if (indices_[PDBX_DESCRIPTION] != -1) {
    desc.details = columns[indices_[PDBX_DESCRIPTION]].str();
  } else {
    desc.details = "";
  }

  if (store) {
    desc.seqres = "";
    entity_desc_map_.insert(
                   MMCifEntityDescMap::value_type(columns[indices_[E_ID]].str(),
                                                  desc)
                            );
  }
}

void MMCifParser::ParseEntityPoly(const std::vector<StringRef>& columns)
{
  // we assume that the entity cat. ALWAYS comes before the entity_poly cat.
  // search entity
  MMCifEntityDescMap::iterator edm_it =
    entity_desc_map_.find(columns[indices_[ENTITY_ID]].str());

  if (edm_it == entity_desc_map_.end()) {
    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                     "'entity_poly' category defined before 'entity' for id '" +
                                            columns[indices_[ENTITY_ID]].str() +
                                             "' or missing.",
                                             this->GetCurrentLinenum()));
  }

  // store type
  if (indices_[EP_TYPE] != -1) {
    edm_it->second.type = mol::ChainTypeFromString(columns[indices_[EP_TYPE]]);
  }

  // store seqres
  if (edm_it->second.seqres.length() > 0) {
    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
     "entity_poly.pdbx_seq_one_letter_code[_can] clash: sequence for entry '" +
                                            columns[indices_[ENTITY_ID]].str() +
                                             "' is already set to '" +
                                             edm_it->second.seqres + "'.",
                                             this->GetCurrentLinenum()));
  }
  if (read_seqres_) {
    StringRef seqres;
    if (seqres_can_) {
      if (indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN] != -1) {
        seqres=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE_CAN]];
        edm_it->second.seqres = seqres.str_no_whitespace();        
      } else {
        throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                   "'entity_poly.pdbx_seq_one_letter_code_can' not available.'",
                                                 this->GetCurrentLinenum()));
      }
    } else if (indices_[PDBX_SEQ_ONE_LETTER_CODE] != -1) {
      seqres=columns[indices_[PDBX_SEQ_ONE_LETTER_CODE]];
      conop::BuilderP builder=conop::Conopology::Instance().GetBuilder("DEFAULT");
      conop::RuleBasedBuilderPtr rbb=dyn_cast<conop::RuleBasedBuilder>(builder);
      if (!rbb) {
        if (!warned_rule_based_) {
          LOG_WARNING("SEQRES import requires the rule-based builder. Ignoring "
                      "SEQRES records");      
        }
        warned_rule_based_=true;
        return;
      }
      conop::CompoundLibPtr comp_lib=rbb->GetCompoundLib();
      edm_it->second.seqres = this->ConvertSEQRES(seqres.str_no_whitespace(),
                                                  comp_lib);
    } else {
      throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                       "'entity_poly.pdbx_seq_one_letter_code' not available.'",
                                               this->GetCurrentLinenum()));
    }
  }
}

String MMCifParser::ConvertSEQRES(const String& seqres, 
                                  conop::CompoundLibPtr comp_lib)
{
  String can_seqres;
  for (String::const_iterator i=seqres.begin(), e=seqres.end(); i!=e; ++i) {
    if (*i=='(') {
      bool found_end_paren=false;
      String tlc;
      tlc.reserve(3);
      while ((++i)!=seqres.end()) {
        if (*i==')') {
          found_end_paren=true;
          break;
        }
        tlc.push_back(*i);
      }
      if (!found_end_paren) {
        throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                          "'entity_poly.pdbx_seq_one_letter_code' contains "
                          "unmatched '('", this->GetCurrentLinenum()));
      }
      conop::CompoundPtr compound=comp_lib->FindCompound(tlc, 
                                                         conop::Compound::PDB);
      if (!compound) {
        if (tlc!="UNK") {

          LOG_WARNING("unknown residue '" << tlc << "' in SEQRES record. "
                      "Setting one-letter-code to '?'");
        }
        can_seqres.push_back('?');
        continue;
      }
      can_seqres.push_back(compound->GetOneLetterCode());
    } else {
      can_seqres.push_back(*i);
    }
  }
  return can_seqres;
}

void MMCifParser::ParseCitation(const std::vector<StringRef>& columns)
{
  // create citation object
  MMCifInfoCitation cit = MMCifInfoCitation();
  // just add info
  cit.SetID(columns[indices_[CITATION_ID]].str());
  if (indices_[ABSTRACT_ID_CAS] != -1) {
    cit.SetCAS(columns[indices_[ABSTRACT_ID_CAS]].str());
  }
  if (indices_[BOOK_ID_ISBN] != -1) {
    cit.SetISBN(columns[indices_[BOOK_ID_ISBN]].str());
  }
  if (indices_[BOOK_TITLE] != -1) {
    if (columns[indices_[BOOK_TITLE]] != StringRef(".", 1)) {
      cit.SetPublishedIn(columns[indices_[BOOK_TITLE]].str());
    }
  }
  if (indices_[JOURNAL_FULL] != -1) {
    if (columns[indices_[JOURNAL_FULL]] != StringRef(".", 1)) {
      if (cit.GetPublishedIn().length() > 0) {
        throw IOException(this->FormatDiagnostic(STAR_DIAG_WARNING,
                                                 "citation.book_title already occupies the 'published_in' field of this citation, cannot add " +
                                                 columns[indices_[JOURNAL_FULL]].str() +
                                                 ".",
                                                 this->GetCurrentLinenum()));
      } else {
        cit.SetPublishedIn(columns[indices_[JOURNAL_FULL]].str());
      }
    }
  }
  if (indices_[JOURNAL_VOLUME] != -1) {
    cit.SetVolume(columns[indices_[JOURNAL_VOLUME]].str());
  }
  if (indices_[PAGE_FIRST] != -1) {
    cit.SetPageFirst(columns[indices_[PAGE_FIRST]].str());
  }
  if (indices_[PAGE_LAST] != -1) {
    cit.SetPageLast(columns[indices_[PAGE_LAST]].str());
  }
  if (indices_[PDBX_DATABASE_ID_DOI] != -1) {
    cit.SetDOI(columns[indices_[PDBX_DATABASE_ID_DOI]].str());
  }
  if (indices_[PDBX_DATABASE_ID_PUBMED] != -1) {
    cit.SetPubMed(this->TryGetInt(columns[indices_[PDBX_DATABASE_ID_PUBMED]],
                                  "citation.pdbx_database_id_PubMed"));
  }
  if (indices_[YEAR] != -1) {
    cit.SetPubMed(this->TryGetInt(columns[indices_[YEAR]], "citation.year"));
  }
  if (indices_[TITLE] != -1) {
    cit.SetTitle(columns[indices_[TITLE]].str());
  }

  // store citation (wo author, yet)
  info_.AddCitation(cit);
}

void MMCifParser::ParseCitationAuthor(const std::vector<StringRef>& columns)
{
  // get/ pack values
  MMCifCitationAuthorMap::iterator atm_it;
  std::vector<String> at_vec;
  std::vector<int> pos_vec;
  atm_it = authors_map_.find(columns[indices_[AUTHOR_CITATION_ID]].str());
  if (atm_it != authors_map_.end()) {
    at_vec = atm_it->second.second;
    pos_vec = atm_it->second.first;
  }
  at_vec.push_back(columns[indices_[AUTHOR_NAME]].str());
  pos_vec.push_back(this->TryGetInt(columns[indices_[ORDINAL]],
                            "citation_author.ordinal"));

  // sort new author into right position
  std::vector<int>::iterator pos_it;
  std::vector<String>::iterator atv_it;
  int ti;
  String ts; 
  pos_it = pos_vec.end();
  atv_it = at_vec.end();
  --pos_it;
  --atv_it;
  for (; pos_it != pos_vec.begin(); --pos_it, --atv_it) {
    if (*pos_it < *(pos_it-1)) {
      ti = *pos_it;
      *pos_it = *(pos_it-1);
      *(pos_it-1) = ti;
      ts = *atv_it;
      *atv_it = *(atv_it-1);
      *(atv_it-1) = ts;
    }
    else {
      break;
    }
  }

  // store new values in map
  if (atm_it != authors_map_.end()) {
    atm_it->second.second = at_vec;
    atm_it->second.first  = pos_vec;
  } else {
    authors_map_.insert(MMCifCitationAuthorMap::value_type(
                               columns[indices_[AUTHOR_CITATION_ID]].str(),
              std::pair<std::vector<int>, std::vector<String> >(pos_vec, at_vec)
                               ));
  }
}

void MMCifParser::ParseExptl(const std::vector<StringRef>& columns)
{
  info_.SetMethod(columns[indices_[METHOD]].str());
}

void MMCifParser::ParseRefine(const std::vector<StringRef>& columns)
{
  info_.SetResolution(this->TryGetReal(columns[indices_[LS_D_RES_HIGH]],
                                       "refine.ls_d_res_high"));
}

void MMCifParser::OnDataRow(const StarLoopDesc& header, 
                            const std::vector<StringRef>& columns)
{
  switch(category_) {
  case ATOM_SITE:
    LOG_TRACE("processing atom_site entry");
    this->ParseAndAddAtom(columns);
    break;
  case ENTITY:
    LOG_TRACE("processing entity entry");
    this->ParseEntity(columns);
    break;
  case ENTITY_POLY:
    LOG_TRACE("processing entity_poly entry");
    this->ParseEntityPoly(columns);
    break;
  case CITATION:
    LOG_TRACE("processing citation entry");
    this->ParseCitation(columns);
    break;
  case CITATION_AUTHOR:
    LOG_TRACE("processing citation_author entry")
    this->ParseCitationAuthor(columns);
    break;
  case EXPTL:
    LOG_TRACE("processing exptl entry")
    this->ParseExptl(columns);
    break;
  case REFINE:
    LOG_TRACE("processing refine entry")
    this->ParseRefine(columns);
    break;
  default:
    throw IOException(this->FormatDiagnostic(STAR_DIAG_ERROR,
                       "Uncatched category '"+ header.GetCategory() +"' found.",
                                             this->GetCurrentLinenum()));
    return;
  }
}

void MMCifParser::OnEndData()
{
  mol::XCSEditor editor=ent_handle_.EditXCS(mol::BUFFERED_EDIT);

  // process chain types
  std::vector<std::pair<mol::ChainHandle, String> >::const_iterator css;
  MMCifEntityDescMap::const_iterator edm_it;
  for (css = chain_id_pairs_.begin(); css != chain_id_pairs_.end(); ++css) {
    edm_it = entity_desc_map_.find(css->second);

    if (edm_it != entity_desc_map_.end()) {
      editor.SetChainType(css->first, edm_it->second.type);
      editor.SetChainDescription(css->first, edm_it->second.details);
      if (edm_it->second.seqres.length() > 0) {
        seqres_.AddSequence(seq::CreateSequence(css->first.GetName(),
                                                edm_it->second.seqres));
      }
    } else {
      LOG_WARNING("No entity description found for atom_site.label_entity_id '"
                  << css->second << "'");
    }
  }

  // process citations (couple with authors
  // iterate citations
  MMCifCitationAuthorMap::const_iterator atm_it;
  std::vector<String>::const_iterator atv_it;
  std::vector<int>::const_iterator pos_it;
  for (atm_it = authors_map_.begin(); atm_it != authors_map_.end(); ++atm_it) {
    info_.AddAuthorsToCitation(StringRef(atm_it->first.c_str(),
                                         atm_it->first.length()),
                               atm_it->second.second);
  }

  LOG_INFO("imported "
           << chain_count_ << " chains, "
           << residue_count_ << " residues, "
           << atom_count_ << " atoms;");
}

}}