diff --git a/modelling/pymod/export_afdb.cc b/modelling/pymod/export_afdb.cc index b4d4eb08172622358e26d2782e2547e70b8e2086..7a9741d7552909274fd0eaa84c667376996a76be 100644 --- a/modelling/pymod/export_afdb.cc +++ b/modelling/pymod/export_afdb.cc @@ -39,4 +39,8 @@ void export_afdb() arg("result_list"))); def("CreateAFDBIdx", &CreateAFDBIdx, (arg("uniprot_ac"), arg("fragment"), arg("version"))); + + def("CreatePentaMatch", &CreatePentaMatch, (arg("seq_list"), + arg("db_path"), + arg("entries_from_seqnames"))); } diff --git a/modelling/src/afdb.cc b/modelling/src/afdb.cc index d41564c84f2ded3caa41117aa8c69f5bb25756bb..210146ac2b5fbcec97e2dc029e5fb3ef5a81245f 100644 --- a/modelling/src/afdb.cc +++ b/modelling/src/afdb.cc @@ -14,8 +14,10 @@ // limitations under the License. #include <algorithm> +#include <fstream> #include <ost/log.hh> +#include <ost/string_ref.hh> #include <promod3/modelling/afdb.hh> @@ -48,6 +50,15 @@ namespace{ throw ost::Error(ss.str()); } + inline int PentamerToIdx(const char* ptr) { + return CharToIdx(ptr[0])*160000 + CharToIdx(ptr[1])*8000 + + CharToIdx(ptr[2])*400 + CharToIdx(ptr[3])*20 + CharToIdx(ptr[4]); + } + + inline int PentamerToIdx(const int* ptr) { + return ptr[0]*160000 + ptr[1]*8000 + ptr[2]*400 + ptr[3]*20 + ptr[4]; + } + inline uint64_t AlphaToIdx(char ch) { if(ch == ' ') { return 0; @@ -90,17 +101,22 @@ namespace{ namespace promod3 { namespace modelling { -int PentamerToIdx(const char* ptr) { - return CharToIdx(ptr[0])*160000 + CharToIdx(ptr[1])*8000 + - CharToIdx(ptr[2])*400 + CharToIdx(ptr[3])*20 + CharToIdx(ptr[4]); -} void SeqToPentamerIndices(const String& seq, bool unique, std::vector<int>& indices) { - int N = seq.size() - 4; + int N = seq.size(); + if(N < 5) { + std::stringstream ss; + ss << "Sequence must have at least length 5, got: " << seq; + throw ost::Error(ss.str()); + } indices.resize(N); for(int i = 0; i < N; ++i) { - indices[i] = PentamerToIdx(&seq[i]); + indices[i] = CharToIdx(seq[i]); + } + for(int i = 0; i < N-4; ++i) { + indices[i] = PentamerToIdx(&indices[i]); } + indices.resize(N-4); if(unique) { auto last = std::unique(indices.begin(), indices.end()); indices.erase(last, indices.end()); @@ -123,40 +139,40 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) { } if(!CheckNumeric(uniprot_ac[1], false)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 1 of uniprot AC"); } if(!CheckAlphaNumeric(uniprot_ac[2], false)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 2 of uniprot AC"); } if(!CheckAlphaNumeric(uniprot_ac[3], false)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 3 of uniprot AC"); } if(!CheckAlphaNumeric(uniprot_ac[4], false)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 4 of uniprot AC"); } if(!CheckNumeric(uniprot_ac[5], false)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 5 of uniprot AC"); } if(ac_size > 6) { if(!CheckAlpha(uniprot_ac[6], true)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 6 of uniprot AC"); } if(!CheckAlphaNumeric(uniprot_ac[7], true)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 7 of uniprot AC"); } if(!CheckAlphaNumeric(uniprot_ac[8], true)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 8 of uniprot AC"); } if(!CheckNumeric(uniprot_ac[9], true)) { - throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); + throw ost::Error("Exp capital alphabetic character at idx 9 of uniprot AC"); } } @@ -187,6 +203,92 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) { return idx; } +void CreatePentaMatch(const ost::seq::SequenceList& seq_list, + const String& db_path, + bool entries_from_seqnames) { + + String indexer_path = db_path + "/" + "indexer.dat"; + String pos_path = db_path + "/" + "pos.dat"; + String length_path = db_path + "/" + "length.dat"; + String meta_path = db_path + "/" + "meta.dat"; + + std::vector<int> entry_indices; + entry_indices.reserve(seq_list.GetCount()); + if(entries_from_seqnames) { + for(int i = 0; i < seq_list.GetCount(); ++i) { + const String& sname = seq_list[i].GetName(); + ost::StringRef s(&sname[0], sname.size()); + std::pair<bool, int> idx = s.trim().to_int(); + if(idx.first == false) { + std::stringstream ss; + ss << "Cannot cast seq name to integer: " << sname; + throw ost::Error(ss.str()); + } + entry_indices.push_back(idx.second); + } + } else { + for(int i = 0; i < seq_list.GetCount(); ++i) { + entry_indices.push_back(i); + } + } + + std::vector<std::vector<int32_t> > indexer(3200000); + for(int i = 0; i < seq_list.GetCount(); ++i) { + int entry_idx = entry_indices[i]; + std::vector<int> penta_indices; + SeqToPentamerIndices(seq_list[i].GetString(), true, penta_indices); + for(auto j = penta_indices.begin(); j != penta_indices.end(); ++j) { + indexer[*j].push_back(entry_idx); + } + } + + for(auto i = indexer.begin(); i != indexer.end(); ++i) { + std::sort(i->begin(), i->end()); + } + + std::ofstream indexer_stream(indexer_path, std::ios::binary); + if(!indexer_stream) { + throw ost::Error("Could not open indexer file: " + indexer_path); + } + std::vector<int64_t> positions; + std::vector<int32_t> lengths; + int64_t current_pos = 0; + for(auto i = indexer.begin(); i != indexer.end(); ++i) { + int size = i->size(); + if(size > 0) { + indexer_stream.write(reinterpret_cast<char*>(&(*i)[0]), + size*sizeof(int32_t)); + } + positions.push_back(current_pos); + lengths.push_back(size); + current_pos += size; + } + indexer_stream.close(); + + std::ofstream pos_stream(pos_path, std::ios::binary); + std::ofstream length_stream(length_path, std::ios::binary); + std::ofstream meta_stream(meta_path, std::ofstream::out); + if(!pos_stream) { + throw ost::Error("Could not open pos file: " + pos_path); + } + if(!length_stream) { + throw ost::Error("Could not open length file: " + length_path); + } + if(!meta_stream) { + throw ost::Error("Could not open meta file: " + meta_path); + } + pos_stream.write(reinterpret_cast<char*>(&positions[0]), + positions.size()*sizeof(int64_t)); + length_stream.write(reinterpret_cast<char*>(&lengths[0]), + lengths.size()*sizeof(int32_t)); + int32_t max_entry_idx = *std::max_element(entry_indices.begin(), + entry_indices.end()); + meta_stream << max_entry_idx; + pos_stream.close(); + length_stream.close(); + meta_stream.close(); +} + }} //ns diff --git a/modelling/src/afdb.hh b/modelling/src/afdb.hh index e48f792b0fbcb75323a47953ca06ccc6dae19e20..551e51d97285707825851f19fdb721fbb0b1687f 100644 --- a/modelling/src/afdb.hh +++ b/modelling/src/afdb.hh @@ -17,17 +17,20 @@ #define PM3_MODELLING_AFDB_HH #include <ost/mol/mol.hh> +#include <ost/seq/sequence_list.hh> #include <promod3/core/message.hh> -namespace promod3 { namespace modelling { -int PentamerToIdx(const char* ptr); +namespace promod3 { namespace modelling { void SeqToPentamerIndices(const String& seq, bool unique, std::vector<int>& indices); -uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version); +void CreatePentaMatch(const ost::seq::SequenceList& seq_list, + const String& db_path, + bool entries_from_seqnames); +uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version); }} //ns #endif