diff --git a/kmatch.cpp b/kmatch.cpp index f2a93df43a437152ddfbdeb992dab501c1e27233..9de0877a1537aa624fa08e9ee31ab07346b5c038 100644 --- a/kmatch.cpp +++ b/kmatch.cpp @@ -306,8 +306,7 @@ public: meta_stream.close(); } - std::vector<int32_t> TopN(const std::string& sequence, int32_t top_n, - bool unique) const { + void Accumulate(const std::string& sequence, bool unique) const { /////////// // SETUP // @@ -358,11 +357,13 @@ public: ++accumulator_[read_buffer[i]]; } } - } + } + } + + std::vector<int32_t> TopN(const std::string& sequence, int32_t top_n, + bool unique) const { - /////////////// - // GET TOP N // - /////////////// + this->Accumulate(sequence, unique); // pair of numbers per element in top_n (count and index) // which are sorted by counts (descending, i.e. top count in front) @@ -398,6 +399,32 @@ public: return best_v; } + std::vector<int32_t> GetHits(const std::string& sequence, int32_t min_hits, + bool unique) const { + + this->Accumulate(sequence, unique); + + std::vector<std::pair<int32_t, int32_t> > hits; + + for(int32_t i = 0; i < N_; ++i) { + if(accumulator_[i] >= min_hits) { + hits.push_back(std::make_pair(accumulator_[i], i)); + } + } + + std::sort(hits.begin(), hits.end(), std::greater<std::pair<int32_t, int32_t> >()); + + int32_t n_hits = hits.size(); + std::vector<int32_t> result_vec(n_hits * 2); + + for(int32_t i = 0; i < n_hits; ++i) { + result_vec[2*i] = hits[i].first; + result_vec[2*i+1] = hits[i].second; + } + + return result_vec; + } + private: bool in_mem_indexer_; std::vector<int64_t> pos_; @@ -417,7 +444,8 @@ PYBIND11_MODULE(kmatch, m) { pybind11::class_<KMatch>(m, "KMatch") .def(pybind11::init<const std::string&, bool>()) .def_static("FromFasta", &KMatch::FromFasta) - .def("TopN", &KMatch::TopN); + .def("TopN", &KMatch::TopN) + .def("GetHits", &KMatch::GetHits); } } // ns