Skip to content
Snippets Groups Projects
Commit 7b0c0412 authored by Studer Gabriel's avatar Studer Gabriel
Browse files

introduce simple k-mer (k=5) counter for fast search of close sequences

parent 18f2f1aa
No related branches found
No related tags found
No related merge requests found
...@@ -39,4 +39,8 @@ void export_afdb() ...@@ -39,4 +39,8 @@ void export_afdb()
arg("result_list"))); arg("result_list")));
def("CreateAFDBIdx", &CreateAFDBIdx, (arg("uniprot_ac"), arg("fragment"), def("CreateAFDBIdx", &CreateAFDBIdx, (arg("uniprot_ac"), arg("fragment"),
arg("version"))); arg("version")));
def("CreatePentaMatch", &CreatePentaMatch, (arg("seq_list"),
arg("db_path"),
arg("entries_from_seqnames")));
} }
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
// limitations under the License. // limitations under the License.
#include <algorithm> #include <algorithm>
#include <fstream>
#include <ost/log.hh> #include <ost/log.hh>
#include <ost/string_ref.hh>
#include <promod3/modelling/afdb.hh> #include <promod3/modelling/afdb.hh>
...@@ -48,6 +50,15 @@ namespace{ ...@@ -48,6 +50,15 @@ namespace{
throw ost::Error(ss.str()); throw ost::Error(ss.str());
} }
inline int PentamerToIdx(const char* ptr) {
return CharToIdx(ptr[0])*160000 + CharToIdx(ptr[1])*8000 +
CharToIdx(ptr[2])*400 + CharToIdx(ptr[3])*20 + CharToIdx(ptr[4]);
}
inline int PentamerToIdx(const int* ptr) {
return ptr[0]*160000 + ptr[1]*8000 + ptr[2]*400 + ptr[3]*20 + ptr[4];
}
inline uint64_t AlphaToIdx(char ch) { inline uint64_t AlphaToIdx(char ch) {
if(ch == ' ') { if(ch == ' ') {
return 0; return 0;
...@@ -90,17 +101,22 @@ namespace{ ...@@ -90,17 +101,22 @@ namespace{
namespace promod3 { namespace modelling { namespace promod3 { namespace modelling {
int PentamerToIdx(const char* ptr) {
return CharToIdx(ptr[0])*160000 + CharToIdx(ptr[1])*8000 +
CharToIdx(ptr[2])*400 + CharToIdx(ptr[3])*20 + CharToIdx(ptr[4]);
}
void SeqToPentamerIndices(const String& seq, bool unique, std::vector<int>& indices) { void SeqToPentamerIndices(const String& seq, bool unique, std::vector<int>& indices) {
int N = seq.size() - 4; int N = seq.size();
if(N < 5) {
std::stringstream ss;
ss << "Sequence must have at least length 5, got: " << seq;
throw ost::Error(ss.str());
}
indices.resize(N); indices.resize(N);
for(int i = 0; i < N; ++i) { for(int i = 0; i < N; ++i) {
indices[i] = PentamerToIdx(&seq[i]); indices[i] = CharToIdx(seq[i]);
}
for(int i = 0; i < N-4; ++i) {
indices[i] = PentamerToIdx(&indices[i]);
} }
indices.resize(N-4);
if(unique) { if(unique) {
auto last = std::unique(indices.begin(), indices.end()); auto last = std::unique(indices.begin(), indices.end());
indices.erase(last, indices.end()); indices.erase(last, indices.end());
...@@ -123,40 +139,40 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) { ...@@ -123,40 +139,40 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
} }
if(!CheckNumeric(uniprot_ac[1], false)) { if(!CheckNumeric(uniprot_ac[1], false)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 1 of uniprot AC");
} }
if(!CheckAlphaNumeric(uniprot_ac[2], false)) { if(!CheckAlphaNumeric(uniprot_ac[2], false)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 2 of uniprot AC");
} }
if(!CheckAlphaNumeric(uniprot_ac[3], false)) { if(!CheckAlphaNumeric(uniprot_ac[3], false)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 3 of uniprot AC");
} }
if(!CheckAlphaNumeric(uniprot_ac[4], false)) { if(!CheckAlphaNumeric(uniprot_ac[4], false)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 4 of uniprot AC");
} }
if(!CheckNumeric(uniprot_ac[5], false)) { if(!CheckNumeric(uniprot_ac[5], false)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 5 of uniprot AC");
} }
if(ac_size > 6) { if(ac_size > 6) {
if(!CheckAlpha(uniprot_ac[6], true)) { if(!CheckAlpha(uniprot_ac[6], true)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 6 of uniprot AC");
} }
if(!CheckAlphaNumeric(uniprot_ac[7], true)) { if(!CheckAlphaNumeric(uniprot_ac[7], true)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 7 of uniprot AC");
} }
if(!CheckAlphaNumeric(uniprot_ac[8], true)) { if(!CheckAlphaNumeric(uniprot_ac[8], true)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 8 of uniprot AC");
} }
if(!CheckNumeric(uniprot_ac[9], true)) { if(!CheckNumeric(uniprot_ac[9], true)) {
throw ost::Error("Exp capital alphabetic character at idx 0 of uniprot AC"); throw ost::Error("Exp capital alphabetic character at idx 9 of uniprot AC");
} }
} }
...@@ -187,6 +203,92 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) { ...@@ -187,6 +203,92 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
return idx; return idx;
} }
void CreatePentaMatch(const ost::seq::SequenceList& seq_list,
const String& db_path,
bool entries_from_seqnames) {
String indexer_path = db_path + "/" + "indexer.dat";
String pos_path = db_path + "/" + "pos.dat";
String length_path = db_path + "/" + "length.dat";
String meta_path = db_path + "/" + "meta.dat";
std::vector<int> entry_indices;
entry_indices.reserve(seq_list.GetCount());
if(entries_from_seqnames) {
for(int i = 0; i < seq_list.GetCount(); ++i) {
const String& sname = seq_list[i].GetName();
ost::StringRef s(&sname[0], sname.size());
std::pair<bool, int> idx = s.trim().to_int();
if(idx.first == false) {
std::stringstream ss;
ss << "Cannot cast seq name to integer: " << sname;
throw ost::Error(ss.str());
}
entry_indices.push_back(idx.second);
}
} else {
for(int i = 0; i < seq_list.GetCount(); ++i) {
entry_indices.push_back(i);
}
}
std::vector<std::vector<int32_t> > indexer(3200000);
for(int i = 0; i < seq_list.GetCount(); ++i) {
int entry_idx = entry_indices[i];
std::vector<int> penta_indices;
SeqToPentamerIndices(seq_list[i].GetString(), true, penta_indices);
for(auto j = penta_indices.begin(); j != penta_indices.end(); ++j) {
indexer[*j].push_back(entry_idx);
}
}
for(auto i = indexer.begin(); i != indexer.end(); ++i) {
std::sort(i->begin(), i->end());
}
std::ofstream indexer_stream(indexer_path, std::ios::binary);
if(!indexer_stream) {
throw ost::Error("Could not open indexer file: " + indexer_path);
}
std::vector<int64_t> positions;
std::vector<int32_t> lengths;
int64_t current_pos = 0;
for(auto i = indexer.begin(); i != indexer.end(); ++i) {
int size = i->size();
if(size > 0) {
indexer_stream.write(reinterpret_cast<char*>(&(*i)[0]),
size*sizeof(int32_t));
}
positions.push_back(current_pos);
lengths.push_back(size);
current_pos += size;
}
indexer_stream.close();
std::ofstream pos_stream(pos_path, std::ios::binary);
std::ofstream length_stream(length_path, std::ios::binary);
std::ofstream meta_stream(meta_path, std::ofstream::out);
if(!pos_stream) {
throw ost::Error("Could not open pos file: " + pos_path);
}
if(!length_stream) {
throw ost::Error("Could not open length file: " + length_path);
}
if(!meta_stream) {
throw ost::Error("Could not open meta file: " + meta_path);
}
pos_stream.write(reinterpret_cast<char*>(&positions[0]),
positions.size()*sizeof(int64_t));
length_stream.write(reinterpret_cast<char*>(&lengths[0]),
lengths.size()*sizeof(int32_t));
int32_t max_entry_idx = *std::max_element(entry_indices.begin(),
entry_indices.end());
meta_stream << max_entry_idx;
pos_stream.close();
length_stream.close();
meta_stream.close();
}
}} //ns }} //ns
...@@ -17,17 +17,20 @@ ...@@ -17,17 +17,20 @@
#define PM3_MODELLING_AFDB_HH #define PM3_MODELLING_AFDB_HH
#include <ost/mol/mol.hh> #include <ost/mol/mol.hh>
#include <ost/seq/sequence_list.hh>
#include <promod3/core/message.hh> #include <promod3/core/message.hh>
namespace promod3 { namespace modelling {
int PentamerToIdx(const char* ptr); namespace promod3 { namespace modelling {
void SeqToPentamerIndices(const String& seq, bool unique, void SeqToPentamerIndices(const String& seq, bool unique,
std::vector<int>& indices); std::vector<int>& indices);
uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version); void CreatePentaMatch(const ost::seq::SequenceList& seq_list,
const String& db_path,
bool entries_from_seqnames);
uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version);
}} //ns }} //ns
#endif #endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment