Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
P
ProMod3
Manage
Activity
Members
Plan
Jira
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
schwede
ProMod3
Commits
7b0c0412
Commit
7b0c0412
authored
2 years ago
by
Studer Gabriel
Browse files
Options
Downloads
Patches
Plain Diff
introduce simple k-mer (k=5) counter for fast search of close sequences
parent
18f2f1aa
No related branches found
No related tags found
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
modelling/pymod/export_afdb.cc
+4
-0
4 additions, 0 deletions
modelling/pymod/export_afdb.cc
modelling/src/afdb.cc
+117
-15
117 additions, 15 deletions
modelling/src/afdb.cc
modelling/src/afdb.hh
+6
-3
6 additions, 3 deletions
modelling/src/afdb.hh
with
127 additions
and
18 deletions
modelling/pymod/export_afdb.cc
+
4
−
0
View file @
7b0c0412
...
@@ -39,4 +39,8 @@ void export_afdb()
...
@@ -39,4 +39,8 @@ void export_afdb()
arg
(
"result_list"
)));
arg
(
"result_list"
)));
def
(
"CreateAFDBIdx"
,
&
CreateAFDBIdx
,
(
arg
(
"uniprot_ac"
),
arg
(
"fragment"
),
def
(
"CreateAFDBIdx"
,
&
CreateAFDBIdx
,
(
arg
(
"uniprot_ac"
),
arg
(
"fragment"
),
arg
(
"version"
)));
arg
(
"version"
)));
def
(
"CreatePentaMatch"
,
&
CreatePentaMatch
,
(
arg
(
"seq_list"
),
arg
(
"db_path"
),
arg
(
"entries_from_seqnames"
)));
}
}
This diff is collapsed.
Click to expand it.
modelling/src/afdb.cc
+
117
−
15
View file @
7b0c0412
...
@@ -14,8 +14,10 @@
...
@@ -14,8 +14,10 @@
// limitations under the License.
// limitations under the License.
#include
<algorithm>
#include
<algorithm>
#include
<fstream>
#include
<ost/log.hh>
#include
<ost/log.hh>
#include
<ost/string_ref.hh>
#include
<promod3/modelling/afdb.hh>
#include
<promod3/modelling/afdb.hh>
...
@@ -48,6 +50,15 @@ namespace{
...
@@ -48,6 +50,15 @@ namespace{
throw
ost
::
Error
(
ss
.
str
());
throw
ost
::
Error
(
ss
.
str
());
}
}
inline
int
PentamerToIdx
(
const
char
*
ptr
)
{
return
CharToIdx
(
ptr
[
0
])
*
160000
+
CharToIdx
(
ptr
[
1
])
*
8000
+
CharToIdx
(
ptr
[
2
])
*
400
+
CharToIdx
(
ptr
[
3
])
*
20
+
CharToIdx
(
ptr
[
4
]);
}
inline
int
PentamerToIdx
(
const
int
*
ptr
)
{
return
ptr
[
0
]
*
160000
+
ptr
[
1
]
*
8000
+
ptr
[
2
]
*
400
+
ptr
[
3
]
*
20
+
ptr
[
4
];
}
inline
uint64_t
AlphaToIdx
(
char
ch
)
{
inline
uint64_t
AlphaToIdx
(
char
ch
)
{
if
(
ch
==
' '
)
{
if
(
ch
==
' '
)
{
return
0
;
return
0
;
...
@@ -90,17 +101,22 @@ namespace{
...
@@ -90,17 +101,22 @@ namespace{
namespace
promod3
{
namespace
modelling
{
namespace
promod3
{
namespace
modelling
{
int
PentamerToIdx
(
const
char
*
ptr
)
{
return
CharToIdx
(
ptr
[
0
])
*
160000
+
CharToIdx
(
ptr
[
1
])
*
8000
+
CharToIdx
(
ptr
[
2
])
*
400
+
CharToIdx
(
ptr
[
3
])
*
20
+
CharToIdx
(
ptr
[
4
]);
}
void
SeqToPentamerIndices
(
const
String
&
seq
,
bool
unique
,
std
::
vector
<
int
>&
indices
)
{
void
SeqToPentamerIndices
(
const
String
&
seq
,
bool
unique
,
std
::
vector
<
int
>&
indices
)
{
int
N
=
seq
.
size
()
-
4
;
int
N
=
seq
.
size
();
if
(
N
<
5
)
{
std
::
stringstream
ss
;
ss
<<
"Sequence must have at least length 5, got: "
<<
seq
;
throw
ost
::
Error
(
ss
.
str
());
}
indices
.
resize
(
N
);
indices
.
resize
(
N
);
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
for
(
int
i
=
0
;
i
<
N
;
++
i
)
{
indices
[
i
]
=
PentamerToIdx
(
&
seq
[
i
]);
indices
[
i
]
=
CharToIdx
(
seq
[
i
]);
}
for
(
int
i
=
0
;
i
<
N
-
4
;
++
i
)
{
indices
[
i
]
=
PentamerToIdx
(
&
indices
[
i
]);
}
}
indices
.
resize
(
N
-
4
);
if
(
unique
)
{
if
(
unique
)
{
auto
last
=
std
::
unique
(
indices
.
begin
(),
indices
.
end
());
auto
last
=
std
::
unique
(
indices
.
begin
(),
indices
.
end
());
indices
.
erase
(
last
,
indices
.
end
());
indices
.
erase
(
last
,
indices
.
end
());
...
@@ -123,40 +139,40 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
...
@@ -123,40 +139,40 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
}
}
if
(
!
CheckNumeric
(
uniprot_ac
[
1
],
false
))
{
if
(
!
CheckNumeric
(
uniprot_ac
[
1
],
false
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
1
of uniprot AC"
);
}
}
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
2
],
false
))
{
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
2
],
false
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
2
of uniprot AC"
);
}
}
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
3
],
false
))
{
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
3
],
false
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
3
of uniprot AC"
);
}
}
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
4
],
false
))
{
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
4
],
false
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
4
of uniprot AC"
);
}
}
if
(
!
CheckNumeric
(
uniprot_ac
[
5
],
false
))
{
if
(
!
CheckNumeric
(
uniprot_ac
[
5
],
false
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
5
of uniprot AC"
);
}
}
if
(
ac_size
>
6
)
{
if
(
ac_size
>
6
)
{
if
(
!
CheckAlpha
(
uniprot_ac
[
6
],
true
))
{
if
(
!
CheckAlpha
(
uniprot_ac
[
6
],
true
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
6
of uniprot AC"
);
}
}
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
7
],
true
))
{
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
7
],
true
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
7
of uniprot AC"
);
}
}
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
8
],
true
))
{
if
(
!
CheckAlphaNumeric
(
uniprot_ac
[
8
],
true
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
8
of uniprot AC"
);
}
}
if
(
!
CheckNumeric
(
uniprot_ac
[
9
],
true
))
{
if
(
!
CheckNumeric
(
uniprot_ac
[
9
],
true
))
{
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
0
of uniprot AC"
);
throw
ost
::
Error
(
"Exp capital alphabetic character at idx
9
of uniprot AC"
);
}
}
}
}
...
@@ -187,6 +203,92 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
...
@@ -187,6 +203,92 @@ uint64_t CreateAFDBIdx(const String& uniprot_ac, int fragment, int version) {
return
idx
;
return
idx
;
}
}
void
CreatePentaMatch
(
const
ost
::
seq
::
SequenceList
&
seq_list
,
const
String
&
db_path
,
bool
entries_from_seqnames
)
{
String
indexer_path
=
db_path
+
"/"
+
"indexer.dat"
;
String
pos_path
=
db_path
+
"/"
+
"pos.dat"
;
String
length_path
=
db_path
+
"/"
+
"length.dat"
;
String
meta_path
=
db_path
+
"/"
+
"meta.dat"
;
std
::
vector
<
int
>
entry_indices
;
entry_indices
.
reserve
(
seq_list
.
GetCount
());
if
(
entries_from_seqnames
)
{
for
(
int
i
=
0
;
i
<
seq_list
.
GetCount
();
++
i
)
{
const
String
&
sname
=
seq_list
[
i
].
GetName
();
ost
::
StringRef
s
(
&
sname
[
0
],
sname
.
size
());
std
::
pair
<
bool
,
int
>
idx
=
s
.
trim
().
to_int
();
if
(
idx
.
first
==
false
)
{
std
::
stringstream
ss
;
ss
<<
"Cannot cast seq name to integer: "
<<
sname
;
throw
ost
::
Error
(
ss
.
str
());
}
entry_indices
.
push_back
(
idx
.
second
);
}
}
else
{
for
(
int
i
=
0
;
i
<
seq_list
.
GetCount
();
++
i
)
{
entry_indices
.
push_back
(
i
);
}
}
std
::
vector
<
std
::
vector
<
int32_t
>
>
indexer
(
3200000
);
for
(
int
i
=
0
;
i
<
seq_list
.
GetCount
();
++
i
)
{
int
entry_idx
=
entry_indices
[
i
];
std
::
vector
<
int
>
penta_indices
;
SeqToPentamerIndices
(
seq_list
[
i
].
GetString
(),
true
,
penta_indices
);
for
(
auto
j
=
penta_indices
.
begin
();
j
!=
penta_indices
.
end
();
++
j
)
{
indexer
[
*
j
].
push_back
(
entry_idx
);
}
}
for
(
auto
i
=
indexer
.
begin
();
i
!=
indexer
.
end
();
++
i
)
{
std
::
sort
(
i
->
begin
(),
i
->
end
());
}
std
::
ofstream
indexer_stream
(
indexer_path
,
std
::
ios
::
binary
);
if
(
!
indexer_stream
)
{
throw
ost
::
Error
(
"Could not open indexer file: "
+
indexer_path
);
}
std
::
vector
<
int64_t
>
positions
;
std
::
vector
<
int32_t
>
lengths
;
int64_t
current_pos
=
0
;
for
(
auto
i
=
indexer
.
begin
();
i
!=
indexer
.
end
();
++
i
)
{
int
size
=
i
->
size
();
if
(
size
>
0
)
{
indexer_stream
.
write
(
reinterpret_cast
<
char
*>
(
&
(
*
i
)[
0
]),
size
*
sizeof
(
int32_t
));
}
positions
.
push_back
(
current_pos
);
lengths
.
push_back
(
size
);
current_pos
+=
size
;
}
indexer_stream
.
close
();
std
::
ofstream
pos_stream
(
pos_path
,
std
::
ios
::
binary
);
std
::
ofstream
length_stream
(
length_path
,
std
::
ios
::
binary
);
std
::
ofstream
meta_stream
(
meta_path
,
std
::
ofstream
::
out
);
if
(
!
pos_stream
)
{
throw
ost
::
Error
(
"Could not open pos file: "
+
pos_path
);
}
if
(
!
length_stream
)
{
throw
ost
::
Error
(
"Could not open length file: "
+
length_path
);
}
if
(
!
meta_stream
)
{
throw
ost
::
Error
(
"Could not open meta file: "
+
meta_path
);
}
pos_stream
.
write
(
reinterpret_cast
<
char
*>
(
&
positions
[
0
]),
positions
.
size
()
*
sizeof
(
int64_t
));
length_stream
.
write
(
reinterpret_cast
<
char
*>
(
&
lengths
[
0
]),
lengths
.
size
()
*
sizeof
(
int32_t
));
int32_t
max_entry_idx
=
*
std
::
max_element
(
entry_indices
.
begin
(),
entry_indices
.
end
());
meta_stream
<<
max_entry_idx
;
pos_stream
.
close
();
length_stream
.
close
();
meta_stream
.
close
();
}
}}
//ns
}}
//ns
This diff is collapsed.
Click to expand it.
modelling/src/afdb.hh
+
6
−
3
View file @
7b0c0412
...
@@ -17,17 +17,20 @@
...
@@ -17,17 +17,20 @@
#define PM3_MODELLING_AFDB_HH
#define PM3_MODELLING_AFDB_HH
#include
<ost/mol/mol.hh>
#include
<ost/mol/mol.hh>
#include
<ost/seq/sequence_list.hh>
#include
<promod3/core/message.hh>
#include
<promod3/core/message.hh>
namespace
promod3
{
namespace
modelling
{
int
PentamerToIdx
(
const
char
*
ptr
);
namespace
promod3
{
namespace
modelling
{
void
SeqToPentamerIndices
(
const
String
&
seq
,
bool
unique
,
void
SeqToPentamerIndices
(
const
String
&
seq
,
bool
unique
,
std
::
vector
<
int
>&
indices
);
std
::
vector
<
int
>&
indices
);
uint64_t
CreateAFDBIdx
(
const
String
&
uniprot_ac
,
int
fragment
,
int
version
);
void
CreatePentaMatch
(
const
ost
::
seq
::
SequenceList
&
seq_list
,
const
String
&
db_path
,
bool
entries_from_seqnames
);
uint64_t
CreateAFDBIdx
(
const
String
&
uniprot_ac
,
int
fragment
,
int
version
);
}}
//ns
}}
//ns
#endif
#endif
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment