Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
O
openstructure
Manage
Activity
Members
Code
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Deploy
Releases
Container registry
Model registry
Analyze
Contributor analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
schwede
openstructure
Commits
9d4c35ba
Commit
9d4c35ba
authored
12 years ago
by
Gabriel Studer
Browse files
Options
Downloads
Patches
Plain Diff
binging for protein sequence clustering program kclust. No unit tests
yet.
parent
b7c9d184
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
modules/bindings/pymod/CMakeLists.txt
+1
-0
1 addition, 0 deletions
modules/bindings/pymod/CMakeLists.txt
modules/bindings/pymod/kclust.py
+148
-0
148 additions, 0 deletions
modules/bindings/pymod/kclust.py
with
149 additions
and
0 deletions
modules/bindings/pymod/CMakeLists.txt
+
1
−
0
View file @
9d4c35ba
...
...
@@ -10,5 +10,6 @@ utils.py
naccess.py
blast.py
cadscore.py
kclust.py
)
pymod
(
NAME bindings PY
${
OST_BINDINGS
}
)
This diff is collapsed.
Click to expand it.
modules/bindings/pymod/kclust.py
0 → 100644
+
148
−
0
View file @
9d4c35ba
import
subprocess
,
os
,
tempfile
from
ost
import
io
,
seq
,
settings
"""
Wrapper for kClust a protein sequence clustering algorithm
unpublished, but mentioned in:
Michael Remmert, Andreas Biegert, Andreas Hauser & Johannes Soeding
HHblits: lighting-fast iterative protein sequence searching by
HMM-HMM alignment
Nature Methods 9, 173-175 (2012)
"""
class
cluster
:
"""
Holds the information of one cluster
.. attribute:: sequences
SequenceList containing all sequences of the cluster
.. attribute:: representative_id
a string, that contains the name of the representative sequence of the cluster
as estimated by kClust.
.. attribute:: alignment
alignment handle containing a multiple sequence alignment of all sequences of
the cluster. Gets only calculated when binding is called with appropriate flag.
"""
def
__init__
(
self
,
sequences
,
representative_id
):
self
.
sequences
=
sequences
self
.
representative_id
=
representative_id
self
.
alignment
=
None
def
_SetupFiles
(
sequences
):
# create temporary directory
tmp_dir_name
=
tempfile
.
mkdtemp
()
io
.
SaveSequenceList
(
sequences
,
os
.
path
.
join
(
tmp_dir_name
,
'
fastadb.fasta
'
))
return
tmp_dir_name
def
_CleanupFiles
(
tmp_dir_name
):
import
shutil
shutil
.
rmtree
(
tmp_dir_name
)
def
_ParseOutput
(
tmp_dir_name
):
header_data
=
open
(
os
.
path
.
join
(
tmp_dir_name
,
'
headers.dmp
'
),
'
r
'
).
readlines
()
cluster_data
=
open
(
os
.
path
.
join
(
tmp_dir_name
,
'
clusters.dmp
'
),
'
r
'
).
readlines
()
sequences
=
io
.
LoadSequenceList
(
os
.
path
.
join
(
tmp_dir_name
,
'
fastadb.fasta
'
))
clusters
=
dict
()
header_mapper
=
dict
()
for
line
in
header_data
:
header_mapper
[
int
(
line
.
split
()[
0
])]
=
line
.
split
()[
1
].
strip
().
strip
(
'
>
'
)
#find numeric ids of the representatives of the clusters
unique_representatives
=
list
()
for
line
in
cluster_data
[
1
:]:
actual_cluster
=
int
(
line
.
split
()[
1
])
try
:
unique_representatives
.
index
(
actual_cluster
)
except
:
unique_representatives
.
append
(
actual_cluster
)
#assign every header to its corresponding cluster, where the
#cluster id is given by the id of the representative of the cluster
for
idx
in
unique_representatives
:
clusters
[
idx
]
=
seq
.
CreateSequenceList
()
for
line
in
cluster_data
[
1
:]:
clusters
[
int
(
line
.
split
()[
1
])].
AddSequence
(
sequences
.
FindSequence
(
header_mapper
[
int
(
line
.
split
()[
0
])]))
#translate into final output
res
=
list
()
for
k
,
v
in
clusters
.
iteritems
():
res
.
append
(
cluster
(
v
,
header_mapper
[
k
]))
return
res
def
_RunkClust
(
tmp_dir_name
,
clustering_thresh
,
create_alignments
):
bitscore
=
clustering_thresh
*
0.060269
-
0.68498
executable
=
settings
.
Locate
(
'
kClust
'
)
cmd
=
[]
cmd
.
append
(
'
kClust
'
)
cmd
.
append
(
'
-i
'
)
cmd
.
append
(
os
.
path
.
join
(
tmp_dir_name
,
'
fastadb.fasta
'
))
cmd
.
append
(
'
-d
'
)
cmd
.
append
(
tmp_dir_name
)
cmd
.
append
(
'
-s
'
)
cmd
.
append
(
str
(
bitscore
))
cmd
=
'
'
.
join
(
cmd
)
ps
=
subprocess
.
Popen
(
cmd
,
shell
=
True
,
stdout
=
subprocess
.
PIPE
)
ps
.
stdout
.
readlines
()
result
=
_ParseOutput
(
tmp_dir_name
)
if
(
create_alignments
):
from
ost.bindings
import
clustalw
for
c
in
result
:
if
len
(
c
.
sequences
)
>
1
:
c
.
alignment
=
clustalw
.
ClustalW
(
c
.
sequences
)
else
:
aln
=
seq
.
CreateAlignment
()
aln
.
AddSequence
(
c
.
sequences
[
0
])
c
.
alignment
=
aln
return
result
def
kClust
(
sequences
,
clustering_thresh
=
30
,
create_alignments
=
False
):
"""
Uses kClust to generate clusters of amino acid sequences.
:param sequences: All sequences you want to cluster.
:type sequences: :class:`ost.seq.SequenceList`
:param clustering_thres: Sequence identity threshold to build clusters. Note,
that clustering_thresh is more a rule of thumb, since
compositional bias in the sequence can also play a role.
The value gets transformed in a bitscore, that is used
as an input parameter of kClust
:param create_alignments: Flag, wether the alignments of the clusters get calculated.
Requires clustalw in the path.
:returns: A list of cluster instances
:raises: :class:`~ost.settings.FileNotFound` if kClust could not be located.
"""
tmp_dir_name
=
_SetupFiles
(
sequences
)
result
=
_RunkClust
(
tmp_dir_name
,
clustering_thresh
,
create_alignments
)
_CleanupFiles
(
tmp_dir_name
)
return
result
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment