seqalg.rst

from ost import seq
mat = seq.alg.BLOSUM62
print(mat.GetWeight('A', 'A'))
# SETUP: aln is multiple sequence alignment, where first sequence is the
#        reference sequence and all others have a structure attached

# clip alignment to only have parts with at least 3 sequences (incl. ref.)
# -> aln will be cut and clip_start is 1st column of aln that was kept
clip_start = seq.alg.ClipAlignment(aln, 3)

# get variance measure and distance to mean for each residue pair
d_map = seq.alg.CreateDistanceMap(aln)
var_map = seq.alg.CreateVarianceMap(d_map)
dist_to_mean = seq.alg.CreateDist2Mean(d_map)

# report min. and max. variances
print("MIN-MAX:", var_map.Min(), "-", var_map.Max())
# get data and json-strings for further processing
var_map_data = var_map.GetData()
var_map_json = var_map.GetJsonString()
dist_to_mean_data = dist_to_mean.GetData()
dist_to_mean_json = dist_to_mean.GetJsonString()
from ost import io, seq

prof_query = io.LoadSequenceProfile("query.hhm")
prof_tpl = io.LoadSequenceProfile("tpl.hhm")
aln = io.LoadAlignment("aln.fasta")

# assign pseudo counts to transition probabilities
seq.alg.AddTransitionPseudoCounts(prof_query)
seq.alg.AddTransitionPseudoCounts(prof_tpl)

# hhblits/hhalign 3 assign different pseudo counts to
# query and template. The reason is computational efficiency.
# The more expensive Angermueller et al. pseudo counts
# are assigned to the query.
path_to_crf = "/path/to/hh-suite/data/context_data.crf"
lib = seq.alg.ContextProfileDB.FromCRF(path_to_crf)
seq.alg.AddAAPseudoCounts(prof_query, lib)

# templates are assigned the computationally cheaper pseudo
# counts derived from a Gonnet substitution matrix
seq.alg.AddAAPseudoCounts(prof_tpl)

# assign null model pseudo counts
# this should be done AFTER you assigned pseudo counts to emission
# probabilities as this affects the result
seq.alg.AddNullPseudoCounts(prof_query)
seq.alg.AddNullPseudoCounts(prof_tpl)

print("score:", seq.alg.HMMScore(prof_query, prof_tpl, aln, 0, 1))