Skip to content
Snippets Groups Projects
Commit 910df3ae authored by Studer Gabriel's avatar Studer Gabriel
Browse files

Algorithms to add pseudo counts to ProfileHandle objects which contain HMM information

parent 71f1450d
No related branches found
No related tags found
No related merge requests found
#include <ost/seq/alg/hmm_pseudo_counts.hh>
namespace {
// to mimic the HHblits behaviour, R is based on the Gonnet substitution matrix
// every entry R[a][b] corresponds to P(a|b)
const Real R[20][20] = {
{0.133435, 0.0861429, 0.0716694, 0.0767892, 0.0451931, 0.0861612, 0.0638644, 0.0638589, 0.0700223, 0.0582424, 0.065352, 0.0716654, 0.0822722, 0.0733284, 0.0668668, 0.0989164, 0.0881558, 0.0785747, 0.0335151, 0.0462615},
{0.0212019, 0.266857, 0.00903972, 0.009469, 0.0157131, 0.01193, 0.0140111, 0.0146727, 0.00991927, 0.013372, 0.0153475, 0.0124885, 0.00924679, 0.0108782, 0.0113849, 0.0193327, 0.0168459, 0.018897, 0.0149936, 0.0168408},
{0.0504801, 0.0258694, 0.15964, 0.100729, 0.0191763, 0.0553552, 0.0593027, 0.0225536, 0.0606905, 0.0215294, 0.0270972, 0.0897624, 0.0460354, 0.0665564, 0.0504718, 0.0606802, 0.0540841, 0.027733, 0.0163566, 0.0283808},
{0.0598873, 0.0300042, 0.111533, 0.1372, 0.0243854, 0.0498144, 0.0656566, 0.0321571, 0.0789513, 0.0314283, 0.0377994, 0.0736845, 0.0533622, 0.0885886, 0.0656581, 0.0627186, 0.0585313, 0.0386611, 0.0222899, 0.0321599},
{0.0206017, 0.0291031, 0.0124111, 0.0142537, 0.175334, 0.0105647, 0.0341859, 0.0440577, 0.0163643, 0.0554499, 0.0505966, 0.0171498, 0.0145874, 0.0192437, 0.0167459, 0.0183671, 0.021082, 0.0358121, 0.0801796, 0.113219},
{0.0848077, 0.0477099, 0.0773563, 0.0628701, 0.0228112, 0.34552, 0.054741, 0.0268108, 0.0586764, 0.0274347, 0.0337462, 0.0828826, 0.0522808, 0.0600294, 0.0600437, 0.0828881, 0.058672, 0.0353579, 0.0300674, 0.0300801},
{0.0306873, 0.0273537, 0.0404564, 0.0404524, 0.0360342, 0.0267232, 0.146872, 0.0222367, 0.0423625, 0.0238214, 0.0273249, 0.0486327, 0.0286452, 0.0486303, 0.042362, 0.0352323, 0.0344236, 0.0232737, 0.0307088, 0.0222177},
{0.0420777, 0.0392812, 0.0210988, 0.027169, 0.0636824, 0.017948, 0.0304931, 0.127064, 0.0311844, 0.0963867, 0.0899444, 0.0265473, 0.0278066, 0.0326653, 0.0291056, 0.0334299, 0.0440638, 0.10328, 0.0334349, 0.0430405},
{0.0544335, 0.0313295, 0.0669828, 0.0786964, 0.0279058, 0.0463415, 0.0685348, 0.0367906, 0.124704, 0.0368031, 0.0432189, 0.0717651, 0.0519718, 0.0842986, 0.111139, 0.0610879, 0.0610786, 0.040354, 0.0266998, 0.0368013},
{0.0759224, 0.0708227, 0.0398451, 0.0525312, 0.158562, 0.0363335, 0.0646247, 0.190686, 0.0617143, 0.251356, 0.190682, 0.0501533, 0.0589235, 0.0692529, 0.0602971, 0.0617101, 0.074195, 0.151465, 0.0852309, 0.100056},
{0.0187229, 0.0178647, 0.0110218, 0.0138856, 0.0317983, 0.00982238, 0.0162919, 0.0391074, 0.0159279, 0.0419077, 0.0592039, 0.0132612, 0.0126454, 0.0174664, 0.0148745, 0.0159425, 0.0191539, 0.0317932, 0.0174792, 0.0210003},
{0.0375111, 0.0265585, 0.066705, 0.0494529, 0.0196915, 0.0440748, 0.052976, 0.0210883, 0.0483208, 0.0201381, 0.0242281, 0.096418, 0.0326617, 0.0472207, 0.0430638, 0.0494368, 0.0450912, 0.0242096, 0.0175593, 0.0291164},
{0.0486405, 0.0222116, 0.0386411, 0.0404524, 0.0189187, 0.0314024, 0.0352449, 0.0249495, 0.039526, 0.026724, 0.0260953, 0.0368921, 0.261161, 0.0433597, 0.0369035, 0.0497801, 0.0464422, 0.0299902, 0.0143521, 0.0222177},
{0.0312223, 0.0188189, 0.0402341, 0.0483655, 0.0179742, 0.0259677, 0.0430922, 0.0211081, 0.0461724, 0.0226204, 0.0259586, 0.0384126, 0.0312272, 0.0608874, 0.0461829, 0.0342238, 0.0326925, 0.0231499, 0.0175593, 0.0220909},
{0.0447524, 0.0309584, 0.0479587, 0.0563456, 0.0245857, 0.0408271, 0.059004, 0.0295632, 0.0956849, 0.0309579, 0.0347482, 0.0550639, 0.0417761, 0.072593, 0.151669, 0.049072, 0.049074, 0.0324263, 0.0355997, 0.0339353},
{0.0601482, 0.0477629, 0.0523859, 0.0489009, 0.0244998, 0.0512063, 0.0445856, 0.0308503, 0.0477837, 0.0287859, 0.0338373, 0.057432, 0.0511994, 0.0488754, 0.0445844, 0.077481, 0.065948, 0.0370921, 0.0218089, 0.0301309},
{0.0817285, 0.0634542, 0.0711878, 0.0695787, 0.0428747, 0.0552625, 0.0664169, 0.0619975, 0.072842, 0.0527675, 0.061982, 0.0798664, 0.0728268, 0.0711834, 0.067978, 0.100547, 0.12659, 0.0711838, 0.0318313, 0.0459572},
{0.0744872, 0.0727841, 0.0373259, 0.0469937, 0.0744727, 0.0340536, 0.0459161, 0.148588, 0.0492103, 0.110149, 0.105201, 0.0438468, 0.0480877, 0.0515413, 0.0459295, 0.0578264, 0.0727877, 0.159228, 0.0400096, 0.0564827},
{0.00545379, 0.00991306, 0.0037789, 0.00465085, 0.0286213, 0.00497084, 0.0103997, 0.0082571, 0.00558903, 0.0106396, 0.00992804, 0.00545903, 0.0039503, 0.00671079, 0.00865564, 0.00583628, 0.00558714, 0.00686789, 0.328817, 0.0321092},
{0.0237983, 0.0351993, 0.0207284, 0.0212132, 0.127766, 0.0157211, 0.0237862, 0.0336026, 0.0243534, 0.0394855, 0.0377084, 0.0286163, 0.0193322, 0.02669, 0.0260839, 0.0254908, 0.025501, 0.0306509, 0.101507, 0.237902}
};
}
namespace ost{ namespace seq{ namespace alg{
void AddTransitionPseudoCounts(ost::seq::ProfileHandle& profile) {
// a priori probabilities estimated with default values of HHblits
Real pM2D = 0.15 * 0.0286;
Real pM2I = pM2D;
Real pM2M = 1 - pM2D - pM2I;
Real pI2I = 0.75;
Real pI2M = 1 - pI2I;
Real pD2D = 0.75;
Real pD2M = 1 - pD2D;
for (size_t col_idx = 0; col_idx < profile.size(); ++col_idx) {
ost::seq::HMMDataPtr data = profile[col_idx].GetHMMData();
// Transitions from M state
Real neff = data->GetNeff();
Real p0 = (neff - 1) * data->GetProb(ost::seq::HMM_M2M) + pM2M;
Real p1 = (neff - 1) * data->GetProb(ost::seq::HMM_M2D) + pM2D;
Real p2 = (neff - 1) * data->GetProb(ost::seq::HMM_M2I) + pM2I;
Real sum = p0 + p1 + p2;
data->SetProb(ost::seq::HMM_M2M, p0/sum);
data->SetProb(ost::seq::HMM_M2D, p1/sum);
data->SetProb(ost::seq::HMM_M2I, p2/sum);
// Transitions from I state
Real neff_i = data->GetNeff_I();
p0 = neff_i * data->GetProb(ost::seq::HMM_I2M) + pI2M;
p1 = neff_i * data->GetProb(ost::seq::HMM_I2I) + pI2I;
sum = p0 + p1;
data->SetProb(ost::seq::HMM_I2M, p0/sum);
data->SetProb(ost::seq::HMM_I2I, p1/sum);
// Transitions from D state
Real neff_d = data->GetNeff_D();
p0 = neff_d * data->GetProb(ost::seq::HMM_D2M) + pD2M;
p1 = neff_d * data->GetProb(ost::seq::HMM_D2D) + pD2D;
sum = p0 + p1;
data->SetProb(ost::seq::HMM_D2M, p0/sum);
data->SetProb(ost::seq::HMM_D2D, p1/sum);
}
}
void AddAAPseudoCounts(ost::seq::ProfileHandle& profile) {
////////////////////
// do frequencies //
////////////////////
Real full_admixture [20];
for(size_t col_idx = 0; col_idx < profile.size(); ++col_idx) {
Real* col_freq = profile[col_idx].freqs_begin();
for(int i = 0; i < 20; ++i) {
full_admixture[i] = 0.0;
for(int j = 0; j < 20; ++j) {
full_admixture[i] += R[i][j] * col_freq[j];
}
}
// tau estimated as in hhblits in diversity dependent mode:
// tau = a/(1+((Neff[i]-1)/b)^c) with default values a=1.0, b=1.5, c=1.0
// this is the equation they write in HHblits when you display the help
// (Neff[i]-1) got rid of the -1. Well, that's how HHblits implements it
Real neff = profile[col_idx].GetHMMData()->GetNeff();
Real tau = std::min(1.0, 1.0 / (1.0 + (neff) / 1.5));
for (int i = 0; i < 20; ++i) {
col_freq[i] = (1. - tau) * col_freq[i] + tau * full_admixture[i];
}
}
/////////////////////////
// do null_frequencies //
/////////////////////////
Real mixing_factor = 100.0 / profile.GetNeff();
const Real* current_null_freq = profile.GetNullModel().freqs_begin();
Real null_freq[20];
for(int i = 0; i < 20; ++i) {
null_freq[i] = current_null_freq[i] * mixing_factor;
}
for(size_t i = 0; i < profile.size(); ++i) {
const Real* freq = profile[i].freqs_begin();
for(int j = 0; j < 20; ++j) {
null_freq[j] += freq[j];
}
}
// normalize
Real summed_p = 0.0;
for(int i = 0; i < 20; ++i) {
summed_p += null_freq[i];
}
Real factor = 1.0/summed_p;
for(int i = 0; i < 20; ++i) {
null_freq[i] *= factor;
}
// create new nullmodel and set it
ost::seq::ProfileColumn new_null_model;
Real* new_null_freq = new_null_model.freqs_begin();
for(int i = 0; i < 20; ++i) {
new_null_freq[i] = null_freq[i];
}
profile.SetNullModel(new_null_model);
}
}}} // ns
//------------------------------------------------------------------------------
// This file is part of the OpenStructure project <www.openstructure.org>
//
// Copyright (C) 2008-2020 by the OpenStructure authors
//
// This library is free software; you can redistribute it and/or modify it under
// the terms of the GNU Lesser General Public License as published by the Free
// Software Foundation; either version 3.0 of the License, or (at your option)
// any later version.
// This library is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
// details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with this library; if not, write to the Free Software Foundation, Inc.,
// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
//------------------------------------------------------------------------------
#ifndef OST_SEQ_ALG_HMM_PSEUDO_COUNTS_HH
#define OST_SEQ_ALG_HMM_PSEUDO_COUNTS_HH
#include <ost/seq/profile_handle.hh>
namespace ost{ namespace seq{ namespace alg{
void AddTransitionPseudoCounts(ost::seq::ProfileHandle& profile);
void AddAAPseudoCounts(ost::seq::ProfileHandle& profile);
}}} // ns
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment