diff --git a/modules/seq/alg/src/hmm_pseudo_counts.cc b/modules/seq/alg/src/hmm_pseudo_counts.cc new file mode 100644 index 0000000000000000000000000000000000000000..ab911d62a6fb79548740898cd0c0b2f43ecfc939 --- /dev/null +++ b/modules/seq/alg/src/hmm_pseudo_counts.cc @@ -0,0 +1,140 @@ +#include <ost/seq/alg/hmm_pseudo_counts.hh> + + +namespace { + + + // to mimic the HHblits behaviour, R is based on the Gonnet substitution matrix + // every entry R[a][b] corresponds to P(a|b) + const Real R[20][20] = { + {0.133435, 0.0861429, 0.0716694, 0.0767892, 0.0451931, 0.0861612, 0.0638644, 0.0638589, 0.0700223, 0.0582424, 0.065352, 0.0716654, 0.0822722, 0.0733284, 0.0668668, 0.0989164, 0.0881558, 0.0785747, 0.0335151, 0.0462615}, + {0.0212019, 0.266857, 0.00903972, 0.009469, 0.0157131, 0.01193, 0.0140111, 0.0146727, 0.00991927, 0.013372, 0.0153475, 0.0124885, 0.00924679, 0.0108782, 0.0113849, 0.0193327, 0.0168459, 0.018897, 0.0149936, 0.0168408}, + {0.0504801, 0.0258694, 0.15964, 0.100729, 0.0191763, 0.0553552, 0.0593027, 0.0225536, 0.0606905, 0.0215294, 0.0270972, 0.0897624, 0.0460354, 0.0665564, 0.0504718, 0.0606802, 0.0540841, 0.027733, 0.0163566, 0.0283808}, + {0.0598873, 0.0300042, 0.111533, 0.1372, 0.0243854, 0.0498144, 0.0656566, 0.0321571, 0.0789513, 0.0314283, 0.0377994, 0.0736845, 0.0533622, 0.0885886, 0.0656581, 0.0627186, 0.0585313, 0.0386611, 0.0222899, 0.0321599}, + {0.0206017, 0.0291031, 0.0124111, 0.0142537, 0.175334, 0.0105647, 0.0341859, 0.0440577, 0.0163643, 0.0554499, 0.0505966, 0.0171498, 0.0145874, 0.0192437, 0.0167459, 0.0183671, 0.021082, 0.0358121, 0.0801796, 0.113219}, + {0.0848077, 0.0477099, 0.0773563, 0.0628701, 0.0228112, 0.34552, 0.054741, 0.0268108, 0.0586764, 0.0274347, 0.0337462, 0.0828826, 0.0522808, 0.0600294, 0.0600437, 0.0828881, 0.058672, 0.0353579, 0.0300674, 0.0300801}, + {0.0306873, 0.0273537, 0.0404564, 0.0404524, 0.0360342, 0.0267232, 0.146872, 0.0222367, 0.0423625, 0.0238214, 0.0273249, 0.0486327, 0.0286452, 0.0486303, 0.042362, 0.0352323, 0.0344236, 0.0232737, 0.0307088, 0.0222177}, + {0.0420777, 0.0392812, 0.0210988, 0.027169, 0.0636824, 0.017948, 0.0304931, 0.127064, 0.0311844, 0.0963867, 0.0899444, 0.0265473, 0.0278066, 0.0326653, 0.0291056, 0.0334299, 0.0440638, 0.10328, 0.0334349, 0.0430405}, + {0.0544335, 0.0313295, 0.0669828, 0.0786964, 0.0279058, 0.0463415, 0.0685348, 0.0367906, 0.124704, 0.0368031, 0.0432189, 0.0717651, 0.0519718, 0.0842986, 0.111139, 0.0610879, 0.0610786, 0.040354, 0.0266998, 0.0368013}, + {0.0759224, 0.0708227, 0.0398451, 0.0525312, 0.158562, 0.0363335, 0.0646247, 0.190686, 0.0617143, 0.251356, 0.190682, 0.0501533, 0.0589235, 0.0692529, 0.0602971, 0.0617101, 0.074195, 0.151465, 0.0852309, 0.100056}, + {0.0187229, 0.0178647, 0.0110218, 0.0138856, 0.0317983, 0.00982238, 0.0162919, 0.0391074, 0.0159279, 0.0419077, 0.0592039, 0.0132612, 0.0126454, 0.0174664, 0.0148745, 0.0159425, 0.0191539, 0.0317932, 0.0174792, 0.0210003}, + {0.0375111, 0.0265585, 0.066705, 0.0494529, 0.0196915, 0.0440748, 0.052976, 0.0210883, 0.0483208, 0.0201381, 0.0242281, 0.096418, 0.0326617, 0.0472207, 0.0430638, 0.0494368, 0.0450912, 0.0242096, 0.0175593, 0.0291164}, + {0.0486405, 0.0222116, 0.0386411, 0.0404524, 0.0189187, 0.0314024, 0.0352449, 0.0249495, 0.039526, 0.026724, 0.0260953, 0.0368921, 0.261161, 0.0433597, 0.0369035, 0.0497801, 0.0464422, 0.0299902, 0.0143521, 0.0222177}, + {0.0312223, 0.0188189, 0.0402341, 0.0483655, 0.0179742, 0.0259677, 0.0430922, 0.0211081, 0.0461724, 0.0226204, 0.0259586, 0.0384126, 0.0312272, 0.0608874, 0.0461829, 0.0342238, 0.0326925, 0.0231499, 0.0175593, 0.0220909}, + {0.0447524, 0.0309584, 0.0479587, 0.0563456, 0.0245857, 0.0408271, 0.059004, 0.0295632, 0.0956849, 0.0309579, 0.0347482, 0.0550639, 0.0417761, 0.072593, 0.151669, 0.049072, 0.049074, 0.0324263, 0.0355997, 0.0339353}, + {0.0601482, 0.0477629, 0.0523859, 0.0489009, 0.0244998, 0.0512063, 0.0445856, 0.0308503, 0.0477837, 0.0287859, 0.0338373, 0.057432, 0.0511994, 0.0488754, 0.0445844, 0.077481, 0.065948, 0.0370921, 0.0218089, 0.0301309}, + {0.0817285, 0.0634542, 0.0711878, 0.0695787, 0.0428747, 0.0552625, 0.0664169, 0.0619975, 0.072842, 0.0527675, 0.061982, 0.0798664, 0.0728268, 0.0711834, 0.067978, 0.100547, 0.12659, 0.0711838, 0.0318313, 0.0459572}, + {0.0744872, 0.0727841, 0.0373259, 0.0469937, 0.0744727, 0.0340536, 0.0459161, 0.148588, 0.0492103, 0.110149, 0.105201, 0.0438468, 0.0480877, 0.0515413, 0.0459295, 0.0578264, 0.0727877, 0.159228, 0.0400096, 0.0564827}, + {0.00545379, 0.00991306, 0.0037789, 0.00465085, 0.0286213, 0.00497084, 0.0103997, 0.0082571, 0.00558903, 0.0106396, 0.00992804, 0.00545903, 0.0039503, 0.00671079, 0.00865564, 0.00583628, 0.00558714, 0.00686789, 0.328817, 0.0321092}, + {0.0237983, 0.0351993, 0.0207284, 0.0212132, 0.127766, 0.0157211, 0.0237862, 0.0336026, 0.0243534, 0.0394855, 0.0377084, 0.0286163, 0.0193322, 0.02669, 0.0260839, 0.0254908, 0.025501, 0.0306509, 0.101507, 0.237902} + }; + +} + +namespace ost{ namespace seq{ namespace alg{ + + +void AddTransitionPseudoCounts(ost::seq::ProfileHandle& profile) { + + // a priori probabilities estimated with default values of HHblits + Real pM2D = 0.15 * 0.0286; + Real pM2I = pM2D; + Real pM2M = 1 - pM2D - pM2I; + Real pI2I = 0.75; + Real pI2M = 1 - pI2I; + Real pD2D = 0.75; + Real pD2M = 1 - pD2D; + + for (size_t col_idx = 0; col_idx < profile.size(); ++col_idx) { + + ost::seq::HMMDataPtr data = profile[col_idx].GetHMMData(); + + // Transitions from M state + Real neff = data->GetNeff(); + Real p0 = (neff - 1) * data->GetProb(ost::seq::HMM_M2M) + pM2M; + Real p1 = (neff - 1) * data->GetProb(ost::seq::HMM_M2D) + pM2D; + Real p2 = (neff - 1) * data->GetProb(ost::seq::HMM_M2I) + pM2I; + Real sum = p0 + p1 + p2; + data->SetProb(ost::seq::HMM_M2M, p0/sum); + data->SetProb(ost::seq::HMM_M2D, p1/sum); + data->SetProb(ost::seq::HMM_M2I, p2/sum); + + // Transitions from I state + Real neff_i = data->GetNeff_I(); + p0 = neff_i * data->GetProb(ost::seq::HMM_I2M) + pI2M; + p1 = neff_i * data->GetProb(ost::seq::HMM_I2I) + pI2I; + sum = p0 + p1; + data->SetProb(ost::seq::HMM_I2M, p0/sum); + data->SetProb(ost::seq::HMM_I2I, p1/sum); + + // Transitions from D state + Real neff_d = data->GetNeff_D(); + p0 = neff_d * data->GetProb(ost::seq::HMM_D2M) + pD2M; + p1 = neff_d * data->GetProb(ost::seq::HMM_D2D) + pD2D; + sum = p0 + p1; + data->SetProb(ost::seq::HMM_D2M, p0/sum); + data->SetProb(ost::seq::HMM_D2D, p1/sum); + } +} + + +void AddAAPseudoCounts(ost::seq::ProfileHandle& profile) { + + + //////////////////// + // do frequencies // + //////////////////// + Real full_admixture [20]; + for(size_t col_idx = 0; col_idx < profile.size(); ++col_idx) { + Real* col_freq = profile[col_idx].freqs_begin(); + for(int i = 0; i < 20; ++i) { + full_admixture[i] = 0.0; + for(int j = 0; j < 20; ++j) { + full_admixture[i] += R[i][j] * col_freq[j]; + } + } + // tau estimated as in hhblits in diversity dependent mode: + // tau = a/(1+((Neff[i]-1)/b)^c) with default values a=1.0, b=1.5, c=1.0 + // this is the equation they write in HHblits when you display the help + // (Neff[i]-1) got rid of the -1. Well, that's how HHblits implements it + Real neff = profile[col_idx].GetHMMData()->GetNeff(); + Real tau = std::min(1.0, 1.0 / (1.0 + (neff) / 1.5)); + for (int i = 0; i < 20; ++i) { + col_freq[i] = (1. - tau) * col_freq[i] + tau * full_admixture[i]; + } + } + + ///////////////////////// + // do null_frequencies // + ///////////////////////// + Real mixing_factor = 100.0 / profile.GetNeff(); + const Real* current_null_freq = profile.GetNullModel().freqs_begin(); + Real null_freq[20]; + for(int i = 0; i < 20; ++i) { + null_freq[i] = current_null_freq[i] * mixing_factor; + } + for(size_t i = 0; i < profile.size(); ++i) { + const Real* freq = profile[i].freqs_begin(); + for(int j = 0; j < 20; ++j) { + null_freq[j] += freq[j]; + } + } + // normalize + Real summed_p = 0.0; + for(int i = 0; i < 20; ++i) { + summed_p += null_freq[i]; + } + Real factor = 1.0/summed_p; + for(int i = 0; i < 20; ++i) { + null_freq[i] *= factor; + } + // create new nullmodel and set it + ost::seq::ProfileColumn new_null_model; + Real* new_null_freq = new_null_model.freqs_begin(); + for(int i = 0; i < 20; ++i) { + new_null_freq[i] = null_freq[i]; + } + profile.SetNullModel(new_null_model); +} + +}}} // ns diff --git a/modules/seq/alg/src/hmm_pseudo_counts.hh b/modules/seq/alg/src/hmm_pseudo_counts.hh new file mode 100644 index 0000000000000000000000000000000000000000..48506f25dde5b673aa10c2cd9be32d9845b5038d --- /dev/null +++ b/modules/seq/alg/src/hmm_pseudo_counts.hh @@ -0,0 +1,33 @@ +//------------------------------------------------------------------------------ +// This file is part of the OpenStructure project <www.openstructure.org> +// +// Copyright (C) 2008-2020 by the OpenStructure authors +// +// This library is free software; you can redistribute it and/or modify it under +// the terms of the GNU Lesser General Public License as published by the Free +// Software Foundation; either version 3.0 of the License, or (at your option) +// any later version. +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more +// details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +//------------------------------------------------------------------------------ + +#ifndef OST_SEQ_ALG_HMM_PSEUDO_COUNTS_HH +#define OST_SEQ_ALG_HMM_PSEUDO_COUNTS_HH + +#include <ost/seq/profile_handle.hh> + +namespace ost{ namespace seq{ namespace alg{ + +void AddTransitionPseudoCounts(ost::seq::ProfileHandle& profile); + +void AddAAPseudoCounts(ost::seq::ProfileHandle& profile); + +}}} // ns + +#endif