From c37a34cdc2e50fd95035ddfb52c0a0db83833ec5 Mon Sep 17 00:00:00 2001 From: Gabriel Studer <gabriel.studer@unibas.ch> Date: Fri, 9 Nov 2018 15:56:35 +0100 Subject: [PATCH] Optionally parse sequence profiles with the PM3ArgumentParser The user provided profiles are mapped on the gapless target sequences by exact sequence match. The mapping works with an all or nothing principle. Either all target sequences must be covered by a profile or none. An error is thrown otherwise. To avoid ambiguities, the sequences of all provided profiles must be unique => in case of a homo-oligomer, one profile must be provided. If an additional profile is provided that does not match any target sequence, an error is thrown. --- core/pymod/pm3argparse.py | 101 ++++++++++- core/tests/CMakeLists.txt | 5 + core/tests/data/aln_tpl/1crn.hhm | 180 ++++++++++++++++++++ core/tests/data/aln_tpl/1crn_corrupted.hhm | 185 +++++++++++++++++++++ core/tests/data/aln_tpl/5ua4_B.fasta | 4 + core/tests/data/aln_tpl/5ua4_B.hhm | 144 ++++++++++++++++ core/tests/test_pm3argparse.py | 108 +++++++++++- 7 files changed, 725 insertions(+), 2 deletions(-) create mode 100644 core/tests/data/aln_tpl/1crn.hhm create mode 100644 core/tests/data/aln_tpl/1crn_corrupted.hhm create mode 100644 core/tests/data/aln_tpl/5ua4_B.fasta create mode 100644 core/tests/data/aln_tpl/5ua4_B.hhm diff --git a/core/pymod/pm3argparse.py b/core/pymod/pm3argparse.py index b8b6a640..33aa1fac 100644 --- a/core/pymod/pm3argparse.py +++ b/core/pymod/pm3argparse.py @@ -232,6 +232,17 @@ def _LoadEntity(filename): str(exc), 33) return ent +def _FetchProfileFromFile(filename): + """Load generic profile file from filename and return it.""" + argstr = "'--seqprof " + filename + "'" + helper.FileExists("Profile", 51, filename) + try: + prof = io.LoadSequenceProfile(filename) + except Exception, exc: + helper.MsgErrorAndExit(argstr + ": failure to parse profile file: " + + str(exc), 52) + return prof + def _GetChains(structures, structure_sources): """Get chain id to entity view (single chain) mapping (dict).""" # IDs: (file_base = base file name with no extensions) @@ -398,6 +409,8 @@ class PM3ArgumentParser(argparse.ArgumentParser): self._AssembleAlignment() if 'STRUCTURE' in self.activate: self._AssembleStructure() + if 'PROFILE' in self.activate: + self._AssembleProfile() def AddAlignment(self, allow_multitemplate=False): """Commandline options for alignments. @@ -474,7 +487,7 @@ class PM3ArgumentParser(argparse.ArgumentParser): def AddStructure(self, attach_views=False): """Commandline options for structures. - Activate everything needed to load alignments to the argument parser. + Activate everything needed to load structures to the argument parser. Command line arguments are then added in :meth:`AssembleParser` and the input is post processed and checked in :meth:`Parse`. @@ -532,6 +545,46 @@ class PM3ArgumentParser(argparse.ArgumentParser): if attach_views: self.activate.add('ATTACH_VIEWS') + def AddProfile(self): + """Commandline options for profiles + + Activate everything needed to load profiles to the argument parser. + Command line arguments are then added in :meth:`AssembleParser` and the + input is post processed and checked in :meth:`Parse`. + + Options/arguments added: + + * ``-s/--seqprof <FILE>`` - Sequence profile in any format readable + by the :meth:`ost.io.LoadSequenceProfile` method. Format is chosen by + file ending. Recognized file extensions: .hhm, .hhm.gz, .pssm, + .pssm.gz. + + Notes: + + * the profiles are mapped based on exact matches towards the gapless + target sequences, i.e. one profile is mapped to several chains in + case of homo-oligomers + + * every profile must have a unique sequence to avoid ambiguities + + * all or nothing - you cannot provide profiles for only a subset of + target sequences + + Attributes added to the namespace returned by :meth:`Parse`: + + * :attr:`profiles` - :class:`list` of :class:`ost.seq.ProfileHandle`, + ordered to match the target sequences. + + Exit codes related to profile input: + + * 51 - a given profile file does not exist + * 52 - failure to read a given profile file + * 53 - a profile cannot be mapped to any target sequence + * 54 - profile sequences are not unique + * 55 - only subset of target sequences is covered by profile + """ + self.activate.add('PROFILE') + def _AssembleAlignment(self): """Actually add alignment arguments/options.""" aln_grp = self.add_mutually_exclusive_group(required=True) @@ -571,6 +624,14 @@ class PM3ArgumentParser(argparse.ArgumentParser): ".ent.gz, .pdb.gz, .cif, .cif.gz.", action='append', default=list()) + def _AssembleProfile(self): + self.add_argument('-s', '--seqprof', metavar=('<FILE>'), + help="Sequence profile in any format readable by "+ + "OST's io.LoadSequenceProfile method. Format is "+ + "chosen by file ending. Recognized File Extensions: "+ + ".hhm, .hhm.gz, .pssm, .pssm.gz", action='append', + default=list()) + class PM3OptionsNamespace(object): # class will grow, so for the moment pylint is ignored #pylint: disable=too-few-public-methods @@ -591,6 +652,8 @@ class PM3OptionsNamespace(object): self._PostProcessStructure() if 'ATTACH_VIEWS' in activated: self._AttachViews() + if 'PROFILE' in activated: + self._PostProcessProfile() def _PostProcessAlignment(self): #pylint: disable=no-member @@ -640,6 +703,42 @@ class PM3OptionsNamespace(object): for aln in self.alignments: _AttachViewsToAln(aln, chain_entities) + def _PostProcessProfile(self): + """Get Profiles from command line input.""" + self.profiles = list() + + if len(self.seqprof) == 0: + # no profiles provided, remember the all or nothing principle + # so not having any profile is fine + return + + loaded_profiles = list() + for src in self.seqprof: + loaded_profiles.append(_FetchProfileFromFile(src)) + + prof_sequences = [p.sequence for p in loaded_profiles] + + # check uniqueness of loaded profiles + if len(set(prof_sequences)) != len(prof_sequences): + helper.MsgErrorAndExit("All sequence profiles must have unique " + + "sequence.", 54) + + # map onto alignment target sequences + trg_sequences = [aln.GetSequence(0).GetGaplessString() \ + for aln in self.alignments] + for s in trg_sequences: + try: + self.profiles.append(loaded_profiles[prof_sequences.index(s)]) + except Exception, exc: + helper.MsgErrorAndExit("Could not find profile with sequence " + + "that exactly matches trg seq: " + s, 55) + + # We found a profile for every target sequence. So if the size of unique + # target sequences is not the same as for unique profile sequences, + # we know that we have additional profiles that never got mapped + if len(set(trg_sequences)) != len(set(prof_sequences)): + helper.MsgErrorAndExit("Could not map every profile to a target " + + "sequence", 53) # LocalWords: param attr prog argparse ArgumentParser bool sys os init str # LocalWords: progattr descattr argpinit argv formatter meth args namespace diff --git a/core/tests/CMakeLists.txt b/core/tests/CMakeLists.txt index b7eebec0..829a9961 100644 --- a/core/tests/CMakeLists.txt +++ b/core/tests/CMakeLists.txt @@ -23,6 +23,10 @@ set(CORE_TEST_DATA data/aln_tpl/5d52-1_cut.pdb data/aln_tpl/5d52-1_cut_A.fasta data/aln_tpl/5d52-1_cut_B.fasta + data/aln_tpl/1crn.hhm + data/aln_tpl/1crn_corrupted.hhm + data/aln_tpl/5ua4_B.fasta + data/aln_tpl/5ua4_B.hhm data/fasta/alignment.fas data/fasta/1ake.fas data/fasta/1ake_1.fas @@ -31,6 +35,7 @@ set(CORE_TEST_DATA data/fasta/1ake.fas.gz data/fasta/1ake_nel.fas data/fasta/1ake_sw.fas + ) promod3_unittest(MODULE core diff --git a/core/tests/data/aln_tpl/1crn.hhm b/core/tests/data/aln_tpl/1crn.hhm new file mode 100644 index 00000000..eb35d071 --- /dev/null +++ b/core/tests/data/aln_tpl/1crn.hhm @@ -0,0 +1,180 @@ +HHsearch 1.5 +NAME c6a0deb50c4f69619d193f0fde0517c2 +FAM +FILE seq01 +COM /scicore/soft/apps/HH-suite/2.0.16-goolf-1.4.10/bin/hhmake -i /scratch/14369431.1.short.q/tmpqOaEJI/seq01.a3m -o /scratch/14369431.1.short.q/tmpqOaEJI/seq01.hhm +DATE Mon Mar 7 14:19:45 2016 +LENG 46 match states, 46 columns in multiple alignment +FILT 55 out of 57 sequences passed filter (-id 90 -cov 0 -qid 0 -qsc -20.00 -diff 100) +NEFF 3.8 +SEQ +>ss_pred PSIPRED predicted secondary structure +CCCCCCHHHHHHHHHCCCCCCCHHHHHHHCCCEEECCCCCCCCCCC +>ss_conf PSIPRED confidence values +9877783554421111168998455542048366208989999999 +>Consensus +xsCCpstxaRnxYnxCrxxgxsxxxCaxxsgCkixsgxxCPxxyxx +>c6a0deb50c4f69619d193f0fde0517c2 +TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN +>gi|115605364|gb|ABJ15789.1| putative thionin precursor [Polygonum sibiricum] +-SCCQTTTARNIYNSCRLAGGSRERCASLSGCKHVTGNTCSPGWEK +>gi|4007745|emb|CAA65316.1| purothionin [Secale cereale] +-SCCKSTLGRNCYNLCRTRGAQK-LCANFCRCKLISSTSCPKEFPK +>gi|17381172|gb|AAL36398.1| putative thionin protein [Arabidopsis thaliana]gi|62320644|dbj|BAD95310.1| putative thionin [Arabidopsis thaliana] +-TCCPSQSTRKGFEDCISEGNLQILCSAESGCRDTYVGYCPSGFPY +>gi|802170|gb|AAB33011.1| crambin precursor=thionin variant Thi2Ca12 [Crambe abyssinica, seeds, Peptide Partial, 135 aa] +-SCCPTKSARNTFDVCRLTGTSMGLCAAISECKILSVTKCPSNLPY +>gi|21553588|gb|AAM62681.1| thionin Thi2.2 [Arabidopsis thaliana] +-ICCPTKDDRSVYFVCMLSVSSQFYCLLKSKCKNTSQTICPPGYTN +>gi|15218931|ref|NP_176784.1| thionin [Arabidopsis thaliana]gi|44888531|sp|Q9C8D6.1|THN24_ARATH RecName: Full=Probable thionin-2.4; Contains: RecName: Full=Probable thionin-2.4; Contains: RecName: Full=Acidic protein; Flags: Precursorgi|12322605|gb|AAG51299.1|AC026480_6 thionin, putative [Arabidopsis thaliana]gi|14190505|gb|AAK55733.1|AF380652_1 At1g66100/F15E12_20 [Arabidopsis thaliana]gi|15809774|gb|AAL06815.1| At1g66100/F15E12_20 [Arabidopsis thaliana]gi|332196341|gb|AEE34462.1| thionin [Arabidopsis thaliana] +-ICCPSIQARTFYNACLFAVGSPSSCIRNSSCLDISESTCPRGYTN +>gi|1729954|sp|Q05806.1|THN5_WHEAT RecName: Full=Type-5 thionin; Contains: RecName: Full=Type-5 thionin; AltName: Full=Type V thionin; Contains: RecName: Full=Acidic protein; Flags: Precursorgi|21885|emb|CAA43844.1| wheat type V thionin [Triticum aestivum]gi|21887|emb|CAA43845.1| wheat type V thionin [Triticum aestivum] +-DCGANPFKVACFNSCLLGPSTVFQCADFCACRLPAG--------- +>gi|120564556|gb|ABM30200.1| thionin [Brassica juncea] +-SCCPSTAARWAYYLCTNSWPLTPLCISHTGC-IESETTCPPGYPY +>gi|545031|gb|AAB29760.1| thionin precursor {clone Thi1Va1} [Viscum album=mistletoe, Peptide, 115 aa] +-ICCRAPAGKKCYNLCTA-lLSSE-TCANTCYCKDVSGETCPAD--- +# +NULL 3706 5728 4211 4064 4839 3729 4763 4308 4069 3323 5509 4640 4464 4937 4285 4423 3815 3783 6325 4665 +HMM A C D E F G H I K L M N P Q R S T V W Y + M->M M->I M->D I->M I->I D->M D->D Neff Neff_I Neff_D + 0 * * 0 * 0 * * * * +T 1 * * * * * * * * * * * * * * * * 0 * * * 1 + 0 * * * * * * 1000 0 0 + +T 2 * * 4186 * * 5283 * 2677 * 5264 * * * * * 833 3153 3986 * * 2 + 0 * * * * * * 3941 0 0 + +C 3 * 0 * * * * * * * * * * * * * * * * * * 3 + 0 * * * * * * 3941 0 0 + +C 4 * 122 * * 5259 4186 * * * * * * * * * * * * * * 4 + 0 * * * * * * 3941 0 0 + +P 5 4186 * * * * * * * 3250 * * * 549 5753 2856 * * * * * 5 + 0 * * * * * * 3941 0 0 + +S 6 5271 * 4429 * * * * * * * * 2329 * * 5941 866 2612 * * * 6 + 0 * * * * * * 3941 0 0 + +I 7 * * 5440 4820 * * * 3810 4212 * 5935 4983 3272 3386 4684 5283 1068 5174 * * 7 + 0 * * * * * * 3941 0 0 + +V 8 2614 * 4805 * 4186 * * 3036 * 3320 * * * 4094 * 2750 1838 5660 5873 * 8 + 0 * * * * * * 3941 0 0 + +A 9 646 * 4264 * * 2749 * * 4186 * * * * * * 3803 4903 * * * 9 + 0 * * * * * * 3941 0 0 + +R 10 * * * * * * * 5975 4265 * * * * * 189 * * 4186 * * 10 + 0 * * * * * * 3941 0 0 + +S 11 4186 * * 3803 * * * 5935 3608 * * 836 * * * 2953 5214 * 4935 5259 11 + 0 * * * * * * 3941 0 0 + +N 12 4935 1544 * 4787 5214 3942 * 2698 5264 * 4684 3736 * 5164 5174 * 3934 3625 * * 12 + 0 * * * * * * 3941 0 0 + +F 13 * * * * 2199 * * * * * * * * * * * * * * 354 13 + 0 * * * * * * 3941 0 0 + +N 14 * * 4222 3844 4805 * * * * 5264 * 510 * * * 4983 5283 5464 * 4935 14 + 0 * * * * * * 3941 0 0 + +V 15 2800 * 3844 * * * * 5271 * 2403 5174 * * * * 2663 3074 1902 * * 15 + 0 * * * * * * 3941 0 0 + +C 16 * 0 * * * * * * * * * * * * * * * * * * 16 + 0 * * * * * * 3941 0 0 + +R 17 * * * * * * 6200 2932 * 3611 4805 * * * 558 * 4093 * * * 17 + 0 * * * * * * 3941 0 0 + +L 18 3969 * * * 2171 * * 3956 * 1318 5174 4935 * * 5264 3115 6073 4979 * * 18 + 38 * 5271 * * * * 3941 0 0 + +P 19 3003 * * 4863 * 3441 * * * 5901 * * 2369 4784 2418 3812 2907 3735 * 4814 19 + 41 * 5155 0 * 0 * 3898 1025 1025 + +G 20 * * * * 4949 454 6123 * * 4542 * * 3604 * * * * 3948 4909 * 20 + 72 * 4354 * * * 0 3901 0 1033 + +T 21 2209 5224 * * * 2278 * * * * * 3752 4867 * * 2344 2028 * * * 21 + 39 5224 * 0 * 669 1430 3845 1023 1191 + +P 22 5624 * * * * * * 5870 * 3138 * * 1935 4054 * 1200 3439 * * * 22 + 0 * * * * 0 * 3903 0 1033 + +E 23 5952 6850 * 3099 * * 4802 5475 3316 * 4903 * 4232 2872 1627 5880 4176 4193 * 5287 23 + 184 * 3059 * * * * 3943 0 0 + +A 24 4442 * 3710 2779 2989 4157 * 4702 * * * * 1565 * 5707 4382 3234 * * * 24 + 28 5707 * 0 * 0 * 3843 1000 1329 + +I 25 4358 * * * 6658 * 5279 3244 4611 2579 * 5958 * 4193 3566 3476 2812 2484 * 4816 25 + 0 * * * * * * 3943 0 0 + +C 26 * 31 * * * * * * * * * * * * * 5573 * * * * 26 + 0 * * * * * * 3943 0 0 + +A 27 560 6231 * * * 4233 * 3636 * 4029 4985 * * * * 3609 * * * * 27 + 0 * * * * * * 3943 0 0 + +T 28 2460 * 3819 * * 4273 * 5850 2738 4816 * 3203 * 4985 3239 2607 3596 * * * 28 + 0 * * * * * * 3943 0 0 + +Y 29 4929 * 5658 4919 2879 4802 4950 3737 4252 1699 5444 5287 5995 * 5262 * 3899 3734 * 4566 29 + 0 * * * * * * 3943 0 0 + +T 30 3911 1554 * * * * * * * * * * * * * 1040 3227 * * * 30 + 0 * * * * * * 3943 0 0 + +G 31 4193 * 3877 5850 * 811 * * 4816 * * * * * 2699 4815 4659 * * 5301 31 + 0 * * * * * * 3943 0 0 + +C 32 * 0 * * * * * * * * * * * * * * * * * * 32 + 47 * 4950 * * * * 3943 0 0 + +I 33 * * * 4672 * * * 2865 809 3414 * * * 4933 3475 * * 4789 * * 33 + 0 * * * * 0 * 3904 0 1044 + +I 34 * * 3573 4802 * * 3933 921 * 2779 * 3898 * * 4839 * 5475 5952 * * 34 + 0 * * * * * * 3943 0 0 + +I 35 * * 5287 4950 4358 * * 1603 * 3869 * * 4193 6130 * 3498 3006 2227 * * 35 + 46 * 4985 * * * * 3943 0 0 + +P 36 4173 * 2874 * 5250 * * 5139 * 4792 * * 4633 4645 5394 1055 4050 * * 3805 36 + 201 * 2941 * * * 0 3881 0 1045 + +G 37 5480 * 5485 3849 * 869 * * * * * * * 4548 5245 3819 4686 2648 * * 37 + 22 * 6064 * * 0 * 3796 0 1453 + +A 38 4564 * 4944 5167 * 1910 * * 4653 4470 * 3572 4946 * 5111 2927 1892 * * * 38 + 0 * * * * 0 * 3775 0 1000 + +T 39 * 4660 * 5366 4744 6406 * 3919 2193 * * 4099 * 5382 5207 3976 1348 * * 4782 39 + 0 * * * * * * 3764 0 0 + +C 40 * 58 * * * * * * 4660 * * * * * * * * * * * 40 + 0 * * * * * * 3764 0 0 + +P 41 * * 4306 * * * * * 3731 * * 6351 322 4587 * 5643 * * * * 41 + 0 * * * * * * 3764 0 0 + +G 42 4398 * 6402 * * 4592 * * 4079 * * 4374 1200 * 3395 2109 5063 * * * 42 + 0 * * * * * * 3764 0 0 + +D 43 * * 2024 6014 4769 1231 4660 * * * * 4386 3096 4744 * 4862 * * * * 43 + 0 * * * * * * 3764 0 0 + +Y 44 * * * * 2991 * 4830 * * 3705 * * * * * * * * 2806 691 44 + 0 * * * * * * 3718 0 0 + +A 45 4566 * 3231 5056 * * * 3467 * * * 5405 1034 * 5100 6384 2896 4532 * * 45 + 35 5366 * 4087 87 * * 3718 1002 0 + +N 46 * * 4370 * * * 2588 * 1723 * * 2200 * * 5953 * * * * 2008 46 + 0 * * 0 * * * 3699 0 0 + +// diff --git a/core/tests/data/aln_tpl/1crn_corrupted.hhm b/core/tests/data/aln_tpl/1crn_corrupted.hhm new file mode 100644 index 00000000..01765421 --- /dev/null +++ b/core/tests/data/aln_tpl/1crn_corrupted.hhm @@ -0,0 +1,185 @@ +####################################################################### +# THIS FILE IS CORRUPTED!!!!!!!!! # +# First three amino acids do not have the right number of frequencies # +####################################################################### + +HHsearch 1.5 +NAME c6a0deb50c4f69619d193f0fde0517c2 +FAM +FILE seq01 +COM /scicore/soft/apps/HH-suite/2.0.16-goolf-1.4.10/bin/hhmake -i /scratch/14369431.1.short.q/tmpqOaEJI/seq01.a3m -o /scratch/14369431.1.short.q/tmpqOaEJI/seq01.hhm +DATE Mon Mar 7 14:19:45 2016 +LENG 46 match states, 46 columns in multiple alignment +FILT 55 out of 57 sequences passed filter (-id 90 -cov 0 -qid 0 -qsc -20.00 -diff 100) +NEFF 3.8 +SEQ +>ss_pred PSIPRED predicted secondary structure +CCCCCCHHHHHHHHHCCCCCCCHHHHHHHCCCEEECCCCCCCCCCC +>ss_conf PSIPRED confidence values +9877783554421111168998455542048366208989999999 +>Consensus +xsCCpstxaRnxYnxCrxxgxsxxxCaxxsgCkixsgxxCPxxyxx +>c6a0deb50c4f69619d193f0fde0517c2 +TTCCPSIVARSNFNVCRLPGTPEAICATYTGCIIIPGATCPGDYAN +>gi|115605364|gb|ABJ15789.1| putative thionin precursor [Polygonum sibiricum] +-SCCQTTTARNIYNSCRLAGGSRERCASLSGCKHVTGNTCSPGWEK +>gi|4007745|emb|CAA65316.1| purothionin [Secale cereale] +-SCCKSTLGRNCYNLCRTRGAQK-LCANFCRCKLISSTSCPKEFPK +>gi|17381172|gb|AAL36398.1| putative thionin protein [Arabidopsis thaliana]gi|62320644|dbj|BAD95310.1| putative thionin [Arabidopsis thaliana] +-TCCPSQSTRKGFEDCISEGNLQILCSAESGCRDTYVGYCPSGFPY +>gi|802170|gb|AAB33011.1| crambin precursor=thionin variant Thi2Ca12 [Crambe abyssinica, seeds, Peptide Partial, 135 aa] +-SCCPTKSARNTFDVCRLTGTSMGLCAAISECKILSVTKCPSNLPY +>gi|21553588|gb|AAM62681.1| thionin Thi2.2 [Arabidopsis thaliana] +-ICCPTKDDRSVYFVCMLSVSSQFYCLLKSKCKNTSQTICPPGYTN +>gi|15218931|ref|NP_176784.1| thionin [Arabidopsis thaliana]gi|44888531|sp|Q9C8D6.1|THN24_ARATH RecName: Full=Probable thionin-2.4; Contains: RecName: Full=Probable thionin-2.4; Contains: RecName: Full=Acidic protein; Flags: Precursorgi|12322605|gb|AAG51299.1|AC026480_6 thionin, putative [Arabidopsis thaliana]gi|14190505|gb|AAK55733.1|AF380652_1 At1g66100/F15E12_20 [Arabidopsis thaliana]gi|15809774|gb|AAL06815.1| At1g66100/F15E12_20 [Arabidopsis thaliana]gi|332196341|gb|AEE34462.1| thionin [Arabidopsis thaliana] +-ICCPSIQARTFYNACLFAVGSPSSCIRNSSCLDISESTCPRGYTN +>gi|1729954|sp|Q05806.1|THN5_WHEAT RecName: Full=Type-5 thionin; Contains: RecName: Full=Type-5 thionin; AltName: Full=Type V thionin; Contains: RecName: Full=Acidic protein; Flags: Precursorgi|21885|emb|CAA43844.1| wheat type V thionin [Triticum aestivum]gi|21887|emb|CAA43845.1| wheat type V thionin [Triticum aestivum] +-DCGANPFKVACFNSCLLGPSTVFQCADFCACRLPAG--------- +>gi|120564556|gb|ABM30200.1| thionin [Brassica juncea] +-SCCPSTAARWAYYLCTNSWPLTPLCISHTGC-IESETTCPPGYPY +>gi|545031|gb|AAB29760.1| thionin precursor {clone Thi1Va1} [Viscum album=mistletoe, Peptide, 115 aa] +-ICCRAPAGKKCYNLCTA-lLSSE-TCANTCYCKDVSGETCPAD--- +# +NULL 3706 5728 4211 4064 4839 3729 4763 4308 4069 3323 5509 4640 4464 4937 4285 4423 3815 3783 6325 4665 +HMM A C D E F G H I K L M N P Q R S T V W Y + M->M M->I M->D I->M I->I D->M D->D Neff Neff_I Neff_D + 0 * * 0 * 0 * * * * +T 1 * * * * * * * * * * * * * * * * 0 * * + 0 * * * * * * 1000 0 0 + +T 2 * * 4186 * * 5283 * 2677 * 5264 * * * * * 833 3153 3986 * + 0 * * * * * * 3941 0 0 + +C 3 * 0 * * * * * * * * * * * * * * * * * + 0 * * * * * * 3941 0 0 + +C 4 * 122 * * 5259 4186 * * * * * * * * * * * * * * 4 + 0 * * * * * * 3941 0 0 + +P 5 4186 * * * * * * * 3250 * * * 549 5753 2856 * * * * * 5 + 0 * * * * * * 3941 0 0 + +S 6 5271 * 4429 * * * * * * * * 2329 * * 5941 866 2612 * * * 6 + 0 * * * * * * 3941 0 0 + +I 7 * * 5440 4820 * * * 3810 4212 * 5935 4983 3272 3386 4684 5283 1068 5174 * * 7 + 0 * * * * * * 3941 0 0 + +V 8 2614 * 4805 * 4186 * * 3036 * 3320 * * * 4094 * 2750 1838 5660 5873 * 8 + 0 * * * * * * 3941 0 0 + +A 9 646 * 4264 * * 2749 * * 4186 * * * * * * 3803 4903 * * * 9 + 0 * * * * * * 3941 0 0 + +R 10 * * * * * * * 5975 4265 * * * * * 189 * * 4186 * * 10 + 0 * * * * * * 3941 0 0 + +S 11 4186 * * 3803 * * * 5935 3608 * * 836 * * * 2953 5214 * 4935 5259 11 + 0 * * * * * * 3941 0 0 + +N 12 4935 1544 * 4787 5214 3942 * 2698 5264 * 4684 3736 * 5164 5174 * 3934 3625 * * 12 + 0 * * * * * * 3941 0 0 + +F 13 * * * * 2199 * * * * * * * * * * * * * * 354 13 + 0 * * * * * * 3941 0 0 + +N 14 * * 4222 3844 4805 * * * * 5264 * 510 * * * 4983 5283 5464 * 4935 14 + 0 * * * * * * 3941 0 0 + +V 15 2800 * 3844 * * * * 5271 * 2403 5174 * * * * 2663 3074 1902 * * 15 + 0 * * * * * * 3941 0 0 + +C 16 * 0 * * * * * * * * * * * * * * * * * * 16 + 0 * * * * * * 3941 0 0 + +R 17 * * * * * * 6200 2932 * 3611 4805 * * * 558 * 4093 * * * 17 + 0 * * * * * * 3941 0 0 + +L 18 3969 * * * 2171 * * 3956 * 1318 5174 4935 * * 5264 3115 6073 4979 * * 18 + 38 * 5271 * * * * 3941 0 0 + +P 19 3003 * * 4863 * 3441 * * * 5901 * * 2369 4784 2418 3812 2907 3735 * 4814 19 + 41 * 5155 0 * 0 * 3898 1025 1025 + +G 20 * * * * 4949 454 6123 * * 4542 * * 3604 * * * * 3948 4909 * 20 + 72 * 4354 * * * 0 3901 0 1033 + +T 21 2209 5224 * * * 2278 * * * * * 3752 4867 * * 2344 2028 * * * 21 + 39 5224 * 0 * 669 1430 3845 1023 1191 + +P 22 5624 * * * * * * 5870 * 3138 * * 1935 4054 * 1200 3439 * * * 22 + 0 * * * * 0 * 3903 0 1033 + +E 23 5952 6850 * 3099 * * 4802 5475 3316 * 4903 * 4232 2872 1627 5880 4176 4193 * 5287 23 + 184 * 3059 * * * * 3943 0 0 + +A 24 4442 * 3710 2779 2989 4157 * 4702 * * * * 1565 * 5707 4382 3234 * * * 24 + 28 5707 * 0 * 0 * 3843 1000 1329 + +I 25 4358 * * * 6658 * 5279 3244 4611 2579 * 5958 * 4193 3566 3476 2812 2484 * 4816 25 + 0 * * * * * * 3943 0 0 + +C 26 * 31 * * * * * * * * * * * * * 5573 * * * * 26 + 0 * * * * * * 3943 0 0 + +A 27 560 6231 * * * 4233 * 3636 * 4029 4985 * * * * 3609 * * * * 27 + 0 * * * * * * 3943 0 0 + +T 28 2460 * 3819 * * 4273 * 5850 2738 4816 * 3203 * 4985 3239 2607 3596 * * * 28 + 0 * * * * * * 3943 0 0 + +Y 29 4929 * 5658 4919 2879 4802 4950 3737 4252 1699 5444 5287 5995 * 5262 * 3899 3734 * 4566 29 + 0 * * * * * * 3943 0 0 + +T 30 3911 1554 * * * * * * * * * * * * * 1040 3227 * * * 30 + 0 * * * * * * 3943 0 0 + +G 31 4193 * 3877 5850 * 811 * * 4816 * * * * * 2699 4815 4659 * * 5301 31 + 0 * * * * * * 3943 0 0 + +C 32 * 0 * * * * * * * * * * * * * * * * * * 32 + 47 * 4950 * * * * 3943 0 0 + +I 33 * * * 4672 * * * 2865 809 3414 * * * 4933 3475 * * 4789 * * 33 + 0 * * * * 0 * 3904 0 1044 + +I 34 * * 3573 4802 * * 3933 921 * 2779 * 3898 * * 4839 * 5475 5952 * * 34 + 0 * * * * * * 3943 0 0 + +I 35 * * 5287 4950 4358 * * 1603 * 3869 * * 4193 6130 * 3498 3006 2227 * * 35 + 46 * 4985 * * * * 3943 0 0 + +P 36 4173 * 2874 * 5250 * * 5139 * 4792 * * 4633 4645 5394 1055 4050 * * 3805 36 + 201 * 2941 * * * 0 3881 0 1045 + +G 37 5480 * 5485 3849 * 869 * * * * * * * 4548 5245 3819 4686 2648 * * 37 + 22 * 6064 * * 0 * 3796 0 1453 + +A 38 4564 * 4944 5167 * 1910 * * 4653 4470 * 3572 4946 * 5111 2927 1892 * * * 38 + 0 * * * * 0 * 3775 0 1000 + +T 39 * 4660 * 5366 4744 6406 * 3919 2193 * * 4099 * 5382 5207 3976 1348 * * 4782 39 + 0 * * * * * * 3764 0 0 + +C 40 * 58 * * * * * * 4660 * * * * * * * * * * * 40 + 0 * * * * * * 3764 0 0 + +P 41 * * 4306 * * * * * 3731 * * 6351 322 4587 * 5643 * * * * 41 + 0 * * * * * * 3764 0 0 + +G 42 4398 * 6402 * * 4592 * * 4079 * * 4374 1200 * 3395 2109 5063 * * * 42 + 0 * * * * * * 3764 0 0 + +D 43 * * 2024 6014 4769 1231 4660 * * * * 4386 3096 4744 * 4862 * * * * 43 + 0 * * * * * * 3764 0 0 + +Y 44 * * * * 2991 * 4830 * * 3705 * * * * * * * * 2806 691 44 + 0 * * * * * * 3718 0 0 + +A 45 4566 * 3231 5056 * * * 3467 * * * 5405 1034 * 5100 6384 2896 4532 * * 45 + 35 5366 * 4087 87 * * 3718 1002 0 + +N 46 * * 4370 * * * 2588 * 1723 * * 2200 * * 5953 * * * * 2008 46 + 0 * * 0 * * * 3699 0 0 + +// diff --git a/core/tests/data/aln_tpl/5ua4_B.fasta b/core/tests/data/aln_tpl/5ua4_B.fasta new file mode 100644 index 00000000..92c2692c --- /dev/null +++ b/core/tests/data/aln_tpl/5ua4_B.fasta @@ -0,0 +1,4 @@ +>target +SESQEAVIRDIARHLARIGDRMEYGIRPGLVDSL +>pdb_id=4bd2, chain=B, assembly_id=1, offset=0 atoms +SESQEDIIRNIARHLAQVGDSMDRSIPPGL---- diff --git a/core/tests/data/aln_tpl/5ua4_B.hhm b/core/tests/data/aln_tpl/5ua4_B.hhm new file mode 100644 index 00000000..a038b721 --- /dev/null +++ b/core/tests/data/aln_tpl/5ua4_B.hhm @@ -0,0 +1,144 @@ +HHsearch 1.5 +NAME target +FAM +FILE query_hhblits +COM /scicore/soft/apps/HH-suite/2.0.16-goolf-1.4.10-Boost-1.53.0/bin/hhmake -i <101 characters> -o <101 characters> +DATE Sat Dec 31 05:49:23 2016 +LENG 34 match states, 34 columns in multiple alignment +FILT 11 out of 13 sequences passed filter (-id 90 -cov 0 -qid 0 -qsc -20.00 -diff 100) +NEFF 2.0 +SEQ +>ss_pred PSIPRED predicted secondary structure +CHHHHHHHHHHHHHHHHHHHHHHCCCCCHHHCCC +>ss_conf PSIPRED confidence values +9159999999999999975443036780111159 +>Consensus +SEsQEeiIxxiArxLAqiGDxmdxsIxPxlvxxL +>target +SESQEAVIRDIARHLARIGDRMEYGIRPGLVDSL +>gi|12083631|ref|NP_073175.1| BH3-interacting domain death agonist [Rattus norvegicus]gi|88909592|sp|Q9JLT6.2|BID_RAT RecName: Full=BH3-interacting domain death agonist; AltName: Full=p22 BID; Short=BID; Contains: RecName: Full=BH3-interacting domain death agonist p15; AltName: Full=p15 BID; Contains: RecName: Full=BH3-interacting domain death agonist p13; AltName: Full=p13 BID; Contains: RecName: Full=BH3-interacting domain death agonist p11; AltName: Full=p11 BIDgi|8050830|gb|AAF71759.1|AF259503_1 BID protein [Rattus norvegicus]gi|149049571|gb|EDM02025.1| BH3 interacting domain death agonist [Rattus norvegicus]gi|157041211|dbj|BAF79674.1| desmocollin type 4 [Rattus norvegicus] +SESQDEVIHNIARHLAQAGDELDHSIQPTLVRQL +>gi|115495761|ref|NP_001068914.1| BH3 interacting domain death agonist [Bos taurus]gi|109658413|gb|AAI18360.1| BH3 interacting domain death agonist [Bos taurus]gi|296486995|gb|DAA29108.1| BH3 interacting domain death agonist [Bos taurus] +SERQEEAVREVARQLAQIGDRLEGSIHPGMVAGL +>gi|281341033|gb|EFB16617.1| hypothetical protein PANDA_016465 [Ailuropoda melanoleuca] +SQGQEEIIQDIARQLAQIGDDMDHSIHPGLMNNL +>gi|307335680|gb|ADN43410.1| BH3 interacting domain death agonist [Lemur catta] +SESEEDLIQTLARQLAQIGDSMERSIPPRLVDHL +>gi|307335678|gb|ADN43409.1| BH3 interacting domain death agonist [Felis catus] +SESQEEIIQDIARQLAQIGDRMDHSIHPRVVNNL +>gi|307335682|gb|ADN43411.1| BH3 interacting domain death agonist [Saimiri boliviensis] +SESQEDIIRNIARQLAQVGDSMDRSIPPGLVNRL +>gi|291412643|ref|XP_002722593.1| PREDICTED: BH3 interacting domain death agonist-like [Oryctolagus cuniculus] +SESQEEVIRNIARRLAQIGDRMDHHIQPELLNNL +>gi|296191314|ref|XP_002743575.1| PREDICTED: hypothetical protein LOC100403496 [Callithrix jacchus] +SESQEDVIRSIARHLAHIGDSMDRSIPLDLVDRL +>gi|334348150|ref|XP_001373913.2| PREDICTED: envelope glycoprotein-like [Monodelphis domestica] +SESQEEIIQRIAMQLAKIGDNIENSIQPRVVDEL +# +NULL 3706 5728 4211 4064 4839 3729 4763 4308 4069 3323 5509 4640 4464 4937 4285 4423 3815 3783 6325 4665 +HMM A C D E F G H I K L M N P Q R S T V W Y + M->M M->I M->D I->M I->I D->M D->D Neff Neff_I Neff_D + 0 * * 0 * 0 * * * * +S 1 * * * * * * * * * * * * * * * 0 * * * * 1 + 0 * * * * * * 1973 0 0 + +E 2 * * * 143 * * * * * * * * * 3405 * * * * * * 2 + 0 * * * * * * 1973 0 0 + +S 3 * * * * * 3405 * * * * * * * * 2994 358 * * * * 3 + 0 * * * * * * 1973 0 0 + +Q 4 * * * 3344 * * * * * * * * * 150 * * * * * * 4 + 0 * * * * * * 1973 0 0 + +E 5 * * 3425 141 * * * * * * * * * * * * * * * * 5 + 0 * * * * * * 1973 0 0 + +A 6 3391 * 1939 635 * * * * * * * * * * * * * * * * 6 + 0 * * * * * * 1973 0 0 + +V 7 2994 * * * * * * 1285 * 3344 * * * * * * * 1452 * * 7 + 0 * * * * * * 1973 0 0 + +I 8 * * * * * * * 194 * * * * * * * * * 2994 * * 8 + 0 * * * * * * 1973 0 0 + +R 9 * * * * * * 2586 * * * * * * 1457 1092 * * * * * 9 + 0 * * * * * * 1973 0 0 + +D 10 * * 1981 2994 * * * * * * * 1628 * * 3212 3454 3344 * * * 10 + 0 * * * * * * 1973 0 0 + +I 11 * * * * * * * 366 * 3344 * * * * * * * 2994 * * 11 + 0 * * * * * * 1973 0 0 + +A 12 0 * * * * * * * * * * * * * * * * * * * 12 + 0 * * * * * * 1973 0 0 + +R 13 * * * * * * * * * * 3212 * * * 165 * * * * * 13 + 0 * * * * * * 1973 0 0 + +H 14 * * * * * * 1502 * * * * * * 834 3540 * * * * * 14 + 0 * * * * * * 1973 0 0 + +L 15 * * * * * * * * * 0 * * * * * * * * * * 15 + 0 * * * * * * 1973 0 0 + +A 16 0 * * * * * * * * * * * * * * * * * * * 16 + 0 * * * * * * 1973 0 0 + +R 17 * * * * * * 3454 * 3212 * * * * 503 3391 * * * * * 17 + 0 * * * * * * 1973 0 0 + +I 18 3425 * * * * * * 259 * * * * * * * * * 3816 * * 18 + 0 * * * * * * 1973 0 0 + +G 19 * * * * * 0 * * * * * * * * * * * * * * 19 + 0 * * * * * * 1973 0 0 + +D 20 * * 0 * * * * * * * * * * * * * * * * * 20 + 0 * * * * * * 1973 0 0 + +R 21 * * 3405 2586 * * * * * * * 3212 * * 1433 1939 * * * * 21 + 0 * * * * * * 1973 0 0 + +M 22 * * * * * * * 3212 * 2194 570 * * * * * * * * * 22 + 0 * * * * * * 1973 0 0 + +E 23 * * 804 1227 * * * * * * * * * * * * * * * * 23 + 0 * * * * * * 1973 0 0 + +Y 24 * * * * * 2994 1285 * * * * 3212 * * 1939 * * * * 3391 24 + 0 * * * * * * 1973 0 0 + +G 25 * * * * * 3391 3540 * * * * 3766 * * * 424 * * * * 25 + 0 * * * * * * 1973 0 0 + +I 26 * * * * * * * 0 * * * * * * * * * * * * 26 + 0 * * * * * * 1973 0 0 + +R 27 * * * * * * 1819 * * * * * 1939 1472 3391 * * * * * 27 + 0 * * * * * * 1973 0 0 + +P 28 * * * * * * * * * 3454 * * 138 * * * * * * * 28 + 0 * * * * * * 1973 0 0 + +G 29 * * 3454 3540 * 1372 * * * * * * * * 1889 * 2586 * * * 29 + 0 * * * * * * 1973 0 0 + +L 30 * * * * * * * * * 508 2994 * * * * * * 2544 * * 30 + 0 * * * * * * 1973 0 0 + +V 31 * * * * * * * * * 3540 3405 * * * * * * 287 * * 31 + 0 * * * * * * 1973 0 0 + +D 32 2994 * 1348 * * * * * * * * 1667 * * 2586 * * * * * 32 + 0 * * * * * * 1973 0 0 + +S 33 * * * 3212 * 2994 3344 * * * * 2036 * 2586 2624 3391 * * * * 33 + 0 * * * * * * 1973 0 0 + +L 34 * * * * * * * * * 0 * * * * * * * * * * 34 + 0 * * 0 * * * 1973 0 0 + +// diff --git a/core/tests/test_pm3argparse.py b/core/tests/test_pm3argparse.py index 0e0d617c..0b2310e8 100644 --- a/core/tests/test_pm3argparse.py +++ b/core/tests/test_pm3argparse.py @@ -1101,7 +1101,113 @@ class PM3ArgParseTests(unittest.TestCase): self.assertEqual(len(self.log.messages['ERROR']), 1) self.assertEqual(self.log.messages['ERROR'], ['Could not find chain with ID tpl (should be '+ - '<FILE>.<CHAIN>) to attach to sequence named tpl']) + '<FILE>.<CHAIN>) to attach to sequence named tpl']) + + def testProfileDoesNotExist(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + with self.assertRaises(SystemExit) as ecd: + parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-s', 'data/aln_tpl/IDoNotExist.hhm']) + self.assertEqual(ecd.exception.code, 51) + self.assertEqual(len(self.log.messages['ERROR']), 1) + + def testProfileIsFuckedUp(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + with self.assertRaises(SystemExit) as ecd: + parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-s', 'data/aln_tpl/1crn_corrupted.hhm']) + self.assertEqual(ecd.exception.code, 52) + self.assertEqual(len(self.log.messages['ERROR']), 1) + + def testAdditionalProfile(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + with self.assertRaises(SystemExit) as ecd: + parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-s', 'data/aln_tpl/1crn.hhm', + '-s', 'data/aln_tpl/5ua4_B.hhm']) + self.assertEqual(ecd.exception.code, 53) + self.assertEqual(len(self.log.messages['ERROR']), 1) + + def testUniqueProfiles(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + with self.assertRaises(SystemExit) as ecd: + parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-s', 'data/aln_tpl/1crn.hhm', + '-s', 'data/aln_tpl/1crn.hhm']) + self.assertEqual(ecd.exception.code, 54) + self.assertEqual(len(self.log.messages['ERROR']), 1) + + def testAllCovered(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + with self.assertRaises(SystemExit) as ecd: + parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-f', 'data/aln_tpl/5d52-1_cut_A.fasta', + '-s', 'data/aln_tpl/1crn.hhm']) + self.assertEqual(ecd.exception.code, 55) + self.assertEqual(len(self.log.messages['ERROR']), 1) + + def testAllGoodOneChain(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + + opts = parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-s', 'data/aln_tpl/1crn.hhm']) + + self.assertEqual(len(opts.alignments), 1) + self.assertEqual(len(opts.profiles), 1) + self.assertEqual(opts.alignments[0].GetSequence(0).GetGaplessString(), + opts.profiles[0].sequence) + + def testAllGoodHomoOligo(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + + opts = parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-f', 'data/aln_tpl/1crn.fasta', + '-s', 'data/aln_tpl/1crn.hhm']) + self.assertEqual(len(opts.alignments), 2) + self.assertEqual(len(opts.profiles), 2) + self.assertEqual(opts.alignments[0].GetSequence(0).GetGaplessString(), + opts.profiles[0].sequence) + self.assertEqual(opts.alignments[1].GetSequence(0).GetGaplessString(), + opts.profiles[1].sequence) + + def testAllGoodHeteroOligo(self): + parser = pm3argparse.PM3ArgumentParser(__doc__, action=False) + parser.AddAlignment() + parser.AddProfile() + parser.AssembleParser() + + opts = parser.Parse(['-f', 'data/aln_tpl/1crn.fasta', + '-f', 'data/aln_tpl/5ua4_B.fasta', + '-s', 'data/aln_tpl/1crn.hhm', + '-s', 'data/aln_tpl/5ua4_B.hhm']) + self.assertEqual(len(opts.alignments), 2) + self.assertEqual(len(opts.profiles), 2) + self.assertEqual(opts.alignments[0].GetSequence(0).GetGaplessString(), + opts.profiles[0].sequence) + self.assertEqual(opts.alignments[1].GetSequence(0).GetGaplessString(), + opts.profiles[1].sequence) + # test options: --disable-aln check (for amino acids) # test options: --disable-input-checks (for all) -- GitLab