diff --git a/read_sequencer_package/cli.py b/read_sequencer_package/cli.py index 1a973363d8074b1a76833766cdad8e42a68a3a8f..9a68267c115d328afa32e582d8c03e4aede2a076 100644 --- a/read_sequencer_package/cli.py +++ b/read_sequencer_package/cli.py @@ -1,5 +1,5 @@ import argparse -from modules import read_sequencer as rs +from modules import ReadSequencer import logging parser = argparse.ArgumentParser(prog='read_sequencer', @@ -11,21 +11,28 @@ parser.add_argument('--output_file_path', parser.add_argument('--read_length', help='read length for sequencing', type=int) +parser.add_argument('--random', action='store_true', default=False, + help='generate random sequences') +parser.add_argument('--n_random', default=100, type=int, help='n random sequences') +parser.add_argument('--mean_random', default=50, type=int, help='mean random sequences') +parser.add_argument('--sd_random', default=25, type=int, help='standard deviation random sequences') args = parser.parse_args() def main(): - LOG.info("Program started.") - read_sequencer = rs() - read_sequencer.read_fasta(args.input_file_path) + LOG.info("Read sequencer started.") + read_sequencer = ReadSequencer() + if args.random: + read_sequencer.add_random_sequences(n=args.n_random, mean=args.mean_random, sd=args.sd_random) + else: + read_sequencer.read_fasta(args.input_file_path) read_sequencer.run_sequencing(args.read_length) read_sequencer.write_fasta(args.output_file_path) - LOG.info("Program finished.") + LOG.info("Read sequencer finished.") if __name__ == '__main__': logging.basicConfig( format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', - level=logging.INFO, - ) + level=logging.INFO) LOG = logging.getLogger(__name__) main() diff --git a/read_sequencer_package/fasta_testfile/result.fasta b/read_sequencer_package/fasta_testfile/result.fasta index 7b9bb49f950ee668b19a8aa1dbddc18f9c62e58c..1aedc2c89f071f5db1dc1a74dde1cd6211a30a93 100644 --- a/read_sequencer_package/fasta_testfile/result.fasta +++ b/read_sequencer_package/fasta_testfile/result.fasta @@ -1,300 +1,100 @@ 1|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 481 bp -tgagcactcggtgccaagggcggggatacacagatggttggctgatacaaccgggactta -aattccctagactagatctgtgttggaacgcctctctacgagaaggcgaacgaactggcg -ccgaggcgatcgctaacatcttcgtctcgcttgaaccacacaatggatgattcctcccta -ggggtttgacaatcaacctggatagcgtttaatatagatggctggttgatttgtaaggcc -ttcacagactactcagagcaataagtgaccccccaacaatcagaggctgatcctctgctc +tgagcactcggtgccaagggcggggatacacagatggttggctgatacaa 2|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 495 bp -ctgaatcaggtgtaggttctttttacgtcgtttaaggagctacacggtatcttgttttca -gttaaggtgccacacccccgggtggatcatccgtcagctttcctacaattaggtaactgg -cgggatcatttagtcttgtattaagacgctcgcgcccggggcggccggcttgtttgtgga -gagaaacaacaagtctgagtatagattaaatacaactggtttactggcaagtcagcgcgt -aacaaccggtgagccgctgcgcatgcttactgcaatgaacatcttggcacgatcctgcga +ctgaatcaggtgtaggttctttttacgtcgtttaaggagctacacggtat 3|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 193 bp -acttcagtactggaaggatctaggaaccattaatgcgagtgtggtgacgccagacgaccc -ccggtgttctgccaccttctttggataggagaaccgtcactcgccccggaggccccacgg -ataagaagggtatcttgtgatcacgcgaatgactcacttgcgtaagtaatctaactttgt -ttttcgctataaaAAGCCGGGGTGTTGAAGTCTATCGTCGATGGCTGCAGTATTAAAGAT -CTTATGACTAATGCTCCAGCTTCCCCTCTGTGGTTAGAGACGAACTCGTCAGCCTCGATT +acttcagtactggaaggatctaggaaccattaatgcgagtgtggtgacgc 4|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 625 bp -acgtctggagcgtgggttgacccctgtacatggttctttccggatccttaacgtgccgat -acaactcaaaggtaactgtgcttaccacttccgaagctacatgcctctaacaaagtactt -tcgaggaggcactcaacccccggagatgctttgcgcggaagcagagatcgctgctcaaaa -tttggaatcactttcgtgcgagacccaaacaatttatggtggattcaagcgaacgagtca -tgattacagatctatcaatcgaggagaggacggcttcgccgtttccttttaatgtgaaac +acgtctggagcgtgggttgacccctgtacatggttctttccggatcctta 5|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 845 bp -agagcgtacggcgcgcatcgtataccctacgagggcggcgtgtggaggaacgctgggctg -acactgtagaagattagatacacttgtccctaaaattaacccttaaccgctattagccgt -gaacgcttcctaatatttcaagccgtatagctaagtggagaatgtggagccctggtcaaa -tcacgagccaattagccctagacggacagcacatctcgtcgcgttaagcggaacactcag -cttttattacctagtgctcagcctggtttccatatgctctaaccgaactgatgcatactt +agagcgtacggcgcgcatcgtataccctacgagggcggcgtgtggaggaa 6|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 703 bp -tgcagtcgatgtgctattcgttttaggcagtctacgcgcttagtaactcccacggccata -gacttatctcagacatggaccatgtcgatatcggacgccgtcttaccacatttttcatag -cccttcataaggcagcgtgctcttactgcccaataaggtggacgattccgaccctaggcg -aaccagcgctatagatggaccttctaattgatgcgcaacgtgattgtttccttggtctgg -gttagcatttcggtagcctaacagtcactccagttcgctaactggcctggatgagggccc +tgcagtcgatgtgctattcgttttaggcagtctacgcgcttagtaactcc 7|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 243 bp -actctttagaatgggtttcactaatagtacgtgcatacaatttcgtcagaaagggcgctt -gctaagggacacggatcaatgatgaccagacttatggtgtcaggtctcactatattacat -atccggaacccgtgcccgcaccacgcgctgggtctaggcgaccggtgcatcatctccgcg -tctctagaggattctctcggtaaatgctgaattgcgtgagatcaaatccgtatgccagtc -atgAGGCAAGGCGTATAGATCTTTCCTCAGCATGAACCGGAACATCCTACAATCGCACGC +actctttagaatgggtttcactaatagtacgtgcatacaatttcgtcaga 8|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 863 bp -attggcccggtccaggacagagccttatattgctactggtatgagaaccgttctgacgta -aacttgatggctttacgcctgcacgggcttcatacacacatgaccgtggacaaagtcgcc -caggccctcgaatagggtgtaatggttaacggttagtgccaccccaatgggtgcgaggca -gtaagagtgtcctatggcaaaactctcctcgtttcagaagggtcgctcctctagcctcct -tatcccccctataatagtactcgccgggtacgagccggagctccctcgagaagtcatcct +attggcccggtccaggacagagccttatattgctactggtatgagaaccg 9|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 494 bp -aagcgaaactcctagaacttcccatcaggcaatcgtgtcccacgaagcacggatactacg -ggcactagttgaatggggggtttttttcgtaggtcgtaataggtactcggatagtcggcc -cagagttatgcttaagaatgcgctgcttaattcaatgtgactgccgttgtctccgatcag -atccaggtgatgattgcgatcgcagcgacatatgtctcgaaagacgtgtcgtgaataagc -ctgtaagcccaatgcaacatggttccctcaccttgtagctgatgtaccgtgtttcaatct +aagcgaaactcctagaacttcccatcaggcaatcgtgtcccacgaagcac 10|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 86 bp -atcctagcgccaaagatttactgttatggggtcgacgaacactagccgataatgccgtcc -tgggatctctagcctagtattatgcgGACGTTTCCAGCCCCGCTGACCCTGAGTCGGACG -TGAAAGCGAATCATCATATAAAGCATACACTGACTTGCACAAGTTGAAATAAGAGGTTCG -CTTAGGCTTGCCTCTCTAGTGGCGCAGCAGTACTGAGTGGGTTCTACTTACTCTCTGGAC -TCATGATCGTGTACACCGTGAAAGACGGCCCATGTGCCGTATATCTACCGTGCATAACCT +atcctagcgccaaagatttactgttatggggtcgacgaacactagccgat 11|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 360 bp -cgcctgagggtcctaaatctgacgtatgatcgaagagattggaaggtcccggcgggtcac -cccacgttgcgatcatggccaaggccatggtttgctcaaaaatcccacattcgccgtctt -acgcgttaggacctcactatcccacagacggtgcgttaccttgtagttgacgcgggatcg -tggtgataacagctatttccgagacttcatattcttttacatagcggcttaccgtagtga -ctccatacattatttgcctattttgtagtgccccgaacagtaaggggaagccaactgccg +cgcctgagggtcctaaatctgacgtatgatcgaagagattggaaggtccc 12|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 140 bp -gaattcctggggatttactcacccccgaggcggacaagatttccagctggatcaccgagg -gttacttaatcccttcgatgctttcaaaggccctaatcagtattgagcaacgaaagcgga -gtcgttagtgtccaagttgcAAATGGTATCGCAGAGCGGTCGGATTCGCCAAAAAGCTTG -GCCAATGGCTAAAATTTAGCTCTCGCCCTCGCGGACATTGTGGACATGTGCCTGACTACC -CATCATTCTCGCTCGGCACTAGGCCACCAGGGTTAACGATTTCGTAACAACGACCCGCAT +gaattcctggggatttactcacccccgaggcggacaagatttccagctgg 13|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 832 bp -aatactctcgttgaagcgtcggacagtaaagtgagagatttcggcccacggtagtcggac -attctcagtggggagcgaagagttgcgcttagagccgacgtacacgatataacctcaatt -gaaaatcgctatgtgcatcgttagggcctccggcgtgctgtttcggcagctgagtgtgag -ggtataacttaccttcgacccgaattgtctcgcggaaatcctaggcaagtaatccacttt -tggtacgggggagctagttcctctaagacgaacaagtgcactcttcacgtatagtgccct +aatactctcgttgaagcgtcggacagtaaagtgagagatttcggcccacg 14|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 296 bp -atcggggtgcgaaatcccctgagctggttgactacatacgtaaccacgttccgtgcgtca -tctaagcgtatcggctcatactggtggtaactagacttggtgaaccctaggtgccggcat -atcgaggtccgcatccaaaataactatcgctatagctacatagacatttactcgcaatat -tacacgaaccgtacgtccctcggtattaacgtaatggttaaagtctctaattccgctgca -gagcggcgggataaagacgccggtgtggcctgaatggtggatctgtccgtagtaccACGT +atcggggtgcgaaatcccctgagctggttgactacatacgtaaccacgtt 15|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 515 bp -accttcaatttgttcgcccgggacaagtagaaattactgtaaactaaacttaacctattc -cttgttaaagtccgcaccaagtgtactgtaagaatggtcgctcgtaataataacgagaag -atcctcgagccgtggtctgctgcaactaccttgagcggtacatcgatgtcccactctggg -cggggatcaggggcgagacttgtggtgaggccaaagaatggcgcatatgtaggcaccata -cgtcgatacgttccaggagtagaggcctcgaacatacaccacgataagtctacagacgca +accttcaatttgttcgcccgggacaagtagaaattactgtaaactaaact 16|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 820 bp -ccggctcaatcctgtagaaccgcgtacaacacacccaagctataccgcacacggcgcctt -agcaaccactgcttatctgcgtattatacctttacaatcattacatttgatctatctgtg -taccggttttttttgattcaattcgctggattacgacctcccggccaaaaattctcaatt -catcgttaacagacgtatttgaagataatcattcaacgtgaactagcacttggtcacttg -gtacgccaaccaagctgtgctttggggcaaccctttataactcacatgccgtcctaggac +ccggctcaatcctgtagaaccgcgtacaacacacccaagctataccgcac 17|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 791 bp -attgttagggcctgtccggaaaagatcaacggaagatattcaccagcacctatgctgact -cacgtagttcccgacgttcagtcccctccaacgtggaaggtaggacccatctccttaacg -ggatcgatcggtcttcctgtgaaagttgctcagagtcctcaaggacgtttttgggtgcgt -gtacggtatggttatggtacgtgtctgtgacagagggtattcttactggttaagtgaccc -atatgaccacctgacgcccgagcatagacctgtaggggtcgacgcgagagatggcagctt +attgttagggcctgtccggaaaagatcaacggaagatattcaccagcacc 18|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 328 bp -accgattacaggcagtcggccttgtccgctcgtatatccagggatgttccaccgaaagtg -ggagtgtggcacttattggtaaaaggcatttttacgaacgacactgataggattgatcac -tcaagaaatgttctcgaccctgaggtaggagtcttaacagacggacatcctccgtagata -cgtgagaattaagggacgcatgtcgaaaacgcttggaatctactgtagtggcccacctta -cgcttcttccaataactcccttcatagtccggcaacctcggtgggggtttcccttaggcc +accgattacaggcagtcggccttgtccgctcgtatatccagggatgttcc 19|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp -ggagtggaaaattctgtagtccgttggcggcgaccgcaaaccagaataatatggtcacgt -taggccctcgggccccttcatatgtacggagtcattgaattagcattatactaccgttac -gcaagaccctatcccatccgcgactgtcaccactgctgtaaggttgcaaggctgtttcaa -tgtaaagtaggcgaattctgacgtgggctgataacgaatcccccgggttatctagtgcaa -gtgctatccCAGCAGTAGCTGTTGACTCGGAGGCGTCAGAACTTCCCTCTAGTGTGGCAA +ggagtggaaaattctgtagtccgttggcggcgaccgcaaaccagaataat 20|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 440 bp -atcttaaacagcccaatcggctcgccgaccaatttcccgcttcacagtacgcggaagaat -ctgcagatagaagtcagccctctcacgtcaataggaatgctgcccgtcatgtttaactac -tcaagttttaaggtgtcccttatcggttccaggatcatgtctgaaggaagatggtcgcaa -cgaaatctggagtggcatacatcgttcggtcgaagcataatctcagacgttatctataaa -gttagggcgctgtatggattgggattcaagctcgaagcctgttcctgccatacagcgcct +atcttaaacagcccaatcggctcgccgaccaatttcccgcttcacagtac 21|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 840 bp -cataactcgtgagtggccctgtacaagtcattgcatcacaatccttgcaatttgctcctt -tggccaagcgtacaagaccccggacccatacgctcccggctgataaactgctacagcatg -gtatatccggatgatgcccctgaaaactgcggaagtcaatttgttgatgaatccccgact -ttccgctgttcctgtggatggtcgaatgccaaatgaagagctgctccccccttctttaat -atcaagcactacaaagataaagcctgtttggctgacggcgagccctcccctatcgtacgc +cataactcgtgagtggccctgtacaagtcattgcatcacaatccttgcaa 22|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 234 bp -caccgcgaaagtgactcagttttcccggtcttatcacggtcgttgtcgtccagattccgg -ttgttaactgcgggagctataacacttattccttactgcgacggctgatccactaagaac -agttcatagagctcggctatataatttgaagacatagattccacggtacttgtagcccat -aaccgctgaggaggaacgtccaacggttcgcgcggagcatgtgacgcttaaaggGATGTA -AACAAAAGTTTCCAGGCGGCGGCGGTGTAGGCCGTTGCACTGTACTAGGCACAGCTACTG +caccgcgaaagtgactcagttttcccggtcttatcacggtcgttgtcgtc 23|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 917 bp -atcaagtgattacctggtaacccgccgctcttgcagtgttcaccctttgtgtcgtcttag -tgtttgtacacgttaaggaaaagcgttagcttaaccattacgccccccaaagcccggtgt -gtagttatctacatgccgtgtcaaagcggtgactaaatgtttatcaagttctgatgacaa -cgtgagctcttaaagccattgactagtataagcacggaacaatgataccaggcaagcttg -aatataggataaggcctctaagctcgaagcggatcttacggaggtgtgaatcaacagcac +atcaagtgattacctggtaacccgccgctcttgcagtgttcaccctttgt 24|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 676 bp -cacacggcatcgcaaagcgagctatccagagatgatacatgtggttgaaggtgattgcgt -caacatgggggttgctcagtttggttggtcaatcaacggtggcagaccatgcgataacga -tgatggtaagactgtaaggtaagttaaatactctcgtctgccagttgggtcgtcaacgct -gcagagacgccattcttcccagaaggtccgagctttctacagtgccgcggcgtcatgacc -aaaggggtccaacctcgcagtaaaatgtctatgcttctggtttggaatgagaccgggcca +cacacggcatcgcaaagcgagctatccagagatgatacatgtggttgaag 25|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 870 bp -ctgatcaccaatagcttgcgcttaacacacgcgccttacaattatatgacgcccttgcca -atgacagatagagccattaatcgtggaaaccaggcatttatacttgtccgatgtatcgat -tctcctctatctacagagcccggacatgcgaaatatcaaaattccatgtatactgaataa -atacattgggcaagccgggctcatgcagcaatcccagcgttgccttacgcaaagatatct -tacggagttgcctttagattaacagcacgtgttcaaaaacctagccaactctgtcggtct +ctgatcaccaatagcttgcgcttaacacacgcgccttacaattatatgac 26|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 751 bp -gggtgcgttatggggactaaagactgttactaccggtactccgccttatagagccgtcac -gtattaatcagctatcaacagatactatcgtcacagccctccttctggcgaaggatctga -gcatttgcaaagctataagttggtacgcaacggtagagggcttcgtagtcggggaaaggg -cttgcagtagtataggccgtaacttatctgttgcaacctcaaccgcacgaatcgattact -ctataactgccctcaatacagtatggttaccagtcaccttcacactgaagattaattcgc +gggtgcgttatggggactaaagactgttactaccggtactccgccttata 27|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 574 bp -gtactgcaccttgcactgctatctacaatgccgagggtcgccctagtgctttgcatgttt -ggcctctacctacgagtctacgcgggcgtttttaagcaagctacgatcatcttgatccaa -gggtacgaggccccgcagaccaatggaggtcgtgaccaccctcgtgtatgcctcgcacta -agcgagcattctggtatactgtctctctcctgtgataataacagtcggctcgatattcag -ttcacatgaaacagtatgttatataggtgggatggttataacacggaaaggtgaaaaaga +gtactgcaccttgcactgctatctacaatgccgagggtcgccctagtgct 28|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 169 bp -agctccctaaacaacacccgcgtaaaaccttcagttatggtgccgactaaccctgtggat -gtcttagcgctctcgttccgatgggtgctgatactagtaaatgagactcgagaccgagaa -cacgcaacggctacaacctggtcggttgttggggtttttataatcagtgTACGAAATAGT -AGACCTCGCCCTGTAGTTGGAAATTACCGTCATTGGTCTATACAAAGCGCGTGCTTCAGG -ACTCGGTGCCGAATCACTCCGTGCCAAATTAACGAAAGCTTCTGATGTGGATATGAGTTC +agctccctaaacaacacccgcgtaaaaccttcagttatggtgccgactaa 29|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 408 bp -tgcagtgatgcatcgataagaccgcatagttacctccttacaggtgacgctaggctaatt -gggagtgctggcacttgtgccctacagtcaagcgctcacgcggtgttctcctcccgcaat -cttagatattaggctctgtaccgcacgaaggatgaattttcttgactattggtccctgtt -tacgagggcttacctagagtgaggatgaacataaacaaggcctacttgacttaaggcttc -caaatcacttgagggcaaatgactcctcaaacgcgagtgccagtactatccgtgagggaa +tgcagtgatgcatcgataagaccgcatagttacctccttacaggtgacgc 30|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 52 bp -caaagcgattcgggttaacgcacttaagagttcgacgtaggttagtcccctcTGAAGCTG -GCCATAACAGGCAAGTTATTGATTGGACCTTACGACGTACATGGCATTGCGAGACACGGA -AGCTTGAGCACTCATAGATCTGCCAGTCCCAACGCAAGTTGTTTTTCCCGCATCATTCAG -AAGAAGAAGACGGATTGAACCAAACTGTCATATCAGTGGTTAGCTGCTGTAAATTCAGCG -GATTGAGGGATGGGTTTTTCGGAACATTATCTTAGCGGGAACAATAAATGCGAAAACCAG +caaagcgattcgggttaacgcacttaagagttcgacgtaggttagtcccc 31|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 581 bp -tgctctgacgtgtaagcgccttcgataacgtctttgcagcgccccacaaagtaaggaccg -gtctaacagggcttccgaatcaatagactgatagtaatgggatcctgaggctgggacccg -acacacggcatattttactagaaacgctgatttaaactccaattatccttgacgcactga -gccacagtcttagacgcagaatgtccgcaggagccctgtctttcccctaaatcattcgcg -gcatttgtttacgggttaagtcctgcggatcctagagtctgggccccgtacaaccaggaa +tgctctgacgtgtaagcgccttcgataacgtctttgcagcgccccacaaa 32|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp -gcggaactacctctctaagaccgcacaacaagtgtagtagatgaagatcacgcagagtgc -tcggcactgcatttttatacgtcgaatcagaaacgaggttcctcctctaggcttgttaaa -aatccgggcgcgatgggctggtaatctgtggccatgggagcctcgccatttaaagatttt -ggttaaggctcctctgttgtgtccatcacccttgaacgagcccgtacaaaccgtgtacga -tgttgacacTGCAGAGAGGGCGTTTCGTACCATAACATGCATCCTAGGCGGTCATATTGT +gcggaactacctctctaagaccgcacaacaagtgtagtagatgaagatca 33|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 297 bp -gccctcgcgccagcttacttttagaaaacatcgaccggtaagagatacctgggtgagctg -ggcttcacgacatgttcttaaatcaatactctaaatctgctttgtagcatgcctcaagta -aaaaaatgtgctggttccgcacaggtgtgacgattaacgttgcgcccgtttgcgtcagtc -cagatcaccgatcttccacaccaccggtgggctgccggactgcaggtaatgactcctggc -tgcattctctgacataaaggttgaatagaacggcgtccttgagaaggttatggaacgAAA +gccctcgcgccagcttacttttagaaaacatcgaccggtaagagatacct 34|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 573 bp -gcctaggggtcttgaccacagggagtacgagcattgatcattggagcaggtggctaatat -tgatagtggttagaccaccggcgcatcatcgtacgagcgcgggcgatacgtgtctttcac -cggcgcactaatcttatcttacttctcaagccccgacagcatgtacgccaagtgttgttc -tgatgaaactttcgaaatagcaactgttagtcagttatagttggggagggcagtgaatac -ctcaaatacacccaagaaataacttcgaagcggcgcctatatcacacccctgtttcttat +gcctaggggtcttgaccacagggagtacgagcattgatcattggagcagg 35|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 559 bp -gaaaaagtcgccccattcagttacaatcgtcttcagaagccagctcggttggggctatct -gcggggtaatgcaacagggggctaccagacggtaaaccagggtcttgctattggtgttac -gaaacaaaggagctatgcgacctcattagatcgagattactctcacaggcagctccggcc -atagcacaactaatttcgggtgtggagctcaccacaggaacatcttgtgcgtcctttgtt -atttaattgtgcattgtaatgcaccggaccccgggaacatacagccattatctgtgttgc +gaaaaagtcgccccattcagttacaatcgtcttcagaagccagctcggtt 36|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 187 bp -ctaagtccttatctatgatgcatctttcgttactgcgacaatatccgagacgagcagagt -tacacgccgaggtgtaaacgaatacgattgctatatgcaacgagttggttacacgcgtga -aggcgaatgtggatgctgcacttggagtcccattttaccggccgcacgtgctagctcact -caccttgCAGATATACACAGCTCGGGGCTTATGTGAGGGCTTTCTGTAGATCGGGAGCTT -ACCAAAATATTATGGCAAGGACTCACACTTTGATATACGCTTCACAATACTAAGTCCAGT +ctaagtccttatctatgatgcatctttcgttactgcgacaatatccgaga 37|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 549 bp -ttcatatggggatttggaatcgggtttgtgcggaatatgcccacgagactgcttatgtca -acgagacgacccattgtcacgttgtaaggccaccaataacacacaggtcttcgtttgctg -tctcagggcaatcgcatcgacaacatcgtatggataccgttttttatcagcttacggcgc -atcatactaataaggtgtttgagagggcgcagactcgaagcagtgtgatcttcccggttc -gaagatgcaaaaacggtcctatttcgatccaaaactcagcgcactagtccaatgcttttt +ttcatatggggatttggaatcgggtttgtgcggaatatgcccacgagact 38|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 916 bp -gtggcctaccataaatcaatttgggttaacgctctttgatctacgcactatgttgattca -cttaccccttgtcaccgggcagaagagagccagtttaggtgtggttgtatttgccaaacc -gcaaaccgcctaatgagctggatccggccatggaattaatcccgtcgtttgactcgaggt -gttcaaagactgtgcaacacgacgtgcattcatcactagaacttaatctagaccaggcct -tgtggccaggagaggcgacgtgatattgccctatacacagataattatatacccctcgcg +gtggcctaccataaatcaatttgggttaacgctctttgatctacgcacta 39|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 848 bp -accccggtcgctttggccggtcgtagccctaatcaattctgttcgtatcactaaagtaac -ggtttgaaatcctttgcaaacttgatctgggtatatgaaccggtatgcggggatagtggt -aaataagtagtttacgagctgagcgtggattatcccagagaagttgccttaggtccagag -cccgcacctacaatcactcgaggccggtcgagcgttgcgtggcaaggaaacccagccggt -caccctaccctcaaactcacgtcattgatccaatcatacatggcgtctctcacggtggtg +accccggtcgctttggccggtcgtagccctaatcaattctgttcgtatca 40|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 289 bp -agagcaaaagaaagtctgctccgcgtgacacacttgctcgttgtagtaactgcacgcgcc -gtctactcgacagggaccccccgtcggttcctctctatagcaatcgcggaagtggttccc -tgcctcccgcgcagaagttcaaactagtaatccttaatgacttgtggggggggagatcag -tttcttccacaatggagtaaacttatgcgagaatcaagatcgcagaggccattttttgat -gatactgtcagatatgtggttagccgtatcacgttaccgacgcagaattTGGTGAAATAG +agagcaaaagaaagtctgctccgcgtgacacacttgctcgttgtagtaac 41|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 642 bp -ctggggaatgaccgtaccgatctaattccccgtcgaaaaacttatgacgcgcagttgtcc -ttatgcttgagacatgaatccttgccccatattggcgatcttggccaatgagatctgtcg -aaagtactggaggccggtaaattgggggctctagaggtccgcccctgaaggactaacgtg -tgtgtgtgtctacgtgtcgggttatcagcgtgttggacgatggccgtggattcaacgcat -gctagagagctaatgatcctccgaagtcaaaagcctcagtgcttcgatttatgagcgcgt +ctggggaatgaccgtaccgatctaattccccgtcgaaaaacttatgacgc 42|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 993 bp -gcaagaagccaaaaaccttgcaggaggtcatttaagtttacccgcgcataagcagagacg -gacctctctgagatctcgcaccgcgcgcccggccggcactatcgatgctagactagggtt -ggtgactagcccgtcaaaaccagcctaaacgcaaagattgtaggcgctagtccggaactg -actgcttcgtgtcggtgggagcctagtatgtttccgggtctatgacccctaaaatcatag -acgtgtcttaatagctatacctgacttactttgaagtacttgccacgacgagtttatgag +gcaagaagccaaaaaccttgcaggaggtcatttaagtttacccgcgcata 43|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 473 bp -ggttgtccaggcgcgagcaagtagctgactcgctaatcttaacgagtattgcttaggact -tccaaatactccaagacgtcaatacgctttatctttgtgaagtcatcccggaccgagcgc -ttgggtcgtgatttaaaatcccctgtgatgtggctacaggtgcggcctatacagccgaga -agaaggccgtctttaggcgtccaatgaaccgttacagggacacaccaaactgcgccaact -gatcccacgggtcacggtacgctctaagaccagtcgggattctgacttaacatcgcagca +ggttgtccaggcgcgagcaagtagctgactcgctaatcttaacgagtatt 44|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 272 bp -gtagaacttgttccccatggacaatgctagttccgttaatgccaggtattcatgtgccaa -gcgcctgcctggggaatacgagcctctctacaaacttacggccaccatgcttaaagattc -ggtgacttcactaatgacctatacaagtaatgcggaggacgctgtcgcttattgctcttt -gctaaggccagttatgtccgtcagtcaacgatacgctgcggcggtgggtgacggcactag -accggaagcctgatgacaagttcgaatcaataGTCGCTTCCATTACACTTGCGCTATTCC +gtagaacttgttccccatggacaatgctagttccgttaatgccaggtatt 45|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 860 bp -aaaagcatcactctaacgacgctaccgtctgaatagatcaagattgctatcggttcgacc -ttgatcgcatgtgaacccgcccaaaaacccgtctcgacaaaagttacgtcgcatgggctg -cgccaccggatagctcctagcttatcttataaatcaggtagagctacaacatggtgctat -gacaactggagtgtcatcgctttggcgaaaccgtaaagggtgggaattgctgcattctca -actgggccgaactattccgcattcggctgctcacaaatcgtggaatgtgtccttgaacgt +aaaagcatcactctaacgacgctaccgtctgaatagatcaagattgctat 46|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 884 bp -aagcctctacaggctctgcggtttggctttacttaacggtgagtcaggaaaacattactg -ctacgttcaccgtgttcagagatagagagtacattagggaccaatcacaacgttcgccag -ggcaccgcctaatccgcgttgttagcaagagtacaggctctcgtatactttcagaccctt -caatactagacgacaaattgcagcccggggtcatcggtcgactcagatacgtgctaacga -gtaccaggtctaccgttgcaacgttggatgcgttatactcggcataaggcgatgcccttt +aagcctctacaggctctgcggtttggctttacttaacggtgagtcaggaa 47|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 888 bp -gcgccttgaagaggcgaggtctaaaggcaaaaatttagatccgccctatgagacggccga -cgcggagaattccctaaccactattgtcctctgcatcgatatcaggaataggcttacctg -caatctcttatggtgatagactgtttgggagctgaacctgagacgcgcacgaaatttgga -aggatcaaataggccccgcagtctctggtagacttctgccgagcggactagcttggctaa -ggtgtacaagcctaaatcgtttttcacatcaattttatagctgattatagaggaacgacg +gcgccttgaagaggcgaggtctaaaggcaaaaatttagatccgccctatg 48|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 588 bp -catcaagatgggttacgtaggaccgagattcagtctctgggttagagccgacagcggggc -cgctacatagtacacggcgaggaatgcggggttgggctgaaccgtacacagtgggctagc -tgcggtacctgccaccggcatgcgtttaaatcctttcctttggcgaagccaactgccgac -gtccgcaacagagactcgttttccgaccccgttactaaatcagctaactggcgcctgaat -cctcttacgtcggatgttaattagtgtatagaatatcggagggttgagtgcgacgcgctt +catcaagatgggttacgtaggaccgagattcagtctctgggttagagccg 49|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 626 bp -taacctcagtctcgttcccccctcggtagttcggacccttattcgcttatctcacattca -tcactgtagaccaaggaccgggcatacttgcggatatctaccaggactaggcacttaggg -atacgctgttgaatacgggtttcgtcccgtgtactcaagtgtagtttaagataggtacga -gtgctagtacatcgtacaatttacaactgacttaaacgagagtttattatgtcttgttca -cttgttgacacgcctgggaaaataataaaaggcaacgtctaatctcagacccgttgatta +taacctcagtctcgttcccccctcggtagttcggacccttattcgcttat 50|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 214 bp -taactgtcggtcactgctcatcccgactagttcggctcactagacttactcgcggaagcg -agaagtaggacgtcgtgtaatactccaacgtcgttacgcaatgttgtaaaacttcatcgc -attccgtgcatggcctaaacgtgcagcattatataacgctctttggtcttaatatccatc -gcgggagtaacgcgaaggggagacgtgtgcctgaACGAACGCTAAACTAGGTACTAAGTC -GTGAAGCTCGGGTGGAGACAGGTAAACTGATCGCAACGTATCAACCAATTCTGGACCCTA +taactgtcggtcactgctcatcccgactagttcggctcactagacttact diff --git a/read_sequencer_package/modules.py b/read_sequencer_package/modules.py index 7cf0c95555e885dd1066fc01b1c115c3c121b83b..4859423ffc81ba0d49f122733043313315f0192f 100644 --- a/read_sequencer_package/modules.py +++ b/read_sequencer_package/modules.py @@ -1,33 +1,8 @@ import logging LOG = logging.getLogger(__name__) -def generate_sequences(n, mean, sd): - """ - Generates random sequences. - - Args: - n (int): Amount of sequences to generate. - mean (int): mean length of sequence (gaussian distribution). - sd (float): standard deviation of length of sequence (gaussian distribution). - - Returns: - list: of n sequences - """ - from random import gauss, choice - LOG.info("Generating sequences.") - dict = {} - for i in range(n): - keys = range(n) - seq = "" - nt = ["A", "T", "C", "G"] - for value in range(abs(round(gauss(mean, sd)))): - seq = seq + choice(nt) - dict[keys[i]] = seq - return dict - - def read_in_fasta(file_path): - ''' + """ This function reads in FASTA files. Args: @@ -36,7 +11,7 @@ def read_in_fasta(file_path): Returns: Dict: It returns a dictionary with sequences. - ''' + """ LOG.info("Reading in FASTA files from destination.") sequences = {} f = open(file_path) @@ -52,7 +27,7 @@ def read_in_fasta(file_path): return sequences def read_sequence(seq, read_length): - ''' + """ This function reads a sequence of a specific read length and adds additional nucleotides if the sequence is smaller then the requested length or cuts the sequence if its longer. @@ -63,12 +38,11 @@ def read_sequence(seq, read_length): Returns: str: returns sequenced element - ''' - + """ from random import choice bases = ["A", "T", "C", "G"] sequenced = '' - if read_length >= len(seq): + if read_length > len(seq): for nt in range(len(seq)): sequenced += seq[nt] for nt in range(len(seq), read_length): @@ -94,10 +68,10 @@ def simulate_sequencing(sequences, read_length): results = {} for index, key in enumerate(sequences): results[key] = read_sequence(sequences[key], read_length=read_length) - + LOG.info("Sequencing was successfully executed.") return results -import random + def generate_sequences(n, mean, sd): """ Generates random sequences. @@ -110,16 +84,17 @@ def generate_sequences(n, mean, sd): Returns: dict: of n sequences """ + from random import choice, gauss LOG.info("Generating random sequences.") - dict1 = {} + dict = {} for i in range(n): - keys = range(n) seq = "" nt = ["A", "T", "C", "G"] - for value in range(round(random.gauss(mean, sd))): - seq = seq + random.choice(nt) - dict1[keys[i]] = seq - return dict1 + for value in range(abs(round(gauss(mean, sd)))): + seq = seq + choice(nt) + key = str(i) + ': length ' + str(len(seq)) + ' nt' + dict[key] = seq + return dict def write_fasta(sequences, file_path): """ @@ -138,8 +113,8 @@ def write_fasta(sequences, file_path): outfile.write(key + "\n") outfile.write("\n".join(wrap(value, 60))) outfile.write("\n") - LOG.info("Sequencing was successfully executed.") -class read_sequencer: + +class ReadSequencer: def __init__(self): self.sequences = {} self.reads = {}