From 200b8c6c03c00eebd5cb02c9088e6b0957878680 Mon Sep 17 00:00:00 2001 From: Christoph Harmel <christoph.harmel@unibas.ch> Date: Fri, 11 Nov 2022 15:58:38 +0100 Subject: [PATCH] feat: added read_sequencer class, updated main() in cli.py accordingly --- read_sequencer_package/cli.py | 11 +- .../fasta_testfile/result.fasta | 300 ++++++++++++++++++ read_sequencer_package/generate_sequences.py | 24 -- read_sequencer_package/modules.py | 47 ++- 4 files changed, 347 insertions(+), 35 deletions(-) create mode 100644 read_sequencer_package/fasta_testfile/result.fasta delete mode 100644 read_sequencer_package/generate_sequences.py diff --git a/read_sequencer_package/cli.py b/read_sequencer_package/cli.py index f242c4c..e786d78 100644 --- a/read_sequencer_package/cli.py +++ b/read_sequencer_package/cli.py @@ -1,8 +1,8 @@ import argparse -from modules import run_read_sequencer +from modules import read_sequencer as rs parser = argparse.ArgumentParser(prog='read_sequencer', - description='Simulates sequencing of a DNA sequences specified by an FASTA file.') + description='Simulates sequencing of DNA sequences specified by an FASTA file.') parser.add_argument('--input_file_path', help='path to FASTA file') parser.add_argument('--output_file_path', @@ -13,10 +13,11 @@ parser.add_argument('--read_length', args = parser.parse_args() - def main(): - run_read_sequencer(args.input_file_path, args.read_length, args.output_file_path) - + read_sequencer = rs() + read_sequencer.read_fasta(args.input_file_path) + read_sequencer.run_sequencing(args.read_length) + read_sequencer.write_fasta(args.output_file_path) if __name__ == '__main__': main() diff --git a/read_sequencer_package/fasta_testfile/result.fasta b/read_sequencer_package/fasta_testfile/result.fasta new file mode 100644 index 0000000..7b9bb49 --- /dev/null +++ b/read_sequencer_package/fasta_testfile/result.fasta @@ -0,0 +1,300 @@ +1|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 481 bp +tgagcactcggtgccaagggcggggatacacagatggttggctgatacaaccgggactta +aattccctagactagatctgtgttggaacgcctctctacgagaaggcgaacgaactggcg +ccgaggcgatcgctaacatcttcgtctcgcttgaaccacacaatggatgattcctcccta +ggggtttgacaatcaacctggatagcgtttaatatagatggctggttgatttgtaaggcc +ttcacagactactcagagcaataagtgaccccccaacaatcagaggctgatcctctgctc +2|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 495 bp +ctgaatcaggtgtaggttctttttacgtcgtttaaggagctacacggtatcttgttttca +gttaaggtgccacacccccgggtggatcatccgtcagctttcctacaattaggtaactgg +cgggatcatttagtcttgtattaagacgctcgcgcccggggcggccggcttgtttgtgga +gagaaacaacaagtctgagtatagattaaatacaactggtttactggcaagtcagcgcgt +aacaaccggtgagccgctgcgcatgcttactgcaatgaacatcttggcacgatcctgcga +3|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 193 bp +acttcagtactggaaggatctaggaaccattaatgcgagtgtggtgacgccagacgaccc +ccggtgttctgccaccttctttggataggagaaccgtcactcgccccggaggccccacgg +ataagaagggtatcttgtgatcacgcgaatgactcacttgcgtaagtaatctaactttgt +ttttcgctataaaAAGCCGGGGTGTTGAAGTCTATCGTCGATGGCTGCAGTATTAAAGAT +CTTATGACTAATGCTCCAGCTTCCCCTCTGTGGTTAGAGACGAACTCGTCAGCCTCGATT +4|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 625 bp +acgtctggagcgtgggttgacccctgtacatggttctttccggatccttaacgtgccgat +acaactcaaaggtaactgtgcttaccacttccgaagctacatgcctctaacaaagtactt +tcgaggaggcactcaacccccggagatgctttgcgcggaagcagagatcgctgctcaaaa +tttggaatcactttcgtgcgagacccaaacaatttatggtggattcaagcgaacgagtca +tgattacagatctatcaatcgaggagaggacggcttcgccgtttccttttaatgtgaaac +5|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 845 bp +agagcgtacggcgcgcatcgtataccctacgagggcggcgtgtggaggaacgctgggctg +acactgtagaagattagatacacttgtccctaaaattaacccttaaccgctattagccgt +gaacgcttcctaatatttcaagccgtatagctaagtggagaatgtggagccctggtcaaa +tcacgagccaattagccctagacggacagcacatctcgtcgcgttaagcggaacactcag +cttttattacctagtgctcagcctggtttccatatgctctaaccgaactgatgcatactt +6|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 703 bp +tgcagtcgatgtgctattcgttttaggcagtctacgcgcttagtaactcccacggccata +gacttatctcagacatggaccatgtcgatatcggacgccgtcttaccacatttttcatag +cccttcataaggcagcgtgctcttactgcccaataaggtggacgattccgaccctaggcg +aaccagcgctatagatggaccttctaattgatgcgcaacgtgattgtttccttggtctgg +gttagcatttcggtagcctaacagtcactccagttcgctaactggcctggatgagggccc +7|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 243 bp +actctttagaatgggtttcactaatagtacgtgcatacaatttcgtcagaaagggcgctt +gctaagggacacggatcaatgatgaccagacttatggtgtcaggtctcactatattacat +atccggaacccgtgcccgcaccacgcgctgggtctaggcgaccggtgcatcatctccgcg +tctctagaggattctctcggtaaatgctgaattgcgtgagatcaaatccgtatgccagtc +atgAGGCAAGGCGTATAGATCTTTCCTCAGCATGAACCGGAACATCCTACAATCGCACGC +8|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 863 bp +attggcccggtccaggacagagccttatattgctactggtatgagaaccgttctgacgta +aacttgatggctttacgcctgcacgggcttcatacacacatgaccgtggacaaagtcgcc +caggccctcgaatagggtgtaatggttaacggttagtgccaccccaatgggtgcgaggca +gtaagagtgtcctatggcaaaactctcctcgtttcagaagggtcgctcctctagcctcct +tatcccccctataatagtactcgccgggtacgagccggagctccctcgagaagtcatcct +9|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 494 bp +aagcgaaactcctagaacttcccatcaggcaatcgtgtcccacgaagcacggatactacg +ggcactagttgaatggggggtttttttcgtaggtcgtaataggtactcggatagtcggcc +cagagttatgcttaagaatgcgctgcttaattcaatgtgactgccgttgtctccgatcag +atccaggtgatgattgcgatcgcagcgacatatgtctcgaaagacgtgtcgtgaataagc +ctgtaagcccaatgcaacatggttccctcaccttgtagctgatgtaccgtgtttcaatct +10|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 86 bp +atcctagcgccaaagatttactgttatggggtcgacgaacactagccgataatgccgtcc +tgggatctctagcctagtattatgcgGACGTTTCCAGCCCCGCTGACCCTGAGTCGGACG +TGAAAGCGAATCATCATATAAAGCATACACTGACTTGCACAAGTTGAAATAAGAGGTTCG +CTTAGGCTTGCCTCTCTAGTGGCGCAGCAGTACTGAGTGGGTTCTACTTACTCTCTGGAC +TCATGATCGTGTACACCGTGAAAGACGGCCCATGTGCCGTATATCTACCGTGCATAACCT +11|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 360 bp +cgcctgagggtcctaaatctgacgtatgatcgaagagattggaaggtcccggcgggtcac +cccacgttgcgatcatggccaaggccatggtttgctcaaaaatcccacattcgccgtctt +acgcgttaggacctcactatcccacagacggtgcgttaccttgtagttgacgcgggatcg +tggtgataacagctatttccgagacttcatattcttttacatagcggcttaccgtagtga +ctccatacattatttgcctattttgtagtgccccgaacagtaaggggaagccaactgccg +12|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 140 bp +gaattcctggggatttactcacccccgaggcggacaagatttccagctggatcaccgagg +gttacttaatcccttcgatgctttcaaaggccctaatcagtattgagcaacgaaagcgga +gtcgttagtgtccaagttgcAAATGGTATCGCAGAGCGGTCGGATTCGCCAAAAAGCTTG +GCCAATGGCTAAAATTTAGCTCTCGCCCTCGCGGACATTGTGGACATGTGCCTGACTACC +CATCATTCTCGCTCGGCACTAGGCCACCAGGGTTAACGATTTCGTAACAACGACCCGCAT +13|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 832 bp +aatactctcgttgaagcgtcggacagtaaagtgagagatttcggcccacggtagtcggac +attctcagtggggagcgaagagttgcgcttagagccgacgtacacgatataacctcaatt +gaaaatcgctatgtgcatcgttagggcctccggcgtgctgtttcggcagctgagtgtgag +ggtataacttaccttcgacccgaattgtctcgcggaaatcctaggcaagtaatccacttt +tggtacgggggagctagttcctctaagacgaacaagtgcactcttcacgtatagtgccct +14|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 296 bp +atcggggtgcgaaatcccctgagctggttgactacatacgtaaccacgttccgtgcgtca +tctaagcgtatcggctcatactggtggtaactagacttggtgaaccctaggtgccggcat +atcgaggtccgcatccaaaataactatcgctatagctacatagacatttactcgcaatat +tacacgaaccgtacgtccctcggtattaacgtaatggttaaagtctctaattccgctgca +gagcggcgggataaagacgccggtgtggcctgaatggtggatctgtccgtagtaccACGT +15|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 515 bp +accttcaatttgttcgcccgggacaagtagaaattactgtaaactaaacttaacctattc +cttgttaaagtccgcaccaagtgtactgtaagaatggtcgctcgtaataataacgagaag +atcctcgagccgtggtctgctgcaactaccttgagcggtacatcgatgtcccactctggg +cggggatcaggggcgagacttgtggtgaggccaaagaatggcgcatatgtaggcaccata +cgtcgatacgttccaggagtagaggcctcgaacatacaccacgataagtctacagacgca +16|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 820 bp +ccggctcaatcctgtagaaccgcgtacaacacacccaagctataccgcacacggcgcctt +agcaaccactgcttatctgcgtattatacctttacaatcattacatttgatctatctgtg +taccggttttttttgattcaattcgctggattacgacctcccggccaaaaattctcaatt +catcgttaacagacgtatttgaagataatcattcaacgtgaactagcacttggtcacttg +gtacgccaaccaagctgtgctttggggcaaccctttataactcacatgccgtcctaggac +17|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 791 bp +attgttagggcctgtccggaaaagatcaacggaagatattcaccagcacctatgctgact +cacgtagttcccgacgttcagtcccctccaacgtggaaggtaggacccatctccttaacg +ggatcgatcggtcttcctgtgaaagttgctcagagtcctcaaggacgtttttgggtgcgt +gtacggtatggttatggtacgtgtctgtgacagagggtattcttactggttaagtgaccc +atatgaccacctgacgcccgagcatagacctgtaggggtcgacgcgagagatggcagctt +18|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 328 bp +accgattacaggcagtcggccttgtccgctcgtatatccagggatgttccaccgaaagtg +ggagtgtggcacttattggtaaaaggcatttttacgaacgacactgataggattgatcac +tcaagaaatgttctcgaccctgaggtaggagtcttaacagacggacatcctccgtagata +cgtgagaattaagggacgcatgtcgaaaacgcttggaatctactgtagtggcccacctta +cgcttcttccaataactcccttcatagtccggcaacctcggtgggggtttcccttaggcc +19|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp +ggagtggaaaattctgtagtccgttggcggcgaccgcaaaccagaataatatggtcacgt +taggccctcgggccccttcatatgtacggagtcattgaattagcattatactaccgttac +gcaagaccctatcccatccgcgactgtcaccactgctgtaaggttgcaaggctgtttcaa +tgtaaagtaggcgaattctgacgtgggctgataacgaatcccccgggttatctagtgcaa +gtgctatccCAGCAGTAGCTGTTGACTCGGAGGCGTCAGAACTTCCCTCTAGTGTGGCAA +20|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 440 bp +atcttaaacagcccaatcggctcgccgaccaatttcccgcttcacagtacgcggaagaat +ctgcagatagaagtcagccctctcacgtcaataggaatgctgcccgtcatgtttaactac +tcaagttttaaggtgtcccttatcggttccaggatcatgtctgaaggaagatggtcgcaa +cgaaatctggagtggcatacatcgttcggtcgaagcataatctcagacgttatctataaa +gttagggcgctgtatggattgggattcaagctcgaagcctgttcctgccatacagcgcct +21|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 840 bp +cataactcgtgagtggccctgtacaagtcattgcatcacaatccttgcaatttgctcctt +tggccaagcgtacaagaccccggacccatacgctcccggctgataaactgctacagcatg +gtatatccggatgatgcccctgaaaactgcggaagtcaatttgttgatgaatccccgact +ttccgctgttcctgtggatggtcgaatgccaaatgaagagctgctccccccttctttaat +atcaagcactacaaagataaagcctgtttggctgacggcgagccctcccctatcgtacgc +22|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 234 bp +caccgcgaaagtgactcagttttcccggtcttatcacggtcgttgtcgtccagattccgg +ttgttaactgcgggagctataacacttattccttactgcgacggctgatccactaagaac +agttcatagagctcggctatataatttgaagacatagattccacggtacttgtagcccat +aaccgctgaggaggaacgtccaacggttcgcgcggagcatgtgacgcttaaaggGATGTA +AACAAAAGTTTCCAGGCGGCGGCGGTGTAGGCCGTTGCACTGTACTAGGCACAGCTACTG +23|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 917 bp +atcaagtgattacctggtaacccgccgctcttgcagtgttcaccctttgtgtcgtcttag +tgtttgtacacgttaaggaaaagcgttagcttaaccattacgccccccaaagcccggtgt +gtagttatctacatgccgtgtcaaagcggtgactaaatgtttatcaagttctgatgacaa +cgtgagctcttaaagccattgactagtataagcacggaacaatgataccaggcaagcttg +aatataggataaggcctctaagctcgaagcggatcttacggaggtgtgaatcaacagcac +24|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 676 bp +cacacggcatcgcaaagcgagctatccagagatgatacatgtggttgaaggtgattgcgt +caacatgggggttgctcagtttggttggtcaatcaacggtggcagaccatgcgataacga +tgatggtaagactgtaaggtaagttaaatactctcgtctgccagttgggtcgtcaacgct +gcagagacgccattcttcccagaaggtccgagctttctacagtgccgcggcgtcatgacc +aaaggggtccaacctcgcagtaaaatgtctatgcttctggtttggaatgagaccgggcca +25|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 870 bp +ctgatcaccaatagcttgcgcttaacacacgcgccttacaattatatgacgcccttgcca +atgacagatagagccattaatcgtggaaaccaggcatttatacttgtccgatgtatcgat +tctcctctatctacagagcccggacatgcgaaatatcaaaattccatgtatactgaataa +atacattgggcaagccgggctcatgcagcaatcccagcgttgccttacgcaaagatatct +tacggagttgcctttagattaacagcacgtgttcaaaaacctagccaactctgtcggtct +26|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 751 bp +gggtgcgttatggggactaaagactgttactaccggtactccgccttatagagccgtcac +gtattaatcagctatcaacagatactatcgtcacagccctccttctggcgaaggatctga +gcatttgcaaagctataagttggtacgcaacggtagagggcttcgtagtcggggaaaggg +cttgcagtagtataggccgtaacttatctgttgcaacctcaaccgcacgaatcgattact +ctataactgccctcaatacagtatggttaccagtcaccttcacactgaagattaattcgc +27|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 574 bp +gtactgcaccttgcactgctatctacaatgccgagggtcgccctagtgctttgcatgttt +ggcctctacctacgagtctacgcgggcgtttttaagcaagctacgatcatcttgatccaa +gggtacgaggccccgcagaccaatggaggtcgtgaccaccctcgtgtatgcctcgcacta +agcgagcattctggtatactgtctctctcctgtgataataacagtcggctcgatattcag +ttcacatgaaacagtatgttatataggtgggatggttataacacggaaaggtgaaaaaga +28|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 169 bp +agctccctaaacaacacccgcgtaaaaccttcagttatggtgccgactaaccctgtggat +gtcttagcgctctcgttccgatgggtgctgatactagtaaatgagactcgagaccgagaa +cacgcaacggctacaacctggtcggttgttggggtttttataatcagtgTACGAAATAGT +AGACCTCGCCCTGTAGTTGGAAATTACCGTCATTGGTCTATACAAAGCGCGTGCTTCAGG +ACTCGGTGCCGAATCACTCCGTGCCAAATTAACGAAAGCTTCTGATGTGGATATGAGTTC +29|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 408 bp +tgcagtgatgcatcgataagaccgcatagttacctccttacaggtgacgctaggctaatt +gggagtgctggcacttgtgccctacagtcaagcgctcacgcggtgttctcctcccgcaat +cttagatattaggctctgtaccgcacgaaggatgaattttcttgactattggtccctgtt +tacgagggcttacctagagtgaggatgaacataaacaaggcctacttgacttaaggcttc +caaatcacttgagggcaaatgactcctcaaacgcgagtgccagtactatccgtgagggaa +30|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 52 bp +caaagcgattcgggttaacgcacttaagagttcgacgtaggttagtcccctcTGAAGCTG +GCCATAACAGGCAAGTTATTGATTGGACCTTACGACGTACATGGCATTGCGAGACACGGA +AGCTTGAGCACTCATAGATCTGCCAGTCCCAACGCAAGTTGTTTTTCCCGCATCATTCAG +AAGAAGAAGACGGATTGAACCAAACTGTCATATCAGTGGTTAGCTGCTGTAAATTCAGCG +GATTGAGGGATGGGTTTTTCGGAACATTATCTTAGCGGGAACAATAAATGCGAAAACCAG +31|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 581 bp +tgctctgacgtgtaagcgccttcgataacgtctttgcagcgccccacaaagtaaggaccg +gtctaacagggcttccgaatcaatagactgatagtaatgggatcctgaggctgggacccg +acacacggcatattttactagaaacgctgatttaaactccaattatccttgacgcactga +gccacagtcttagacgcagaatgtccgcaggagccctgtctttcccctaaatcattcgcg +gcatttgtttacgggttaagtcctgcggatcctagagtctgggccccgtacaaccaggaa +32|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 249 bp +gcggaactacctctctaagaccgcacaacaagtgtagtagatgaagatcacgcagagtgc +tcggcactgcatttttatacgtcgaatcagaaacgaggttcctcctctaggcttgttaaa +aatccgggcgcgatgggctggtaatctgtggccatgggagcctcgccatttaaagatttt +ggttaaggctcctctgttgtgtccatcacccttgaacgagcccgtacaaaccgtgtacga +tgttgacacTGCAGAGAGGGCGTTTCGTACCATAACATGCATCCTAGGCGGTCATATTGT +33|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 297 bp +gccctcgcgccagcttacttttagaaaacatcgaccggtaagagatacctgggtgagctg +ggcttcacgacatgttcttaaatcaatactctaaatctgctttgtagcatgcctcaagta +aaaaaatgtgctggttccgcacaggtgtgacgattaacgttgcgcccgtttgcgtcagtc +cagatcaccgatcttccacaccaccggtgggctgccggactgcaggtaatgactcctggc +tgcattctctgacataaaggttgaatagaacggcgtccttgagaaggttatggaacgAAA +34|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 573 bp +gcctaggggtcttgaccacagggagtacgagcattgatcattggagcaggtggctaatat +tgatagtggttagaccaccggcgcatcatcgtacgagcgcgggcgatacgtgtctttcac +cggcgcactaatcttatcttacttctcaagccccgacagcatgtacgccaagtgttgttc +tgatgaaactttcgaaatagcaactgttagtcagttatagttggggagggcagtgaatac +ctcaaatacacccaagaaataacttcgaagcggcgcctatatcacacccctgtttcttat +35|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 559 bp +gaaaaagtcgccccattcagttacaatcgtcttcagaagccagctcggttggggctatct +gcggggtaatgcaacagggggctaccagacggtaaaccagggtcttgctattggtgttac +gaaacaaaggagctatgcgacctcattagatcgagattactctcacaggcagctccggcc +atagcacaactaatttcgggtgtggagctcaccacaggaacatcttgtgcgtcctttgtt +atttaattgtgcattgtaatgcaccggaccccgggaacatacagccattatctgtgttgc +36|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 187 bp +ctaagtccttatctatgatgcatctttcgttactgcgacaatatccgagacgagcagagt +tacacgccgaggtgtaaacgaatacgattgctatatgcaacgagttggttacacgcgtga +aggcgaatgtggatgctgcacttggagtcccattttaccggccgcacgtgctagctcact +caccttgCAGATATACACAGCTCGGGGCTTATGTGAGGGCTTTCTGTAGATCGGGAGCTT +ACCAAAATATTATGGCAAGGACTCACACTTTGATATACGCTTCACAATACTAAGTCCAGT +37|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 549 bp +ttcatatggggatttggaatcgggtttgtgcggaatatgcccacgagactgcttatgtca +acgagacgacccattgtcacgttgtaaggccaccaataacacacaggtcttcgtttgctg +tctcagggcaatcgcatcgacaacatcgtatggataccgttttttatcagcttacggcgc +atcatactaataaggtgtttgagagggcgcagactcgaagcagtgtgatcttcccggttc +gaagatgcaaaaacggtcctatttcgatccaaaactcagcgcactagtccaatgcttttt +38|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 916 bp +gtggcctaccataaatcaatttgggttaacgctctttgatctacgcactatgttgattca +cttaccccttgtcaccgggcagaagagagccagtttaggtgtggttgtatttgccaaacc +gcaaaccgcctaatgagctggatccggccatggaattaatcccgtcgtttgactcgaggt +gttcaaagactgtgcaacacgacgtgcattcatcactagaacttaatctagaccaggcct +tgtggccaggagaggcgacgtgatattgccctatacacagataattatatacccctcgcg +39|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 848 bp +accccggtcgctttggccggtcgtagccctaatcaattctgttcgtatcactaaagtaac +ggtttgaaatcctttgcaaacttgatctgggtatatgaaccggtatgcggggatagtggt +aaataagtagtttacgagctgagcgtggattatcccagagaagttgccttaggtccagag +cccgcacctacaatcactcgaggccggtcgagcgttgcgtggcaaggaaacccagccggt +caccctaccctcaaactcacgtcattgatccaatcatacatggcgtctctcacggtggtg +40|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 289 bp +agagcaaaagaaagtctgctccgcgtgacacacttgctcgttgtagtaactgcacgcgcc +gtctactcgacagggaccccccgtcggttcctctctatagcaatcgcggaagtggttccc +tgcctcccgcgcagaagttcaaactagtaatccttaatgacttgtggggggggagatcag +tttcttccacaatggagtaaacttatgcgagaatcaagatcgcagaggccattttttgat +gatactgtcagatatgtggttagccgtatcacgttaccgacgcagaattTGGTGAAATAG +41|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 642 bp +ctggggaatgaccgtaccgatctaattccccgtcgaaaaacttatgacgcgcagttgtcc +ttatgcttgagacatgaatccttgccccatattggcgatcttggccaatgagatctgtcg +aaagtactggaggccggtaaattgggggctctagaggtccgcccctgaaggactaacgtg +tgtgtgtgtctacgtgtcgggttatcagcgtgttggacgatggccgtggattcaacgcat +gctagagagctaatgatcctccgaagtcaaaagcctcagtgcttcgatttatgagcgcgt +42|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 993 bp +gcaagaagccaaaaaccttgcaggaggtcatttaagtttacccgcgcataagcagagacg +gacctctctgagatctcgcaccgcgcgcccggccggcactatcgatgctagactagggtt +ggtgactagcccgtcaaaaccagcctaaacgcaaagattgtaggcgctagtccggaactg +actgcttcgtgtcggtgggagcctagtatgtttccgggtctatgacccctaaaatcatag +acgtgtcttaatagctatacctgacttactttgaagtacttgccacgacgagtttatgag +43|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 473 bp +ggttgtccaggcgcgagcaagtagctgactcgctaatcttaacgagtattgcttaggact +tccaaatactccaagacgtcaatacgctttatctttgtgaagtcatcccggaccgagcgc +ttgggtcgtgatttaaaatcccctgtgatgtggctacaggtgcggcctatacagccgaga +agaaggccgtctttaggcgtccaatgaaccgttacagggacacaccaaactgcgccaact +gatcccacgggtcacggtacgctctaagaccagtcgggattctgacttaacatcgcagca +44|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 272 bp +gtagaacttgttccccatggacaatgctagttccgttaatgccaggtattcatgtgccaa +gcgcctgcctggggaatacgagcctctctacaaacttacggccaccatgcttaaagattc +ggtgacttcactaatgacctatacaagtaatgcggaggacgctgtcgcttattgctcttt +gctaaggccagttatgtccgtcagtcaacgatacgctgcggcggtgggtgacggcactag +accggaagcctgatgacaagttcgaatcaataGTCGCTTCCATTACACTTGCGCTATTCC +45|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 860 bp +aaaagcatcactctaacgacgctaccgtctgaatagatcaagattgctatcggttcgacc +ttgatcgcatgtgaacccgcccaaaaacccgtctcgacaaaagttacgtcgcatgggctg +cgccaccggatagctcctagcttatcttataaatcaggtagagctacaacatggtgctat +gacaactggagtgtcatcgctttggcgaaaccgtaaagggtgggaattgctgcattctca +actgggccgaactattccgcattcggctgctcacaaatcgtggaatgtgtccttgaacgt +46|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 884 bp +aagcctctacaggctctgcggtttggctttacttaacggtgagtcaggaaaacattactg +ctacgttcaccgtgttcagagatagagagtacattagggaccaatcacaacgttcgccag +ggcaccgcctaatccgcgttgttagcaagagtacaggctctcgtatactttcagaccctt +caatactagacgacaaattgcagcccggggtcatcggtcgactcagatacgtgctaacga +gtaccaggtctaccgttgcaacgttggatgcgttatactcggcataaggcgatgcccttt +47|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 888 bp +gcgccttgaagaggcgaggtctaaaggcaaaaatttagatccgccctatgagacggccga +cgcggagaattccctaaccactattgtcctctgcatcgatatcaggaataggcttacctg +caatctcttatggtgatagactgtttgggagctgaacctgagacgcgcacgaaatttgga +aggatcaaataggccccgcagtctctggtagacttctgccgagcggactagcttggctaa +ggtgtacaagcctaaatcgtttttcacatcaattttatagctgattatagaggaacgacg +48|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 588 bp +catcaagatgggttacgtaggaccgagattcagtctctgggttagagccgacagcggggc +cgctacatagtacacggcgaggaatgcggggttgggctgaaccgtacacagtgggctagc +tgcggtacctgccaccggcatgcgtttaaatcctttcctttggcgaagccaactgccgac +gtccgcaacagagactcgttttccgaccccgttactaaatcagctaactggcgcctgaat +cctcttacgtcggatgttaattagtgtatagaatatcggagggttgagtgcgacgcgctt +49|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 626 bp +taacctcagtctcgttcccccctcggtagttcggacccttattcgcttatctcacattca +tcactgtagaccaaggaccgggcatacttgcggatatctaccaggactaggcacttaggg +atacgctgttgaatacgggtttcgtcccgtgtactcaagtgtagtttaagataggtacga +gtgctagtacatcgtacaatttacaactgacttaaacgagagtttattatgtcttgttca +cttgttgacacgcctgggaaaataataaaaggcaacgtctaatctcagacccgttgatta +50|random sequence|A: 0.25|C: 0.25|G: 0.25|T: 0.25|length: 214 bp +taactgtcggtcactgctcatcccgactagttcggctcactagacttactcgcggaagcg +agaagtaggacgtcgtgtaatactccaacgtcgttacgcaatgttgtaaaacttcatcgc +attccgtgcatggcctaaacgtgcagcattatataacgctctttggtcttaatatccatc +gcgggagtaacgcgaaggggagacgtgtgcctgaACGAACGCTAAACTAGGTACTAAGTC +GTGAAGCTCGGGTGGAGACAGGTAAACTGATCGCAACGTATCAACCAATTCTGGACCCTA diff --git a/read_sequencer_package/generate_sequences.py b/read_sequencer_package/generate_sequences.py deleted file mode 100644 index 0125bbf..0000000 --- a/read_sequencer_package/generate_sequences.py +++ /dev/null @@ -1,24 +0,0 @@ -import random - -def generate_sequences(n, mean, sd): - """ - Generates random sequences. - - Args: - n (int): Amount of sequences to generate. - mean (int): mean length of sequence (gaussian distribution). - sd (float): standart deviation of length of sequence (gaussian distribution). - - Returns: - list: of n sequences - """ - dict1 = {} - for i in range(n): - keys = range(n) - seq = "" - nt = ["A", "T", "C", "G"] - for value in range(round(random.gauss(mean, sd))): - seq = seq + random.choice(nt) - dict1[keys[i]] = seq - return dict1 - diff --git a/read_sequencer_package/modules.py b/read_sequencer_package/modules.py index ca84a64..854aaee 100644 --- a/read_sequencer_package/modules.py +++ b/read_sequencer_package/modules.py @@ -1,3 +1,26 @@ +def generate_sequences(n, mean, sd): + """ + Generates random sequences. + + Args: + n (int): Amount of sequences to generate. + mean (int): mean length of sequence (gaussian distribution). + sd (float): standard deviation of length of sequence (gaussian distribution). + + Returns: + list: of n sequences + """ + from random import gauss, choice + dict = {} + for i in range(n): + keys = range(n) + seq = "" + nt = ["A", "T", "C", "G"] + for value in range(abs(round(gauss(mean, sd)))): + seq = seq + choice(nt) + dict[keys[i]] = seq + return dict + def read_in_fasta(file_path): ''' @@ -36,7 +59,7 @@ def read_sequence(seq, read_length, padding_probabilities=None): if read_length >= len(seq): for nt in range(len(seq)): sequenced += seq[nt] - for nt in range(len(seq),read_length): + for nt in range(len(seq), read_length): sequenced += choice(bases) else: for nt in range(read_length): @@ -47,7 +70,7 @@ def read_sequence(seq, read_length, padding_probabilities=None): def simulate_sequencing(sequences, read_length): results = {} for index, key in enumerate(sequences): - results[key] = read_sequence(sequences[key],read_length=read_length) + results[key] = read_sequence(sequences[key], read_length=read_length) return results @@ -63,7 +86,19 @@ def write_fasta(sequences, file_path): outfile.write("\n".join(wrap(value, 60))) outfile.write("\n") -def run_read_sequencer(input_file_path, read_length, output_file_path): - sequences = read_in_fasta(input_file_path) - reads = simulate_sequencing(sequences, read_length) - write_fasta(reads, output_file_path) +class read_sequencer: + def __init__(self): + self.sequences = {} + self.reads = {} + + def add_random_sequences(self, n, mean, sd): + self.sequences = generate_sequences(n, mean, sd) + + def read_fasta(self, input_file): + self.sequences = read_in_fasta(input_file) + + def run_sequencing(self, read_length): + self.reads = simulate_sequencing(self.sequences, read_length) + + def write_fasta(self, output_file_path): + write_fasta(self.reads, output_file_path) -- GitLab