From e4c0da92d7cfb711e2e2fced78ba3ecc191ab86d Mon Sep 17 00:00:00 2001 From: Christoph Harmel <christoph.harmel@unibas.ch> Date: Mon, 28 Nov 2022 11:10:20 +0100 Subject: [PATCH] feat: rewrote ReadSequencer class, added batch processing --- read_sequencer_package/read_sequencer.py | 270 ++++++++++++----------- setup.py | 2 +- tests/fasta_testfile/result.fasta | 200 ----------------- 3 files changed, 145 insertions(+), 327 deletions(-) delete mode 100644 tests/fasta_testfile/result.fasta diff --git a/read_sequencer_package/read_sequencer.py b/read_sequencer_package/read_sequencer.py index aefc540..585e639 100644 --- a/read_sequencer_package/read_sequencer.py +++ b/read_sequencer_package/read_sequencer.py @@ -1,131 +1,149 @@ import logging -from random import choice, gauss -from textwrap import wrap +from random import choices +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from collections.abc import Generator, Iterator LOG = logging.getLogger(__name__) - -def read_in_fasta(file_path: str) -> dict[str,str]: - """ - This function reads in FASTA files. - - Args: - file_path: A file path directing to the fasta file. - - Returns: - Dict: It returns a dictionary with sequences. - """ - LOG.info("Reading in FASTA files from destination.") - sequences: dict[str,str] = {} - f = open(file_path) - for line in f: - if line[0] == '>': - def_line = line.strip() - def_line = def_line.replace('>', '') - else: - if def_line not in sequences: - sequences[def_line] = '' - sequences[def_line] += line.strip() - f.close() - return sequences - -def read_sequence(seq:str, read_length:int) -> str: - """ - This function reads a sequence of a specific read length and adds additional nucleotides if the sequence is - smaller than the requested length or cuts the sequence if its longer. - - Args: - seq: the sequence to read - read_length: length of reads - - Returns: - str: returns sequenced element - - """ - bases: list[str] = ["A", "T", "C", "G"] - sequenced: str = '' - if read_length > len(seq): - for nt in range(len(seq)): - sequenced += seq[nt] - for nt in range(len(seq), read_length): - sequenced += choice(bases) - else: - for nt in range(read_length): - sequenced += seq[nt] - - return sequenced - -def simulate_sequencing(sequences: dict[str,str], read_length: int) -> dict[str,str]: - """ - Simulates sequencing. - - Args: - sequences: Dictionary of sequences to sequence. - read_length: length of reads - - Returns: - dict: of n sequences as values - """ - LOG.info("Sequencing in progress....") - results: dict[str,str] = {} - for index, key in enumerate(sequences): - results[key] = read_sequence(sequences[key], read_length=read_length) - LOG.info("Sequencing was successfully executed.") - return results - -def generate_sequences(n: int, mean: int, sd: int) -> dict[str,str]: - """ - Generates random sequences. - - Args: - n: Amount of sequences to generate. - mean: mean length of sequence (gaussian distribution). - sd: standard deviation of length of sequence (gaussian distribution). - - Returns: - dict: of n sequences - """ - LOG.info("Generating random sequences.") - sequences: dict[str,str] = {} - for i in range(n): - seq: str = "" - bases: list[str] = ["A", "T", "C", "G"] - for nt in range(abs(round(gauss(mean, sd)))): - seq = seq + choice(bases) - key: str = str(i) + ': length ' + str(len(seq)) + ' nt' - sequences[key] = seq - return sequences - -def write_fasta(sequences: dict[str,str], file_path: str) -> None: - """ - Takes a dictionary and writes it to a fasta file. - Must specify the filename when calling the function. - - Args: - sequences: Dictionary of sequence. - file_path: A file path directing to the output folder. - - """ - LOG.info("Writing FASTA file.") - with open(file_path, "w") as outfile: - for key, value in sequences.items(): - outfile.write(key + "\n") - outfile.write("\n".join(wrap(value, 60))) - outfile.write("\n") - class ReadSequencer: - def __init__(self): - self.sequences: dict[str,str] = {} - self.reads: dict[str,str] = {} - - def add_random_sequences(self, n: int, mean: int, sd: int): - self.sequences: dict[str,str] = generate_sequences(n, mean, sd) - - def read_fasta(self, input_file): - self.sequences: dict[str,str] = read_in_fasta(input_file) - - def run_sequencing(self, read_length: int): - self.reads: dict[str,str] = simulate_sequencing(self.sequences, read_length) - - def write_fasta(self, output_file_path: str): - write_fasta(self.reads, output_file_path) + def __init__(self, fasta: str = None, output: str = None, read_length: int = 150, chunk_size: int = 10000) -> None: + """ ReadSequencer class + Args: + fasta: path fasta file + output: path output fasta file(s) + read_length: read length, defaults to 150. + chunk_size: batch size used for memory efficient processing, only used when number of sequences greater + than number of passed sequences. Defaults to 10000. + + Returns: + None + """ + self.fasta = fasta + self.output = output + self.read_length = read_length + self.chunk_size = chunk_size + self.random = False + self.bases = ('A', 'T', 'C', 'G') + self.n_sequences = None + + def get_n_sequences(self) -> None: + """ + Helper function to detect number of sequences present in set fasta file. + + Returns: + None + """ + self.n_sequences = len(list(SeqIO.parse(self.fasta, 'fasta'))) + LOG.info(self.n_sequences, ' sequences found in ', self.fasta) + + def define_random_sequences(self, n: int) -> None: + """ + Defines random sequences. + + Args: + n: number of random sequences to be generated + + Returns: + None + """ + LOG.info('Set mode to: generate random sequences') + LOG.info('N random sequences: ', n) + self.random = True + self.n_sequences = n + + def generate_random_sequence(self, length: int) -> Seq: + """ + Generates random sequence. + + Args: + length: length of sequence + + Returns: + random sequence of length n + """ + seq = choices(self.bases, k=length) + seq = Seq(''.join(seq)) + return seq + + def resize_sequence(self, record:SeqRecord) -> SeqRecord: + """ + Resizes sequence according to set read length. If sequence is shorter than read length, fills up + with random nucleotides. + + Args: + record: SeqRecord + + Returns: + resized SeqRecord + """ + if (len(record)) >= self.read_length: + record.seq = record.seq[0:self.read_length] + else: + n_add = self.read_length - len(record) + add_seq = self.generate_random_sequence(n_add) + record.seq = record.seq + add_seq + return record.seq + + def batch_iterator(self, iterator: Iterator, batch_size: int) -> Generator: + """ + This is a generator function, and it returns lists of the + entries from the supplied iterator. Each list will have + batch_size entries, although the final list may be shorter. + + Args: + iterator: iterator object generated with Bio.SeqIO.parse() + batch_size: batch size to use for the generator + + Returns: + list of entries from supplied iterator according to batch_size + """ + batch = [] + for entry in iterator: + batch.append(entry) + if len(batch) == batch_size: + yield batch + batch = [] + + def run_sequencing(self) -> None: + """ + Runs read sequencing of specified sequences from input fasta file or generates random sequences for a given + read length. If number of sequences exceeds chunk-size, it will switch to batch processing mode. + + Returns: + Writes processed sequences to output fasta file(s). + """ + if self.random: + if self.n_sequences < self.chunk_size: + with open(self.output, 'w') as output_handle: + for i in range(self.n_sequences): + record = SeqRecord( + self.generate_random_sequence(self.read_length), + id='random_seq: ' + str(i+1)) + SeqIO.write(record, output_handle, 'fasta') + else: + for i, batch in enumerate(self.batch_iterator(range(self.n_sequences), self.chunk_size)): + filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1) + with open(filename, 'w') as output_handle: + for j, k in enumerate(batch): + record = SeqRecord( + self.generate_random_sequence(self.read_length), + id='random_seq: ' + str(j+1)) + SeqIO.write(record, output_handle, 'fasta') + else: + if self.n_sequences < self.chunk_size: + with open(self.fasta) as input_handle, open( + self.output, 'w') as output_handle: + for record in SeqIO.parse(input_handle, 'fasta'): + record.seq = self.resize_sequence(record) + SeqIO.write(record, output_handle, 'fasta') + + else: + record_iter = SeqIO.parse(open(self.fasta), 'fasta') + for i, batch in enumerate(self.batch_iterator(record_iter, self.chunk_size)): + filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1) + for j, record in enumerate(batch): + batch[j].seq = self.resize_sequence(record) + with open(filename, 'w') as handle: + SeqIO.write(batch, handle, 'fasta') diff --git a/setup.py b/setup.py index 905f8ee..0c45a41 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,6 @@ setup( author_email='christoph.harmel@unibas.ch', description='Simulates sequencing with a specified read length from sequences specified by a FASTA file.', packages=find_packages(), - install_requires=['random','textwrap','argparse','logging'], + install_requires=['random','Bio','argparse','logging'], entry_points={'console_scripts': ['read_sequencer=read_sequencer_package.cli:main']} ) diff --git a/tests/fasta_testfile/result.fasta b/tests/fasta_testfile/result.fasta deleted file mode 100644 index 32258d6..0000000 --- a/tests/fasta_testfile/result.fasta +++ /dev/null @@ -1,200 +0,0 @@ -0: length 103 nt -ACGCCCTATGTCAGAGTGGTGTTAGTGCACAATTCATTCCGATATGCGAA -1: length 35 nt -ATAACGAGCGCGAGCCCTTAACGGGCAGGACTGCCGACTCACACCTTAAT -2: length 61 nt -GGTCTCATAAGCACTTCGGGTACATCCTCCTAAATGGGCCGCGACTTAGG -3: length 54 nt -TAGGTCTCGGATACGAGATATAACCGTAGCATGGGAGTGACACCCATGCC -4: length 64 nt -TCTGCGATACCCAATTTAGGGCATGGCATAAGGCCCGGCGGGTATTTCCG -5: length 85 nt -TTGTTAAGCGTGTTGGTAGCGCCCCCAAATTGTCCCAACTGGGCCACACC -6: length 53 nt -ATAGTACAAGTTAGACCTCATGCATTGAGCTGCCCGGTTCGGACCTTAAT -7: length 27 nt -TAAAATATTAAGGATAGCCGGAACGCGCTGGCGACTGACGACACTACACG -8: length 25 nt -TAGATTCCAGGGAATGACGTCACACCGTTCCGTATATCAGCGGCGCGGAA -9: length 52 nt -GGTGCACCTATTAACAGAGTGACTTTGGGGGAATCACTTATTCGCATATA -10: length 41 nt -AACGACAAGCTGAACGAGCTCTGGCGGCTACGCAGTGTTCGCGCAAGCGA -11: length 106 nt -GTAATAAACTAAACATCAGCTGAGGCGTCCGACCACTGCATCTTTTGTTG -12: length 5 nt -GTAATATGGTATAGCTCGACCCTCGTACAACGGGGGTCAGACAGCGTTGC -13: length 49 nt -TTACTTTGCACGCATGCATGGAGGCACCATGAAAGACGGGAAGCGGCTAA -14: length 22 nt -GCAAGGGGTTTGAGGGAGCAACGTAAGATCAGAGATAACAGTCTCAAGAG -15: length 109 nt -AATTGAAGTCAGCAAAGAATTGGGATTGCTGGAAGCAACACGGACACGTT -16: length 43 nt -GAACCATTCTAAACAGCACCACCCGCATCCCGCTTTGCGCAGCAGCCACT -17: length 85 nt -ATGTCTCACAGGGGAGGTCTCCAGGCTTCTACCAAGTTCCCAATTCCATG -18: length 16 nt -ATCTCATCAATTATGTTGTACACCTGCTCGTTCTAACTTGCAGACTGCAT -19: length 52 nt -GAGTATCGCAGTATACTGTTGGCGGACATTCCAGAATATTGGTATTCAGC -20: length 2 nt -ATATAGCTACCCCAGATTGTCTTAACGAGTCCCGCACCCCCCACCAAGAT -21: length 13 nt -GTATTTACCTGCCCACTAGTACAAGTCCCCGAACCTTAGCAATCCTAAGT -22: length 64 nt -TGGCCGTTTCAGAAGGATTCCTGCCCCTGTGATGAGTCACGTGGAGACAG -23: length 59 nt -CGAGGAGAGCTGTAAAATACTGCTGCCTGCTACATTGGGGCTTGGGTCTC -24: length 82 nt -GCACAGTAGCGAGTGACTCGGTTACTATGTGTTGCTGACTTTATATGTGC -25: length 6 nt -GAGTAAAGCTGAAACACCGAACCGGCAGTTAGAGCTAGTGGATTTTCTTA -26: length 87 nt -AGACATTTTCGGAACCAATGATCTTGTTGGAGTTGAGAGGCCGAACCAGG -27: length 136 nt -AACACGGTAATATCAGGTCGGCCTTAGTCACTTCGGAATTGGAGTTAGTA -28: length 68 nt -CACCGGTAGCTGAATGTAACGAATCTACGTTGACACGGTTAAAGGGATAT -29: length 84 nt -AGATTGTCATGGACTGACGGAGAACACCGTGCTTAAAGTGGAATAATGTG -30: length 51 nt -CGGCAAGGAGGAGGCTACTGACCAATCAGACAAACGTTCATGGCTAGAGA -31: length 51 nt -AGCAGCCGCTCAACTGTGACGTCGGCTGAGTCTACCAGCCCTCAGACTAC -32: length 73 nt -TCCGATCTTATAGTAGTGATCCTGATGGAACAATAACTGACCCAGTAAAG -33: length 58 nt -ACGTTTTATTGTACCTAGGTGTGGCACCATGAAAGGCAGGAGGCCCACCG -34: length 28 nt -AATCCGCAATCGTGCTACGCATTTTCCAGACAATATTTCGAGGTTACCGC -35: length 13 nt -AACACGAGGCCCTCTCGACGCTGCTATTTAACAGCGCGTGCCTATTGAAA -36: length 2 nt -ATACCGAAACTTACTAGTTGTGACAGCGGATTCCAAACTGCGGGTGAATA -37: length 71 nt -GTGAGGAGTACTCGGCACCGGGAACGTAGTAATGTGCTCTGATCCACTCT -38: length 46 nt -CGAACGTAAATAAGTCTGCTGCTGCCTCAGCAGAGGGATGGGAGGGGTCT -39: length 48 nt -CGCAAGAGTGTCGAGCTGGCAGACGCGTATTGCCTTTATATCCGTGAAAC -40: length 63 nt -TGACAAACATAAGTTTGTCGCCCGTTTCCACATAGTAGAATTCATTAGAA -41: length 11 nt -AGAAATAGTACGTAGTGTTGGCTGTGAATATTCGGAGATAAGGACTCATT -42: length 68 nt -GGTTCACAAATTGTCATAGATCATCGTGGTACGACGGGCGATCTTATTTC -43: length 7 nt -CGAGCGTGATGACGTAATGGAGTGAATAAGGAGTGCGAAATGGTGCGTTT -44: length 84 nt -TGGGACGATCATTCCCCCTGACCTTACGTACAATGCAATTGCGAGTGGCT -45: length 23 nt -TTGTACTTACGGCTTAAATTGCACAGAAGGGGTTCTCCGGAAGATAACTA -46: length 58 nt -GGCCCTTCGGTGCAGTCGGCCTAGGTATCAAGCTCGCCTGCCCGTGTTAT -47: length 50 nt -TATCGTGGTTATGTGCGGACCGCCTGATATTACAGCTGATGTGAGGGCAA -48: length 63 nt -TCTGCTAGGGATCCACTCCTGGGTCATTATTAAGCACTTACGGCCTGAGA -49: length 48 nt -GGAGAATTCGCGGCATAAGATCTCCATACGGGTAGTATGTGCCACTATTA -50: length 64 nt -AATGCGTACGGCCCAAAAACCTTCAGGATTACCATGAACCCGTAACTCAA -51: length 41 nt -AGTCATATAATGATTGAGTTCCCTCAACCCTCCGGACGTAAACACTCCGG -52: length 44 nt -TCCAGATGGCCTGGCAATCAGGTGTTGACGGGAACCGCACCCCTGCTGCT -53: length 62 nt -AGGTTGTCTGAATCACTCGGGAGACTTCCCCGCACGATATCGGCACCTGA -54: length 73 nt -CCGTACGGAACATCCTGCAGCTTCGCCCGGTCGAGAAGAAGGTAGAGATA -55: length 39 nt -TGCGCGGCGGACTCTGTCATTTATGACCTGCTTGAACGATGAGTGGGCCA -56: length 88 nt -CGATCCATAGGGCACGCTAGTTTGCATTTTAAGGAGTGCCAATTACGAGG -57: length 117 nt -TATTGCGTCCGTTGCCTGCTCCATCCGTTGCACTATGCCATAAGGCAACA -58: length 67 nt -TTAACGCACGATCAACGGTTCATAATCTGCGGTGATCCCAGAGCTCTACA -59: length 84 nt -CTGACCCTCGATGTGTTTAATAGTTGTATTTCGGCCCCAGTAGGCTCGAC -60: length 71 nt -CTGTCAGATGCGCATGTTCGTGGTCCATTGTCAGCCAAATTAAAGGTTGC -61: length 102 nt -TACATGCTCATATGAACTGACATTGAGTGGAAGGTAGCGACCGCGAGGTT -62: length 66 nt -ACATGATCTTTGTCGAATACTCAAAATGCACAGTGCCTAACTATTTGGGT -63: length 22 nt -GTCACGACACGCATCTACACTAAACAGCACCACATGTTAGAGATGAGATC -64: length 46 nt -CGTGGTCGTGAAGACCCACTATATTAGCAAGAGTCGCGATAGATGGGTGT -65: length 78 nt -CTAGGTAATTTTGAGAGACACCCCGACGCACGGATGTTCCGTACAAAGAT -66: length 32 nt -CAGTCTGAATGTATCAACTAAGAGAATCGTACGAAAATAGACCTAATCGA -67: length 42 nt -GCTCCGCACACTAGATAGCTGATGCCCACATACTCATTCATTATTCGATG -68: length 49 nt -GTCTGAACCAGACACGCACTACGGCTAGCACCTATCCTTATCCTAAAACG -69: length 41 nt -GAGTTACGACCGGACGGGTTGTTAGGTCACCCCGGAATCAGACGATTTAC -70: length 29 nt -GTAGGCCAACGCGAACTACTCCTCCCGCCCACAGTCTAGACTAAGCGTTG -71: length 94 nt -GGTTTAGACTGTCTGCAACTATGTTACTGGATATACCTAATGACAGGGGT -72: length 48 nt -TAAGAATGTTATTGGGCGGTAGCTTACTCTATTGGAATTGAGAAAATGTA -73: length 63 nt -CTAGTATGTTTTATTGGATGCTTGAGGTAGGAATAATCACATCGGGCCGA -74: length 70 nt -ACAAACCTCCTCCCCGGCGACGGGCAACGCCCGCATTGCAACATGCAAGA -75: length 58 nt -GGCTCCATTTCGCTACCCTCTCCCCTAAATCAGGTCCGCGGAGTTGTCGT -76: length 98 nt -ACATACCTATGCCTCAGATCGGTTCGGCTCTTGAGGCGACACCCGTGATA -77: length 34 nt -TTAGAAAGGAACTCATTTTAAGCAAGTTATGCCAGGTTAGCCCGCCCTGG -78: length 48 nt -AAGGCTCGCCGTGACCAGTAGTGCGCAACCTTTCAAGCCGTGTAATATTT -79: length 51 nt -CCTTAAGGGTGGTTTGCACCGTAAAGACCCTCTCTTCCACTTTCCCCGAA -80: length 30 nt -GCCTCGACGTGCCGGACGATGGGCGCTGATCGAAAGTCCATTGTCCCCCT -81: length 14 nt -AGGGCAGTGTACACACACGGCAAGGGTTGCGATGGGTAGAGGCTAAGACG -82: length 39 nt -AATGGGACTTAGTAAACAGGCTGGCCGCCTAGACTGTGTGTGAATGCCCA -83: length 29 nt -AGCTTGGACTTAGAGAAAATCAGTCTACAAGACATAGGCTTTAAAGAGGA -84: length 15 nt -ACGGAAAGTAAGTCTCTCGTCCCTGGTTAAAGGGTCAGTCCCTCCCACCT -85: length 20 nt -CCCAGAGCAAACTGGACCTCCCACCGGCAGATCATCGTTTATTATTATAC -86: length 64 nt -CGTGGCCAGTACATGCTGATGATCCCTATTACAGACCTCGCGTGTTGAGA -87: length 52 nt -GTTAGGCTGTACTGAATACCATGAGCATGTGGGTAGGTTGTACTGGAACC -88: length 60 nt -CGCCGGGCTCCCCGAAGTATAGAGCCAGCGGTTAAATAACTTATCTGATC -89: length 32 nt -CTTGTACGTCGAGTATGAGAGGGGTTACACTTTCTGCTTGCTAAGTTCAG -90: length 14 nt -AAACAGATTAATAACGTAATACCCCCGCTCTACGCCCCTGCATTACGGTT -91: length 21 nt -GTAGTTCGTAGCAGGGGATTGTCATAAATCAGCGCCTGATGGGAAGCGTT -92: length 47 nt -TGCCCAGATGTAACACATCCCGGCCCATTCTGGTAACTCCGTTTCTGGGT -93: length 47 nt -AATTAAGGGGTTGTCAAACCGGCCACTTTTGAACAATGGGCTTCGCCGCC -94: length 42 nt -TTCAACCTTTACGATGCAAACATAGACAAACTTGGACGTATTCGGACCCT -95: length 35 nt -GGCGCTTAGTGGAATGCCCCCCCGTATGCAGCATCGAGTATTCCGTAGCG -96: length 54 nt -TTGCTGATCTTTCGAATTTGGATCTGGGATTTTGCCAGATAGCGCCCGTA -97: length 34 nt -TACGTAGATATTTTCAGGAGGCACTGAACAGCCCAGAGCCTCCATCGGGC -98: length 15 nt -ATTGCGCAGGTATGGCTCAGAGACACAGTATACTATTACCTTCCCGCAAC -99: length 42 nt -CGATCTGGCATCGGCATCCGAGTAGCCCCTACGTATGGCTTTGCCCAACA -- GitLab