diff --git a/read_sequencer_package/cli.py b/read_sequencer_package/cli.py index c6dce09b4e30a35520040220358e868ea99e8081..0fdd614691291bd71000c36d30f36d796663885b 100644 --- a/read_sequencer_package/cli.py +++ b/read_sequencer_package/cli.py @@ -4,32 +4,34 @@ import logging parser = argparse.ArgumentParser(prog='read_sequencer', description='Simulates sequencing of DNA sequences specified by an FASTA file.') -parser.add_argument('--input_file_path', + +parser.add_argument('output', help='path to FASTA file') -parser.add_argument('--output_file_path', +parser.add_argument('-i','--input', default=None, help='path to FASTA file') -parser.add_argument('--read_length', +parser.add_argument('-r','--read-length', default=100, help='read length for sequencing', type=int) -parser.add_argument('--random', action='store_true', default=False, - help='generate random sequences') -parser.add_argument('--n_random', default=100, type=int, help='n random sequences') -parser.add_argument('--mean_random', default=50, type=int, help='mean random sequences') -parser.add_argument('--sd_random', default=25, type=int, help='standard deviation random sequences') +parser.add_argument('-n','--n_random', default=100, type=int, + help='n random sequences. Just used if input fasta file is not specified.') +parser.add_argument('-s','--chunk-size', default=10000, type=int, help='chunk_size for batch processing') args = parser.parse_args() def main(): LOG.info("Read sequencer started.") - read_sequencer = ReadSequencer() - if args.random: - read_sequencer.add_random_sequences(n=args.n_random, mean=args.mean_random, sd=args.sd_random) + if args.input is not None: + read_sequencer = ReadSequencer(fasta=args.input, output=args.output, read_length=args.read_length, chunk_size=args.chunk_size) + read_sequencer.get_n_sequences() else: - read_sequencer.read_fasta(args.input_file_path) - read_sequencer.run_sequencing(args.read_length) - read_sequencer.write_fasta(args.output_file_path) + read_sequencer = ReadSequencer(fasta=args.input, output=args.output, read_length=args.read_length, chunk_size=args.chunk_size) + read_sequencer.define_random_sequences(n=args.n_random) + + read_sequencer.run_sequencing() + LOG.info("Read sequencer finished.") + if __name__ == '__main__': logging.basicConfig( format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', diff --git a/read_sequencer_package/read_sequencer.py b/read_sequencer_package/read_sequencer.py index f92458982cf1d57721279b44344fad2f262f80ee..6d1a81a8d59de9d1e8d7a0de2c5a5208f5ce68a7 100644 --- a/read_sequencer_package/read_sequencer.py +++ b/read_sequencer_package/read_sequencer.py @@ -1,130 +1,146 @@ import logging -LOG = logging.getLogger(__name__) +from random import choices +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from collections.abc import Generator, Iterator -def read_in_fasta(file_path: str) -> dict[str,str]: - """ - This function reads in FASTA files. - - Args: - file_path (str): A file path directing to the fasta file. - - Returns: - Dict: It returns a dictionary with sequences. - """ - LOG.info("Reading in FASTA files from destination.") - sequences: dict[str,str] = {} - f = open(file_path) - for line in f: - if line[0] == '>': - def_line = line.strip() - def_line = def_line.replace('>', '') - else: - if def_line not in sequences: - sequences[def_line] = '' - sequences[def_line] += line.strip() - f.close() - return sequences - -def read_sequence(seq:str, read_length:int) -> str: - """ - This function reads a sequence of a specific read length and adds additional nucleotides if the sequence is - smaller than the requested length or cuts the sequence if its longer. - - Args: - seq (str): the sequence to read - read_length (int): length of reads - - Returns: - str: returns sequenced element - - """ - from random import choice - bases: list[str] = ["A", "T", "C", "G"] - sequenced: str = '' - if read_length > len(seq): - for nt in range(len(seq)): - sequenced += seq[nt] - for nt in range(len(seq), read_length): - sequenced += choice(bases) - else: - for nt in range(read_length): - sequenced += seq[nt] - - return sequenced - -def simulate_sequencing(sequences: dict[str,str], read_length: int) -> dict[str,str]: - """ - Simulates sequencing. - - Args: - sequences (dict): Dictionary of sequences to sequence. - read_length (int): length of reads - - Returns: - dict: of n sequences as values - """ - LOG.info("Sequencing in progress....") - results: dict[str,str] = {} - for index, key in enumerate(sequences): - results[key] = read_sequence(sequences[key], read_length=read_length) - LOG.info("Sequencing was successfully executed.") - return results - -def generate_sequences(n: int, mean: int, sd: int) -> dict[str,str]: - """ - Generates random sequences. - - Args: - n (int): Amount of sequences to generate. - mean (int): mean length of sequence (gaussian distribution). - sd (float): standard deviation of length of sequence (gaussian distribution). - - Returns: - dict: of n sequences - """ - from random import choice, gauss - LOG.info("Generating random sequences.") - sequences: dict[str,str] = {} - for i in range(n): - seq: str = "" - bases: list[str] = ["A", "T", "C", "G"] - for nt in range(abs(round(gauss(mean, sd)))): - seq = seq + choice(bases) - key: str = str(i) + ': length ' + str(len(seq)) + ' nt' - sequences[key] = seq - return sequences - -def write_fasta(sequences: dict[str,str], file_path: str): - """ - Takes a dictionary and writes it to a fasta file. - Must specify the filename when calling the function. - - Args: - sequences (dict): Dictionary of sequence. - file_path (str): A file path directing to the output folder. - - """ - LOG.info("Writing FASTA file.") - from textwrap import wrap - with open(file_path, "w") as outfile: - for key, value in sequences.items(): - outfile.write(key + "\n") - outfile.write("\n".join(wrap(value, 60))) - outfile.write("\n") +LOG = logging.getLogger(__name__) class ReadSequencer: - def __init__(self): - self.sequences: dict[str,str] = {} - self.reads: dict[str,str] = {} - - def add_random_sequences(self, n: int, mean: int, sd: int): - self.sequences: dict[str,str] = generate_sequences(n, mean, sd) - - def read_fasta(self, input_file): - self.sequences: dict[str,str] = read_in_fasta(input_file) - - def run_sequencing(self, read_length: int): - self.reads: dict[str,str] = simulate_sequencing(self.sequences, read_length) - - def write_fasta(self, output_file_path: str): - write_fasta(self.reads, output_file_path) + def __init__(self, fasta: str = None, output: str = None, read_length: int = 150, chunk_size: int = 10000) -> None: + """ ReadSequencer class + Args: + fasta: path fasta file + output: path output fasta file(s) + read_length: read length, defaults to 150. + chunk_size: batch size used for memory efficient processing, only used when number of sequences greater + than number of passed sequences. Defaults to 10000. + + Returns: + None + """ + self.fasta = fasta + self.output = output + self.read_length = read_length + self.chunk_size = chunk_size + self.random = False + self.bases = ('A', 'T', 'C', 'G') + self.n_sequences = None + + def get_n_sequences(self) -> None: + """ + Helper function to detect number of sequences present in set fasta file. + + Returns: + None + """ + self.n_sequences = len(list(SeqIO.parse(self.fasta, 'fasta'))) + + def define_random_sequences(self, n: int) -> None: + """ + Defines random sequences. + + Args: + n: number of random sequences to be generated + + Returns: + None + """ + self.random = True + self.n_sequences = n + + def generate_random_sequence(self, length: int) -> Seq: + """ + Generates random sequence. + + Args: + length: length of sequence + + Returns: + random sequence of length n + """ + seq = choices(self.bases, k=length) + seq = Seq(''.join(seq)) + return seq + + def resize_sequence(self, record:SeqRecord) -> SeqRecord: + """ + Resizes sequence according to set read length. If sequence is shorter than read length, fills up + with random nucleotides. + + Args: + record: SeqRecord + + Returns: + resized SeqRecord + """ + if (len(record)) >= self.read_length: + record.seq = record.seq[0:self.read_length] + else: + n_add = self.read_length - len(record) + add_seq = self.generate_random_sequence(n_add) + record.seq = record.seq + add_seq + return record.seq + + def batch_iterator(self, iterator: Iterator, batch_size: int) -> Generator: + """ + This is a generator function, and it returns lists of the + entries from the supplied iterator. Each list will have + batch_size entries, although the final list may be shorter. + + Args: + iterator: iterator object generated with Bio.SeqIO.parse() + batch_size: batch size to use for the generator + + Returns: + list of entries from supplied iterator according to batch_size + """ + batch = [] + for entry in iterator: + batch.append(entry) + if len(batch) == batch_size: + yield batch + batch = [] + + def run_sequencing(self) -> None: + """ + Runs read sequencing of specified sequences from input fasta file or generates random sequences for a given + read length. If number of sequences exceeds chunk-size, it will switch to batch processing mode. + + Returns: + Writes processed sequences to output fasta file(s). + """ + if self.random: + if self.n_sequences < self.chunk_size: + with open(self.output, 'w') as output_handle: + for i in range(self.n_sequences): + record = SeqRecord( + self.generate_random_sequence(self.read_length), + id='random_seq: ' + str(i+1)) + SeqIO.write(record, output_handle, 'fasta') + else: + for i, batch in enumerate(self.batch_iterator(range(self.n_sequences), self.chunk_size)): + filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1) + with open(filename, 'w') as output_handle: + for j, k in enumerate(batch): + record = SeqRecord( + self.generate_random_sequence(self.read_length), + id='random_seq: ' + str(j+1)) + SeqIO.write(record, output_handle, 'fasta') + else: + if self.n_sequences < self.chunk_size: + with open(self.fasta) as input_handle, open( + self.output, 'w') as output_handle: + for record in SeqIO.parse(input_handle, 'fasta'): + record.seq = self.resize_sequence(record) + SeqIO.write(record, output_handle, 'fasta') + + else: + record_iter = SeqIO.parse(open(self.fasta), 'fasta') + for i, batch in enumerate(self.batch_iterator(record_iter, self.chunk_size)): + filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1) + for j, record in enumerate(batch): + batch[j].seq = self.resize_sequence(record) + with open(filename, 'w') as handle: + SeqIO.write(batch, handle, 'fasta') diff --git a/setup.py b/setup.py index 905f8ee96fc23a232cf669563da1389e1228c9e5..0c45a41200918be3c3fc4c3eb2001c110507e822 100644 --- a/setup.py +++ b/setup.py @@ -9,6 +9,6 @@ setup( author_email='christoph.harmel@unibas.ch', description='Simulates sequencing with a specified read length from sequences specified by a FASTA file.', packages=find_packages(), - install_requires=['random','textwrap','argparse','logging'], + install_requires=['random','Bio','argparse','logging'], entry_points={'console_scripts': ['read_sequencer=read_sequencer_package.cli:main']} ) diff --git a/tests/fasta_testfile/result.fasta b/tests/fasta_testfile/result.fasta deleted file mode 100644 index 32258d65e587dd9e7501d585c137d799103c60f8..0000000000000000000000000000000000000000 --- a/tests/fasta_testfile/result.fasta +++ /dev/null @@ -1,200 +0,0 @@ -0: length 103 nt -ACGCCCTATGTCAGAGTGGTGTTAGTGCACAATTCATTCCGATATGCGAA -1: length 35 nt -ATAACGAGCGCGAGCCCTTAACGGGCAGGACTGCCGACTCACACCTTAAT -2: length 61 nt -GGTCTCATAAGCACTTCGGGTACATCCTCCTAAATGGGCCGCGACTTAGG -3: length 54 nt -TAGGTCTCGGATACGAGATATAACCGTAGCATGGGAGTGACACCCATGCC -4: length 64 nt -TCTGCGATACCCAATTTAGGGCATGGCATAAGGCCCGGCGGGTATTTCCG -5: length 85 nt -TTGTTAAGCGTGTTGGTAGCGCCCCCAAATTGTCCCAACTGGGCCACACC -6: length 53 nt -ATAGTACAAGTTAGACCTCATGCATTGAGCTGCCCGGTTCGGACCTTAAT -7: length 27 nt -TAAAATATTAAGGATAGCCGGAACGCGCTGGCGACTGACGACACTACACG -8: length 25 nt -TAGATTCCAGGGAATGACGTCACACCGTTCCGTATATCAGCGGCGCGGAA -9: length 52 nt -GGTGCACCTATTAACAGAGTGACTTTGGGGGAATCACTTATTCGCATATA -10: length 41 nt -AACGACAAGCTGAACGAGCTCTGGCGGCTACGCAGTGTTCGCGCAAGCGA -11: length 106 nt -GTAATAAACTAAACATCAGCTGAGGCGTCCGACCACTGCATCTTTTGTTG -12: length 5 nt -GTAATATGGTATAGCTCGACCCTCGTACAACGGGGGTCAGACAGCGTTGC -13: length 49 nt -TTACTTTGCACGCATGCATGGAGGCACCATGAAAGACGGGAAGCGGCTAA -14: length 22 nt -GCAAGGGGTTTGAGGGAGCAACGTAAGATCAGAGATAACAGTCTCAAGAG -15: length 109 nt -AATTGAAGTCAGCAAAGAATTGGGATTGCTGGAAGCAACACGGACACGTT -16: length 43 nt -GAACCATTCTAAACAGCACCACCCGCATCCCGCTTTGCGCAGCAGCCACT -17: length 85 nt -ATGTCTCACAGGGGAGGTCTCCAGGCTTCTACCAAGTTCCCAATTCCATG -18: length 16 nt -ATCTCATCAATTATGTTGTACACCTGCTCGTTCTAACTTGCAGACTGCAT -19: length 52 nt -GAGTATCGCAGTATACTGTTGGCGGACATTCCAGAATATTGGTATTCAGC -20: length 2 nt -ATATAGCTACCCCAGATTGTCTTAACGAGTCCCGCACCCCCCACCAAGAT -21: length 13 nt -GTATTTACCTGCCCACTAGTACAAGTCCCCGAACCTTAGCAATCCTAAGT -22: length 64 nt -TGGCCGTTTCAGAAGGATTCCTGCCCCTGTGATGAGTCACGTGGAGACAG -23: length 59 nt -CGAGGAGAGCTGTAAAATACTGCTGCCTGCTACATTGGGGCTTGGGTCTC -24: length 82 nt -GCACAGTAGCGAGTGACTCGGTTACTATGTGTTGCTGACTTTATATGTGC -25: length 6 nt -GAGTAAAGCTGAAACACCGAACCGGCAGTTAGAGCTAGTGGATTTTCTTA -26: length 87 nt -AGACATTTTCGGAACCAATGATCTTGTTGGAGTTGAGAGGCCGAACCAGG -27: length 136 nt -AACACGGTAATATCAGGTCGGCCTTAGTCACTTCGGAATTGGAGTTAGTA -28: length 68 nt -CACCGGTAGCTGAATGTAACGAATCTACGTTGACACGGTTAAAGGGATAT -29: length 84 nt -AGATTGTCATGGACTGACGGAGAACACCGTGCTTAAAGTGGAATAATGTG -30: length 51 nt -CGGCAAGGAGGAGGCTACTGACCAATCAGACAAACGTTCATGGCTAGAGA -31: length 51 nt -AGCAGCCGCTCAACTGTGACGTCGGCTGAGTCTACCAGCCCTCAGACTAC -32: length 73 nt -TCCGATCTTATAGTAGTGATCCTGATGGAACAATAACTGACCCAGTAAAG -33: length 58 nt -ACGTTTTATTGTACCTAGGTGTGGCACCATGAAAGGCAGGAGGCCCACCG -34: length 28 nt -AATCCGCAATCGTGCTACGCATTTTCCAGACAATATTTCGAGGTTACCGC -35: length 13 nt -AACACGAGGCCCTCTCGACGCTGCTATTTAACAGCGCGTGCCTATTGAAA -36: length 2 nt -ATACCGAAACTTACTAGTTGTGACAGCGGATTCCAAACTGCGGGTGAATA -37: length 71 nt -GTGAGGAGTACTCGGCACCGGGAACGTAGTAATGTGCTCTGATCCACTCT -38: length 46 nt -CGAACGTAAATAAGTCTGCTGCTGCCTCAGCAGAGGGATGGGAGGGGTCT -39: length 48 nt -CGCAAGAGTGTCGAGCTGGCAGACGCGTATTGCCTTTATATCCGTGAAAC -40: length 63 nt -TGACAAACATAAGTTTGTCGCCCGTTTCCACATAGTAGAATTCATTAGAA -41: length 11 nt -AGAAATAGTACGTAGTGTTGGCTGTGAATATTCGGAGATAAGGACTCATT -42: length 68 nt -GGTTCACAAATTGTCATAGATCATCGTGGTACGACGGGCGATCTTATTTC -43: length 7 nt -CGAGCGTGATGACGTAATGGAGTGAATAAGGAGTGCGAAATGGTGCGTTT -44: length 84 nt -TGGGACGATCATTCCCCCTGACCTTACGTACAATGCAATTGCGAGTGGCT -45: length 23 nt -TTGTACTTACGGCTTAAATTGCACAGAAGGGGTTCTCCGGAAGATAACTA -46: length 58 nt -GGCCCTTCGGTGCAGTCGGCCTAGGTATCAAGCTCGCCTGCCCGTGTTAT -47: length 50 nt -TATCGTGGTTATGTGCGGACCGCCTGATATTACAGCTGATGTGAGGGCAA -48: length 63 nt -TCTGCTAGGGATCCACTCCTGGGTCATTATTAAGCACTTACGGCCTGAGA -49: length 48 nt -GGAGAATTCGCGGCATAAGATCTCCATACGGGTAGTATGTGCCACTATTA -50: length 64 nt -AATGCGTACGGCCCAAAAACCTTCAGGATTACCATGAACCCGTAACTCAA -51: length 41 nt -AGTCATATAATGATTGAGTTCCCTCAACCCTCCGGACGTAAACACTCCGG -52: length 44 nt -TCCAGATGGCCTGGCAATCAGGTGTTGACGGGAACCGCACCCCTGCTGCT -53: length 62 nt -AGGTTGTCTGAATCACTCGGGAGACTTCCCCGCACGATATCGGCACCTGA -54: length 73 nt -CCGTACGGAACATCCTGCAGCTTCGCCCGGTCGAGAAGAAGGTAGAGATA -55: length 39 nt -TGCGCGGCGGACTCTGTCATTTATGACCTGCTTGAACGATGAGTGGGCCA -56: length 88 nt -CGATCCATAGGGCACGCTAGTTTGCATTTTAAGGAGTGCCAATTACGAGG -57: length 117 nt -TATTGCGTCCGTTGCCTGCTCCATCCGTTGCACTATGCCATAAGGCAACA -58: length 67 nt -TTAACGCACGATCAACGGTTCATAATCTGCGGTGATCCCAGAGCTCTACA -59: length 84 nt -CTGACCCTCGATGTGTTTAATAGTTGTATTTCGGCCCCAGTAGGCTCGAC -60: length 71 nt -CTGTCAGATGCGCATGTTCGTGGTCCATTGTCAGCCAAATTAAAGGTTGC -61: length 102 nt -TACATGCTCATATGAACTGACATTGAGTGGAAGGTAGCGACCGCGAGGTT -62: length 66 nt -ACATGATCTTTGTCGAATACTCAAAATGCACAGTGCCTAACTATTTGGGT -63: length 22 nt -GTCACGACACGCATCTACACTAAACAGCACCACATGTTAGAGATGAGATC -64: length 46 nt -CGTGGTCGTGAAGACCCACTATATTAGCAAGAGTCGCGATAGATGGGTGT -65: length 78 nt -CTAGGTAATTTTGAGAGACACCCCGACGCACGGATGTTCCGTACAAAGAT -66: length 32 nt -CAGTCTGAATGTATCAACTAAGAGAATCGTACGAAAATAGACCTAATCGA -67: length 42 nt -GCTCCGCACACTAGATAGCTGATGCCCACATACTCATTCATTATTCGATG -68: length 49 nt -GTCTGAACCAGACACGCACTACGGCTAGCACCTATCCTTATCCTAAAACG -69: length 41 nt -GAGTTACGACCGGACGGGTTGTTAGGTCACCCCGGAATCAGACGATTTAC -70: length 29 nt -GTAGGCCAACGCGAACTACTCCTCCCGCCCACAGTCTAGACTAAGCGTTG -71: length 94 nt -GGTTTAGACTGTCTGCAACTATGTTACTGGATATACCTAATGACAGGGGT -72: length 48 nt -TAAGAATGTTATTGGGCGGTAGCTTACTCTATTGGAATTGAGAAAATGTA -73: length 63 nt -CTAGTATGTTTTATTGGATGCTTGAGGTAGGAATAATCACATCGGGCCGA -74: length 70 nt -ACAAACCTCCTCCCCGGCGACGGGCAACGCCCGCATTGCAACATGCAAGA -75: length 58 nt -GGCTCCATTTCGCTACCCTCTCCCCTAAATCAGGTCCGCGGAGTTGTCGT -76: length 98 nt -ACATACCTATGCCTCAGATCGGTTCGGCTCTTGAGGCGACACCCGTGATA -77: length 34 nt -TTAGAAAGGAACTCATTTTAAGCAAGTTATGCCAGGTTAGCCCGCCCTGG -78: length 48 nt -AAGGCTCGCCGTGACCAGTAGTGCGCAACCTTTCAAGCCGTGTAATATTT -79: length 51 nt -CCTTAAGGGTGGTTTGCACCGTAAAGACCCTCTCTTCCACTTTCCCCGAA -80: length 30 nt -GCCTCGACGTGCCGGACGATGGGCGCTGATCGAAAGTCCATTGTCCCCCT -81: length 14 nt -AGGGCAGTGTACACACACGGCAAGGGTTGCGATGGGTAGAGGCTAAGACG -82: length 39 nt -AATGGGACTTAGTAAACAGGCTGGCCGCCTAGACTGTGTGTGAATGCCCA -83: length 29 nt -AGCTTGGACTTAGAGAAAATCAGTCTACAAGACATAGGCTTTAAAGAGGA -84: length 15 nt -ACGGAAAGTAAGTCTCTCGTCCCTGGTTAAAGGGTCAGTCCCTCCCACCT -85: length 20 nt -CCCAGAGCAAACTGGACCTCCCACCGGCAGATCATCGTTTATTATTATAC -86: length 64 nt -CGTGGCCAGTACATGCTGATGATCCCTATTACAGACCTCGCGTGTTGAGA -87: length 52 nt -GTTAGGCTGTACTGAATACCATGAGCATGTGGGTAGGTTGTACTGGAACC -88: length 60 nt -CGCCGGGCTCCCCGAAGTATAGAGCCAGCGGTTAAATAACTTATCTGATC -89: length 32 nt -CTTGTACGTCGAGTATGAGAGGGGTTACACTTTCTGCTTGCTAAGTTCAG -90: length 14 nt -AAACAGATTAATAACGTAATACCCCCGCTCTACGCCCCTGCATTACGGTT -91: length 21 nt -GTAGTTCGTAGCAGGGGATTGTCATAAATCAGCGCCTGATGGGAAGCGTT -92: length 47 nt -TGCCCAGATGTAACACATCCCGGCCCATTCTGGTAACTCCGTTTCTGGGT -93: length 47 nt -AATTAAGGGGTTGTCAAACCGGCCACTTTTGAACAATGGGCTTCGCCGCC -94: length 42 nt -TTCAACCTTTACGATGCAAACATAGACAAACTTGGACGTATTCGGACCCT -95: length 35 nt -GGCGCTTAGTGGAATGCCCCCCCGTATGCAGCATCGAGTATTCCGTAGCG -96: length 54 nt -TTGCTGATCTTTCGAATTTGGATCTGGGATTTTGCCAGATAGCGCCCGTA -97: length 34 nt -TACGTAGATATTTTCAGGAGGCACTGAACAGCCCAGAGCCTCCATCGGGC -98: length 15 nt -ATTGCGCAGGTATGGCTCAGAGACACAGTATACTATTACCTTCCCGCAAC -99: length 42 nt -CGATCTGGCATCGGCATCCGAGTAGCCCCTACGTATGGCTTTGCCCAACA