Skip to content
Snippets Groups Projects
Commit 28cc31f1 authored by Christoph Harmel's avatar Christoph Harmel
Browse files

Merge branch 'rewrite_class' into 'main'

feat: ReadSequencer class rewritten, updated CLI

See merge request !26
parents ea45d048 f4f2c586
No related branches found
No related tags found
1 merge request!26feat: ReadSequencer class rewritten, updated CLI
...@@ -4,32 +4,34 @@ import logging ...@@ -4,32 +4,34 @@ import logging
parser = argparse.ArgumentParser(prog='read_sequencer', parser = argparse.ArgumentParser(prog='read_sequencer',
description='Simulates sequencing of DNA sequences specified by an FASTA file.') description='Simulates sequencing of DNA sequences specified by an FASTA file.')
parser.add_argument('--input_file_path',
parser.add_argument('output',
help='path to FASTA file') help='path to FASTA file')
parser.add_argument('--output_file_path', parser.add_argument('-i','--input', default=None,
help='path to FASTA file') help='path to FASTA file')
parser.add_argument('--read_length', parser.add_argument('-r','--read-length', default=100,
help='read length for sequencing', help='read length for sequencing',
type=int) type=int)
parser.add_argument('--random', action='store_true', default=False, parser.add_argument('-n','--n_random', default=100, type=int,
help='generate random sequences') help='n random sequences. Just used if input fasta file is not specified.')
parser.add_argument('--n_random', default=100, type=int, help='n random sequences') parser.add_argument('-s','--chunk-size', default=10000, type=int, help='chunk_size for batch processing')
parser.add_argument('--mean_random', default=50, type=int, help='mean random sequences')
parser.add_argument('--sd_random', default=25, type=int, help='standard deviation random sequences')
args = parser.parse_args() args = parser.parse_args()
def main(): def main():
LOG.info("Read sequencer started.") LOG.info("Read sequencer started.")
read_sequencer = ReadSequencer() if args.input is not None:
if args.random: read_sequencer = ReadSequencer(fasta=args.input, output=args.output, read_length=args.read_length, chunk_size=args.chunk_size)
read_sequencer.add_random_sequences(n=args.n_random, mean=args.mean_random, sd=args.sd_random) read_sequencer.get_n_sequences()
else: else:
read_sequencer.read_fasta(args.input_file_path) read_sequencer = ReadSequencer(fasta=args.input, output=args.output, read_length=args.read_length, chunk_size=args.chunk_size)
read_sequencer.run_sequencing(args.read_length) read_sequencer.define_random_sequences(n=args.n_random)
read_sequencer.write_fasta(args.output_file_path)
read_sequencer.run_sequencing()
LOG.info("Read sequencer finished.") LOG.info("Read sequencer finished.")
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig( logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")',
......
import logging import logging
LOG = logging.getLogger(__name__) from random import choices
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections.abc import Generator, Iterator
def read_in_fasta(file_path: str) -> dict[str,str]: LOG = logging.getLogger(__name__)
"""
This function reads in FASTA files.
Args:
file_path (str): A file path directing to the fasta file.
Returns:
Dict: It returns a dictionary with sequences.
"""
LOG.info("Reading in FASTA files from destination.")
sequences: dict[str,str] = {}
f = open(file_path)
for line in f:
if line[0] == '>':
def_line = line.strip()
def_line = def_line.replace('>', '')
else:
if def_line not in sequences:
sequences[def_line] = ''
sequences[def_line] += line.strip()
f.close()
return sequences
def read_sequence(seq:str, read_length:int) -> str:
"""
This function reads a sequence of a specific read length and adds additional nucleotides if the sequence is
smaller than the requested length or cuts the sequence if its longer.
Args:
seq (str): the sequence to read
read_length (int): length of reads
Returns:
str: returns sequenced element
"""
from random import choice
bases: list[str] = ["A", "T", "C", "G"]
sequenced: str = ''
if read_length > len(seq):
for nt in range(len(seq)):
sequenced += seq[nt]
for nt in range(len(seq), read_length):
sequenced += choice(bases)
else:
for nt in range(read_length):
sequenced += seq[nt]
return sequenced
def simulate_sequencing(sequences: dict[str,str], read_length: int) -> dict[str,str]:
"""
Simulates sequencing.
Args:
sequences (dict): Dictionary of sequences to sequence.
read_length (int): length of reads
Returns:
dict: of n sequences as values
"""
LOG.info("Sequencing in progress....")
results: dict[str,str] = {}
for index, key in enumerate(sequences):
results[key] = read_sequence(sequences[key], read_length=read_length)
LOG.info("Sequencing was successfully executed.")
return results
def generate_sequences(n: int, mean: int, sd: int) -> dict[str,str]:
"""
Generates random sequences.
Args:
n (int): Amount of sequences to generate.
mean (int): mean length of sequence (gaussian distribution).
sd (float): standard deviation of length of sequence (gaussian distribution).
Returns:
dict: of n sequences
"""
from random import choice, gauss
LOG.info("Generating random sequences.")
sequences: dict[str,str] = {}
for i in range(n):
seq: str = ""
bases: list[str] = ["A", "T", "C", "G"]
for nt in range(abs(round(gauss(mean, sd)))):
seq = seq + choice(bases)
key: str = str(i) + ': length ' + str(len(seq)) + ' nt'
sequences[key] = seq
return sequences
def write_fasta(sequences: dict[str,str], file_path: str):
"""
Takes a dictionary and writes it to a fasta file.
Must specify the filename when calling the function.
Args:
sequences (dict): Dictionary of sequence.
file_path (str): A file path directing to the output folder.
"""
LOG.info("Writing FASTA file.")
from textwrap import wrap
with open(file_path, "w") as outfile:
for key, value in sequences.items():
outfile.write(key + "\n")
outfile.write("\n".join(wrap(value, 60)))
outfile.write("\n")
class ReadSequencer: class ReadSequencer:
def __init__(self): def __init__(self, fasta: str = None, output: str = None, read_length: int = 150, chunk_size: int = 10000) -> None:
self.sequences: dict[str,str] = {} """ ReadSequencer class
self.reads: dict[str,str] = {} Args:
fasta: path fasta file
def add_random_sequences(self, n: int, mean: int, sd: int): output: path output fasta file(s)
self.sequences: dict[str,str] = generate_sequences(n, mean, sd) read_length: read length, defaults to 150.
chunk_size: batch size used for memory efficient processing, only used when number of sequences greater
def read_fasta(self, input_file): than number of passed sequences. Defaults to 10000.
self.sequences: dict[str,str] = read_in_fasta(input_file)
Returns:
def run_sequencing(self, read_length: int): None
self.reads: dict[str,str] = simulate_sequencing(self.sequences, read_length) """
self.fasta = fasta
def write_fasta(self, output_file_path: str): self.output = output
write_fasta(self.reads, output_file_path) self.read_length = read_length
self.chunk_size = chunk_size
self.random = False
self.bases = ('A', 'T', 'C', 'G')
self.n_sequences = None
def get_n_sequences(self) -> None:
"""
Helper function to detect number of sequences present in set fasta file.
Returns:
None
"""
self.n_sequences = len(list(SeqIO.parse(self.fasta, 'fasta')))
def define_random_sequences(self, n: int) -> None:
"""
Defines random sequences.
Args:
n: number of random sequences to be generated
Returns:
None
"""
self.random = True
self.n_sequences = n
def generate_random_sequence(self, length: int) -> Seq:
"""
Generates random sequence.
Args:
length: length of sequence
Returns:
random sequence of length n
"""
seq = choices(self.bases, k=length)
seq = Seq(''.join(seq))
return seq
def resize_sequence(self, record:SeqRecord) -> SeqRecord:
"""
Resizes sequence according to set read length. If sequence is shorter than read length, fills up
with random nucleotides.
Args:
record: SeqRecord
Returns:
resized SeqRecord
"""
if (len(record)) >= self.read_length:
record.seq = record.seq[0:self.read_length]
else:
n_add = self.read_length - len(record)
add_seq = self.generate_random_sequence(n_add)
record.seq = record.seq + add_seq
return record.seq
def batch_iterator(self, iterator: Iterator, batch_size: int) -> Generator:
"""
This is a generator function, and it returns lists of the
entries from the supplied iterator. Each list will have
batch_size entries, although the final list may be shorter.
Args:
iterator: iterator object generated with Bio.SeqIO.parse()
batch_size: batch size to use for the generator
Returns:
list of entries from supplied iterator according to batch_size
"""
batch = []
for entry in iterator:
batch.append(entry)
if len(batch) == batch_size:
yield batch
batch = []
def run_sequencing(self) -> None:
"""
Runs read sequencing of specified sequences from input fasta file or generates random sequences for a given
read length. If number of sequences exceeds chunk-size, it will switch to batch processing mode.
Returns:
Writes processed sequences to output fasta file(s).
"""
if self.random:
if self.n_sequences < self.chunk_size:
with open(self.output, 'w') as output_handle:
for i in range(self.n_sequences):
record = SeqRecord(
self.generate_random_sequence(self.read_length),
id='random_seq: ' + str(i+1))
SeqIO.write(record, output_handle, 'fasta')
else:
for i, batch in enumerate(self.batch_iterator(range(self.n_sequences), self.chunk_size)):
filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1)
with open(filename, 'w') as output_handle:
for j, k in enumerate(batch):
record = SeqRecord(
self.generate_random_sequence(self.read_length),
id='random_seq: ' + str(j+1))
SeqIO.write(record, output_handle, 'fasta')
else:
if self.n_sequences < self.chunk_size:
with open(self.fasta) as input_handle, open(
self.output, 'w') as output_handle:
for record in SeqIO.parse(input_handle, 'fasta'):
record.seq = self.resize_sequence(record)
SeqIO.write(record, output_handle, 'fasta')
else:
record_iter = SeqIO.parse(open(self.fasta), 'fasta')
for i, batch in enumerate(self.batch_iterator(record_iter, self.chunk_size)):
filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1)
for j, record in enumerate(batch):
batch[j].seq = self.resize_sequence(record)
with open(filename, 'w') as handle:
SeqIO.write(batch, handle, 'fasta')
...@@ -9,6 +9,6 @@ setup( ...@@ -9,6 +9,6 @@ setup(
author_email='christoph.harmel@unibas.ch', author_email='christoph.harmel@unibas.ch',
description='Simulates sequencing with a specified read length from sequences specified by a FASTA file.', description='Simulates sequencing with a specified read length from sequences specified by a FASTA file.',
packages=find_packages(), packages=find_packages(),
install_requires=['random','textwrap','argparse','logging'], install_requires=['random','Bio','argparse','logging'],
entry_points={'console_scripts': ['read_sequencer=read_sequencer_package.cli:main']} entry_points={'console_scripts': ['read_sequencer=read_sequencer_package.cli:main']}
) )
0: length 103 nt
ACGCCCTATGTCAGAGTGGTGTTAGTGCACAATTCATTCCGATATGCGAA
1: length 35 nt
ATAACGAGCGCGAGCCCTTAACGGGCAGGACTGCCGACTCACACCTTAAT
2: length 61 nt
GGTCTCATAAGCACTTCGGGTACATCCTCCTAAATGGGCCGCGACTTAGG
3: length 54 nt
TAGGTCTCGGATACGAGATATAACCGTAGCATGGGAGTGACACCCATGCC
4: length 64 nt
TCTGCGATACCCAATTTAGGGCATGGCATAAGGCCCGGCGGGTATTTCCG
5: length 85 nt
TTGTTAAGCGTGTTGGTAGCGCCCCCAAATTGTCCCAACTGGGCCACACC
6: length 53 nt
ATAGTACAAGTTAGACCTCATGCATTGAGCTGCCCGGTTCGGACCTTAAT
7: length 27 nt
TAAAATATTAAGGATAGCCGGAACGCGCTGGCGACTGACGACACTACACG
8: length 25 nt
TAGATTCCAGGGAATGACGTCACACCGTTCCGTATATCAGCGGCGCGGAA
9: length 52 nt
GGTGCACCTATTAACAGAGTGACTTTGGGGGAATCACTTATTCGCATATA
10: length 41 nt
AACGACAAGCTGAACGAGCTCTGGCGGCTACGCAGTGTTCGCGCAAGCGA
11: length 106 nt
GTAATAAACTAAACATCAGCTGAGGCGTCCGACCACTGCATCTTTTGTTG
12: length 5 nt
GTAATATGGTATAGCTCGACCCTCGTACAACGGGGGTCAGACAGCGTTGC
13: length 49 nt
TTACTTTGCACGCATGCATGGAGGCACCATGAAAGACGGGAAGCGGCTAA
14: length 22 nt
GCAAGGGGTTTGAGGGAGCAACGTAAGATCAGAGATAACAGTCTCAAGAG
15: length 109 nt
AATTGAAGTCAGCAAAGAATTGGGATTGCTGGAAGCAACACGGACACGTT
16: length 43 nt
GAACCATTCTAAACAGCACCACCCGCATCCCGCTTTGCGCAGCAGCCACT
17: length 85 nt
ATGTCTCACAGGGGAGGTCTCCAGGCTTCTACCAAGTTCCCAATTCCATG
18: length 16 nt
ATCTCATCAATTATGTTGTACACCTGCTCGTTCTAACTTGCAGACTGCAT
19: length 52 nt
GAGTATCGCAGTATACTGTTGGCGGACATTCCAGAATATTGGTATTCAGC
20: length 2 nt
ATATAGCTACCCCAGATTGTCTTAACGAGTCCCGCACCCCCCACCAAGAT
21: length 13 nt
GTATTTACCTGCCCACTAGTACAAGTCCCCGAACCTTAGCAATCCTAAGT
22: length 64 nt
TGGCCGTTTCAGAAGGATTCCTGCCCCTGTGATGAGTCACGTGGAGACAG
23: length 59 nt
CGAGGAGAGCTGTAAAATACTGCTGCCTGCTACATTGGGGCTTGGGTCTC
24: length 82 nt
GCACAGTAGCGAGTGACTCGGTTACTATGTGTTGCTGACTTTATATGTGC
25: length 6 nt
GAGTAAAGCTGAAACACCGAACCGGCAGTTAGAGCTAGTGGATTTTCTTA
26: length 87 nt
AGACATTTTCGGAACCAATGATCTTGTTGGAGTTGAGAGGCCGAACCAGG
27: length 136 nt
AACACGGTAATATCAGGTCGGCCTTAGTCACTTCGGAATTGGAGTTAGTA
28: length 68 nt
CACCGGTAGCTGAATGTAACGAATCTACGTTGACACGGTTAAAGGGATAT
29: length 84 nt
AGATTGTCATGGACTGACGGAGAACACCGTGCTTAAAGTGGAATAATGTG
30: length 51 nt
CGGCAAGGAGGAGGCTACTGACCAATCAGACAAACGTTCATGGCTAGAGA
31: length 51 nt
AGCAGCCGCTCAACTGTGACGTCGGCTGAGTCTACCAGCCCTCAGACTAC
32: length 73 nt
TCCGATCTTATAGTAGTGATCCTGATGGAACAATAACTGACCCAGTAAAG
33: length 58 nt
ACGTTTTATTGTACCTAGGTGTGGCACCATGAAAGGCAGGAGGCCCACCG
34: length 28 nt
AATCCGCAATCGTGCTACGCATTTTCCAGACAATATTTCGAGGTTACCGC
35: length 13 nt
AACACGAGGCCCTCTCGACGCTGCTATTTAACAGCGCGTGCCTATTGAAA
36: length 2 nt
ATACCGAAACTTACTAGTTGTGACAGCGGATTCCAAACTGCGGGTGAATA
37: length 71 nt
GTGAGGAGTACTCGGCACCGGGAACGTAGTAATGTGCTCTGATCCACTCT
38: length 46 nt
CGAACGTAAATAAGTCTGCTGCTGCCTCAGCAGAGGGATGGGAGGGGTCT
39: length 48 nt
CGCAAGAGTGTCGAGCTGGCAGACGCGTATTGCCTTTATATCCGTGAAAC
40: length 63 nt
TGACAAACATAAGTTTGTCGCCCGTTTCCACATAGTAGAATTCATTAGAA
41: length 11 nt
AGAAATAGTACGTAGTGTTGGCTGTGAATATTCGGAGATAAGGACTCATT
42: length 68 nt
GGTTCACAAATTGTCATAGATCATCGTGGTACGACGGGCGATCTTATTTC
43: length 7 nt
CGAGCGTGATGACGTAATGGAGTGAATAAGGAGTGCGAAATGGTGCGTTT
44: length 84 nt
TGGGACGATCATTCCCCCTGACCTTACGTACAATGCAATTGCGAGTGGCT
45: length 23 nt
TTGTACTTACGGCTTAAATTGCACAGAAGGGGTTCTCCGGAAGATAACTA
46: length 58 nt
GGCCCTTCGGTGCAGTCGGCCTAGGTATCAAGCTCGCCTGCCCGTGTTAT
47: length 50 nt
TATCGTGGTTATGTGCGGACCGCCTGATATTACAGCTGATGTGAGGGCAA
48: length 63 nt
TCTGCTAGGGATCCACTCCTGGGTCATTATTAAGCACTTACGGCCTGAGA
49: length 48 nt
GGAGAATTCGCGGCATAAGATCTCCATACGGGTAGTATGTGCCACTATTA
50: length 64 nt
AATGCGTACGGCCCAAAAACCTTCAGGATTACCATGAACCCGTAACTCAA
51: length 41 nt
AGTCATATAATGATTGAGTTCCCTCAACCCTCCGGACGTAAACACTCCGG
52: length 44 nt
TCCAGATGGCCTGGCAATCAGGTGTTGACGGGAACCGCACCCCTGCTGCT
53: length 62 nt
AGGTTGTCTGAATCACTCGGGAGACTTCCCCGCACGATATCGGCACCTGA
54: length 73 nt
CCGTACGGAACATCCTGCAGCTTCGCCCGGTCGAGAAGAAGGTAGAGATA
55: length 39 nt
TGCGCGGCGGACTCTGTCATTTATGACCTGCTTGAACGATGAGTGGGCCA
56: length 88 nt
CGATCCATAGGGCACGCTAGTTTGCATTTTAAGGAGTGCCAATTACGAGG
57: length 117 nt
TATTGCGTCCGTTGCCTGCTCCATCCGTTGCACTATGCCATAAGGCAACA
58: length 67 nt
TTAACGCACGATCAACGGTTCATAATCTGCGGTGATCCCAGAGCTCTACA
59: length 84 nt
CTGACCCTCGATGTGTTTAATAGTTGTATTTCGGCCCCAGTAGGCTCGAC
60: length 71 nt
CTGTCAGATGCGCATGTTCGTGGTCCATTGTCAGCCAAATTAAAGGTTGC
61: length 102 nt
TACATGCTCATATGAACTGACATTGAGTGGAAGGTAGCGACCGCGAGGTT
62: length 66 nt
ACATGATCTTTGTCGAATACTCAAAATGCACAGTGCCTAACTATTTGGGT
63: length 22 nt
GTCACGACACGCATCTACACTAAACAGCACCACATGTTAGAGATGAGATC
64: length 46 nt
CGTGGTCGTGAAGACCCACTATATTAGCAAGAGTCGCGATAGATGGGTGT
65: length 78 nt
CTAGGTAATTTTGAGAGACACCCCGACGCACGGATGTTCCGTACAAAGAT
66: length 32 nt
CAGTCTGAATGTATCAACTAAGAGAATCGTACGAAAATAGACCTAATCGA
67: length 42 nt
GCTCCGCACACTAGATAGCTGATGCCCACATACTCATTCATTATTCGATG
68: length 49 nt
GTCTGAACCAGACACGCACTACGGCTAGCACCTATCCTTATCCTAAAACG
69: length 41 nt
GAGTTACGACCGGACGGGTTGTTAGGTCACCCCGGAATCAGACGATTTAC
70: length 29 nt
GTAGGCCAACGCGAACTACTCCTCCCGCCCACAGTCTAGACTAAGCGTTG
71: length 94 nt
GGTTTAGACTGTCTGCAACTATGTTACTGGATATACCTAATGACAGGGGT
72: length 48 nt
TAAGAATGTTATTGGGCGGTAGCTTACTCTATTGGAATTGAGAAAATGTA
73: length 63 nt
CTAGTATGTTTTATTGGATGCTTGAGGTAGGAATAATCACATCGGGCCGA
74: length 70 nt
ACAAACCTCCTCCCCGGCGACGGGCAACGCCCGCATTGCAACATGCAAGA
75: length 58 nt
GGCTCCATTTCGCTACCCTCTCCCCTAAATCAGGTCCGCGGAGTTGTCGT
76: length 98 nt
ACATACCTATGCCTCAGATCGGTTCGGCTCTTGAGGCGACACCCGTGATA
77: length 34 nt
TTAGAAAGGAACTCATTTTAAGCAAGTTATGCCAGGTTAGCCCGCCCTGG
78: length 48 nt
AAGGCTCGCCGTGACCAGTAGTGCGCAACCTTTCAAGCCGTGTAATATTT
79: length 51 nt
CCTTAAGGGTGGTTTGCACCGTAAAGACCCTCTCTTCCACTTTCCCCGAA
80: length 30 nt
GCCTCGACGTGCCGGACGATGGGCGCTGATCGAAAGTCCATTGTCCCCCT
81: length 14 nt
AGGGCAGTGTACACACACGGCAAGGGTTGCGATGGGTAGAGGCTAAGACG
82: length 39 nt
AATGGGACTTAGTAAACAGGCTGGCCGCCTAGACTGTGTGTGAATGCCCA
83: length 29 nt
AGCTTGGACTTAGAGAAAATCAGTCTACAAGACATAGGCTTTAAAGAGGA
84: length 15 nt
ACGGAAAGTAAGTCTCTCGTCCCTGGTTAAAGGGTCAGTCCCTCCCACCT
85: length 20 nt
CCCAGAGCAAACTGGACCTCCCACCGGCAGATCATCGTTTATTATTATAC
86: length 64 nt
CGTGGCCAGTACATGCTGATGATCCCTATTACAGACCTCGCGTGTTGAGA
87: length 52 nt
GTTAGGCTGTACTGAATACCATGAGCATGTGGGTAGGTTGTACTGGAACC
88: length 60 nt
CGCCGGGCTCCCCGAAGTATAGAGCCAGCGGTTAAATAACTTATCTGATC
89: length 32 nt
CTTGTACGTCGAGTATGAGAGGGGTTACACTTTCTGCTTGCTAAGTTCAG
90: length 14 nt
AAACAGATTAATAACGTAATACCCCCGCTCTACGCCCCTGCATTACGGTT
91: length 21 nt
GTAGTTCGTAGCAGGGGATTGTCATAAATCAGCGCCTGATGGGAAGCGTT
92: length 47 nt
TGCCCAGATGTAACACATCCCGGCCCATTCTGGTAACTCCGTTTCTGGGT
93: length 47 nt
AATTAAGGGGTTGTCAAACCGGCCACTTTTGAACAATGGGCTTCGCCGCC
94: length 42 nt
TTCAACCTTTACGATGCAAACATAGACAAACTTGGACGTATTCGGACCCT
95: length 35 nt
GGCGCTTAGTGGAATGCCCCCCCGTATGCAGCATCGAGTATTCCGTAGCG
96: length 54 nt
TTGCTGATCTTTCGAATTTGGATCTGGGATTTTGCCAGATAGCGCCCGTA
97: length 34 nt
TACGTAGATATTTTCAGGAGGCACTGAACAGCCCAGAGCCTCCATCGGGC
98: length 15 nt
ATTGCGCAGGTATGGCTCAGAGACACAGTATACTATTACCTTCCCGCAAC
99: length 42 nt
CGATCTGGCATCGGCATCCGAGTAGCCCCTACGTATGGCTTTGCCCAACA
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment