Skip to content
Snippets Groups Projects
Commit e4c0da92 authored by Christoph Harmel's avatar Christoph Harmel
Browse files

feat: rewrote ReadSequencer class, added batch processing

parent 1e148db1
No related branches found
No related tags found
1 merge request!26feat: ReadSequencer class rewritten, updated CLI
import logging
from random import choice, gauss
from textwrap import wrap
from random import choices
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections.abc import Generator, Iterator
LOG = logging.getLogger(__name__)
def read_in_fasta(file_path: str) -> dict[str,str]:
"""
This function reads in FASTA files.
Args:
file_path: A file path directing to the fasta file.
Returns:
Dict: It returns a dictionary with sequences.
"""
LOG.info("Reading in FASTA files from destination.")
sequences: dict[str,str] = {}
f = open(file_path)
for line in f:
if line[0] == '>':
def_line = line.strip()
def_line = def_line.replace('>', '')
else:
if def_line not in sequences:
sequences[def_line] = ''
sequences[def_line] += line.strip()
f.close()
return sequences
def read_sequence(seq:str, read_length:int) -> str:
"""
This function reads a sequence of a specific read length and adds additional nucleotides if the sequence is
smaller than the requested length or cuts the sequence if its longer.
Args:
seq: the sequence to read
read_length: length of reads
Returns:
str: returns sequenced element
"""
bases: list[str] = ["A", "T", "C", "G"]
sequenced: str = ''
if read_length > len(seq):
for nt in range(len(seq)):
sequenced += seq[nt]
for nt in range(len(seq), read_length):
sequenced += choice(bases)
else:
for nt in range(read_length):
sequenced += seq[nt]
return sequenced
def simulate_sequencing(sequences: dict[str,str], read_length: int) -> dict[str,str]:
"""
Simulates sequencing.
Args:
sequences: Dictionary of sequences to sequence.
read_length: length of reads
Returns:
dict: of n sequences as values
"""
LOG.info("Sequencing in progress....")
results: dict[str,str] = {}
for index, key in enumerate(sequences):
results[key] = read_sequence(sequences[key], read_length=read_length)
LOG.info("Sequencing was successfully executed.")
return results
def generate_sequences(n: int, mean: int, sd: int) -> dict[str,str]:
"""
Generates random sequences.
Args:
n: Amount of sequences to generate.
mean: mean length of sequence (gaussian distribution).
sd: standard deviation of length of sequence (gaussian distribution).
Returns:
dict: of n sequences
"""
LOG.info("Generating random sequences.")
sequences: dict[str,str] = {}
for i in range(n):
seq: str = ""
bases: list[str] = ["A", "T", "C", "G"]
for nt in range(abs(round(gauss(mean, sd)))):
seq = seq + choice(bases)
key: str = str(i) + ': length ' + str(len(seq)) + ' nt'
sequences[key] = seq
return sequences
def write_fasta(sequences: dict[str,str], file_path: str) -> None:
"""
Takes a dictionary and writes it to a fasta file.
Must specify the filename when calling the function.
Args:
sequences: Dictionary of sequence.
file_path: A file path directing to the output folder.
"""
LOG.info("Writing FASTA file.")
with open(file_path, "w") as outfile:
for key, value in sequences.items():
outfile.write(key + "\n")
outfile.write("\n".join(wrap(value, 60)))
outfile.write("\n")
class ReadSequencer:
def __init__(self):
self.sequences: dict[str,str] = {}
self.reads: dict[str,str] = {}
def add_random_sequences(self, n: int, mean: int, sd: int):
self.sequences: dict[str,str] = generate_sequences(n, mean, sd)
def read_fasta(self, input_file):
self.sequences: dict[str,str] = read_in_fasta(input_file)
def run_sequencing(self, read_length: int):
self.reads: dict[str,str] = simulate_sequencing(self.sequences, read_length)
def write_fasta(self, output_file_path: str):
write_fasta(self.reads, output_file_path)
def __init__(self, fasta: str = None, output: str = None, read_length: int = 150, chunk_size: int = 10000) -> None:
""" ReadSequencer class
Args:
fasta: path fasta file
output: path output fasta file(s)
read_length: read length, defaults to 150.
chunk_size: batch size used for memory efficient processing, only used when number of sequences greater
than number of passed sequences. Defaults to 10000.
Returns:
None
"""
self.fasta = fasta
self.output = output
self.read_length = read_length
self.chunk_size = chunk_size
self.random = False
self.bases = ('A', 'T', 'C', 'G')
self.n_sequences = None
def get_n_sequences(self) -> None:
"""
Helper function to detect number of sequences present in set fasta file.
Returns:
None
"""
self.n_sequences = len(list(SeqIO.parse(self.fasta, 'fasta')))
LOG.info(self.n_sequences, ' sequences found in ', self.fasta)
def define_random_sequences(self, n: int) -> None:
"""
Defines random sequences.
Args:
n: number of random sequences to be generated
Returns:
None
"""
LOG.info('Set mode to: generate random sequences')
LOG.info('N random sequences: ', n)
self.random = True
self.n_sequences = n
def generate_random_sequence(self, length: int) -> Seq:
"""
Generates random sequence.
Args:
length: length of sequence
Returns:
random sequence of length n
"""
seq = choices(self.bases, k=length)
seq = Seq(''.join(seq))
return seq
def resize_sequence(self, record:SeqRecord) -> SeqRecord:
"""
Resizes sequence according to set read length. If sequence is shorter than read length, fills up
with random nucleotides.
Args:
record: SeqRecord
Returns:
resized SeqRecord
"""
if (len(record)) >= self.read_length:
record.seq = record.seq[0:self.read_length]
else:
n_add = self.read_length - len(record)
add_seq = self.generate_random_sequence(n_add)
record.seq = record.seq + add_seq
return record.seq
def batch_iterator(self, iterator: Iterator, batch_size: int) -> Generator:
"""
This is a generator function, and it returns lists of the
entries from the supplied iterator. Each list will have
batch_size entries, although the final list may be shorter.
Args:
iterator: iterator object generated with Bio.SeqIO.parse()
batch_size: batch size to use for the generator
Returns:
list of entries from supplied iterator according to batch_size
"""
batch = []
for entry in iterator:
batch.append(entry)
if len(batch) == batch_size:
yield batch
batch = []
def run_sequencing(self) -> None:
"""
Runs read sequencing of specified sequences from input fasta file or generates random sequences for a given
read length. If number of sequences exceeds chunk-size, it will switch to batch processing mode.
Returns:
Writes processed sequences to output fasta file(s).
"""
if self.random:
if self.n_sequences < self.chunk_size:
with open(self.output, 'w') as output_handle:
for i in range(self.n_sequences):
record = SeqRecord(
self.generate_random_sequence(self.read_length),
id='random_seq: ' + str(i+1))
SeqIO.write(record, output_handle, 'fasta')
else:
for i, batch in enumerate(self.batch_iterator(range(self.n_sequences), self.chunk_size)):
filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1)
with open(filename, 'w') as output_handle:
for j, k in enumerate(batch):
record = SeqRecord(
self.generate_random_sequence(self.read_length),
id='random_seq: ' + str(j+1))
SeqIO.write(record, output_handle, 'fasta')
else:
if self.n_sequences < self.chunk_size:
with open(self.fasta) as input_handle, open(
self.output, 'w') as output_handle:
for record in SeqIO.parse(input_handle, 'fasta'):
record.seq = self.resize_sequence(record)
SeqIO.write(record, output_handle, 'fasta')
else:
record_iter = SeqIO.parse(open(self.fasta), 'fasta')
for i, batch in enumerate(self.batch_iterator(record_iter, self.chunk_size)):
filename = self.output.replace('.fasta','') + '_chunk_%i.fasta' % (i + 1)
for j, record in enumerate(batch):
batch[j].seq = self.resize_sequence(record)
with open(filename, 'w') as handle:
SeqIO.write(batch, handle, 'fasta')
......@@ -9,6 +9,6 @@ setup(
author_email='christoph.harmel@unibas.ch',
description='Simulates sequencing with a specified read length from sequences specified by a FASTA file.',
packages=find_packages(),
install_requires=['random','textwrap','argparse','logging'],
install_requires=['random','Bio','argparse','logging'],
entry_points={'console_scripts': ['read_sequencer=read_sequencer_package.cli:main']}
)
0: length 103 nt
ACGCCCTATGTCAGAGTGGTGTTAGTGCACAATTCATTCCGATATGCGAA
1: length 35 nt
ATAACGAGCGCGAGCCCTTAACGGGCAGGACTGCCGACTCACACCTTAAT
2: length 61 nt
GGTCTCATAAGCACTTCGGGTACATCCTCCTAAATGGGCCGCGACTTAGG
3: length 54 nt
TAGGTCTCGGATACGAGATATAACCGTAGCATGGGAGTGACACCCATGCC
4: length 64 nt
TCTGCGATACCCAATTTAGGGCATGGCATAAGGCCCGGCGGGTATTTCCG
5: length 85 nt
TTGTTAAGCGTGTTGGTAGCGCCCCCAAATTGTCCCAACTGGGCCACACC
6: length 53 nt
ATAGTACAAGTTAGACCTCATGCATTGAGCTGCCCGGTTCGGACCTTAAT
7: length 27 nt
TAAAATATTAAGGATAGCCGGAACGCGCTGGCGACTGACGACACTACACG
8: length 25 nt
TAGATTCCAGGGAATGACGTCACACCGTTCCGTATATCAGCGGCGCGGAA
9: length 52 nt
GGTGCACCTATTAACAGAGTGACTTTGGGGGAATCACTTATTCGCATATA
10: length 41 nt
AACGACAAGCTGAACGAGCTCTGGCGGCTACGCAGTGTTCGCGCAAGCGA
11: length 106 nt
GTAATAAACTAAACATCAGCTGAGGCGTCCGACCACTGCATCTTTTGTTG
12: length 5 nt
GTAATATGGTATAGCTCGACCCTCGTACAACGGGGGTCAGACAGCGTTGC
13: length 49 nt
TTACTTTGCACGCATGCATGGAGGCACCATGAAAGACGGGAAGCGGCTAA
14: length 22 nt
GCAAGGGGTTTGAGGGAGCAACGTAAGATCAGAGATAACAGTCTCAAGAG
15: length 109 nt
AATTGAAGTCAGCAAAGAATTGGGATTGCTGGAAGCAACACGGACACGTT
16: length 43 nt
GAACCATTCTAAACAGCACCACCCGCATCCCGCTTTGCGCAGCAGCCACT
17: length 85 nt
ATGTCTCACAGGGGAGGTCTCCAGGCTTCTACCAAGTTCCCAATTCCATG
18: length 16 nt
ATCTCATCAATTATGTTGTACACCTGCTCGTTCTAACTTGCAGACTGCAT
19: length 52 nt
GAGTATCGCAGTATACTGTTGGCGGACATTCCAGAATATTGGTATTCAGC
20: length 2 nt
ATATAGCTACCCCAGATTGTCTTAACGAGTCCCGCACCCCCCACCAAGAT
21: length 13 nt
GTATTTACCTGCCCACTAGTACAAGTCCCCGAACCTTAGCAATCCTAAGT
22: length 64 nt
TGGCCGTTTCAGAAGGATTCCTGCCCCTGTGATGAGTCACGTGGAGACAG
23: length 59 nt
CGAGGAGAGCTGTAAAATACTGCTGCCTGCTACATTGGGGCTTGGGTCTC
24: length 82 nt
GCACAGTAGCGAGTGACTCGGTTACTATGTGTTGCTGACTTTATATGTGC
25: length 6 nt
GAGTAAAGCTGAAACACCGAACCGGCAGTTAGAGCTAGTGGATTTTCTTA
26: length 87 nt
AGACATTTTCGGAACCAATGATCTTGTTGGAGTTGAGAGGCCGAACCAGG
27: length 136 nt
AACACGGTAATATCAGGTCGGCCTTAGTCACTTCGGAATTGGAGTTAGTA
28: length 68 nt
CACCGGTAGCTGAATGTAACGAATCTACGTTGACACGGTTAAAGGGATAT
29: length 84 nt
AGATTGTCATGGACTGACGGAGAACACCGTGCTTAAAGTGGAATAATGTG
30: length 51 nt
CGGCAAGGAGGAGGCTACTGACCAATCAGACAAACGTTCATGGCTAGAGA
31: length 51 nt
AGCAGCCGCTCAACTGTGACGTCGGCTGAGTCTACCAGCCCTCAGACTAC
32: length 73 nt
TCCGATCTTATAGTAGTGATCCTGATGGAACAATAACTGACCCAGTAAAG
33: length 58 nt
ACGTTTTATTGTACCTAGGTGTGGCACCATGAAAGGCAGGAGGCCCACCG
34: length 28 nt
AATCCGCAATCGTGCTACGCATTTTCCAGACAATATTTCGAGGTTACCGC
35: length 13 nt
AACACGAGGCCCTCTCGACGCTGCTATTTAACAGCGCGTGCCTATTGAAA
36: length 2 nt
ATACCGAAACTTACTAGTTGTGACAGCGGATTCCAAACTGCGGGTGAATA
37: length 71 nt
GTGAGGAGTACTCGGCACCGGGAACGTAGTAATGTGCTCTGATCCACTCT
38: length 46 nt
CGAACGTAAATAAGTCTGCTGCTGCCTCAGCAGAGGGATGGGAGGGGTCT
39: length 48 nt
CGCAAGAGTGTCGAGCTGGCAGACGCGTATTGCCTTTATATCCGTGAAAC
40: length 63 nt
TGACAAACATAAGTTTGTCGCCCGTTTCCACATAGTAGAATTCATTAGAA
41: length 11 nt
AGAAATAGTACGTAGTGTTGGCTGTGAATATTCGGAGATAAGGACTCATT
42: length 68 nt
GGTTCACAAATTGTCATAGATCATCGTGGTACGACGGGCGATCTTATTTC
43: length 7 nt
CGAGCGTGATGACGTAATGGAGTGAATAAGGAGTGCGAAATGGTGCGTTT
44: length 84 nt
TGGGACGATCATTCCCCCTGACCTTACGTACAATGCAATTGCGAGTGGCT
45: length 23 nt
TTGTACTTACGGCTTAAATTGCACAGAAGGGGTTCTCCGGAAGATAACTA
46: length 58 nt
GGCCCTTCGGTGCAGTCGGCCTAGGTATCAAGCTCGCCTGCCCGTGTTAT
47: length 50 nt
TATCGTGGTTATGTGCGGACCGCCTGATATTACAGCTGATGTGAGGGCAA
48: length 63 nt
TCTGCTAGGGATCCACTCCTGGGTCATTATTAAGCACTTACGGCCTGAGA
49: length 48 nt
GGAGAATTCGCGGCATAAGATCTCCATACGGGTAGTATGTGCCACTATTA
50: length 64 nt
AATGCGTACGGCCCAAAAACCTTCAGGATTACCATGAACCCGTAACTCAA
51: length 41 nt
AGTCATATAATGATTGAGTTCCCTCAACCCTCCGGACGTAAACACTCCGG
52: length 44 nt
TCCAGATGGCCTGGCAATCAGGTGTTGACGGGAACCGCACCCCTGCTGCT
53: length 62 nt
AGGTTGTCTGAATCACTCGGGAGACTTCCCCGCACGATATCGGCACCTGA
54: length 73 nt
CCGTACGGAACATCCTGCAGCTTCGCCCGGTCGAGAAGAAGGTAGAGATA
55: length 39 nt
TGCGCGGCGGACTCTGTCATTTATGACCTGCTTGAACGATGAGTGGGCCA
56: length 88 nt
CGATCCATAGGGCACGCTAGTTTGCATTTTAAGGAGTGCCAATTACGAGG
57: length 117 nt
TATTGCGTCCGTTGCCTGCTCCATCCGTTGCACTATGCCATAAGGCAACA
58: length 67 nt
TTAACGCACGATCAACGGTTCATAATCTGCGGTGATCCCAGAGCTCTACA
59: length 84 nt
CTGACCCTCGATGTGTTTAATAGTTGTATTTCGGCCCCAGTAGGCTCGAC
60: length 71 nt
CTGTCAGATGCGCATGTTCGTGGTCCATTGTCAGCCAAATTAAAGGTTGC
61: length 102 nt
TACATGCTCATATGAACTGACATTGAGTGGAAGGTAGCGACCGCGAGGTT
62: length 66 nt
ACATGATCTTTGTCGAATACTCAAAATGCACAGTGCCTAACTATTTGGGT
63: length 22 nt
GTCACGACACGCATCTACACTAAACAGCACCACATGTTAGAGATGAGATC
64: length 46 nt
CGTGGTCGTGAAGACCCACTATATTAGCAAGAGTCGCGATAGATGGGTGT
65: length 78 nt
CTAGGTAATTTTGAGAGACACCCCGACGCACGGATGTTCCGTACAAAGAT
66: length 32 nt
CAGTCTGAATGTATCAACTAAGAGAATCGTACGAAAATAGACCTAATCGA
67: length 42 nt
GCTCCGCACACTAGATAGCTGATGCCCACATACTCATTCATTATTCGATG
68: length 49 nt
GTCTGAACCAGACACGCACTACGGCTAGCACCTATCCTTATCCTAAAACG
69: length 41 nt
GAGTTACGACCGGACGGGTTGTTAGGTCACCCCGGAATCAGACGATTTAC
70: length 29 nt
GTAGGCCAACGCGAACTACTCCTCCCGCCCACAGTCTAGACTAAGCGTTG
71: length 94 nt
GGTTTAGACTGTCTGCAACTATGTTACTGGATATACCTAATGACAGGGGT
72: length 48 nt
TAAGAATGTTATTGGGCGGTAGCTTACTCTATTGGAATTGAGAAAATGTA
73: length 63 nt
CTAGTATGTTTTATTGGATGCTTGAGGTAGGAATAATCACATCGGGCCGA
74: length 70 nt
ACAAACCTCCTCCCCGGCGACGGGCAACGCCCGCATTGCAACATGCAAGA
75: length 58 nt
GGCTCCATTTCGCTACCCTCTCCCCTAAATCAGGTCCGCGGAGTTGTCGT
76: length 98 nt
ACATACCTATGCCTCAGATCGGTTCGGCTCTTGAGGCGACACCCGTGATA
77: length 34 nt
TTAGAAAGGAACTCATTTTAAGCAAGTTATGCCAGGTTAGCCCGCCCTGG
78: length 48 nt
AAGGCTCGCCGTGACCAGTAGTGCGCAACCTTTCAAGCCGTGTAATATTT
79: length 51 nt
CCTTAAGGGTGGTTTGCACCGTAAAGACCCTCTCTTCCACTTTCCCCGAA
80: length 30 nt
GCCTCGACGTGCCGGACGATGGGCGCTGATCGAAAGTCCATTGTCCCCCT
81: length 14 nt
AGGGCAGTGTACACACACGGCAAGGGTTGCGATGGGTAGAGGCTAAGACG
82: length 39 nt
AATGGGACTTAGTAAACAGGCTGGCCGCCTAGACTGTGTGTGAATGCCCA
83: length 29 nt
AGCTTGGACTTAGAGAAAATCAGTCTACAAGACATAGGCTTTAAAGAGGA
84: length 15 nt
ACGGAAAGTAAGTCTCTCGTCCCTGGTTAAAGGGTCAGTCCCTCCCACCT
85: length 20 nt
CCCAGAGCAAACTGGACCTCCCACCGGCAGATCATCGTTTATTATTATAC
86: length 64 nt
CGTGGCCAGTACATGCTGATGATCCCTATTACAGACCTCGCGTGTTGAGA
87: length 52 nt
GTTAGGCTGTACTGAATACCATGAGCATGTGGGTAGGTTGTACTGGAACC
88: length 60 nt
CGCCGGGCTCCCCGAAGTATAGAGCCAGCGGTTAAATAACTTATCTGATC
89: length 32 nt
CTTGTACGTCGAGTATGAGAGGGGTTACACTTTCTGCTTGCTAAGTTCAG
90: length 14 nt
AAACAGATTAATAACGTAATACCCCCGCTCTACGCCCCTGCATTACGGTT
91: length 21 nt
GTAGTTCGTAGCAGGGGATTGTCATAAATCAGCGCCTGATGGGAAGCGTT
92: length 47 nt
TGCCCAGATGTAACACATCCCGGCCCATTCTGGTAACTCCGTTTCTGGGT
93: length 47 nt
AATTAAGGGGTTGTCAAACCGGCCACTTTTGAACAATGGGCTTCGCCGCC
94: length 42 nt
TTCAACCTTTACGATGCAAACATAGACAAACTTGGACGTATTCGGACCCT
95: length 35 nt
GGCGCTTAGTGGAATGCCCCCCCGTATGCAGCATCGAGTATTCCGTAGCG
96: length 54 nt
TTGCTGATCTTTCGAATTTGGATCTGGGATTTTGCCAGATAGCGCCCGTA
97: length 34 nt
TACGTAGATATTTTCAGGAGGCACTGAACAGCCCAGAGCCTCCATCGGGC
98: length 15 nt
ATTGCGCAGGTATGGCTCAGAGACACAGTATACTATTACCTTCCCGCAAC
99: length 42 nt
CGATCTGGCATCGGCATCCGAGTAGCCCCTACGTATGGCTTTGCCCAACA
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment