diff --git a/build/lib/read_sequencer_package/__init__.py b/build/lib/read_sequencer_package/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/build/lib/read_sequencer_package/cli.py b/build/lib/read_sequencer_package/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..e786d78de5391ea2b035c058f4de2cd16162a4d6 --- /dev/null +++ b/build/lib/read_sequencer_package/cli.py @@ -0,0 +1,23 @@ +import argparse +from modules import read_sequencer as rs + +parser = argparse.ArgumentParser(prog='read_sequencer', + description='Simulates sequencing of DNA sequences specified by an FASTA file.') +parser.add_argument('--input_file_path', + help='path to FASTA file') +parser.add_argument('--output_file_path', + help='path to FASTA file') +parser.add_argument('--read_length', + help='read length for sequencing', + type=int) + +args = parser.parse_args() + +def main(): + read_sequencer = rs() + read_sequencer.read_fasta(args.input_file_path) + read_sequencer.run_sequencing(args.read_length) + read_sequencer.write_fasta(args.output_file_path) + +if __name__ == '__main__': + main() diff --git a/build/lib/read_sequencer_package/modules.py b/build/lib/read_sequencer_package/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..39a686616817f6b496328d460fe06931f9670da8 --- /dev/null +++ b/build/lib/read_sequencer_package/modules.py @@ -0,0 +1,148 @@ +def generate_sequences(n, mean, sd): + """ + Generates random sequences. + + Args: + n (int): Amount of sequences to generate. + mean (int): mean length of sequence (gaussian distribution). + sd (float): standard deviation of length of sequence (gaussian distribution). + + Returns: + list: of n sequences + """ + from random import gauss, choice + dict = {} + for i in range(n): + keys = range(n) + seq = "" + nt = ["A", "T", "C", "G"] + for value in range(abs(round(gauss(mean, sd)))): + seq = seq + choice(nt) + dict[keys[i]] = seq + return dict + + +def read_in_fasta(file_path): + ''' + This function reads in FASTA files. + + Args: + file_path (str): A file path directing to the fasta file. + + Returns: + Dict: It returns a dictionary with sequences. + + ''' + sequences = {} + f = open(file_path) + for line in f: + if line[0] == '>': + defline = line.strip() + defline = defline.replace('>', '') + else: + if defline not in sequences: + sequences[defline] = '' + sequences[defline] += line.strip() + f.close() + return sequences + +def read_sequence(seq, read_length): + ''' + This function reads a sequence of a specific read length and adds additional nucleotides if the sequence is + smaller then the requested length or cuts the sequence if its longer. + + Args: + seq (str): the sequence to read + read_length (int): length of reads + + Returns: + str: returns sequenced element + + ''' + from random import choice + bases = ["A", "T", "C", "G"] + sequenced = '' + if read_length >= len(seq): + for nt in range(len(seq)): + sequenced += seq[nt] + for nt in range(len(seq), read_length): + sequenced += choice(bases) + else: + for nt in range(read_length): + sequenced += seq[nt] + + return sequenced + +def simulate_sequencing(sequences, read_length): + """ + Simulates sequencing. + + Args: + sequences (dict): Dictionary of sequences to sequence. + read_length (int): length of reads + + Returns: + dict: of n sequences as values + """ + results = {} + for index, key in enumerate(sequences): + results[key] = read_sequence(sequences[key], read_length=read_length) + + return results + +import random +def generate_sequences(n, mean, sd): + """ + Generates random sequences. + + Args: + n (int): Amount of sequences to generate. + mean (int): mean length of sequence (gaussian distribution). + sd (float): standart deviation of length of sequence (gaussian distribution). + + Returns: + dict: of n sequences + """ + dict1 = {} + for i in range(n): + keys = range(n) + seq = "" + nt = ["A", "T", "C", "G"] + for value in range(round(random.gauss(mean, sd))): + seq = seq + random.choice(nt) + dict1[keys[i]] = seq + return dict1 + +def write_fasta(sequences, file_path): + """ + Takes a dictionary and writes it to a fasta file. + Must specify the filename when calling the function. + + Args: + sequences (dict): Dictionary of sequence. + file_path (str): A file path directing to the output folder. + + """ + from textwrap import wrap + with open(file_path, "w") as outfile: + for key, value in sequences.items(): + outfile.write(key + "\n") + outfile.write("\n".join(wrap(value, 60))) + outfile.write("\n") + +class read_sequencer: + def __init__(self): + self.sequences = {} + self.reads = {} + + def add_random_sequences(self, n, mean, sd): + self.sequences = generate_sequences(n, mean, sd) + + def read_fasta(self, input_file): + self.sequences = read_in_fasta(input_file) + + def run_sequencing(self, read_length): + self.reads = simulate_sequencing(self.sequences, read_length) + + def write_fasta(self, output_file_path): + write_fasta(self.reads, output_file_path) diff --git a/dist/read_sequencer-0.1.1-py3-none-any.whl b/dist/read_sequencer-0.1.1-py3-none-any.whl new file mode 100644 index 0000000000000000000000000000000000000000..234b5227d12d9f5e410d26b2e131fed34c2596fc Binary files /dev/null and b/dist/read_sequencer-0.1.1-py3-none-any.whl differ diff --git a/dist/read_sequencer-0.1.1.tar.gz b/dist/read_sequencer-0.1.1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..7eb03d4a5f331e1050f6b5f6c8cb5ee356fc2595 Binary files /dev/null and b/dist/read_sequencer-0.1.1.tar.gz differ