diff --git a/src/parameter_parser.py b/src/parameter_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..61af08e707a9cf3485432b50007ea516dc8f15ae --- /dev/null +++ b/src/parameter_parser.py @@ -0,0 +1,72 @@ +"""Module containing functionalities to store run parameters. + +Class: + ParamParse: Take as input a file containing the parameters + and stores them in its attributes. +""" +import logging +from pathlib import Path + +LOG = logging.getLogger(__name__) + + +class ParamParse: + """Class holding the parameters of the run. + + Args: + param_file: Path to file with parameter values. + + Attributes: + param_file: File with parameter values. + transcripts_file: File with transcript abundances. + genome_ref_file: Reference genome file. + annotations_file: Transcripts annotations. + output_path: Output folder. + n_reads: Number of reads to be simulated. + n_cells: Number of cells to be simulated. + rna_avg_length: average RNA fragment length. + rna_sd_length: RNA fragment length standard deviation. + read_length: Read length. + intron_rate: Constant probability of retaining an intron. + add_poly_a: Boolean option to add a poly A tail. + poly_a_func: Function to add a poly_a tail. + primer_seq: Sequence of the primer. + priming_func: Function that evaluates internal priming. + """ + + def __init__(self, param_file: Path) -> None: + """Class constructor.""" + self.param_file: Path = Path(param_file) + with open(param_file) as f: + LOG.info("Loading parameters...") + for line in f: + s = line.split(':') + if s[0] == 'Csv transcripts file': + self.transcripts_file: Path = Path(s[1].strip()) + elif s[0] == 'Reference genome file': + self.genome_ref_file: Path = Path(s[1].strip()) + elif s[0] == 'Transcripts annotation file': + self.annotations_file: Path = Path(s[1].strip()) + elif s[0] == 'Output folder': + self.output_path: Path = Path(s[1].strip()) + elif s[0] == 'Number of reads': + self.n_reads: int = int(s[1].strip()) + elif s[0] == 'Number of cells': + self.n_cells: int = int(s[1].strip()) + elif s[0] == 'Average RNA fragments length': + self.rna_avg: float = float(s[1].strip()) + elif s[0] == 'RNA fragment length standard deviation': + self.rna_sd_length: float = float(s[1].strip()) + elif s[0] == 'Reads length': + self.read_length: int = int(s[1].strip()) + elif s[0] == 'Intron retaining probability': + self.intron_rate: float = float(s[1].strip()) + elif s[0] == 'Add poly A tail': + self.add_poly_a: bool = bool(s[1].strip()) + elif s[0] == 'Function to add poly A tail': + self.poly_a_func: str = str(s[1].strip()) + elif s[0] == 'Primer sequence': + self.primer_seq: str = str(s[1].strip()) + elif s[0] == 'Function to evaluate internal priming': + self.priming_func: str = str(s[1].strip()) + LOG.info("Parameters loaded.") diff --git a/src/read_sequencing.py b/src/read_sequencing.py new file mode 100644 index 0000000000000000000000000000000000000000..dfd7da99999727ce48b87ad7403a26a8b9ddf36c --- /dev/null +++ b/src/read_sequencing.py @@ -0,0 +1,98 @@ +"""Read Sequencing. + +Simulate the sequencing of reads on the template of terminal fragments and simulates reads of these fragments. +Author: Kathleen Moriarty +""" +# Imports from built-in modules +from random import choices +from typing import List +from pathlib import Path + + +def read_sequencing( + frag_file_name: Path, + output_file_name: Path = Path.cwd() / 'output_reads.txt', + num_reads: int = 1000, + read_len: int = 80, +) -> None: + """Reads a fasta-formatted file of terminal fragments and simulates reads. + + Simulate the sequencing of reads on the template of terminal + fragments. Reads are copies of fixed length starting + from the 5' end of fragments. If the desired read length + is larger than the fragment length, sequencing would in + principle proceed into the 3' adaptor and then would perhaps + yield random bases. For simplicity, here we assume that random + nucleotides are introduced in this case. Saves a fasta-formatted + file of reads of identical length, representing 5’ + ends of the terminal fragments as .txt. + + Args: + frag_file_name: input file path of terminal fragments + output_file_name: output file path where to store the output + num_reads: number of total reads to simulate + read_len: integer of identical read length + """ + # Read data from terminal fragment file + # Store fragment descriptions in a list + frag_desc = [] # type: List[str] + + with open(frag_file_name, 'r') as f: + frag_line = f.readline() + # Store all fragments as a list to parse later + frag_list = [] # type: List[str] + # Store combined fragment lines + frag_str = "" + while frag_line != "": + # To stop when the end of file is reached + if frag_line.startswith('>'): + # Determine if this is the first fragment in the file + # Ignore the description line (starting with >) of the first fragment + if not (len(frag_list) == 0 and frag_str == ""): + # Not the first fragment. Append to list. + frag_list.append(frag_str) + frag_str = "" + # Store description line for output file + frag_desc.append(frag_line) + else: + frag_str = frag_str + frag_line.rstrip("\n") + # Read next line + frag_line = f.readline() + frag_list.append(frag_str) + + # Store list of random nucleotides from which to sample when read length is too short + nucleotides = ['A', 'C', 'G', 'T'] + + # Calculate sum of all lengths to determine the relative abundance for that fragment + sum_frags = sum(map(len, frag_list)) + + # Open the file to save the reads + with open(output_file_name, 'w') as fw: + + # Loop through fasta fragments that start with 5' + for frag in frag_list: + # Determine number of reads to create from this fragment + # This might not always provide an exact number of reads that were asked + # TODO resolve this issue + num_frag_reads = round((len(frag)/sum_frags) * num_reads) + + for i in range(0, num_frag_reads): + # If the read length is less than the required length given by the parameter, + # then add random nucleotides + if len(frag) < read_len: + + # Calculate number of random nucleotides to add to the end of the read + diff = read_len - len(frag) + + # Select random nucleotides from list of possible + rand_samp = choices(nucleotides, k=diff) + + # Add the random list to the read and save + tmp_read = frag[0:len(frag)] + ''.join(rand_samp) + else: + # Save subset of fragment as read + tmp_read = frag[0:read_len] + + # Write read to file and original fragment description + fw.write(frag_desc[frag_list.index(frag)]) + fw.write(tmp_read + "\n\n") diff --git a/tests/resources/Param_test.txt b/tests/resources/Param_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..6af6df2c32ab69c6a3d776a5cc0909a890bea1a0 --- /dev/null +++ b/tests/resources/Param_test.txt @@ -0,0 +1,27 @@ +Csv transcripts file: ./transcripts.csv + +Reference genome file: ./home/ref.ref + +Transcripts annotation file: ./home/annotations.ann + +Output folder: ./home/output + +Number of reads: 10023 + +Number of cells: 34 + +Average RNA fragments length: 150 + +RNA fragment length standard deviation: 10 + +Reads length: 100 + +Intron retaining probability: 0.2 + +Add poly A tail: TRUE + +Function to add poly A tail: generate_poly_a + +Primer sequence: ACCTGATCGTACG + +Function to evaluate internal priming: internal_priming \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..364ce9b8dfad69cf60749b7daf78e9c05c326667 --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,25 @@ +"""Tests the parameter parser class.""" + +import pytest +from pathlib import Path +from src import parameter_parser as pp +from src import poly_a + +def test_parser(): + """Tests the attributes of the class.""" + par=pp.ParamParse('./tests/resources/Param_test.txt') + assert par.param_file == Path('./tests/resources/Param_test.txt') + assert par.transcripts_file == Path('./transcripts.csv') + assert par.genome_ref_file == Path('./home/ref.ref') + assert par.annotations_file == Path('./home/annotations.ann') + assert par.output_path == Path('./home/output') + assert par.n_reads == 10023 + assert par.n_cells == 34 + assert par.rna_avg == 150 + assert par.rna_sd_length == 10 + assert par.read_length == 100 + assert par.intron_rate == 0.2 + assert par.add_poly_a == bool('TRUE') + assert par.poly_a_func == 'generate_poly_a' + assert par.primer_seq == 'ACCTGATCGTACG' + assert par.priming_func == 'internal_priming' \ No newline at end of file diff --git a/tests/test_sampleinput.py b/tests/test_sampleinput.py index f73dfb9dedc3d9436a585f75f356685f59e30e9a..3f81651fa79f0e6cedb59c24ec8322141870c9af 100644 --- a/tests/test_sampleinput.py +++ b/tests/test_sampleinput.py @@ -1,4 +1,4 @@ -"""Placeholder test for pipeline.""" +"""Tests the transcriptome abundance file input reader.""" import pytest import pandas as pd