From 93ef9e435bf95646fb269a3214cb5737732179d6 Mon Sep 17 00:00:00 2001 From: "hugo.madgeleon" <hugo.madgeleon@stud.unibas.ch> Date: Fri, 9 Dec 2022 15:03:18 +0100 Subject: [PATCH] Use Biopython FASTA parser + batches split --- terminal-fragment-selector/fragmentation.py | 31 +---------- terminal-fragment-selector/main.py | 57 ++++++++++++++++----- 2 files changed, 45 insertions(+), 43 deletions(-) diff --git a/terminal-fragment-selector/fragmentation.py b/terminal-fragment-selector/fragmentation.py index ff7c73f..6c509c7 100644 --- a/terminal-fragment-selector/fragmentation.py +++ b/terminal-fragment-selector/fragmentation.py @@ -5,35 +5,7 @@ import numpy as np import pandas as pd -def fasta_process(fasta_file): - """ - Pre-process FASTA file. - - Args: - fasta_file (fasta): FASTA file with cDNA sequences - - Returns: - dict: Dictionary of gene sequence IDs and their sequence - """ - with open(fasta_file, "r") as f: - lines = f.readlines() - - # Tanya, try \\S instead of \S and see if that works - ident_pattern = re.compile('>(\S+)') - seq_pattern = re.compile('^(\S+)$') - - genes = {} - for line in lines: - if ident_pattern.search(line): - seq_id = (ident_pattern.search(line)).group(1) - elif seq_id in genes.keys(): - genes[seq_id] += (seq_pattern.search(line)).group(1) - else: - genes[seq_id] = (seq_pattern.search(line)).group(1) - return genes - - -def fragmentation(fasta_file, counts_file, mean_length, std, +def fragmentation(fasta, counts_file, mean_length, std, a_prob, t_prob, g_prob, c_prob): """ Fragment cDNA sequences and select terminal fragment. @@ -51,7 +23,6 @@ def fragmentation(fasta_file, counts_file, mean_length, std, Returns: list: list of selected terminal fragments """ - fasta = fasta_process(fasta_file) seq_counts = pd.read_csv(counts_file, names=["seqID", "count"]) diff --git a/terminal-fragment-selector/main.py b/terminal-fragment-selector/main.py index b91fb2f..865705e 100644 --- a/terminal-fragment-selector/main.py +++ b/terminal-fragment-selector/main.py @@ -1,5 +1,8 @@ """Receive command line arguments, fragment, and output fragments.""" import argparse +import logging +from Bio import SeqIO +import numpy as np from fragmentation import fragmentation from utils import check_positive, extant_file, check_prob @@ -13,13 +16,29 @@ def main(args): Args: args (parser): list of arguments from CLI. """ - fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob = args + fasta, seq_counts, output, mean_length, std = args[0:5] + a_prob, t_prob, g_prob, c_prob, chunk_size = args[5:] - term_frags = fragmentation(fasta, seq_counts, mean_length, std, - a_prob, t_prob, g_prob, c_prob) - with open('terminal_frags.txt', 'w') as f: - for line in term_frags: - f.write(f"{line}\n") + # Create or wipe output file + open(output, "w").close() + + logger.info(f"Fragmentation of {fasta}") + + fasta_parse = {} + for record in SeqIO.parse(fasta, "fasta"): + fasta_parse[record.id] = record.seq + splits = np.arange(0, len(list(fasta_parse))+chunk_size, chunk_size) + + for i, split in enumerate(splits): + fasta_dict = fasta_parse[split:splits[i+1]] + term_frags = fragmentation(fasta_dict, seq_counts, output, + mean_length, std, + a_prob, t_prob, g_prob, c_prob) + + logger.info(f"Writing batch {i} sequences to {output}") + with open(output, 'a') as f: + for line in term_frags: + f.write(f"{line}\n") def parse_arguments(): @@ -30,7 +49,7 @@ def parse_arguments(): parser: list of arguments from CLI. """ parser = argparse.ArgumentParser(description="""Takes as input FASTA file - of cDNA sequences, a CSV with sequence + of cDNA sequences, a CSV/TSV with sequence counts, and mean and std. dev. of fragment lengths and 4 nucleotide probabilities for the cuts. Outputs most terminal @@ -38,9 +57,11 @@ def parse_arguments(): for each sequence.""") parser.add_argument('--fasta', required=True, type=extant_file, - help="FASTA file with cDNA sequences") + help="Path to FASTA file with cDNA sequences") parser.add_argument('--counts', required=True, type=extant_file, - help="CSV file with sequence counts") + help="Path to CSV/TSV file with sequence counts") + parser.add_argument('-o', '--output', required=True, type=extant_file, + help="output file path") parser.add_argument('--mean', required=False, default=300, type=check_positive, help="Mean fragment length (default: 10)") @@ -48,23 +69,33 @@ def parse_arguments(): type=check_positive, help="Standard deviation fragment length \ (defafult: 1)") - parser.add_argument('--A_prob', required=False, default=0.22, + parser.add_argument('-a', '--A_prob', required=False, default=0.22, type=check_prob, help="Probability cut happens after nucleotide A") - parser.add_argument('--T_prob', required=False, default=0.25, + parser.add_argument('-t', '--T_prob', required=False, default=0.25, type=check_prob, help="Probability cut happens after nucleotide T") - parser.add_argument('--G_prob', required=False, default=0.25, + parser.add_argument('-g', '--G_prob', required=False, default=0.25, type=check_prob, help="Probability cut happens after nucleotide G") - parser.add_argument('--C_prob', required=False, default=0.28, + parser.add_argument('-c', '--C_prob', required=False, default=0.28, type=check_prob, help="Probability cut happens after nucleotide C") + parser.add_argument('-s', '--size', required=False, default=10000, + type=check_positive, + help="Chunk size for batch processing") args = parser.parse_args() return args if __name__ == '__main__': + logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s \ + (module "%(module)s")', + level=logging.INFO, + ) + logger = logging.getLogger(__name__) + arguments = parse_arguments() main(arguments) -- GitLab