Skip to content
Snippets Groups Projects
Commit 93ef9e43 authored by Hugo Madge Leon's avatar Hugo Madge Leon
Browse files

Use Biopython FASTA parser + batches split

parent 995f7d7b
No related branches found
No related tags found
2 merge requests!33remove weird setup.py,!32Fix all issues from review except frag theory
...@@ -5,35 +5,7 @@ import numpy as np ...@@ -5,35 +5,7 @@ import numpy as np
import pandas as pd import pandas as pd
def fasta_process(fasta_file): def fragmentation(fasta, counts_file, mean_length, std,
"""
Pre-process FASTA file.
Args:
fasta_file (fasta): FASTA file with cDNA sequences
Returns:
dict: Dictionary of gene sequence IDs and their sequence
"""
with open(fasta_file, "r") as f:
lines = f.readlines()
# Tanya, try \\S instead of \S and see if that works
ident_pattern = re.compile('>(\S+)')
seq_pattern = re.compile('^(\S+)$')
genes = {}
for line in lines:
if ident_pattern.search(line):
seq_id = (ident_pattern.search(line)).group(1)
elif seq_id in genes.keys():
genes[seq_id] += (seq_pattern.search(line)).group(1)
else:
genes[seq_id] = (seq_pattern.search(line)).group(1)
return genes
def fragmentation(fasta_file, counts_file, mean_length, std,
a_prob, t_prob, g_prob, c_prob): a_prob, t_prob, g_prob, c_prob):
""" """
Fragment cDNA sequences and select terminal fragment. Fragment cDNA sequences and select terminal fragment.
...@@ -51,7 +23,6 @@ def fragmentation(fasta_file, counts_file, mean_length, std, ...@@ -51,7 +23,6 @@ def fragmentation(fasta_file, counts_file, mean_length, std,
Returns: Returns:
list: list of selected terminal fragments list: list of selected terminal fragments
""" """
fasta = fasta_process(fasta_file)
seq_counts = pd.read_csv(counts_file, seq_counts = pd.read_csv(counts_file,
names=["seqID", "count"]) names=["seqID", "count"])
......
"""Receive command line arguments, fragment, and output fragments.""" """Receive command line arguments, fragment, and output fragments."""
import argparse import argparse
import logging
from Bio import SeqIO
import numpy as np
from fragmentation import fragmentation from fragmentation import fragmentation
from utils import check_positive, extant_file, check_prob from utils import check_positive, extant_file, check_prob
...@@ -13,13 +16,29 @@ def main(args): ...@@ -13,13 +16,29 @@ def main(args):
Args: Args:
args (parser): list of arguments from CLI. args (parser): list of arguments from CLI.
""" """
fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob = args fasta, seq_counts, output, mean_length, std = args[0:5]
a_prob, t_prob, g_prob, c_prob, chunk_size = args[5:]
term_frags = fragmentation(fasta, seq_counts, mean_length, std, # Create or wipe output file
a_prob, t_prob, g_prob, c_prob) open(output, "w").close()
with open('terminal_frags.txt', 'w') as f:
for line in term_frags: logger.info(f"Fragmentation of {fasta}")
f.write(f"{line}\n")
fasta_parse = {}
for record in SeqIO.parse(fasta, "fasta"):
fasta_parse[record.id] = record.seq
splits = np.arange(0, len(list(fasta_parse))+chunk_size, chunk_size)
for i, split in enumerate(splits):
fasta_dict = fasta_parse[split:splits[i+1]]
term_frags = fragmentation(fasta_dict, seq_counts, output,
mean_length, std,
a_prob, t_prob, g_prob, c_prob)
logger.info(f"Writing batch {i} sequences to {output}")
with open(output, 'a') as f:
for line in term_frags:
f.write(f"{line}\n")
def parse_arguments(): def parse_arguments():
...@@ -30,7 +49,7 @@ def parse_arguments(): ...@@ -30,7 +49,7 @@ def parse_arguments():
parser: list of arguments from CLI. parser: list of arguments from CLI.
""" """
parser = argparse.ArgumentParser(description="""Takes as input FASTA file parser = argparse.ArgumentParser(description="""Takes as input FASTA file
of cDNA sequences, a CSV with sequence of cDNA sequences, a CSV/TSV with sequence
counts, and mean and std. dev. of fragment counts, and mean and std. dev. of fragment
lengths and 4 nucleotide probabilities lengths and 4 nucleotide probabilities
for the cuts. Outputs most terminal for the cuts. Outputs most terminal
...@@ -38,9 +57,11 @@ def parse_arguments(): ...@@ -38,9 +57,11 @@ def parse_arguments():
for each sequence.""") for each sequence.""")
parser.add_argument('--fasta', required=True, type=extant_file, parser.add_argument('--fasta', required=True, type=extant_file,
help="FASTA file with cDNA sequences") help="Path to FASTA file with cDNA sequences")
parser.add_argument('--counts', required=True, type=extant_file, parser.add_argument('--counts', required=True, type=extant_file,
help="CSV file with sequence counts") help="Path to CSV/TSV file with sequence counts")
parser.add_argument('-o', '--output', required=True, type=extant_file,
help="output file path")
parser.add_argument('--mean', required=False, default=300, parser.add_argument('--mean', required=False, default=300,
type=check_positive, type=check_positive,
help="Mean fragment length (default: 10)") help="Mean fragment length (default: 10)")
...@@ -48,23 +69,33 @@ def parse_arguments(): ...@@ -48,23 +69,33 @@ def parse_arguments():
type=check_positive, type=check_positive,
help="Standard deviation fragment length \ help="Standard deviation fragment length \
(defafult: 1)") (defafult: 1)")
parser.add_argument('--A_prob', required=False, default=0.22, parser.add_argument('-a', '--A_prob', required=False, default=0.22,
type=check_prob, type=check_prob,
help="Probability cut happens after nucleotide A") help="Probability cut happens after nucleotide A")
parser.add_argument('--T_prob', required=False, default=0.25, parser.add_argument('-t', '--T_prob', required=False, default=0.25,
type=check_prob, type=check_prob,
help="Probability cut happens after nucleotide T") help="Probability cut happens after nucleotide T")
parser.add_argument('--G_prob', required=False, default=0.25, parser.add_argument('-g', '--G_prob', required=False, default=0.25,
type=check_prob, type=check_prob,
help="Probability cut happens after nucleotide G") help="Probability cut happens after nucleotide G")
parser.add_argument('--C_prob', required=False, default=0.28, parser.add_argument('-c', '--C_prob', required=False, default=0.28,
type=check_prob, type=check_prob,
help="Probability cut happens after nucleotide C") help="Probability cut happens after nucleotide C")
parser.add_argument('-s', '--size', required=False, default=10000,
type=check_positive,
help="Chunk size for batch processing")
args = parser.parse_args() args = parser.parse_args()
return args return args
if __name__ == '__main__': if __name__ == '__main__':
logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s \
(module "%(module)s")',
level=logging.INFO,
)
logger = logging.getLogger(__name__)
arguments = parse_arguments() arguments = parse_arguments()
main(arguments) main(arguments)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment