Skip to content
Snippets Groups Projects
Commit 93ef9e43 authored by Hugo Madge Leon's avatar Hugo Madge Leon
Browse files

Use Biopython FASTA parser + batches split

parent 995f7d7b
No related branches found
No related tags found
2 merge requests!33remove weird setup.py,!32Fix all issues from review except frag theory
......@@ -5,35 +5,7 @@ import numpy as np
import pandas as pd
def fasta_process(fasta_file):
"""
Pre-process FASTA file.
Args:
fasta_file (fasta): FASTA file with cDNA sequences
Returns:
dict: Dictionary of gene sequence IDs and their sequence
"""
with open(fasta_file, "r") as f:
lines = f.readlines()
# Tanya, try \\S instead of \S and see if that works
ident_pattern = re.compile('>(\S+)')
seq_pattern = re.compile('^(\S+)$')
genes = {}
for line in lines:
if ident_pattern.search(line):
seq_id = (ident_pattern.search(line)).group(1)
elif seq_id in genes.keys():
genes[seq_id] += (seq_pattern.search(line)).group(1)
else:
genes[seq_id] = (seq_pattern.search(line)).group(1)
return genes
def fragmentation(fasta_file, counts_file, mean_length, std,
def fragmentation(fasta, counts_file, mean_length, std,
a_prob, t_prob, g_prob, c_prob):
"""
Fragment cDNA sequences and select terminal fragment.
......@@ -51,7 +23,6 @@ def fragmentation(fasta_file, counts_file, mean_length, std,
Returns:
list: list of selected terminal fragments
"""
fasta = fasta_process(fasta_file)
seq_counts = pd.read_csv(counts_file,
names=["seqID", "count"])
......
"""Receive command line arguments, fragment, and output fragments."""
import argparse
import logging
from Bio import SeqIO
import numpy as np
from fragmentation import fragmentation
from utils import check_positive, extant_file, check_prob
......@@ -13,13 +16,29 @@ def main(args):
Args:
args (parser): list of arguments from CLI.
"""
fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob = args
fasta, seq_counts, output, mean_length, std = args[0:5]
a_prob, t_prob, g_prob, c_prob, chunk_size = args[5:]
term_frags = fragmentation(fasta, seq_counts, mean_length, std,
a_prob, t_prob, g_prob, c_prob)
with open('terminal_frags.txt', 'w') as f:
for line in term_frags:
f.write(f"{line}\n")
# Create or wipe output file
open(output, "w").close()
logger.info(f"Fragmentation of {fasta}")
fasta_parse = {}
for record in SeqIO.parse(fasta, "fasta"):
fasta_parse[record.id] = record.seq
splits = np.arange(0, len(list(fasta_parse))+chunk_size, chunk_size)
for i, split in enumerate(splits):
fasta_dict = fasta_parse[split:splits[i+1]]
term_frags = fragmentation(fasta_dict, seq_counts, output,
mean_length, std,
a_prob, t_prob, g_prob, c_prob)
logger.info(f"Writing batch {i} sequences to {output}")
with open(output, 'a') as f:
for line in term_frags:
f.write(f"{line}\n")
def parse_arguments():
......@@ -30,7 +49,7 @@ def parse_arguments():
parser: list of arguments from CLI.
"""
parser = argparse.ArgumentParser(description="""Takes as input FASTA file
of cDNA sequences, a CSV with sequence
of cDNA sequences, a CSV/TSV with sequence
counts, and mean and std. dev. of fragment
lengths and 4 nucleotide probabilities
for the cuts. Outputs most terminal
......@@ -38,9 +57,11 @@ def parse_arguments():
for each sequence.""")
parser.add_argument('--fasta', required=True, type=extant_file,
help="FASTA file with cDNA sequences")
help="Path to FASTA file with cDNA sequences")
parser.add_argument('--counts', required=True, type=extant_file,
help="CSV file with sequence counts")
help="Path to CSV/TSV file with sequence counts")
parser.add_argument('-o', '--output', required=True, type=extant_file,
help="output file path")
parser.add_argument('--mean', required=False, default=300,
type=check_positive,
help="Mean fragment length (default: 10)")
......@@ -48,23 +69,33 @@ def parse_arguments():
type=check_positive,
help="Standard deviation fragment length \
(defafult: 1)")
parser.add_argument('--A_prob', required=False, default=0.22,
parser.add_argument('-a', '--A_prob', required=False, default=0.22,
type=check_prob,
help="Probability cut happens after nucleotide A")
parser.add_argument('--T_prob', required=False, default=0.25,
parser.add_argument('-t', '--T_prob', required=False, default=0.25,
type=check_prob,
help="Probability cut happens after nucleotide T")
parser.add_argument('--G_prob', required=False, default=0.25,
parser.add_argument('-g', '--G_prob', required=False, default=0.25,
type=check_prob,
help="Probability cut happens after nucleotide G")
parser.add_argument('--C_prob', required=False, default=0.28,
parser.add_argument('-c', '--C_prob', required=False, default=0.28,
type=check_prob,
help="Probability cut happens after nucleotide C")
parser.add_argument('-s', '--size', required=False, default=10000,
type=check_positive,
help="Chunk size for batch processing")
args = parser.parse_args()
return args
if __name__ == '__main__':
logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s \
(module "%(module)s")',
level=logging.INFO,
)
logger = logging.getLogger(__name__)
arguments = parse_arguments()
main(arguments)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment