Skip to content
Snippets Groups Projects
Commit 37dbac77 authored by Tanya Santosh Nandan's avatar Tanya Santosh Nandan
Browse files

Merge branch 'tanya2' into 'main'

frag-changes

See merge request !25
parents a36f728b 15bb61ef
No related branches found
No related tags found
1 merge request!25frag-changes
File added
File added
......@@ -21,12 +21,13 @@ def fasta_process(fasta_file):
genes[seq_id] = (seq_pattern.search(line)).group(1)
return genes
def fragmentation(fasta_file, counts_file, mean_length, std):
def fragmentation(fasta_file, counts_file, mean_length, std, a_prob, t_prob, g_prob, c_prob):
fasta = fasta_process(fasta_file)
seq_counts = pd.read_csv(counts_file, names = ["seqID", "count"])
nucs = ['A','T','G','C']
mononuc_freqs = [0.22, 0.25, 0.23, 0.30]
# nucs = ['A','T','G','C']
# mononuc_freqs = [0.22, 0.25, 0.23, 0.30]
nuc_probs = {'A':a_prob, 'T':t_prob, 'G':g_prob, 'C':c_prob} # calculated using https://www.nature.com/articles/srep04532#MOESM1
term_frags = []
for seq_id, seq in fasta.items():
......@@ -37,7 +38,7 @@ def fragmentation(fasta_file, counts_file, mean_length, std):
# non-uniformly random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1
# assume fragmentation by sonication for NGS workflow
cuts = []
cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs)
cut_nucs = np.random.choice(list(nuc_probs.keys()), n_cuts, p=list(nuc_probs.values()))
for nuc in cut_nucs:
nuc_pos = [x.start() for x in re.finditer(nuc, seq)]
pos = np.random.choice(nuc_pos)
......
import argparse
from fragmentation_v2 import fragmentation
from utils import check_positive, extant_file
from utils import check_positive, extant_file, check_prob
def main(args):
fasta, seq_counts, mean_length, std = args
fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob = args
term_frags = fragmentation(fasta, seq_counts, mean_length, std)
term_frags = fragmentation(fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob)
with open('terminal_frags.txt', 'w') as f:
for line in term_frags:
f.write(line)
......@@ -15,15 +15,19 @@ def main(args):
# Parse command-line arguments
def parse_arguments():
parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths. Outputs most terminal fragment (within desired length range) for each sequence.")
parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths and 4 nucleotide probabilities for the cuts. Outputs most terminal fragment (within desired length range) for each sequence.")
parser.add_argument('--fasta', required=True, type=extant_file, help="FASTA file with cDNA sequences")
parser.add_argument('--counts', required=True, type=extant_file, help="CSV file with sequence counts")
parser.add_argument('--mean', required = False, default = 10, type = check_positive, help="Mean fragment length (default: 10)")
parser.add_argument('--std', required = False, default = 1, type = check_positive, help="Standard deviation fragment length (defafult: 1)")
parser.add_argument('--mean', required = False, default = 300, type = check_positive, help="Mean fragment length (default: 10)")
parser.add_argument('--std', required = False, default = 60, type = check_positive, help="Standard deviation fragment length (defafult: 1)")
parser.add_argument('--A_prob', required=False, default = 0.22, type=check_prob, help="Probability cut happens after nucleotide A")
parser.add_argument('--T_prob', required=False, default = 0.25, type=check_prob, help="Probability cut happens after nucleotide T")
parser.add_argument('--G_prob', required=False, default = 0.25, type=check_prob, help="Probability cut happens after nucleotide G")
parser.add_argument('--C_prob', required=False, default = 0.28, type=check_prob, help="Probability cut happens after nucleotide C")
args = parser.parse_args()
return args.fasta, args.counts, args.mean, args.std
return args
if __name__ == '__main__':
......
......@@ -22,3 +22,8 @@ def check_positive(value):
raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value)
return ivalue
def check_prob(value):
pvalue = float(value)
if pvalue <= 0 or pvalue>1:
raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value)
return pvalue
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment