diff --git a/frag_package/__pycache__/fragmentation_v2.cpython-37.pyc b/frag_package/__pycache__/fragmentation_v2.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f120fab7808139ce0bf84f3cf45b3a645cc61b0a Binary files /dev/null and b/frag_package/__pycache__/fragmentation_v2.cpython-37.pyc differ diff --git a/frag_package/__pycache__/utils.cpython-37.pyc b/frag_package/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55be3016626b94985831a019e58cfa8b7598bb47 Binary files /dev/null and b/frag_package/__pycache__/utils.cpython-37.pyc differ diff --git a/frag_package/fragmentation_v2.py b/frag_package/fragmentation_v2.py index bce9ca91cc627b90cdd85e53fb00519d7c3460a0..80762e2d70cb97184b2a58f3f6975e3d8cc40104 100644 --- a/frag_package/fragmentation_v2.py +++ b/frag_package/fragmentation_v2.py @@ -21,12 +21,13 @@ def fasta_process(fasta_file): genes[seq_id] = (seq_pattern.search(line)).group(1) return genes -def fragmentation(fasta_file, counts_file, mean_length, std): +def fragmentation(fasta_file, counts_file, mean_length, std, a_prob, t_prob, g_prob, c_prob): fasta = fasta_process(fasta_file) seq_counts = pd.read_csv(counts_file, names = ["seqID", "count"]) - nucs = ['A','T','G','C'] - mononuc_freqs = [0.22, 0.25, 0.23, 0.30] + # nucs = ['A','T','G','C'] + # mononuc_freqs = [0.22, 0.25, 0.23, 0.30] + nuc_probs = {'A':a_prob, 'T':t_prob, 'G':g_prob, 'C':c_prob} # calculated using https://www.nature.com/articles/srep04532#MOESM1 term_frags = [] for seq_id, seq in fasta.items(): @@ -37,7 +38,7 @@ def fragmentation(fasta_file, counts_file, mean_length, std): # non-uniformly random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1 # assume fragmentation by sonication for NGS workflow cuts = [] - cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs) + cut_nucs = np.random.choice(list(nuc_probs.keys()), n_cuts, p=list(nuc_probs.values())) for nuc in cut_nucs: nuc_pos = [x.start() for x in re.finditer(nuc, seq)] pos = np.random.choice(nuc_pos) diff --git a/frag_package/main.py b/frag_package/main.py index 661c8f370c844f633823dc44ba22f6cb0a57c6a1..0088de46e17fdd97a4a52c2d33aee373c85c2b86 100644 --- a/frag_package/main.py +++ b/frag_package/main.py @@ -1,13 +1,13 @@ import argparse from fragmentation_v2 import fragmentation -from utils import check_positive, extant_file +from utils import check_positive, extant_file, check_prob def main(args): - fasta, seq_counts, mean_length, std = args + fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob = args - term_frags = fragmentation(fasta, seq_counts, mean_length, std) + term_frags = fragmentation(fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob) with open('terminal_frags.txt', 'w') as f: for line in term_frags: f.write(line) @@ -15,15 +15,19 @@ def main(args): # Parse command-line arguments def parse_arguments(): - parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths. Outputs most terminal fragment (within desired length range) for each sequence.") + parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths and 4 nucleotide probabilities for the cuts. Outputs most terminal fragment (within desired length range) for each sequence.") parser.add_argument('--fasta', required=True, type=extant_file, help="FASTA file with cDNA sequences") parser.add_argument('--counts', required=True, type=extant_file, help="CSV file with sequence counts") - parser.add_argument('--mean', required = False, default = 10, type = check_positive, help="Mean fragment length (default: 10)") - parser.add_argument('--std', required = False, default = 1, type = check_positive, help="Standard deviation fragment length (defafult: 1)") + parser.add_argument('--mean', required = False, default = 300, type = check_positive, help="Mean fragment length (default: 10)") + parser.add_argument('--std', required = False, default = 60, type = check_positive, help="Standard deviation fragment length (defafult: 1)") + parser.add_argument('--A_prob', required=False, default = 0.22, type=check_prob, help="Probability cut happens after nucleotide A") + parser.add_argument('--T_prob', required=False, default = 0.25, type=check_prob, help="Probability cut happens after nucleotide T") + parser.add_argument('--G_prob', required=False, default = 0.25, type=check_prob, help="Probability cut happens after nucleotide G") + parser.add_argument('--C_prob', required=False, default = 0.28, type=check_prob, help="Probability cut happens after nucleotide C") args = parser.parse_args() - return args.fasta, args.counts, args.mean, args.std + return args if __name__ == '__main__': diff --git a/frag_package/utils.py b/frag_package/utils.py index 2212bbc61d3db2fd0c55f8d85f52866fb31c2d07..aced683d70d2d8ad1f99824062143f533efe035f 100644 --- a/frag_package/utils.py +++ b/frag_package/utils.py @@ -22,3 +22,8 @@ def check_positive(value): raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) return ivalue +def check_prob(value): + pvalue = float(value) + if pvalue <= 0 or pvalue>1: + raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) + return pvalue \ No newline at end of file