From 2c8cbd0dbf8600ae4b0ddbc793c83588c6459a59 Mon Sep 17 00:00:00 2001 From: "hugo.madgeleon" <hugo.madgeleon@stud.unibas.ch> Date: Sun, 13 Nov 2022 11:14:54 +0100 Subject: [PATCH] Clean package imports --- frag_package/fragmentation.py | 3 +- frag_package/fragmentation_2.py | 57 ++++++++++++++++++++++++++++++++- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/frag_package/fragmentation.py b/frag_package/fragmentation.py index 6ea8cdf..7dba609 100644 --- a/frag_package/fragmentation.py +++ b/frag_package/fragmentation.py @@ -1,6 +1,5 @@ -import pandas as pd import random -import numpy as np + dna_seq = { "ATAACATGTGGATGGCCAGTGGTCGGTTGTTACACGCCTACCGCGATGCTGAATGACCCGGACTAGAGTGGCGAAATTTATGGCGTGTGACCCGTTATGC": 100, diff --git a/frag_package/fragmentation_2.py b/frag_package/fragmentation_2.py index 4a20a85..69329ef 100644 --- a/frag_package/fragmentation_2.py +++ b/frag_package/fragmentation_2.py @@ -1 +1,56 @@ -import pandas as pd import random import numpy as np import re dna_seq = { "ATAACATGTGGATGGCCAGTGGTCGGTTGTTACACGCCTACCGCGATGCTGAATGACCCGGACTAGAGTGGCGAAATTTATGGCGTGTGACCCGTTATGC": 100, "TCCATTTCGGTCAGTGGGTCATTGCTAGTAGTCGATTGCATTGCCATTCTCCGAGTGATTTAGCGTGACAGCCGCAGGGAACCCATAAAATGCAATCGTA": 100 } nucs = ['A','T','G','C'] mononuc_freqs = [0.22, 0.25, 0.23, 0.30] # A_dinuc_freqs = {'AA':0.27,'AT':0.25, 'AG':0.22, 'AC':0.26} # T_dinuc_freqs = {'TA':0.27, 'TT':0.24, 'TG':0.24, 'TC':0.25} # C_dinuc_freqs = {'CA':0.23, 'CT':0.21, 'CG':0.35, 'CC':0.21} # G_dinuc_freqs = {'GA':0.25, 'GT':0.25, 'GG':0.23, 'GC':0.27} mean_length = 12 std = 1 term_frags = [] for seq, counts in dna_seq.items(): for _ in range(counts): n_cuts = int(len(seq)/mean_length) # non-random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1 # assume fragmentation by sonication for NGS workflow cuts = [] cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs) for nuc in cut_nucs: nuc_pos = [x.start() for x in re.finditer(nuc, seq)] pos = np.random.choice(nuc_pos) while pos in cuts: pos = np.random.choice(nuc_pos) cuts.append(pos) cuts.sort() cuts.insert(0,0) term_frag = "" for i, val in enumerate(cuts): if i == len(cuts)-1: fragment = seq[val+1:cuts[-1]] else: fragment = seq[val:cuts[i+1]] if mean_length-std <= len(fragment) <= mean_length+std: term_frag = fragment if term_frag == "": continue else: term_frags.append(term_frag) with open('terminal_frags.txt', 'w') as f: for line in term_frags: f.write(line) f.write('\n') \ No newline at end of file +import re + +import numpy as np + +dna_seq = { + "ATAACATGTGGATGGCCAGTGGTCGGTTGTTACACGCCTACCGCGATGCTGAATGACCCGGACTAGAGTGGCGAAATTTATGGCGTGTGACCCGTTATGC": 100, + "TCCATTTCGGTCAGTGGGTCATTGCTAGTAGTCGATTGCATTGCCATTCTCCGAGTGATTTAGCGTGACAGCCGCAGGGAACCCATAAAATGCAATCGTA": 100 +} +nucs = ['A','T','G','C'] +mononuc_freqs = [0.22, 0.25, 0.23, 0.30] + +# A_dinuc_freqs = {'AA':0.27,'AT':0.25, 'AG':0.22, 'AC':0.26} +# T_dinuc_freqs = {'TA':0.27, 'TT':0.24, 'TG':0.24, 'TC':0.25} +# C_dinuc_freqs = {'CA':0.23, 'CT':0.21, 'CG':0.35, 'CC':0.21} +# G_dinuc_freqs = {'GA':0.25, 'GT':0.25, 'GG':0.23, 'GC':0.27} + +mean_length = 12 +std = 1 + +term_frags = [] +for seq, counts in dna_seq.items(): + for _ in range(counts): + n_cuts = int(len(seq)/mean_length) + + # non-random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1 + # assume fragmentation by sonication for NGS workflow + cuts = [] + cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs) + for nuc in cut_nucs: + nuc_pos = [x.start() for x in re.finditer(nuc, seq)] + pos = np.random.choice(nuc_pos) + while pos in cuts: + pos = np.random.choice(nuc_pos) + cuts.append(pos) + + cuts.sort() + cuts.insert(0,0) + term_frag = "" + for i, val in enumerate(cuts): + if i == len(cuts)-1: + fragment = seq[val+1:cuts[-1]] + else: + fragment = seq[val:cuts[i+1]] + if mean_length-std <= len(fragment) <= mean_length+std: + term_frag = fragment + if term_frag == "": + continue + else: + term_frags.append(term_frag) + +with open('terminal_frags.txt', 'w') as f: + for line in term_frags: + f.write(line) + f.write('\n') + + -- GitLab