From 69515d326906f3a815c419e16ebeb96ca961c676 Mon Sep 17 00:00:00 2001 From: Hugo Madge Leon <hugo.madgeleon@stud.unibas.ch> Date: Wed, 7 Dec 2022 08:35:19 +0000 Subject: [PATCH] Hope this doesn't break --- frag_package/fragmentation.py | 64 ------------------- frag_package/setup.py => setup.py | 0 .../main.py | 0 .../utils.py | 0 4 files changed, 64 deletions(-) delete mode 100644 frag_package/fragmentation.py rename frag_package/setup.py => setup.py (100%) rename {frag_package => terminal-fragment-selector}/main.py (100%) rename {frag_package => terminal-fragment-selector}/utils.py (100%) diff --git a/frag_package/fragmentation.py b/frag_package/fragmentation.py deleted file mode 100644 index 80762e2..0000000 --- a/frag_package/fragmentation.py +++ /dev/null @@ -1,64 +0,0 @@ -import re - -import numpy as np -import pandas as pd - - -def fasta_process(fasta_file): - with open(fasta_file, "r") as f: - lines = f.readlines() - - ident_pattern = re.compile('>(\S+)') - seq_pattern = re.compile('^(\S+)$') - - genes = {} - for line in lines: - if ident_pattern.search(line): - seq_id = (ident_pattern.search(line)).group(1) - elif seq_id in genes.keys(): - genes[seq_id] += (seq_pattern.search(line)).group(1) - else: - genes[seq_id] = (seq_pattern.search(line)).group(1) - return genes - -def fragmentation(fasta_file, counts_file, mean_length, std, a_prob, t_prob, g_prob, c_prob): - fasta = fasta_process(fasta_file) - seq_counts = pd.read_csv(counts_file, names = ["seqID", "count"]) - - # nucs = ['A','T','G','C'] - # mononuc_freqs = [0.22, 0.25, 0.23, 0.30] - nuc_probs = {'A':a_prob, 'T':t_prob, 'G':g_prob, 'C':c_prob} # calculated using https://www.nature.com/articles/srep04532#MOESM1 - - term_frags = [] - for seq_id, seq in fasta.items(): - counts = seq_counts[seq_counts["seqID"] == seq_id]["count"] - for _ in range(counts): - n_cuts = int(len(seq)/mean_length) - - # non-uniformly random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1 - # assume fragmentation by sonication for NGS workflow - cuts = [] - cut_nucs = np.random.choice(list(nuc_probs.keys()), n_cuts, p=list(nuc_probs.values())) - for nuc in cut_nucs: - nuc_pos = [x.start() for x in re.finditer(nuc, seq)] - pos = np.random.choice(nuc_pos) - while pos in cuts: - pos = np.random.choice(nuc_pos) - cuts.append(pos) - - cuts.sort() - cuts.insert(0,0) - term_frag = "" - for i, val in enumerate(cuts): - if i == len(cuts)-1: - fragment = seq[val+1:cuts[-1]] - else: - fragment = seq[val:cuts[i+1]] - if mean_length-std <= len(fragment) <= mean_length+std: - term_frag = fragment - if term_frag == "": - continue - else: - term_frags.append(term_frag) - return term_frags - diff --git a/frag_package/setup.py b/setup.py similarity index 100% rename from frag_package/setup.py rename to setup.py diff --git a/frag_package/main.py b/terminal-fragment-selector/main.py similarity index 100% rename from frag_package/main.py rename to terminal-fragment-selector/main.py diff --git a/frag_package/utils.py b/terminal-fragment-selector/utils.py similarity index 100% rename from frag_package/utils.py rename to terminal-fragment-selector/utils.py -- GitLab