From 15bb61efca8b070c828dd263d1c8351ac88c2ae8 Mon Sep 17 00:00:00 2001 From: Tanya Nandan <tanyanandan@student-net-cx-1659.intern.ethz.ch> Date: Wed, 23 Nov 2022 14:31:59 +0100 Subject: [PATCH] frag-changes --- .../fragmentation_v2.cpython-37.pyc | Bin 0 -> 1808 bytes frag_package/__pycache__/utils.cpython-37.pyc | Bin 0 -> 1043 bytes frag_package/fragmentation_v2.py | 9 +++++---- frag_package/main.py | 18 +++++++++++------- frag_package/utils.py | 5 +++++ 5 files changed, 21 insertions(+), 11 deletions(-) create mode 100644 frag_package/__pycache__/fragmentation_v2.cpython-37.pyc create mode 100644 frag_package/__pycache__/utils.cpython-37.pyc diff --git a/frag_package/__pycache__/fragmentation_v2.cpython-37.pyc b/frag_package/__pycache__/fragmentation_v2.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f120fab7808139ce0bf84f3cf45b3a645cc61b0a GIT binary patch literal 1808 zcmZ8hO>7)B6t+FT`<I{1CQXD?iK>S#rCrnu2t_HhK?PJI+7yDKrlZN&JDu+C%&hIL zlE_mIX^*Ah#(~2w;=+Y9XU;vq)kio*<;tZOc%E&df+ascKfm|<?DyVxt<k6=7=P9t zMPF86HJg(`fcXHXcpC^s6!+2gbj91G+*{i=CDeM1wjF9y2UeHjP1JUe!2)eW*eo!7 znBW~<hbevlB9Q`rhveBe1y*>BC#WEjj4etgxUfpj0?tlQ;p}x4Sw}nf3RhWUTe%a= zZ=Y~uxEJ)i;T7d6XY5W;&*e%v%5%^PQub7Ts`W0i&?*AW|45Clk#3<ac-t*>8-0b8 z18;CPH0B5H+J(ED7uyHky>dN&Y@XEiw4LQF)fLV{+D}p@G~rBpQ8vhvKGUvXA&>5B zC+68G*LIKXi?*+;-B8Fd=o%H=)y~wikx-V(AP=QvJk>S9hNl}RX2!W~G>z9`kkFqH zUw|N9e%<*(FfKYWO!vbyOlg>Q)|u$ZEbn}D=i|<2n^AE6+O^K-Jd1fa7$j*NbTb~@ zOuB3{N>~~(FlHPM!v3W$590yEaY?W~i)6+-MgoCEy)b6w#vF$-$<pBAmCO0QZcLw( z^9(!*&*XL)2*P#jlPazf8w15n*q<fmFkghT$Tk`<mSCE)C`;mJ_*GL9hmdlK9}tBp zrX+^adP<72w5UT}>V0n)Zj1`=2=5(08m$RhM@M+4f0q<~QGrxC6hqqm9$7&YW>E#r zo6=U*LtJXE6*cKdS5*$NB5#=c>fo;-y$Ak-imWPkT$_MY)l0r^c<Mj3ikZFJY6gn% z(_)rZX!S9Yjq!|{Rh0>pCU}`M9>x_l2gNwQx2)z>0}Anv*QLYfTSzs=vub{VF&h3@ z?&c;48|gf)(b|MyMC(O!?~!b(hKYEG!^;+mmRf+AK|&kLs7H=4Usn~?qKz|POJ`Ql z6Z6)^v}L5Kp%%=^iBu6~EhDgBJj6D%Ko5hxGSV|szLnY-rM6lunn!rpg1DNrUf%NE zaCf+?Jczkj#=N_QhVQ|-2kkMtp&h8lPu8^)WusKKv3AmMz(m}>`|Q`hFFd~*n=~Wz z+tv85Tfdxr^5XZaZCm3HHNK<q4UMmZ1<p?)!hnsbjZK)M1tf-g*r}KBfF|f$GPF0R zg>P<xHsZ2PG`aka#(QNyOo_34SzYZXLYih=yJ*}Q-!3|M^Q6YhBEG!Z&!Vs|)_^(D z=BB$f;ub8&|G;Q3lvS%qPPK2EJ&43Z?Ie;7gm!qC#!OpDDz()II3Q`RT@cc2pxx+x zmPE|7u8`b-tacxU{Sgz~H5<R1q!j#c?S?sE3Dvg9xYTZvLRU*&Vd-eVcqmzWiPwxp zt^8*(g^${Rg=qkn#`3<lgrwRjyHQsSNK7@iE9oc-pgnhmu9`*<%m)bGG~O)GgCI@n zgvNn((;ylFYnRjq8i2LX7EtY#jxyNgKq{xcG)}YyuY%XSf{*{D0x;s$LHXwf9>n`V z5cYA?o&khd!Yx3EIornP04tUNE2?-7&k!Ft8%~;diL~&A5=m^*#QX+Wx4r2nI`BvN uK7R{D+RY6F3Vt3oWztH`*Wxjt$xVG|e|3=2QJ<}ubPIz*KHTS9{`|jmJn-BA literal 0 HcmV?d00001 diff --git a/frag_package/__pycache__/utils.cpython-37.pyc b/frag_package/__pycache__/utils.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55be3016626b94985831a019e58cfa8b7598bb47 GIT binary patch literal 1043 zcmb7CK~EGh6mDl`hsnYsU}Dq*X)sFG$c{%3#3)E&j1m*@vX@Pnc6Y~}olai62;<@b z{UOWIzcg15hQHv+*A56ITx`?6_PzGK_r33JwOUPr<M+b1WE>Fk%U&*rhlf|V^fTN9 z5i}#+TGFm3JmLQ!U0(#Efie)Gm_ykRbdR*7bNoSe6KVwxD;v1Bap_Im4EaoF-jtkC zy=wefAn2@NZ)fzB<`1W2777n-FPzQ4vP}tk56X8<hGTvIxRC3tuYhqF6dbf<8!Q>h zWTcrHa>M#*CYd}+wbATgVwg};vs@Xb3YmADlXB(j^OGxsd))TRu+!(-@X|+F2AxD7 zmTUH5TK_jg$r1%1lZzmj#F(4*T;(ZU1qC<5Dy#!n(PeVg@@p7O#xgfH@f`pKRak$l zBAE;QB{jn~t?1ED0+yCPCjO))=-BZ5nCCnfJdfW<Ju<3@clO`LANG>o_SRPX5!3+q zc%0^g9y;v3Py2E&NoAf${44<tJlp64AK2&{T4tEIg4k+$1y4qNAmfQiGu<hUt9f~Z z0Zq?VFSLxg76}&o722dr6c$kTk6PO#*f=${Yj`kZO1{$*I;Db+D6ASJS|HW;kls_& zB7y!c*p4V4J=V6MJZEWs$g@<iLh00`hY}Trp=u)Ae&yo@Z2N%CrtMV?XX%Eilde?t zHjnmd(RD!Y($!tm=?FJMBb<vA?%?_M{GX!bX8!s<;_yZRJaIVu8#sa*$N7JNffk0i z4h9xc(G81Rp)@a`fI4`9&;G$!vM`oW(+-`lYEB_@*mPj&a0EG&?#;FOSR%m{DcT$> SG0Egho4fOi#R;NT6#fO(BLnyV literal 0 HcmV?d00001 diff --git a/frag_package/fragmentation_v2.py b/frag_package/fragmentation_v2.py index bce9ca9..80762e2 100644 --- a/frag_package/fragmentation_v2.py +++ b/frag_package/fragmentation_v2.py @@ -21,12 +21,13 @@ def fasta_process(fasta_file): genes[seq_id] = (seq_pattern.search(line)).group(1) return genes -def fragmentation(fasta_file, counts_file, mean_length, std): +def fragmentation(fasta_file, counts_file, mean_length, std, a_prob, t_prob, g_prob, c_prob): fasta = fasta_process(fasta_file) seq_counts = pd.read_csv(counts_file, names = ["seqID", "count"]) - nucs = ['A','T','G','C'] - mononuc_freqs = [0.22, 0.25, 0.23, 0.30] + # nucs = ['A','T','G','C'] + # mononuc_freqs = [0.22, 0.25, 0.23, 0.30] + nuc_probs = {'A':a_prob, 'T':t_prob, 'G':g_prob, 'C':c_prob} # calculated using https://www.nature.com/articles/srep04532#MOESM1 term_frags = [] for seq_id, seq in fasta.items(): @@ -37,7 +38,7 @@ def fragmentation(fasta_file, counts_file, mean_length, std): # non-uniformly random DNA fragmentation implementation based on https://www.nature.com/articles/srep04532#Sec1 # assume fragmentation by sonication for NGS workflow cuts = [] - cut_nucs = np.random.choice(nucs, n_cuts, p=mononuc_freqs) + cut_nucs = np.random.choice(list(nuc_probs.keys()), n_cuts, p=list(nuc_probs.values())) for nuc in cut_nucs: nuc_pos = [x.start() for x in re.finditer(nuc, seq)] pos = np.random.choice(nuc_pos) diff --git a/frag_package/main.py b/frag_package/main.py index 661c8f3..0088de4 100644 --- a/frag_package/main.py +++ b/frag_package/main.py @@ -1,13 +1,13 @@ import argparse from fragmentation_v2 import fragmentation -from utils import check_positive, extant_file +from utils import check_positive, extant_file, check_prob def main(args): - fasta, seq_counts, mean_length, std = args + fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob = args - term_frags = fragmentation(fasta, seq_counts, mean_length, std) + term_frags = fragmentation(fasta, seq_counts, mean_length, std, a_prob, t_prob, g_prob, c_prob) with open('terminal_frags.txt', 'w') as f: for line in term_frags: f.write(line) @@ -15,15 +15,19 @@ def main(args): # Parse command-line arguments def parse_arguments(): - parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths. Outputs most terminal fragment (within desired length range) for each sequence.") + parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths and 4 nucleotide probabilities for the cuts. Outputs most terminal fragment (within desired length range) for each sequence.") parser.add_argument('--fasta', required=True, type=extant_file, help="FASTA file with cDNA sequences") parser.add_argument('--counts', required=True, type=extant_file, help="CSV file with sequence counts") - parser.add_argument('--mean', required = False, default = 10, type = check_positive, help="Mean fragment length (default: 10)") - parser.add_argument('--std', required = False, default = 1, type = check_positive, help="Standard deviation fragment length (defafult: 1)") + parser.add_argument('--mean', required = False, default = 300, type = check_positive, help="Mean fragment length (default: 10)") + parser.add_argument('--std', required = False, default = 60, type = check_positive, help="Standard deviation fragment length (defafult: 1)") + parser.add_argument('--A_prob', required=False, default = 0.22, type=check_prob, help="Probability cut happens after nucleotide A") + parser.add_argument('--T_prob', required=False, default = 0.25, type=check_prob, help="Probability cut happens after nucleotide T") + parser.add_argument('--G_prob', required=False, default = 0.25, type=check_prob, help="Probability cut happens after nucleotide G") + parser.add_argument('--C_prob', required=False, default = 0.28, type=check_prob, help="Probability cut happens after nucleotide C") args = parser.parse_args() - return args.fasta, args.counts, args.mean, args.std + return args if __name__ == '__main__': diff --git a/frag_package/utils.py b/frag_package/utils.py index 2212bbc..aced683 100644 --- a/frag_package/utils.py +++ b/frag_package/utils.py @@ -22,3 +22,8 @@ def check_positive(value): raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) return ivalue +def check_prob(value): + pvalue = float(value) + if pvalue <= 0 or pvalue>1: + raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value) + return pvalue \ No newline at end of file -- GitLab