diff --git a/EM-algorithm/README.txt b/EM-algorithm/README.txt index 8a0e7d1dbb6431e50c769a464e56e585948470a2..4f17da58540dcd486a3f1b9f4bfd5e1570ea00df 100644 --- a/EM-algorithm/README.txt +++ b/EM-algorithm/README.txt @@ -1,5 +1,5 @@ ----------------------------------------------- -Niels Schlusser, July, 28th 2022 +Niels Schlusser, April, 22th 2024 ----------------------------------------------- This directory contains the (procedural) c++ code that runs the actual expectation maximization procedure optimizing the model parameters. diff --git a/Markov-model/README.txt b/Markov-model/README.txt index 2534243fa43c413fb7aca4601353799bbe3ed1ac..77316398d3e76022d01b127528a1f062a7bfec30 100644 --- a/Markov-model/README.txt +++ b/Markov-model/README.txt @@ -1,5 +1,5 @@ ----------------------------------------------- -Niels Schlusser, July, 28th 2022 +Niels Schlusser, April, 22th 2024 ----------------------------------------------- This directory contains the python code and bash scripts to turn *.fastq.gz files into the desired format for our EM optimization of binding sites. To this end, an the background frequencies need to be constructed from a Markov model of highest possible degree. diff --git a/Markov-model/construct_background.py b/Markov-model/construct_background.py index 0ef9be7ba11c4af8fa044ba303a9fb152c5c55a6..412acde832a1d25ce559ed45f2bb49c1a22a8e96 100644 --- a/Markov-model/construct_background.py +++ b/Markov-model/construct_background.py @@ -2,6 +2,8 @@ import Bio import sys import math from Bio import SeqIO +import numpy as np +from scipy.optimize import fsolve #declare input background_files = [] @@ -9,8 +11,8 @@ foreground_files = [] b_kmer_len=0 #initialize input -if len(sys.argv) > 4: - b_kmer_len=int(sys.argv[1]) +if len(sys.argv) > 2: + b_kmer_len = int(sys.argv[1]) for ind in range(3,3+int(sys.argv[2])): background_files.append(sys.argv[ind]) for ind in range(3+int(sys.argv[2]),len(sys.argv)): @@ -18,103 +20,183 @@ if len(sys.argv) > 4: else: print("Insufficient input\n") +min_len=b_kmer_len -#define nucleic acid dictionary -nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 } -inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" } #save original output channel original_stdout = sys.stdout -#create array of background counts -background_frequency = [0.0 for i in range(0,pow(4,b_kmer_len))] -#define function which calculates the background frequency of a given read S -def construct_frequency(S): +''' +define function +which calculates the background log frequency +of a given read S +''' +def construct_frequency(S,background_log_freq_dict): ret = 0.0 - for kmer_start in range(0, len(S)-b_kmer_len+1): - tmp_pos = 0 - - for kmer_rel_pos in range(0,b_kmer_len): - tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos) - - ret += background_frequency[tmp_pos] + for kmer_start in range(0,len(S)-b_kmer_len+1): + kmer = S[kmer_start:kmer_start+b_kmer_len] + if kmer in background_log_freq_dict: + ret += background_log_freq_dict[kmer] + else: + return 0.0 - return ret + if ret < -1000000.0: + return 0.0 + + return np.exp(ret,dtype='float64') + +''' +define method +that constructs background sequence counts dictionary +from background pool +''' +def background_sequence_dict(files): + background_dict = {} -#construct background frequency array -normalization = 0 -for data_file in background_files: - for read in SeqIO.parse(data_file+".fastq","fastq"): - s = read.seq - - for kmer_start in range(0,len(s)-b_kmer_len+1): - tmp_cnt = 0 - is_N = 0 - - for kmer_rel_pos in range(0,b_kmer_len): - if(s[kmer_start+kmer_rel_pos] != 'N'): - tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos) + for data_file in files: + for read in SeqIO.parse(data_file+".fastq","fastq"): + s = read.seq + + if 'N' not in s and len(s) >= min_len: + if s not in background_dict: + background_dict[s] = 1 else: - is_N = 1 - - #only use sequence if there is no N in it - if( is_N == 0 ): - background_frequency[tmp_cnt] += 1.0 - normalization += 1 - - - + background_dict[s] += 1 -#normalize and print log of background likelihood of kmer -with open("background_frequencies_"+str(b_kmer_len)+"mer.dat","w") as f: - sys.stdout = f - for i in range(0,pow(4,b_kmer_len)): - background_frequency[i] = background_frequency[i]/normalization - s = "" - tmpI = i - for j in range(0,b_kmer_len): - tmpN = tmpI % 4 - s = s + inv_nucl_dict[tmpN] - tmpI -= tmpN - tmpI /= 4 - print(s,background_frequency[i]) - sys.stdout = original_stdout - - - -#create foreground sequence dictionaries, print in data format for RBAPBindnSeq -background_normalization = 0 -number_kmers = 0 -for data_file in foreground_files: - foreground_dict = {} - - #create foreground dictionary - for seq_record in SeqIO.parse(data_file+".fastq","fastq"): - if seq_record.seq in foreground_dict: - foreground_dict[seq_record.seq] += 1 - else: - foreground_dict[seq_record.seq] = 1 - - - #print read files with background frequencies - with open(data_file+"_"+str(b_kmer_len)+"mer.dat","w") as f: + return background_dict + + + +''' +define method +that constructs background kmer counts dictionary +from background sequence dict +''' +def background_kmer_dict(background_dict): + kmer_dict = {} + + for s,m in background_dict.items(): + for kmer_start in range(0,len(s)-b_kmer_len+1): + kmer = s[kmer_start:kmer_start+b_kmer_len] + + if kmer not in kmer_dict: + kmer_dict[kmer] = m + else: + kmer_dict[kmer] += m + + return kmer_dict + + + +polynomial = {} +''' +callable for normalization polynomial +''' +def poly(N): + powers = np.array(list(polynomial.keys()),dtype='int32') + coefficients = np.array(list(polynomial.values()),dtype='float64') + + ret = sum(coefficients*np.power(N,powers)) - 1.0 + + return ret + + + +''' +define method +that normalizes the kmer frequencies +such that sum_S f_S = 1 for S in background is enforced + +involves solving nonlinear equation +polynomial needs to be defined globally +since python does not allow function parameters to be set on the fly :/ +''' +def normalize_background_kmer_freq_dict(background_sequence_dict,background_kmer_dict): + global polynomial + + norm = sum(background_kmer_dict.values()) + norm_log_counts = {k : np.log(v/norm) for k,v in background_kmer_dict.items()} + + for s in background_sequence_dict.keys(): + tmp_bgr = construct_frequency(s,norm_log_counts) + + N_kmers = len(s) - b_kmer_len + 1 + + if tmp_bgr > 0.0 and N_kmers > 0: + if N_kmers not in polynomial: + polynomial[N_kmers] = tmp_bgr + else: + polynomial[N_kmers] += tmp_bgr + + #rescale polynomial, necessary! + maxpow = max(polynomial.keys()) + maxpow_coeff = polynomial[maxpow] + + fact = np.power(maxpow_coeff,-1.0/maxpow) + + polynomial = {p : c*np.power(fact,p) for p,c in polynomial.items()} + + norm = fsolve(poly, 0.5, maxfev=2000)[0] + norm *= fact + + background_kmer_log_freq = {k : (v + np.log(norm)) for k,v in norm_log_counts.items()} + + return background_kmer_log_freq + + + +''' +print background likelihood of kmer +''' +def print_kmer_dict(kmer_dict): + with open("background_log_kmer_frequencies_"+str(b_kmer_len)+"mer.dat","w") as f: sys.stdout = f - for s in foreground_dict: - if('N' not in s): - tmp_bgr = construct_frequency(s) + for k,f in kmer_dict.items(): + print(k,f) + sys.stdout = original_stdout + + +''' +create foreground sequence dictionaries, print in data format for RBAPBindnSeq +''' +def prepare_foreground_data(foreground_files,background_kmer_log_freq): + for data_file in foreground_files: + foreground_dict = {} + + #create foreground dictionary + for seq_record in SeqIO.parse(data_file+".fastq","fastq"): + s = seq_record.seq + + if 'N' not in s and len(s) >= min_len: + if s in foreground_dict: + foreground_dict[s] += 1 + else: + foreground_dict[s] = 1 + + + + #print read files with background frequencies + with open(data_file+"_"+str(b_kmer_len)+"mer.dat","w") as f: + sys.stdout = f + for s in foreground_dict: + tmp_bgr = construct_frequency(s,background_kmer_log_freq) if tmp_bgr > 0.0 and len(s) >= b_kmer_len: - background_normalization += foreground_dict[s]*math.log(tmp_bgr) - number_kmers += foreground_dict[s]*(len(s) - b_kmer_len + 1) print(len(s),foreground_dict[s],tmp_bgr,s) - sys.stdout = original_stdout - - -print("background normalization for ", b_kmer_len, "mer: ", background_normalization, "\nNumber of possible ", b_kmer_len, "mers: ", number_kmers) + sys.stdout = original_stdout + + +back_seq_dict = background_sequence_dict(background_files) +back_kmer_dict = background_kmer_dict(back_seq_dict) + +norm_background_dict = normalize_background_kmer_freq_dict(back_seq_dict,back_kmer_dict) + +print_kmer_dict(norm_background_dict) +prepare_foreground_data(foreground_files,norm_background_dict) diff --git a/Markov-model/execute.sh b/Markov-model/execute.sh index 02d7d5dd3108ed42acb5ec75252dcba451eb9122..8b73bac8d8a3158685e22a504d3958ec6af021b6 100755 --- a/Markov-model/execute.sh +++ b/Markov-model/execute.sh @@ -1,11 +1,11 @@ #!/bin/bash -Lw=14 #kmer-length -nbr_bfiles=2 #number of background files +Lw=4 #kmer-length +nbr_bfiles=1 #number of background files #input file names -background_files=("ENCFF002DGO" "ENCFF002DGY") -foreground_files=("ENCFF002DGP" "ENCFF002DGQ" "ENCFF002DGR" "ENCFF002DGS" "ENCFF002DGT" "ENCFF002DGU" "ENCFF002DGV" "ENCFF002DGW" "ENCFF002DGX") +background_files=("ENCFF295WRQ") +foreground_files=("ENCFF278GIB" "ENCFF374UTI" "ENCFF400JXE" "ENCFF815CIK" "ENCFF897ONM") @@ -13,6 +13,7 @@ foreground_files=("ENCFF002DGP" "ENCFF002DGQ" "ENCFF002DGR" "ENCFF002DGS" "ENCFF ####START SCRIPT#### #################### +#download and cut adapter mode if [[ $1 == "dlct" ]] then #download files @@ -44,7 +45,7 @@ else #calculates background frequencies based on Markov model module purge ml Biopython - python construct_background.py ${Lw} ${nbr_bfiles} $(echo "${background_files[*]}") $(echo "${foreground_files[*]}") + python3 construct_background.py ${Lw} ${nbr_bfiles} $(echo "${background_files[*]}") $(echo "${foreground_files[*]}") for filename in ${foreground_files[@]} do diff --git a/Markov-model/submit.sh b/Markov-model/submit.sh new file mode 100755 index 0000000000000000000000000000000000000000..c09240e847a721b8c35cc060ae782ba271fdc8b7 --- /dev/null +++ b/Markov-model/submit.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +#SBATCH --job-name=construct_kMM #This is the name of your job +#SBATCH --ntasks=1 +#SBATCH --cpus-per-task=1 #This is the number of cores reserved +#SBATCH --mem=16G #This is the memory reserved. +#SBATCH --time=1-00:00:00 #This is the time that your task will run +#SBATCH --qos=1day #You will run in this queue +#SBATCH --output=output_%j.dat #Output file +#SBATCH --error=error_%j.err #Error file + + +#add your command lines below +bash execute.sh diff --git a/scripts_postprocessing/README.txt b/scripts_postprocessing/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6da07a932e793d1a4f16d0fb7b6b57c3684140b --- /dev/null +++ b/scripts_postprocessing/README.txt @@ -0,0 +1,6 @@ +----------------------------------------------- +Niels Schlusser, April, 22th 2024 +----------------------------------------------- + +This directory contains the python scripts for postprocessing after the EM procedure, including ranking of the extracted PWMs according to their relative KD and plotting a summary plot for run statistics as well as binding motifs. + diff --git a/scripts_postprocessing/histogram_overview_convergence.py b/scripts_postprocessing/histogram_overview_convergence.py new file mode 100644 index 0000000000000000000000000000000000000000..f878b23a42cb4ca30227a869abf88730283bae28 --- /dev/null +++ b/scripts_postprocessing/histogram_overview_convergence.py @@ -0,0 +1,164 @@ +#!/usr/bin/python +''' +script plots histogram +input needs form like + python histogram.py +''' +from sys import argv +import os +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + + +plt.rcParams['font.size'] = 28 + +spec_threshold = 0.0 + +''' +define class of PWMs with score +''' +class PWM_Score(): + def __init__(self, PWM, fg_log_lh, diff_log_lh, iterations, E0): + self.PWM = PWM + self.fg_log_lh = fg_log_lh + self.diff_log_lh = diff_log_lh + self.iterations = iterations + self.E0 = E0 + + + +''' +START SCRIPT +''' +number_PWMs = {} +number_convergent_PWMs = {} +number_convergent_specific_PWMs = {} + +for RBP in sorted(["A1CF","HNRNPC","RBM24","AKAP8L","HNRNPCL1","PABPC3","RBM25","SUCLG1","APOBEC3C","HNRNPD","PABPN1L","RBM3","SYNCRIP","BOLL","HNRNPDL","RBM4","TAF15","HNRNPF","PCBP1","RBM41","TARDBP","CELF1","HNRNPH2","PCBP2","RBM45","TDRD10","CNOT4","HNRNPK","PCBP4","RBM47","THUMPD1","HNRNPL","RBM4B","TIA1","CPEB1","IGF2BP1","RBM5","TRA2A","CSDE1","IGF2BP2","RBM6","TRA2B","DAZ3","IGF2BP3","RBMS2","TRNAU1AP","DAZAP1","ILF2","PPP1R10","RBMS3","TROVE2","EIF3D","PRR3","RC3H1","UNK","EIF4G2","KHDRBS2","PTBP3","SAFB2","EIF4H","KHDRBS3","PUF60","SF1","XRCC6","ELAVL4","KHSRP","PUM1","SF3B6","XRN2","ESPR1","RALYL","SFPQ","YBX2","EWSR1","LIN28B","SNRPB2","ZC3H10","EXOSC4","SRSF10","ZC3H18","FUBP1","RBFOX2","SRSF11","ZCRB1","FUBP3","MBNL1","RBFOX3","SRSF2","ZFP36","FUS","MSI1","RBM11","SRSF4","ZFP36L1","NOVA1","RBM15B","SRSF5","ZFP36L2","HNRNPA0","NSUN2","RBM20","SRSF8","ZNF326","HNRNPA1L2","NUPL2","RBM22","SRSF9","ZRANB2","HNRNPA2B1","RBM23","SNRPA"],key=str.lower): + #fill in zeros + number_PWMs[RBP] = 0 + number_convergent_PWMs[RBP] = 0 + number_convergent_specific_PWMs[RBP] = 0 + + for fi in os.listdir(os.getcwd()+"/"+RBP): + filename = RBP+"/"+os.fsdecode(fi) + + if ((not os.fsdecode(fi).startswith("ENCFF")) and filename.endswith("mer.dat") and os.fsdecode(fi).startswith(RBP)): + f = open(filename,'r') + + while True: + # Get next PWM from file + line1 = f.readline() #skip first line + line1 = f.readline() + line2 = f.readline() + line3 = f.readline() + line4 = f.readline() + line5 = f.readline() + + # if two lines are empty + # end of file is reached + if not (line1): + break + + #extract words from lines + val1 = line1.split() + val2 = line2.split() + val3 = line3.split() + val4 = line4.split() + val5 = line5.split() + + #skip free line after each PWM + line1 = f.readline() + + Lstart = float(val5[1]) + + line1 = f.readline() #skip first line + line1 = f.readline() + line2 = f.readline() + line3 = f.readline() + line4 = f.readline() + line5 = f.readline() + + # if two lines are empty + # end of file is reached + if not (line1): + break + + #extract words from lines + val1 = line1.split() + val2 = line2.split() + val3 = line3.split() + val4 = line4.split() + val5 = line5.split() + + #skip free line after each PWM + line1 = f.readline() + + Lw = len(val1) - 1 + Lstop = float(val5[1]) + E0 = float(val5[0]) + + number_PWMs[RBP] += 1 + + if (Lstart < Lstop): + number_convergent_PWMs[RBP] += 1 + + if E0 < spec_threshold: + number_convergent_specific_PWMs[RBP] += 1 + + f.close() + + +frequency_convergent_PWMs = {} +frequency_convergent_specific_PWMs = {} +for p in number_PWMs.keys(): + if number_PWMs[p] > 0.0: + frequency_convergent_PWMs[p] = number_convergent_PWMs[p] / number_PWMs[p] + frequency_convergent_specific_PWMs[p] = number_convergent_specific_PWMs[p] / number_PWMs[p] + else: + frequency_convergent_PWMs[p] = 0.0 + frequency_convergent_specific_PWMs[p] = 0.0 + + +fig,(ax1,ax2,ax3) = plt.subplots(3,figsize=(30,32)) + +ax3.set_xlabel("RBP") +ax1.set_ylabel("fraction of runs") +ax2.set_ylabel("fraction of runs") +ax3.set_ylabel("fraction of runs") + +RBP_names = [*frequency_convergent_PWMs.keys()] +RBPs_conv_freq = [*frequency_convergent_PWMs.values()] +RBPs_conv_spec_freq = [*frequency_convergent_specific_PWMs.values()] +N_RBPs = len(RBP_names) + +ax1.set_xlim([-0.5,(N_RBPs//3)-0.5]) +ax2.set_xlim([(N_RBPs//3)-0.5,2*(N_RBPs//3)-0.5]) +ax3.set_xlim([2*(N_RBPs//3)-0.5,N_RBPs-0.5]) + +ax1.set_xticks(range(0,(N_RBPs//3)),RBP_names[:(N_RBPs//3)],rotation="vertical") +ax2.set_xticks(range((N_RBPs//3),2*(N_RBPs//3)),RBP_names[(N_RBPs//3):2*(N_RBPs//3)],rotation="vertical") +ax3.set_xticks(range(2*(N_RBPs//3),N_RBPs),RBP_names[2*(N_RBPs//3):],rotation="vertical") + +ax1.set_ylim([0.0,1.0]) +ax2.set_ylim([0.0,1.0]) +ax3.set_ylim([0.0,1.0]) +ax1.set_yticks([0.0,0.2,0.4,0.6,0.8,1.0]) +ax2.set_yticks([0.0,0.2,0.4,0.6,0.8,1.0]) +ax3.set_yticks([0.0,0.2,0.4,0.6,0.8,1.0]) + +ax1.bar(range(0,(N_RBPs//3)),RBPs_conv_freq[:(N_RBPs//3)],width=0.7,label="convergent",color="blue") +ax1.bar(range(0,(N_RBPs//3)),RBPs_conv_spec_freq[:(N_RBPs//3)],width=0.7,label="convergent and specific",color="red") + +ax2.bar(range((N_RBPs//3),2*(N_RBPs//3)),RBPs_conv_freq[(N_RBPs//3):2*(N_RBPs//3)],width=0.7,label="convergent",color="blue") +ax2.bar(range((N_RBPs//3),2*(N_RBPs//3)),RBPs_conv_spec_freq[(N_RBPs//3):2*(N_RBPs//3)],width=0.7,label="convergent and specific",color="red") + +ax3.bar(range(2*(N_RBPs//3),N_RBPs),RBPs_conv_freq[2*(N_RBPs//3):],width=0.7,label="convergent",color="blue") +ax3.bar(range(2*(N_RBPs//3),N_RBPs),RBPs_conv_spec_freq[2*(N_RBPs//3):],width=0.7,label="convergent and specific",color="red") + +plt.subplots_adjust(hspace=0.4) + +ax1.legend(loc='upper right',labelspacing=0.1) + +plt.savefig("histogram_convergence.pdf",bbox_inches="tight") diff --git a/scripts_postprocessing/plot_motifs.py b/scripts_postprocessing/plot_motifs.py new file mode 100644 index 0000000000000000000000000000000000000000..d26db0b561cd54f63755af937fd7dbe863692fea --- /dev/null +++ b/scripts_postprocessing/plot_motifs.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +import numpy as np +import pandas as pd +import matplotlib +import matplotlib.pyplot as plt +import logomaker + + +#matplotlib.rcParams.update({'font.size': 16}) + + +color_scheme = { 'A' : '#00873E', \ + 'C' : '#1175A8', \ + 'G' : '#F6BE00', \ + 'T' : '#B8293D', \ +} + +plt.rcParams['font.size'] = 20 + + +f = open("PWMs_ranked_KD.dat","r") +ctr = 1 + +while True: + # Get next PWM from file + line1 = f.readline() #skip first line + line1 = f.readline() + line2 = f.readline() + line3 = f.readline() + line4 = f.readline() + line5 = f.readline() + + # if two lines are empty + # end of file is reached + if not (line1): + break + + #extract words from lines + val1 = line1.split() + val2 = line2.split() + val3 = line3.split() + val4 = line4.split() + + + #remove nucleotide labels + val1.pop(0) + val2.pop(0) + val3.pop(0) + val4.pop(0) + + #skip free line after each PWM + line1 = f.readline() + + motif = pd.DataFrame(columns=['A','C','G','T']) + + for a,c,g,t in zip(val1,val2,val3,val4): + tmp = np.array([a,c,g,t],dtype='float32') + + tmp += 0.01 #add pseudocount + + tmp /= 1.04 #renormalize + + row_ft = -sum(tmp * np.log2(tmp)) + + motif.loc[len(motif)] = tmp * (2.0 - row_ft) + + fig,ax = plt.subplots() + + motif_logo = logomaker.Logo(motif,color_scheme=color_scheme,vpad=0,width=0.95,ax=ax) + + motif_logo.ax.set_xlim([-0.5,len(motif)-0.5]) + motif_logo.ax.set_ylim([0, 2]) + motif_logo.ax.set_xticks([]) + motif_logo.ax.set_yticks([0,2]) + motif_logo.ax.set_ylabel("Information content [bits]",rotation="vertical") + motif_logo.style_spines(visible=False) + + plt.show() + plt.savefig("motifs/motif"+str(ctr)+".pdf",bbox_inches = "tight") + plt.close() + + ctr += 1 + +f.close() + diff --git a/scripts_postprocessing/postprocessing.sh b/scripts_postprocessing/postprocessing.sh new file mode 100755 index 0000000000000000000000000000000000000000..f821c67232a594b2cb435e1e195978c5d85cc166 --- /dev/null +++ b/scripts_postprocessing/postprocessing.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +for RBP_name in A1CF HNRNPC RBM24 AKAP8L HNRNPCL1 PABPC3 RBM25 SUCLG1 APOBEC3C HNRNPD PABPN1L RBM3 SYNCRIP BOLL HNRNPDL RBM4 TAF15 HNRNPF PCBP1 RBM41 TARDBP CELF1 HNRNPH2 PCBP2 RBM45 TDRD10 CNOT4 HNRNPK PCBP4 RBM47 THUMPD1 HNRNPL RBM4B TIA1 CPEB1 IGF2BP1 RBM5 TRA2A CSDE1 IGF2BP2 RBM6 TRA2B DAZ3 IGF2BP3 RBMS2 TRNAU1AP DAZAP1 ILF2 PPP1R10 RBMS3 TROVE2 EIF3D PRR3 RC3H1 UNK EIF4G2 KHDRBS2 PTBP3 SAFB2 EIF4H KHDRBS3 PUF60 SF1 XRCC6 ELAVL4 KHSRP PUM1 SF3B6 XRN2 ESPR1 RALYL SFPQ YBX2 EWSR1 LIN28B SNRPB2 ZC3H10 EXOSC4 SRSF10 ZC3H18 FUBP1 RBFOX2 SRSF11 ZCRB1 FUBP3 MBNL1 RBFOX3 SRSF2 ZFP36 FUS MSI1 RBM11 SRSF4 ZFP36L1 NOVA1 RBM15B SRSF5 ZFP36L2 HNRNPA0 NSUN2 RBM20 SRSF8 ZNF326 HNRNPA1L2 NUPL2 RBM22 SRSF9 ZRANB2 HNRNPA2B1 RBM23 SNRPA +do + cd ${RBP_name} + + if [[ $1 == "rank" ]] + then + cp ../rank_kmers_KD.py . + + conda run -n base python3 rank_kmers_KD.py ${RBP_name} + + rm rank_kmers_KD.py + elif [[ $1 == "consensus" ]] + then + cp ../consensus_rank_kmers_KD.py . + + rm *.err + + cat consensus_${RBP_name}_5mer_*.dat > consensus_${RBP_name}_5mer.dat + cat consensus_${RBP_name}_6mer_*.dat > consensus_${RBP_name}_6mer.dat + + conda run -n base python3 consensus_rank_kmers_KD.py consensus_${RBP_name} + + rm consensus_rank_kmers_KD.py + else + cp ../plot_motifs.py . + + rm -r motifs/ + + mkdir motifs/ + + conda run -n base python3 plot_motifs.py + + rm plot_motifs.py + fi + + cd ../ +done diff --git a/scripts_postprocessing/rank_kmers_KD.py b/scripts_postprocessing/rank_kmers_KD.py new file mode 100755 index 0000000000000000000000000000000000000000..bf5e0f328d25d6110a03f54840981976b1936687 --- /dev/null +++ b/scripts_postprocessing/rank_kmers_KD.py @@ -0,0 +1,274 @@ +''' +script searches output file for PWMs +subtracts the respective background probability +prints a ranked list of accordingly ranked PWMs +''' +import os +import sys +import pandas as pd +import numpy as np + +#specify prefix for files +prefix = str(sys.argv[-1]) + +#define thresholds +spec_threshold = 0.0 +sq_precision = 1e-3 + +reference_PWM_length = 5 + +#save original output channel +original_stdout = sys.stdout + +#define nucleic acid dictionary +nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 } +inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" } + + + +''' +define class of PWMs with score +''' +class PWM_Score(): + def __init__(self, PWM, fg_log_lh, diff_log_lh, iterations, E0): + self.PWM = PWM + self.fg_log_lh = fg_log_lh + self.bg_log_lh = 0.0 + self.diff_log_lh = diff_log_lh + self.iterations = iterations + self.E0 = E0 + self.norm = 1.0 + self.len_corr_fact = 0.0 + self.log_KD_rel = 0.0 + + + + +''' +method gives distance between two PWMs and E0s +''' +def distance(p1,p2): + PWM1_length = len(p1.PWM) + PWM2_length = len(p2.PWM) + + if PWM1_length == PWM2_length: + d = 0.0 + for a,b in zip(p1.PWM,p2.PWM): + d += np.power((a-b),2) + + d /= PWM1_length + + d += np.power((p1.E0 - p2.E0),2) + + d /= 2 + + return d + else: + return np.inf + + + +''' +method removes duplicates +''' +def remove_duplicates(pArray): + deduplicated_PWM_array = [] + + for p in pArray: + isind = False + + for d in deduplicated_PWM_array: + if distance(p,d) < sq_precision: + isind = True + + if isind == False: + deduplicated_PWM_array.append(p) + + return deduplicated_PWM_array + + + +''' +Method calculates the baseline P(D|N...N) of a kmer of unit distribution of given length, as well as length correction factor and overall normalization. +Iterates over all files starting with "ENCFF" and ending with "mer.dat" in directory. +Assumes infinitely specific binding (E_0 to -infinity) + +Args: + length of PWM + length of reference PWM + +Returns: + PD of the reference PWM + correction factor for length difference + overall library size normalization factor +''' +def calculate_PD_baseline(PWM_length,ref_PWM_length): + ret_ref = 0.0 + ret_len = 0.0 + norm = 0.0 + + for fi in os.listdir(os.getcwd()): + filename = os.fsdecode(fi) + + if filename.startswith("ENCFF") and filename.endswith("mer.dat"): + df = pd.read_csv(filename,sep=" ",usecols=[0,1,2],names=['len','nbr','freq']) + + df = df[(df['len'] >= max(PWM_length,ref_PWM_length)) & (df['freq'] > 0.0)] + + L_arr = df['len'].to_numpy() + n_arr = df['nbr'].to_numpy() + f_arr = df['freq'].to_numpy() + + t1 = sum( n_arr * np.log(f_arr * (L_arr - ref_PWM_length + 1)) ) + t2 = sum(n_arr) * np.log( sum(f_arr * (L_arr - ref_PWM_length + 1))) + + ret_ref += t1 - t2 + + ret_len += sum( n_arr * np.log((L_arr - PWM_length + 1) / (L_arr - ref_PWM_length + 1)) ) + + norm += sum(n_arr) + + return ret_ref,ret_len,norm + + + +''' +function reads the PWMs from results file in directory +''' +def read_list_PWMs(): + PWM_score_array = [] + + for fi in os.listdir(os.getcwd()): + filename = os.fsdecode(fi) + + if ((not filename.startswith("ENCFF")) and filename.endswith("mer.dat") and filename.startswith(prefix)): + + foreground_file = filename + + #open files + f = open(foreground_file,'r') + + while True: + # Get next PWM from file + line1 = f.readline() #skip first line + line1 = f.readline() + line2 = f.readline() + line3 = f.readline() + line4 = f.readline() + line5 = f.readline() + + # if two lines are empty + # end of file is reached + if not (line1): + break + + #extract words from lines + val1 = line1.split() + val2 = line2.split() + val3 = line3.split() + val4 = line4.split() + val5 = line5.split() + + #skip free line after each PWM + line1 = f.readline() + + Lstart = float(val5[1]) + + line1 = f.readline() #skip first line + line1 = f.readline() + line2 = f.readline() + line3 = f.readline() + line4 = f.readline() + line5 = f.readline() + + # if two lines are empty + # end of file is reached + if not (line1): + break + + #extract words from lines + val1 = line1.split() + val2 = line2.split() + val3 = line3.split() + val4 = line4.split() + val5 = line5.split() + + #skip free line after each PWM + line1 = f.readline() + + Lstop = float(val5[1]) + + if (Lstart < Lstop and float(val5[0]) < spec_threshold): + Lw = len(val1) - 1 + + pPWM = [0.0 for j in range(0,4*Lw)] + + for i in range(0,Lw): + pPWM[0*Lw+i] = float(val1[i+1]) + for i in range(0,Lw): + pPWM[1*Lw+i] = float(val2[i+1]) + for i in range(0,Lw): + pPWM[2*Lw+i] = float(val3[i+1]) + for i in range(0,Lw): + pPWM[3*Lw+i] = float(val4[i+1]) + + tmp_obj = PWM_Score(PWM = pPWM, fg_log_lh = Lstop, diff_log_lh = Lstop - Lstart, iterations = int(val5[2]), E0 = float(val5[0])) + + PWM_score_array.append(tmp_obj) + + + f.close() + + return PWM_score_array + + + +''' +set relative logarithmic KDs +Args: + list of convergent PWMs +''' +def set_rel_log_KDs(PWM_list): + for p in PWM_list: + p.bg_log_lh, p.len_corr_fact, p.norm = calculate_PD_baseline(len(p.PWM)//4,reference_PWM_length) + p.log_KD_rel = (p.bg_log_lh - p.fg_log_lh + p.len_corr_fact) / p.norm + return PWM_list + + + +''' +prints the PWMs into a file +Args: + filename + PWM_list +''' +def print_PWMs(PWM_list,filename): + with open(filename,"w") as f: + sys.stdout = f + + #print ranked consensus PWMs + for p in sorted(PWM_list, key= lambda item: item.log_KD_rel, reverse= False): + print("", end = "\t") + for i in range(0,len(p.PWM)//4): + print((i+1), end = "\t\t") + print("") + for i in range(0,4): + print(inv_nucl_dict[i], end = "\t") + for j in range(0,len(p.PWM)//4): + print(p.PWM[i*len(p.PWM)//4+j], end = "\t") + print("") + print(p.E0, p.fg_log_lh, p.log_KD_rel, p.iterations) + print("") + sys.stdout = original_stdout + +''' +BEGIN SCRIPT +''' +PWM_list = read_list_PWMs() +PWM_list = remove_duplicates(PWM_list) +PWM_list = set_rel_log_KDs(PWM_list) +print_PWMs(PWM_list,"ranked_KD.dat") + + + + diff --git a/scripts_postprocessing/vertical_bar_summary_plot.py b/scripts_postprocessing/vertical_bar_summary_plot.py new file mode 100644 index 0000000000000000000000000000000000000000..56a62044b7cdf3b25d6e324796c3062eb6dbeb6c --- /dev/null +++ b/scripts_postprocessing/vertical_bar_summary_plot.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python +import os.path +import numpy as np +import pandas as pd +import matplotlib +import matplotlib.pyplot as plt +import seaborn as sns +import matplotlib.image as image +from matplotlib.offsetbox import (OffsetImage, AnnotationBbox) +import logomaker +from mpl_toolkits.axes_grid1.inset_locator import (inset_axes, InsetPosition, mark_inset) + + +barwidth=0.15 +bar_offset=0.2 + + +color_scheme = { 'A' : '#00873E', \ + 'C' : '#1175A8', \ + 'G' : '#F6BE00', \ + 'T' : '#B8293D', \ +} + +plt.rcParams['font.size'] = 28 + +''' +reads the PWM +''' +def read_motif(filename): + motif = pd.read_csv(filename, sep='\t',header=None) + + motif = motif.dropna(axis=1) + + motif = motif.set_index(0) + + if len(motif) == 0: + return None + + #add pseudocount + motif = motif.astype('float64') + motif = motif + 0.01 + + #normalize + motif = motif.divide(1.04, axis='rows') + motif = motif.astype('float64') + + motif1 = pd.DataFrame() + + # convert the pwm to bits info for the logo + for col in motif.columns: + tmp = motif[col].to_numpy() + + row_ft = -sum(tmp * np.log2(tmp)) + + motif1[col] = tmp * (2.0 - row_ft) + + motif = motif1.transpose() + + motif.columns = ['A','C','G','T'] + + return motif + + +lowest_KD_rbps = pd.read_csv("lowest_KD/lowest_KD_motifs.tsv",sep="\t",low_memory=False,usecols=['RBP','log_rel_K_D','iterations']) +highest_KD_rbps = pd.read_csv("highest_KD/highest_KD_motifs.tsv",sep="\t",low_memory=False,usecols=['RBP','log_rel_K_D','iterations']) +consensus_KD_rbps = pd.read_csv("consensus_KD/consensus_motifs.tsv",sep="\t",low_memory=False,usecols=['RBP','log_rel_K_D','iterations']) + +lowest_KD_rbps = lowest_KD_rbps[lowest_KD_rbps['iterations'] > 0] +highest_KD_rbps = highest_KD_rbps[highest_KD_rbps['iterations'] > 0] +consensus_KD_rbps = consensus_KD_rbps[consensus_KD_rbps['iterations'] > 0] + +lowest_KD_rbps = pd.merge(lowest_KD_rbps,highest_KD_rbps,on="RBP",how='left') +lowest_KD_rbps = pd.merge(lowest_KD_rbps,consensus_KD_rbps,on="RBP",how='left') + +lowest_KD_rbps.columns = ['RBP','log_rel_K_D_lowest','iterations_lowest','log_rel_K_D_highest','iterations_highest','log_rel_K_D_consensus','iterations_consensus'] + +lowest_KD_rbps = lowest_KD_rbps.sort_values('RBP',ascending=True) + +N = len(lowest_KD_rbps) + +bar_loc = np.arange(0,N) + +bar_names = lowest_KD_rbps['RBP'].to_numpy() +bar_length_lowest = lowest_KD_rbps['log_rel_K_D_lowest'].to_numpy() +bar_length_highest = lowest_KD_rbps['log_rel_K_D_highest'].to_numpy() +bar_length_consensus = lowest_KD_rbps['log_rel_K_D_consensus'].to_numpy() + + +fig, (ax1,ax2,ax3,ax4) = plt.subplots(4,figsize=(30,38)) + +ax1.bar(bar_loc[:N//4]-bar_offset, bar_length_lowest[:N//4], width=barwidth, align='center', color='blue', label='random initialization, lowest $K_{D}$') +ax1.bar(bar_loc[:N//4], bar_length_highest[:N//4], width=barwidth, align='center', color='red', label='random initialization, highest $K_{D}$') +ax1.bar(bar_loc[:N//4]+bar_offset, bar_length_consensus[:N//4], width=barwidth, align='center', color='green', label='consensus initialization, lowest $K_{D}$') + +ax1.set_xticks(bar_loc[:N//4],labels=bar_names[:N//4],rotation='vertical') + +ax1.set_xlim([-0.5,N//4-0.5]) +ax1.set_ylabel('$log_{10}$ $K_{D}^{rel}$',labelpad=0) +ax1.set_ylim([-4,7]) + +ax1.axhline(y=0, color='black',lw=0.8) + +for x, r in zip(np.arange(0,N//4),bar_names[:N//4]): + pwm = read_motif("lowest_KD/"+r+".dat") + + axins = ax1.inset_axes(((x+0.15)/(N//4),0.86,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if os.path.isfile("highest_KD/"+r+".dat"): + pwm = read_motif("highest_KD/"+r+".dat") + + axins = ax1.inset_axes(((x+0.15)/(N//4),0.73,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + if os.path.isfile("consensus_KD/"+r+".dat"): + pwm = read_motif("consensus_KD/"+r+".dat") + + axins = ax1.inset_axes(((x+0.15)/(N//4),0.6,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if np.isnan(bar_length_consensus[x]) == True: + ax1.text(x+bar_offset-0.07,0.07,'?') + +ax1.legend(loc='lower left',labelspacing=0.0) + +ax2.bar(bar_loc[N//4:2*(N//4)]-bar_offset, bar_length_lowest[N//4:2*(N//4)], width=barwidth, align='center', color='blue', label='random initialization, lowest $K_{D}$') +ax2.bar(bar_loc[N//4:2*(N//4)], bar_length_highest[N//4:2*(N//4)], width=barwidth, align='center', color='red', label='random initialization, highest $K_{D}$') +ax2.bar(bar_loc[N//4:2*(N//4)]+bar_offset, bar_length_consensus[N//4:2*(N//4)], width=barwidth, align='center', color='green', label='consensus initialization, lowest $K_{D}$') + +ax2.set_xticks(bar_loc[N//4:2*(N//4)],labels=bar_names[N//4:2*(N//4)],rotation='vertical') + +ax2.set_xlim([N//4-0.5,2*(N//4)-0.5]) +ax2.set_ylabel('$log_{10}$ $K_{D}^{rel}$',labelpad=0) +ax2.set_ylim([-4,7]) + +ax2.axhline(y=0, color='black',lw=1.0) + +for x, r in zip(np.arange(N//4,2*(N//4)),bar_names[N//4:2*(N//4)]): + pwm = read_motif("lowest_KD/"+r+".dat") + + axins = ax2.inset_axes(((x+0.15-N//4)/(N//4),0.86,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if os.path.isfile("highest_KD/"+r+".dat"): + pwm = read_motif("highest_KD/"+r+".dat") + + axins = ax2.inset_axes(((x+0.15-N//4)/(N//4),0.73,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + if os.path.isfile("consensus_KD/"+r+".dat"): + pwm = read_motif("consensus_KD/"+r+".dat") + + axins = ax2.inset_axes(((x+0.15-N//4)/(N//4),0.6,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if np.isnan(bar_length_consensus[x]) == True: + ax2.text(x+bar_offset-0.07,0.07,'?') + +ax3.bar(bar_loc[2*(N//4):3*(N//4)]-bar_offset, bar_length_lowest[2*(N//4):3*(N//4)], width=barwidth, align='center', color='blue') +ax3.bar(bar_loc[2*(N//4):3*(N//4)], bar_length_highest[2*(N//4):3*(N//4)], width=barwidth, align='center', color='red') +ax3.bar(bar_loc[2*(N//4):3*(N//4)]+bar_offset, bar_length_consensus[2*(N//4):3*(N//4)], width=barwidth, align='center', color='green') + +ax3.set_xticks(bar_loc[2*(N//4):3*(N//4)],labels=bar_names[2*(N//4):3*(N//4)],rotation='vertical') + +ax3.set_xlim([2*(N//4)-0.5,3*(N//4)-0.5]) +ax3.set_ylabel('$log_{10}$ $K_{D}^{rel}$',labelpad=0) +ax3.set_ylim([-4,7]) + +ax3.axhline(y=0, color='black',lw=1.0) + +for x, r in zip(np.arange(2*(N//4),3*(N//4)),bar_names[2*(N//4):3*(N//4)]): + pwm = read_motif("lowest_KD/"+r+".dat") + + axins = ax3.inset_axes(((x+0.15-2*(N//4))/(N//4),0.86,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if os.path.isfile("highest_KD/"+r+".dat"): + pwm = read_motif("highest_KD/"+r+".dat") + + axins = ax3.inset_axes(((x+0.15-2*(N//4))/(N//4),0.73,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + if os.path.isfile("consensus_KD/"+r+".dat"): + pwm = read_motif("consensus_KD/"+r+".dat") + + axins = ax3.inset_axes(((x+0.15-2*(N//4))/(N//4),0.6,0.8/(N//4),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if np.isnan(bar_length_consensus[x]) == True: + ax3.text(x+bar_offset-0.07,0.05,'?') + + +ax4.bar(bar_loc[3*(N//4):]-bar_offset, bar_length_lowest[3*(N//4):], width=barwidth, align='center', color='blue') +ax4.bar(bar_loc[3*(N//4):], bar_length_highest[3*(N//4):], width=barwidth, align='center', color='red') +ax4.bar(bar_loc[3*(N//4):]+bar_offset, bar_length_consensus[3*(N//4):], width=barwidth, align='center', color='green') + +ax4.set_xticks(bar_loc[3*(N//4):],labels=bar_names[3*(N//4):],rotation='vertical') + +ax4.set_xlabel('RBP',labelpad=10) +ax4.set_xlim([3*(N//4)-0.5,N-0.5]) +ax4.set_ylabel('$log_{10}$ $K_{D}^{rel}$',labelpad=0) +ax4.set_ylim([-4,7]) + +ax4.axhline(y=0, color='black',lw=1.0) + +for x, r in zip(np.arange(3*(N//4),N),bar_names[3*(N//4):]): + pwm = read_motif("lowest_KD/"+r+".dat") + + axins = ax4.inset_axes(((x+0.15-3*(N//4))/(N - 3*(N//4)),0.86,0.8/(N - 3*(N//4)),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if os.path.isfile("highest_KD/"+r+".dat"): + pwm = read_motif("highest_KD/"+r+".dat") + + axins = ax4.inset_axes(((x+0.15-3*(N//4))/(N - 3*(N//4)),0.73,0.8/(N - 3*(N//4)),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if os.path.isfile("consensus_KD/"+r+".dat"): + pwm = read_motif("consensus_KD/"+r+".dat") + + axins = ax4.inset_axes(((x+0.15-3*(N//4))/(N - 3*(N//4)),0.6,0.8/(N - 3*(N//4)),0.13)) + + motif_logo = logomaker.Logo(pwm,color_scheme=color_scheme,vpad=0,width=0.95,ax=axins) + + motif_logo.ax.set_ylim([0, 2]) + motif_logo.style_spines(visible=False) + motif_logo.ax.set_yticks([]) + motif_logo.ax.set_xticks([]) + + + if np.isnan(bar_length_consensus[x]) == True: + ax4.text(x+bar_offset-0.07,0.07,'?') + + +plt.subplots_adjust(hspace=0.41) + +plt.savefig('vertical_bar_plot.pdf', bbox_inches = "tight")