diff --git a/Makefile b/EM-algorithm/Makefile similarity index 100% rename from Makefile rename to EM-algorithm/Makefile diff --git a/README.txt b/EM-algorithm/README.txt similarity index 100% rename from README.txt rename to EM-algorithm/README.txt diff --git a/cluster_main.cpp b/EM-algorithm/cluster_main.cpp similarity index 100% rename from cluster_main.cpp rename to EM-algorithm/cluster_main.cpp diff --git a/inout.h b/EM-algorithm/inout.h similarity index 100% rename from inout.h rename to EM-algorithm/inout.h diff --git a/main.cpp b/EM-algorithm/main.cpp similarity index 100% rename from main.cpp rename to EM-algorithm/main.cpp diff --git a/params.h b/EM-algorithm/params.h similarity index 100% rename from params.h rename to EM-algorithm/params.h diff --git a/update.h b/EM-algorithm/update.h similarity index 100% rename from update.h rename to EM-algorithm/update.h diff --git a/Markov-model/README.txt b/Markov-model/README.txt new file mode 100644 index 0000000000000000000000000000000000000000..2534243fa43c413fb7aca4601353799bbe3ed1ac --- /dev/null +++ b/Markov-model/README.txt @@ -0,0 +1,22 @@ +----------------------------------------------- +Niels Schlusser, July, 28th 2022 +----------------------------------------------- + +This directory contains the python code and bash scripts to turn *.fastq.gz files into the desired format for our EM optimization of binding sites. To this end, an the background frequencies need to be constructed from a Markov model of highest possible degree. + +The code was ran on the following system configuration + - Ubuntu 20.04.1 + - python 3.8.10 + - Biopython 1.79 + - Bash 5.0.7(1) + - cutadapt 3.4 + +The code is organized in two parts: The actual Markov model is run in a python script ("construct_background.py", not parallelized), whereas the overarching tasks, including the execution of the python script itself, are organized in the bash script "execute.sh". + +Parameters need to be specified in the bash script "execute.sh", only: + - the degree of the Markov model. It should be as long as possible. In practice, a length of 14 has proven feasible with a decent RAM consumption (~16GB). + - the number of background files + - the (ENCODE-format) filenames (without *.fastq.gz extension) for foreground and background + +For downloading and adapter trimming, execute the script with <./execute.sh dlct>. This will yield *.fastq files with trimmed adapters. After this, run <./execute.sh> for the file preparation using the Markov model. + diff --git a/Markov-model/construct_background.py b/Markov-model/construct_background.py new file mode 100644 index 0000000000000000000000000000000000000000..0ef9be7ba11c4af8fa044ba303a9fb152c5c55a6 --- /dev/null +++ b/Markov-model/construct_background.py @@ -0,0 +1,120 @@ +import Bio +import sys +import math +from Bio import SeqIO + +#declare input +background_files = [] +foreground_files = [] +b_kmer_len=0 + +#initialize input +if len(sys.argv) > 4: + b_kmer_len=int(sys.argv[1]) + for ind in range(3,3+int(sys.argv[2])): + background_files.append(sys.argv[ind]) + for ind in range(3+int(sys.argv[2]),len(sys.argv)): + foreground_files.append(sys.argv[ind]) +else: + print("Insufficient input\n") + + +#define nucleic acid dictionary +nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 } +inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" } + + +#save original output channel +original_stdout = sys.stdout + +#create array of background counts +background_frequency = [0.0 for i in range(0,pow(4,b_kmer_len))] + + + +#define function which calculates the background frequency of a given read S +def construct_frequency(S): + ret = 0.0 + + for kmer_start in range(0, len(S)-b_kmer_len+1): + tmp_pos = 0 + + for kmer_rel_pos in range(0,b_kmer_len): + tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos) + + ret += background_frequency[tmp_pos] + + return ret + + + +#construct background frequency array +normalization = 0 +for data_file in background_files: + for read in SeqIO.parse(data_file+".fastq","fastq"): + s = read.seq + + for kmer_start in range(0,len(s)-b_kmer_len+1): + tmp_cnt = 0 + is_N = 0 + + for kmer_rel_pos in range(0,b_kmer_len): + if(s[kmer_start+kmer_rel_pos] != 'N'): + tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos) + else: + is_N = 1 + + #only use sequence if there is no N in it + if( is_N == 0 ): + background_frequency[tmp_cnt] += 1.0 + normalization += 1 + + + + +#normalize and print log of background likelihood of kmer +with open("background_frequencies_"+str(b_kmer_len)+"mer.dat","w") as f: + sys.stdout = f + for i in range(0,pow(4,b_kmer_len)): + background_frequency[i] = background_frequency[i]/normalization + s = "" + tmpI = i + for j in range(0,b_kmer_len): + tmpN = tmpI % 4 + s = s + inv_nucl_dict[tmpN] + tmpI -= tmpN + tmpI /= 4 + print(s,background_frequency[i]) + sys.stdout = original_stdout + + + +#create foreground sequence dictionaries, print in data format for RBAPBindnSeq +background_normalization = 0 +number_kmers = 0 +for data_file in foreground_files: + foreground_dict = {} + + #create foreground dictionary + for seq_record in SeqIO.parse(data_file+".fastq","fastq"): + if seq_record.seq in foreground_dict: + foreground_dict[seq_record.seq] += 1 + else: + foreground_dict[seq_record.seq] = 1 + + + #print read files with background frequencies + with open(data_file+"_"+str(b_kmer_len)+"mer.dat","w") as f: + sys.stdout = f + for s in foreground_dict: + if('N' not in s): + tmp_bgr = construct_frequency(s) + if tmp_bgr > 0.0 and len(s) >= b_kmer_len: + background_normalization += foreground_dict[s]*math.log(tmp_bgr) + number_kmers += foreground_dict[s]*(len(s) - b_kmer_len + 1) + print(len(s),foreground_dict[s],tmp_bgr,s) + sys.stdout = original_stdout + + +print("background normalization for ", b_kmer_len, "mer: ", background_normalization, "\nNumber of possible ", b_kmer_len, "mers: ", number_kmers) + diff --git a/Markov-model/execute.sh b/Markov-model/execute.sh new file mode 100755 index 0000000000000000000000000000000000000000..02d7d5dd3108ed42acb5ec75252dcba451eb9122 --- /dev/null +++ b/Markov-model/execute.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +Lw=14 #kmer-length +nbr_bfiles=2 #number of background files + +#input file names +background_files=("ENCFF002DGO" "ENCFF002DGY") +foreground_files=("ENCFF002DGP" "ENCFF002DGQ" "ENCFF002DGR" "ENCFF002DGS" "ENCFF002DGT" "ENCFF002DGU" "ENCFF002DGV" "ENCFF002DGW" "ENCFF002DGX") + + + +#################### +####START SCRIPT#### +#################### + +if [[ $1 == "dlct" ]] +then + #download files + for filename in ${background_files[@]} + do + curl -O -L https://www.encodeproject.org/files/${filename}/@@download/${filename}.fastq.gz + done + + for filename in ${foreground_files[@]} + do + curl -O -L https://www.encodeproject.org/files/${filename}/@@download/${filename}.fastq.gz + done + + + #turn into fastq and cut adapter + module purge + ml cutadapt + + for filename in ${background_files[@]} + do + cutadapt -j 4 -a TGGAATTCTCGGGTGTCAAGG -o ${filename}.fastq ${filename}.fastq.gz + done + + for filename in ${foreground_files[@]} + do + cutadapt -j 4 -a TGGAATTCTCGGGTGTCAAGG -o ${filename}.fastq ${filename}.fastq.gz + done +else + #calculates background frequencies based on Markov model + module purge + ml Biopython + python construct_background.py ${Lw} ${nbr_bfiles} $(echo "${background_files[*]}") $(echo "${foreground_files[*]}") + + for filename in ${foreground_files[@]} + do + awk 'BEGIN{max=0}{if($1>max) max=$1}END{print "'${filename}'",max}' ${filename}.fastq.gz + done + + rm background_frequencies_${Lw}mer.dat +fi diff --git a/Markov-model/move_wdir.sh b/Markov-model/move_wdir.sh new file mode 100755 index 0000000000000000000000000000000000000000..35936a7c462b28f5caa3eb2dacf3b54cb68af502 --- /dev/null +++ b/Markov-model/move_wdir.sh @@ -0,0 +1,4 @@ +for lett in 002DGP 002DGQ 002DGR 002DGS 002DGT 002DGU 002DGV 002DGW 002DGX +do + mv ENCFF${lett}_14mer.dat ../RBAPBindnSeq/ENCFF${lett}.dat +done diff --git a/python_scripts/background_log_likelihood.py b/python_scripts/background_log_likelihood.py deleted file mode 100644 index 7972f68ac01819f8738fc972d73689bf4e4d2f16..0000000000000000000000000000000000000000 --- a/python_scripts/background_log_likelihood.py +++ /dev/null @@ -1,131 +0,0 @@ -import Bio -import sys -import math -from Bio import SeqIO - - -#declare input -background_files = ["ENCFF002DGZ", "ENCFF002DHJ"] -eE0 = math.exp(2.848039) -PWM = [0.99377,0.002394,0.000393,0.000303,0.999999,0.000407,# -3.7e-05,9e-06,0.0,0.999696,0.0,0.001929,# -0.0,0.0,0.999607,0.0,0.0,3e-06,# -0.006192,0.997597,0.0,0.0,1e-06,0.997661] -L_w = len(PWM) // 4 - - -#define nucleic acid dictionary -nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3} -inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T"} - -#save original output channel -original_stdout = sys.stdout - - -#create array of background counts -background_frequency = [0.0 for i in range(0,pow(4,L_w))] - - -#define function which calculates the background frequency of a given read S -def construct_frequency(S): - ret = 0.0 - - for kmer_start in range(0,len(S)-L_w+1): - tmp_pos = 0 - - for kmer_rel_pos in range(0,L_w): - tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos) - - ret += background_frequency[tmp_pos] - - return ret - - -#function determines exp(E(S)) -#position-specific binding likelihood -def eES(read): - ret = 0.0 - for kmer_start in range(0,len(read) - L_w + 1): - aux = 1.0 - for kmer_rel_pos in range(0,L_w): - aux *= PWM[nucl_dict[read[kmer_start+kmer_rel_pos]]*L_w + kmer_rel_pos] - if 'N' not in read[kmer_start:kmer_start+L_w]: - ret += aux - return ret - - - - -#------------------------------- -#START OF THE SCRIPT -#------------------------------- -#construct background frequency array -#normalization = 0 -#for data_file in background_files: - #for read in SeqIO.parse(data_file+".fastq","fastq"): - #s = read.seq - - #for kmer_start in range(0,len(s)-L_w+1): - #tmp_cnt = 0 - #is_N = 0 - - #for kmer_rel_pos in range(0,L_w): - #if(s[kmer_start+kmer_rel_pos] != 'N'): - #tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos) - #else: - #is_N = 1 - - ##only use sequence if there is no N in it - #if( is_N == 0 ): - #background_frequency[tmp_cnt] += 1.0 - #normalization += 1 - - - - -##normalize -#for i in range(0,pow(4,L_w)): - #background_frequency[i] = background_frequency[i]/normalization - - -##create background sequence dictionaries, print in data format for RBAPBindnSeq -#number_kmers = 0 -#for data_file in background_files: - ##print read files with background frequencies - #with open(data_file+".dat","w") as f: - #sys.stdout = f - #for seq_record in SeqIO.parse(data_file+".fastq","fastq"): - #if('N' not in seq_record.seq AND len(seq_record.seq) >= L_w): - #print(len(seq_record.seq),1,construct_frequency(seq_record.seq),seq_record.seq) - #sys.stdout = original_stdout - - -bgr_log_likelihood = 0.0 -norm1 = 0 -norm2 = 0.0 -for data_file in background_files: - f = open(data_file+'.dat','r') - - while True: - line = f.readline() - - if not (line): - break - - val = line.split() - - peES = 0.0 - if len(val) == 4: - peES = eES(val[3]) - peES *= float(val[2]) - peES += eE0*(int(val[0]) - L_w + 1) - - if peES > 0.0: - bgr_log_likelihood += int(val[1])*math.log(peES) - norm1 += int(val[1]) - norm2 += peES - - - -print("Background normalization",bgr_log_likelihood-norm1*math.log(norm2)) - diff --git a/python_scripts/compare_kmers.py b/python_scripts/compare_kmers.py deleted file mode 100755 index d3f834cc96b7703c24965e3e34db5e2f7db9eaae..0000000000000000000000000000000000000000 --- a/python_scripts/compare_kmers.py +++ /dev/null @@ -1,148 +0,0 @@ -#script returns similarity scores -#of PWMs - -import math -import sys - - -#give input filenames -PWM_file_1 = "no_mask_HNRNPK_7mer_ranked.dat" -PWM_file_2 = "no_mask_HNRNPK_8mer_ranked.dat" - -#define thresholds -spec_threshhold = 20.0 - - -#auxiliary method -#computes the Euclidean distance -#for an alignment of rows -#ind1 in PWM1 -#ind2 in PWM2 -#if ind2 is out of range of PWM2 -#assume equal distribution -def distance_row(ind1,PWM1,ind2,PWM2): - ret = 0.0 - for ind in range(0,4): - if 0 <= ind2 < int(len(PWM2)/4): - ret += math.pow(PWM1[ind*int(len(PWM1)/4) + ind1] - PWM2[ind*int(len(PWM2)/4) + ind2],2) - else: - ret += math.pow(PWM1[ind*int(len(PWM1)/4) + ind1] - 0.25,2) - return ret - - -#auxiliary method -#computes maximum Euclidean -#similarity score -#len(PWM1) <= len(PWM2) is assumed -#assumes "half" mismatch for non-overlapping parts of PWM -def max_sim(PWM1,PWM2): - ret = float("inf") - ptr = 0 - for offset in range(-int(len(PWM1)/4)+1,int(len(PWM2)/4)): - tmp = 0.0 - for i in range(0,int(len(PWM1)/4)): - tmp += distance_row(i,PWM1,i+offset,PWM2) - if(ret > tmp/int(len(PWM1)/4)): - ret = tmp/int(len(PWM1)/4) - ptr = offset - return ret,ptr - - -#auxiliary method -#returns float list with next PWM -#extracted from in_file -def get_PWM(in_file): - while True: - line1 = in_file.readline() #skip first line - line1 = in_file.readline() - line2 = in_file.readline() - line3 = in_file.readline() - line4 = in_file.readline() - line5 = in_file.readline() - - # if two lines are empty - # end of file is reached - if not (line1): - return None - - #extract words from lines - val1 = line1.split() - val2 = line2.split() - val3 = line3.split() - val4 = line4.split() - val5 = line5.split() - - #skip free line after each PWM - line1 = in_file.readline() - - if (int(float(val5[3])) > 0 and float(val5[0]) < spec_threshhold): - Lw = len(val1) - 1 - PWM = [0.0 for j in range(0,4*Lw)] - for i in range(0,Lw): - PWM[0*Lw+i] = float(val1[i+1]) - for i in range(0,Lw): - PWM[1*Lw+i] = float(val2[i+1]) - for i in range(0,Lw): - PWM[2*Lw+i] = float(val3[i+1]) - for i in range(0,Lw): - PWM[3*Lw+i] = float(val4[i+1]) - return PWM - - return None - - -#define class of comparisons -#of two PWMs -class PWM_Comparison(): - def __init__(self, index1, index2, distance, offset, PWM1, PWM2): - self.index1 = index1 - self.index2 = index2 - self.distance = distance - self.offset = offset - self.PWM1 = PWM1 - self.PWM2 = PWM2 - - -#------------------------------- -#START OF THE SCRIPT -#------------------------------- - -f2 = open(PWM_file_2,'r') -ind2 = -1 -while True: - pPWM2 = get_PWM(f2) - - if pPWM2 is None: - break - else: - ind2 += 1 - - comparison_array = [] - f1 = open(PWM_file_1,'r') - ind1 = -1 - while True: - pPWM1 = get_PWM(f1) - - if pPWM1 is None: - break - else: - ind1 += 1 - di,of = max_sim(pPWM1,pPWM2) - tmp_comp = PWM_Comparison(index1 = ind1, index2 = ind2, distance = di, offset = of, PWM1 = pPWM1, PWM2 = pPWM2) - comparison_array.append(tmp_comp) - - f1.close() - min_comp = min(comparison_array, key=lambda item: item.distance) - for i in range(0,4): - for j in range(0,int(len(min_comp.PWM1)/4)): - print(min_comp.PWM1[i*int(len(min_comp.PWM1)/4)+j], end = "\t") - print("") - print("") - for i in range(0,4): - for j in range(0,int(len(min_comp.PWM2)/4)): - print(min_comp.PWM2[i*int(len(min_comp.PWM2)/4)+j], end = "\t") - print("") - print(min_comp.index1,min_comp.index2,min_comp.distance,min_comp.offset) - print("") - -f2.close() diff --git a/python_scripts/count_possible_kmers.sh b/python_scripts/count_possible_kmers.sh deleted file mode 100755 index d4d430a7348652b0d261ba7df916d0cc75569d0e..0000000000000000000000000000000000000000 --- a/python_scripts/count_possible_kmers.sh +++ /dev/null @@ -1,9 +0,0 @@ -for kmer_length in 4 5 6 7 8 9 10 -do - for file in ENCFF002DHA.dat ENCFF002DHB.dat ENCFF002DHC.dat ENCFF002DHD.dat ENCFF002DHE.dat ENCFF002DHF.dat ENCFF002DHG.dat ENCFF002DHH.dat ENCFF002DHI.dat - do - awk 'BEGIN{sum=0}{sum+=$2*($1 - '${kmer_length}' + 1)}END{printf("%d\n",sum)}' ${file} >> tmp.dat - done - awk 'BEGIN{sum=0}{sum+=$1}END{printf("%d\t%d\n",'${kmer_length}',sum)}' tmp.dat - rm tmp.dat -done diff --git a/python_scripts/nucleotide_bias.py b/python_scripts/nucleotide_bias.py deleted file mode 100644 index 8edbea9fb94ef2544c0cfbbbbf9dd656738ee09c..0000000000000000000000000000000000000000 --- a/python_scripts/nucleotide_bias.py +++ /dev/null @@ -1,66 +0,0 @@ -import Bio -import sys -import math -from Bio import SeqIO - - -#declare input -#files = ["ENCFF002DHJ"] -files = ["ENCFF002DHA", "ENCFF002DHB", "ENCFF002DHC", "ENCFF002DHD", "ENCFF002DHE", "ENCFF002DHF", "ENCFF002DHG", "ENCFF002DHH", "ENCFF002DHI"] -sequence = "TGCATG" -reach=3 - -L_w=len(sequence) - - -#define nucleic acid dictionary -nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3} -inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T"} - - - -#create arrays for nucleotide counts -upstream_counts = [0.0 for i in range(0,4*reach)] -downstream_counts = [0.0 for i in range(0,4*reach)] - - - -#------------------------------- -#START OF THE SCRIPT -#------------------------------- -for data_file in files: - for read in SeqIO.parse(data_file+".fastq","fastq"): - s = read.seq - - ind = s.find(sequence) - if ind == -1 or "N" in s: - continue - elif reach <= ind < len(s) - L_w - reach: - for r in range(0,reach): - upstream_counts[nucl_dict[s[ind-reach+r]]*reach+r] += 1.0 - downstream_counts[nucl_dict[s[ind+L_w+r]]*reach+r] += 1.0 - - - - -#normalize -for i in range(0,reach): - upstream_norm = 0.0 - downstream_norm = 0.0 - for j in range(0,4): - upstream_norm += upstream_counts[j*reach+i] - downstream_norm += downstream_counts[j*reach+i] - for j in range(0,4): - upstream_counts[j*reach+i] /= upstream_norm - downstream_counts[j*reach+i] /= downstream_norm - - -#print result -print("Sequence bias around "+sequence+":") - -for j in range(0,4): - for i in range(0,reach): - print(upstream_counts[j*reach+i],end="\t") - for i in range(0,reach): - print(downstream_counts[j*reach+i],end="\t") - print("") diff --git a/python_scripts/plot.sh b/python_scripts/plot.sh deleted file mode 100755 index 73985fc9f1f77c7c1c22fec876f86f9a95c2a3a8..0000000000000000000000000000000000000000 --- a/python_scripts/plot.sh +++ /dev/null @@ -1,55 +0,0 @@ -#!/bin/bash -#plots all PWMs from a given input file - -filename=PWMs_ranked.dat - - -#split files -split -l 7 -d --additional-suffix=.dat ${filename} pmat - -#safe number of digits for leading zeros -di="$(ls -dq pmat*.dat | wc -l)" -digits="${#di}" - -i=0 -for wdfile in ${PWD}/pmat*.dat; -do - ((i++)) - head -n 5 ${wdfile} > mat$(printf "%0${digits}d" ${i}).dat -done - -rm pmat*.dat - - -#load library -echo "library(seqLogo)" >> next.plot.r - -#load tables -i=0 -for wdfile in ${PWD}/mat*.dat; -do - ((i++)) - echo 'm'$(printf "%0${digits}d" ${i})' <- read.table("'${wdfile}'", header=TRUE, stringsAsFactors=FALSE)' >> next.plot.r - echo 'p'$(printf "%0${digits}d" ${i})' <- makePWM(m'$(printf "%0${digits}d" ${i})')' >> next.plot.r -done - - -#export plots to PDF, once with info content scaling, once without -echo 'pdf("plots.pdf")' >> next.plot.r - -i=0 -for wdfile in ${PWD}/mat*.dat; -do - ((i++)) - echo 'seqLogo(p'$(printf "%0${digits}d" ${i})')' >> next.plot.r -done - -echo 'dev.off()' >> next.plot.r - -echo 'q()' >> next.plot.r -echo 'n' >> next.plot.r - -Rscript next.plot.r - -rm mat*.dat -rm next.plot.r diff --git a/python_scripts/rank_kmers.py b/python_scripts/rank_kmers.py deleted file mode 100755 index c707d5edb970e12ddb976a08212e5101664363fb..0000000000000000000000000000000000000000 --- a/python_scripts/rank_kmers.py +++ /dev/null @@ -1,140 +0,0 @@ -#script searches output file for PWMs -#subtracts the respective background probabilities -#prints a ranked list of accordingly ranked PWMs - -import math -import os -import sys - -#specify prefix for files -prefix = "data" - -#define thresholds -spec_threshhold = 50.0 - -#give background values -normalization_dict = {4 : 6044491827, 5 : 5880088810, 6 : 5715685929, 7 : 5551283214} - -#save original output channel -original_stdout = sys.stdout - -#define nucleic acid dictionary -nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 } -inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" } - - - -#define class of PWMs with score -class PWM_Score(): - def __init__(self, PWM, foreground_log_likelihood, iterations, E0): - self.PWM = PWM - self.foreground_log_likelihood = foreground_log_likelihood - self.iterations = iterations - self.E0 = E0 - - -#-------------------- -#START SCRIPT -#-------------------- - -#initialize PWM collection -PWM_score_array = [] - -for fi in os.listdir(os.getcwd()): - filename = os.fsdecode(fi) - - if ((not filename.startswith("ENCFF")) and filename.endswith("mer.dat") and filename.startswith(prefix)): - - foreground_file = filename - - #open files - f = open(foreground_file,'r') - - while True: - # Get next PWM from file - line1 = f.readline() #skip first line - line1 = f.readline() - line2 = f.readline() - line3 = f.readline() - line4 = f.readline() - line5 = f.readline() - - # if two lines are empty - # end of file is reached - if not (line1): - break - - #extract words from lines - val1 = line1.split() - val2 = line2.split() - val3 = line3.split() - val4 = line4.split() - val5 = line5.split() - - #skip free line after each PWM - line1 = f.readline() - - Lstart = float(val5[1]) - - line1 = f.readline() #skip first line - line1 = f.readline() - line2 = f.readline() - line3 = f.readline() - line4 = f.readline() - line5 = f.readline() - - # if two lines are empty - # end of file is reached - if not (line1): - break - - #extract words from lines - val1 = line1.split() - val2 = line2.split() - val3 = line3.split() - val4 = line4.split() - val5 = line5.split() - - #skip free line after each PWM - line1 = f.readline() - - Lstop = float(val5[1]) - - if (Lstart < Lstop and float(val5[0]) < spec_threshhold): - Lw = len(val1) - 1 - pPWM = [0.0 for j in range(0,4*Lw)] - for i in range(0,Lw): - pPWM[0*Lw+i] = float(val1[i+1]) - for i in range(0,Lw): - pPWM[1*Lw+i] = float(val2[i+1]) - for i in range(0,Lw): - pPWM[2*Lw+i] = float(val3[i+1]) - for i in range(0,Lw): - pPWM[3*Lw+i] = float(val4[i+1]) - tmp_foreground = Lstop #/ normalization_dict[Lw] - tmp_iterations = int(float(val5[2])) - tmp_E0 = float(val5[0]) - tmp_obj = PWM_Score(PWM = pPWM, foreground_log_likelihood = tmp_foreground, iterations = tmp_iterations, E0 = tmp_E0) - PWM_score_array.append(tmp_obj) - - - - f.close() - -#with open(filename[:-4]+"_ranked.dat","w") as f: -with open("PWMs_ranked.dat","w") as f: - sys.stdout = f - #print ranked consensus PWMs - for p in sorted(PWM_score_array, key= lambda item: item.foreground_log_likelihood, reverse= True): - print("", end = "\t") - for i in range(0,int(len(p.PWM)/4)): - print((i+1), end = "\t\t") - print("") - for i in range(0,4): - print(inv_nucl_dict[i], end = "\t") - for j in range(0,int(len(p.PWM)/4)): - print(p.PWM[i*int(len(p.PWM)/4)+j], end = "\t") - print("") - print(p.E0, p.foreground_log_likelihood, p.iterations) - print("") - sys.stdout = original_stdout