diff --git a/Makefile b/EM-algorithm/Makefile
similarity index 100%
rename from Makefile
rename to EM-algorithm/Makefile
diff --git a/README.txt b/EM-algorithm/README.txt
similarity index 100%
rename from README.txt
rename to EM-algorithm/README.txt
diff --git a/cluster_main.cpp b/EM-algorithm/cluster_main.cpp
similarity index 100%
rename from cluster_main.cpp
rename to EM-algorithm/cluster_main.cpp
diff --git a/inout.h b/EM-algorithm/inout.h
similarity index 100%
rename from inout.h
rename to EM-algorithm/inout.h
diff --git a/main.cpp b/EM-algorithm/main.cpp
similarity index 100%
rename from main.cpp
rename to EM-algorithm/main.cpp
diff --git a/params.h b/EM-algorithm/params.h
similarity index 100%
rename from params.h
rename to EM-algorithm/params.h
diff --git a/update.h b/EM-algorithm/update.h
similarity index 100%
rename from update.h
rename to EM-algorithm/update.h
diff --git a/Markov-model/README.txt b/Markov-model/README.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2534243fa43c413fb7aca4601353799bbe3ed1ac
--- /dev/null
+++ b/Markov-model/README.txt
@@ -0,0 +1,22 @@
+-----------------------------------------------
+Niels Schlusser, July, 28th 2022
+-----------------------------------------------
+
+This directory contains the python code and bash scripts to turn *.fastq.gz files into the desired format for our EM optimization of binding sites. To this end, an the background frequencies need to be constructed from a Markov model of highest possible degree.
+
+The code was ran on the following system configuration
+	- Ubuntu 20.04.1
+	- python 3.8.10
+	- Biopython 1.79
+	- Bash 5.0.7(1)
+	- cutadapt 3.4
+	
+The code is organized in two parts: The actual Markov model is run in a python script ("construct_background.py", not parallelized), whereas the overarching tasks, including the execution of the python script itself, are organized in the bash script "execute.sh".
+
+Parameters need to be specified in the bash script "execute.sh", only:
+	- the degree of the Markov model. It should be as long as possible. In practice, a length of 14 has proven feasible with a decent RAM consumption (~16GB).
+	- the number of background files
+	- the (ENCODE-format) filenames (without *.fastq.gz extension) for foreground and background
+	
+For downloading and adapter trimming, execute the script with <./execute.sh dlct>. This will yield *.fastq files with trimmed adapters. After this, run <./execute.sh> for the file preparation using the Markov model.
+
diff --git a/Markov-model/construct_background.py b/Markov-model/construct_background.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ef9be7ba11c4af8fa044ba303a9fb152c5c55a6
--- /dev/null
+++ b/Markov-model/construct_background.py
@@ -0,0 +1,120 @@
+import Bio
+import sys
+import math
+from Bio import SeqIO
+
+#declare input
+background_files = []
+foreground_files = []
+b_kmer_len=0
+
+#initialize input
+if len(sys.argv) > 4: 
+	b_kmer_len=int(sys.argv[1])
+	for ind in range(3,3+int(sys.argv[2])):
+		background_files.append(sys.argv[ind])
+	for ind in range(3+int(sys.argv[2]),len(sys.argv)):
+		foreground_files.append(sys.argv[ind])
+else:
+	print("Insufficient input\n")
+
+
+#define nucleic acid dictionary
+nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 }
+inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" }
+
+
+#save original output channel
+original_stdout = sys.stdout
+
+#create array of background counts
+background_frequency = [0.0 for i in range(0,pow(4,b_kmer_len))]
+
+
+
+#define function which calculates the background frequency of a given read S
+def construct_frequency(S):
+	ret = 0.0
+	
+	for kmer_start in range(0, len(S)-b_kmer_len+1):
+		tmp_pos = 0
+		
+		for kmer_rel_pos in range(0,b_kmer_len):
+			tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos)
+		
+		ret += background_frequency[tmp_pos]
+	
+	return ret
+
+
+
+#construct background frequency array
+normalization = 0
+for data_file in background_files:
+	for read in SeqIO.parse(data_file+".fastq","fastq"):
+		s = read.seq
+		
+		for kmer_start in range(0,len(s)-b_kmer_len+1):
+			tmp_cnt = 0
+			is_N = 0
+			
+			for kmer_rel_pos in range(0,b_kmer_len):
+				if(s[kmer_start+kmer_rel_pos] != 'N'):
+					tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos)
+				else:
+					is_N = 1
+					
+			#only use sequence if there is no N in it
+			if( is_N == 0 ):
+				background_frequency[tmp_cnt] += 1.0
+				normalization += 1
+			
+		
+	
+
+#normalize and print log of background likelihood of kmer
+with open("background_frequencies_"+str(b_kmer_len)+"mer.dat","w") as f:
+	sys.stdout = f
+	for i in range(0,pow(4,b_kmer_len)):
+		background_frequency[i] = background_frequency[i]/normalization
+		s = ""
+		tmpI = i
+		for j in range(0,b_kmer_len):
+			tmpN = tmpI % 4
+			s = s + inv_nucl_dict[tmpN]
+			tmpI -= tmpN
+			tmpI /= 4
+		print(s,background_frequency[i])
+	sys.stdout = original_stdout
+
+
+
+#create foreground sequence dictionaries, print in data format for RBAPBindnSeq
+background_normalization = 0
+number_kmers = 0
+for data_file in foreground_files:
+	foreground_dict = {}
+	
+	#create foreground dictionary
+	for seq_record in SeqIO.parse(data_file+".fastq","fastq"):
+		if seq_record.seq in foreground_dict:
+			foreground_dict[seq_record.seq] += 1
+		else:
+			foreground_dict[seq_record.seq] = 1
+		
+	
+	#print read files with background frequencies
+	with open(data_file+"_"+str(b_kmer_len)+"mer.dat","w") as f:
+		sys.stdout = f
+		for s in foreground_dict:
+			if('N' not in s):
+				tmp_bgr = construct_frequency(s)
+				if tmp_bgr > 0.0 and len(s) >= b_kmer_len:
+					background_normalization += foreground_dict[s]*math.log(tmp_bgr)
+					number_kmers += foreground_dict[s]*(len(s) - b_kmer_len + 1)
+					print(len(s),foreground_dict[s],tmp_bgr,s)
+		sys.stdout = original_stdout
+		
+	
+print("background normalization for ", b_kmer_len, "mer: ", background_normalization, "\nNumber of possible ", b_kmer_len, "mers: ", number_kmers)
+
diff --git a/Markov-model/execute.sh b/Markov-model/execute.sh
new file mode 100755
index 0000000000000000000000000000000000000000..02d7d5dd3108ed42acb5ec75252dcba451eb9122
--- /dev/null
+++ b/Markov-model/execute.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+Lw=14			#kmer-length
+nbr_bfiles=2		#number of background files
+
+#input file names
+background_files=("ENCFF002DGO" "ENCFF002DGY")
+foreground_files=("ENCFF002DGP" "ENCFF002DGQ" "ENCFF002DGR" "ENCFF002DGS" "ENCFF002DGT" "ENCFF002DGU" "ENCFF002DGV" "ENCFF002DGW" "ENCFF002DGX")
+
+
+
+####################
+####START SCRIPT####
+####################
+
+if [[ $1 == "dlct" ]]
+then
+	#download files
+	for filename in ${background_files[@]}
+	do
+		curl -O -L https://www.encodeproject.org/files/${filename}/@@download/${filename}.fastq.gz
+	done
+
+	for filename in ${foreground_files[@]}
+	do
+		curl -O -L https://www.encodeproject.org/files/${filename}/@@download/${filename}.fastq.gz
+	done
+
+
+	#turn into fastq and cut adapter
+	module purge
+	ml cutadapt
+
+	for filename in ${background_files[@]}
+	do
+		cutadapt -j 4 -a TGGAATTCTCGGGTGTCAAGG -o ${filename}.fastq ${filename}.fastq.gz
+	done
+
+	for filename in ${foreground_files[@]}
+	do
+		cutadapt -j 4 -a TGGAATTCTCGGGTGTCAAGG -o ${filename}.fastq ${filename}.fastq.gz
+	done
+else
+	#calculates background frequencies based on Markov model
+	module purge
+	ml Biopython
+	python construct_background.py ${Lw} ${nbr_bfiles} $(echo "${background_files[*]}") $(echo "${foreground_files[*]}")
+
+	for filename in ${foreground_files[@]}
+	do
+		awk 'BEGIN{max=0}{if($1>max) max=$1}END{print "'${filename}'",max}' ${filename}.fastq.gz
+	done
+
+	rm background_frequencies_${Lw}mer.dat
+fi
diff --git a/Markov-model/move_wdir.sh b/Markov-model/move_wdir.sh
new file mode 100755
index 0000000000000000000000000000000000000000..35936a7c462b28f5caa3eb2dacf3b54cb68af502
--- /dev/null
+++ b/Markov-model/move_wdir.sh
@@ -0,0 +1,4 @@
+for lett in 002DGP 002DGQ 002DGR 002DGS 002DGT 002DGU 002DGV 002DGW 002DGX
+do
+	mv ENCFF${lett}_14mer.dat ../RBAPBindnSeq/ENCFF${lett}.dat
+done
diff --git a/python_scripts/background_log_likelihood.py b/python_scripts/background_log_likelihood.py
deleted file mode 100644
index 7972f68ac01819f8738fc972d73689bf4e4d2f16..0000000000000000000000000000000000000000
--- a/python_scripts/background_log_likelihood.py
+++ /dev/null
@@ -1,131 +0,0 @@
-import Bio
-import sys
-import math
-from Bio import SeqIO
-
-
-#declare input
-background_files = ["ENCFF002DGZ", "ENCFF002DHJ"]
-eE0 = math.exp(2.848039)
-PWM = [0.99377,0.002394,0.000393,0.000303,0.999999,0.000407,#
-3.7e-05,9e-06,0.0,0.999696,0.0,0.001929,#
-0.0,0.0,0.999607,0.0,0.0,3e-06,#
-0.006192,0.997597,0.0,0.0,1e-06,0.997661]
-L_w = len(PWM) // 4
-
-
-#define nucleic acid dictionary
-nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3}
-inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T"}
-
-#save original output channel
-original_stdout = sys.stdout
-
-
-#create array of background counts
-background_frequency = [0.0 for i in range(0,pow(4,L_w))]
-
-
-#define function which calculates the background frequency of a given read S
-def construct_frequency(S):
-	ret = 0.0
-	
-	for kmer_start in range(0,len(S)-L_w+1):
-		tmp_pos = 0
-		
-		for kmer_rel_pos in range(0,L_w):
-			tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos)
-		
-		ret += background_frequency[tmp_pos]
-	
-	return ret
-
-
-#function determines exp(E(S))
-#position-specific binding likelihood
-def eES(read):
-	ret = 0.0
-	for kmer_start in range(0,len(read) - L_w + 1):
-		aux = 1.0
-		for kmer_rel_pos in range(0,L_w):
-			aux *= PWM[nucl_dict[read[kmer_start+kmer_rel_pos]]*L_w + kmer_rel_pos]
-		if 'N' not in read[kmer_start:kmer_start+L_w]:
-			ret += aux
-	return ret
-
-
-
-
-#-------------------------------
-#START OF THE SCRIPT
-#-------------------------------
-#construct background frequency array
-#normalization = 0
-#for data_file in background_files:
-	#for read in SeqIO.parse(data_file+".fastq","fastq"):
-		#s = read.seq
-		
-		#for kmer_start in range(0,len(s)-L_w+1):
-			#tmp_cnt = 0
-			#is_N = 0
-			
-			#for kmer_rel_pos in range(0,L_w):
-				#if(s[kmer_start+kmer_rel_pos] != 'N'):
-					#tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos)
-				#else:
-					#is_N = 1
-					
-			##only use sequence if there is no N in it
-			#if( is_N == 0 ):
-				#background_frequency[tmp_cnt] += 1.0
-				#normalization += 1
-			
-		
-	
-
-##normalize
-#for i in range(0,pow(4,L_w)):
-	#background_frequency[i] = background_frequency[i]/normalization
-
-
-##create background sequence dictionaries, print in data format for RBAPBindnSeq
-#number_kmers = 0
-#for data_file in background_files:
-	##print read files with background frequencies
-	#with open(data_file+".dat","w") as f:
-		#sys.stdout = f
-		#for seq_record in SeqIO.parse(data_file+".fastq","fastq"):
-			#if('N' not in seq_record.seq AND len(seq_record.seq) >= L_w):
-				#print(len(seq_record.seq),1,construct_frequency(seq_record.seq),seq_record.seq)
-		#sys.stdout = original_stdout
-		
-	
-bgr_log_likelihood = 0.0
-norm1 = 0
-norm2 = 0.0
-for data_file in background_files:
-	f = open(data_file+'.dat','r')
-	
-	while True:
-		line = f.readline()
-		
-		if not (line):
-			break
-		
-		val = line.split()
-		
-		peES = 0.0
-		if len(val) == 4:
-			peES = eES(val[3])
-			peES *= float(val[2])
-			peES += eE0*(int(val[0]) - L_w + 1)
-		
-		if peES > 0.0:
-			bgr_log_likelihood += int(val[1])*math.log(peES)
-			norm1 += int(val[1])
-			norm2 += peES
-		
-	
-
-print("Background normalization",bgr_log_likelihood-norm1*math.log(norm2))
-
diff --git a/python_scripts/compare_kmers.py b/python_scripts/compare_kmers.py
deleted file mode 100755
index d3f834cc96b7703c24965e3e34db5e2f7db9eaae..0000000000000000000000000000000000000000
--- a/python_scripts/compare_kmers.py
+++ /dev/null
@@ -1,148 +0,0 @@
-#script returns similarity scores
-#of PWMs
-
-import math
-import sys
-
-
-#give input filenames
-PWM_file_1 = "no_mask_HNRNPK_7mer_ranked.dat"
-PWM_file_2 = "no_mask_HNRNPK_8mer_ranked.dat"
-
-#define thresholds
-spec_threshhold = 20.0
-
-
-#auxiliary method
-#computes the Euclidean distance
-#for an alignment of rows 
-#ind1 in PWM1
-#ind2 in PWM2
-#if ind2 is out of range of PWM2
-#assume equal distribution
-def distance_row(ind1,PWM1,ind2,PWM2):
-	ret = 0.0
-	for ind in range(0,4):
-		if 0 <= ind2 < int(len(PWM2)/4):
-			ret += math.pow(PWM1[ind*int(len(PWM1)/4) + ind1] - PWM2[ind*int(len(PWM2)/4) + ind2],2)
-		else:
-			ret += math.pow(PWM1[ind*int(len(PWM1)/4) + ind1] - 0.25,2)
-	return ret
-
-
-#auxiliary method 
-#computes maximum Euclidean 
-#similarity score
-#len(PWM1) <= len(PWM2) is assumed
-#assumes "half" mismatch for non-overlapping parts of PWM
-def max_sim(PWM1,PWM2):
-	ret = float("inf")
-	ptr = 0
-	for offset in range(-int(len(PWM1)/4)+1,int(len(PWM2)/4)):
-		tmp = 0.0
-		for i in range(0,int(len(PWM1)/4)):
-			tmp += distance_row(i,PWM1,i+offset,PWM2)
-		if(ret > tmp/int(len(PWM1)/4)):
-			ret = tmp/int(len(PWM1)/4)
-			ptr = offset
-	return ret,ptr
-
-
-#auxiliary method
-#returns float list with next PWM
-#extracted from in_file
-def get_PWM(in_file):
-	while True:
-		line1 = in_file.readline()	#skip first line
-		line1 = in_file.readline()
-		line2 = in_file.readline()
-		line3 = in_file.readline()
-		line4 = in_file.readline()
-		line5 = in_file.readline()
-		
-		# if two lines are empty    
-		# end of file is reached
-		if not (line1):
-			return None
-		
-		#extract words from lines
-		val1 = line1.split()
-		val2 = line2.split()
-		val3 = line3.split()
-		val4 = line4.split()
-		val5 = line5.split()
-		
-		#skip free line after each PWM
-		line1 = in_file.readline()
-		
-		if (int(float(val5[3])) > 0 and float(val5[0]) < spec_threshhold):
-			Lw = len(val1) - 1
-			PWM = [0.0 for j in range(0,4*Lw)]
-			for i in range(0,Lw):
-				PWM[0*Lw+i] = float(val1[i+1])
-			for i in range(0,Lw):
-				PWM[1*Lw+i] = float(val2[i+1])
-			for i in range(0,Lw):
-				PWM[2*Lw+i] = float(val3[i+1])
-			for i in range(0,Lw):
-				PWM[3*Lw+i] = float(val4[i+1])
-			return PWM
-	
-	return None
-	
-
-#define class of comparisons
-#of two PWMs
-class PWM_Comparison():
-	def __init__(self, index1, index2, distance, offset, PWM1, PWM2):
-		self.index1 = index1
-		self.index2 = index2
-		self.distance = distance
-		self.offset = offset
-		self.PWM1 = PWM1
-		self.PWM2 = PWM2
-
-
-#-------------------------------
-#START OF THE SCRIPT
-#-------------------------------
-
-f2 = open(PWM_file_2,'r')
-ind2 = -1
-while True:
-	pPWM2 = get_PWM(f2)
-	
-	if pPWM2 is None:
-		break
-	else:
-		ind2 += 1
-		
-		comparison_array = []
-		f1 = open(PWM_file_1,'r')
-		ind1 = -1
-		while True:
-			pPWM1 = get_PWM(f1)
-			
-			if pPWM1 is None:
-				break
-			else:
-				ind1 += 1
-				di,of = max_sim(pPWM1,pPWM2)
-				tmp_comp = PWM_Comparison(index1 = ind1, index2 = ind2, distance = di, offset = of, PWM1 = pPWM1, PWM2 = pPWM2)
-				comparison_array.append(tmp_comp)
-			
-		f1.close()
-		min_comp = min(comparison_array, key=lambda item: item.distance)
-		for i in range(0,4):
-			for j  in range(0,int(len(min_comp.PWM1)/4)):
-				print(min_comp.PWM1[i*int(len(min_comp.PWM1)/4)+j], end = "\t")
-			print("")
-		print("")
-		for i in range(0,4):
-			for j  in range(0,int(len(min_comp.PWM2)/4)):
-				print(min_comp.PWM2[i*int(len(min_comp.PWM2)/4)+j], end = "\t")
-			print("")
-		print(min_comp.index1,min_comp.index2,min_comp.distance,min_comp.offset)
-		print("")
-	
-f2.close()
diff --git a/python_scripts/count_possible_kmers.sh b/python_scripts/count_possible_kmers.sh
deleted file mode 100755
index d4d430a7348652b0d261ba7df916d0cc75569d0e..0000000000000000000000000000000000000000
--- a/python_scripts/count_possible_kmers.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-for kmer_length in 4 5 6 7 8 9 10
-do
-	for file in ENCFF002DHA.dat ENCFF002DHB.dat ENCFF002DHC.dat ENCFF002DHD.dat ENCFF002DHE.dat ENCFF002DHF.dat ENCFF002DHG.dat ENCFF002DHH.dat ENCFF002DHI.dat 
-	do
-		awk 'BEGIN{sum=0}{sum+=$2*($1 - '${kmer_length}' + 1)}END{printf("%d\n",sum)}' ${file} >> tmp.dat
-	done
-	awk 'BEGIN{sum=0}{sum+=$1}END{printf("%d\t%d\n",'${kmer_length}',sum)}' tmp.dat
-	rm tmp.dat
-done
diff --git a/python_scripts/nucleotide_bias.py b/python_scripts/nucleotide_bias.py
deleted file mode 100644
index 8edbea9fb94ef2544c0cfbbbbf9dd656738ee09c..0000000000000000000000000000000000000000
--- a/python_scripts/nucleotide_bias.py
+++ /dev/null
@@ -1,66 +0,0 @@
-import Bio
-import sys
-import math
-from Bio import SeqIO
-
-
-#declare input
-#files = ["ENCFF002DHJ"]
-files = ["ENCFF002DHA", "ENCFF002DHB", "ENCFF002DHC", "ENCFF002DHD", "ENCFF002DHE", "ENCFF002DHF", "ENCFF002DHG", "ENCFF002DHH", "ENCFF002DHI"]
-sequence = "TGCATG"
-reach=3
-
-L_w=len(sequence)
-
-
-#define nucleic acid dictionary
-nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3}
-inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T"}
-
-
-
-#create arrays for nucleotide counts
-upstream_counts = [0.0 for i in range(0,4*reach)]
-downstream_counts = [0.0 for i in range(0,4*reach)]
-
-
-
-#-------------------------------
-#START OF THE SCRIPT
-#-------------------------------
-for data_file in files:
-	for read in SeqIO.parse(data_file+".fastq","fastq"):
-		s = read.seq
-		
-		ind = s.find(sequence)
-		if ind == -1 or "N" in s:
-			continue
-		elif reach <= ind < len(s) - L_w - reach:
-			for r in range(0,reach):
-				upstream_counts[nucl_dict[s[ind-reach+r]]*reach+r] += 1.0
-				downstream_counts[nucl_dict[s[ind+L_w+r]]*reach+r] += 1.0
-			
-		
-	
-
-#normalize
-for i in range(0,reach):
-	upstream_norm = 0.0
-	downstream_norm = 0.0
-	for j in range(0,4):
-		upstream_norm += upstream_counts[j*reach+i]
-		downstream_norm += downstream_counts[j*reach+i]
-	for j in range(0,4):
-		upstream_counts[j*reach+i] /= upstream_norm
-		downstream_counts[j*reach+i] /= downstream_norm
-
-
-#print result
-print("Sequence bias around "+sequence+":")
-
-for j in range(0,4):
-	for i in range(0,reach):
-		print(upstream_counts[j*reach+i],end="\t")
-	for i in range(0,reach):
-		print(downstream_counts[j*reach+i],end="\t")
-	print("")
diff --git a/python_scripts/plot.sh b/python_scripts/plot.sh
deleted file mode 100755
index 73985fc9f1f77c7c1c22fec876f86f9a95c2a3a8..0000000000000000000000000000000000000000
--- a/python_scripts/plot.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-#plots all PWMs from a given input file
-
-filename=PWMs_ranked.dat
-
-
-#split files
-split -l 7 -d --additional-suffix=.dat ${filename} pmat
-
-#safe number of digits for leading zeros
-di="$(ls -dq pmat*.dat | wc -l)"
-digits="${#di}"
-
-i=0
-for wdfile in ${PWD}/pmat*.dat;
-do
-	((i++))
-	head -n 5 ${wdfile} > mat$(printf "%0${digits}d" ${i}).dat
-done
-
-rm pmat*.dat
-
-
-#load library
-echo "library(seqLogo)" >> next.plot.r
-
-#load tables
-i=0
-for wdfile in ${PWD}/mat*.dat;
-do
-	((i++))
-	echo 'm'$(printf "%0${digits}d" ${i})' <- read.table("'${wdfile}'", header=TRUE, stringsAsFactors=FALSE)' >> next.plot.r
-	echo 'p'$(printf "%0${digits}d" ${i})' <- makePWM(m'$(printf "%0${digits}d" ${i})')' >> next.plot.r
-done
-
-
-#export plots to PDF, once with info content scaling, once without
-echo 'pdf("plots.pdf")' >> next.plot.r
-
-i=0
-for wdfile in ${PWD}/mat*.dat;
-do
-	((i++))
-	echo 'seqLogo(p'$(printf "%0${digits}d" ${i})')' >> next.plot.r
-done
-
-echo 'dev.off()' >> next.plot.r
-
-echo 'q()' >> next.plot.r
-echo 'n' >> next.plot.r
-
-Rscript next.plot.r
-
-rm mat*.dat
-rm next.plot.r
diff --git a/python_scripts/rank_kmers.py b/python_scripts/rank_kmers.py
deleted file mode 100755
index c707d5edb970e12ddb976a08212e5101664363fb..0000000000000000000000000000000000000000
--- a/python_scripts/rank_kmers.py
+++ /dev/null
@@ -1,140 +0,0 @@
-#script searches output file for PWMs
-#subtracts the respective background probabilities
-#prints a ranked list of accordingly ranked PWMs
-
-import math
-import os
-import sys
-
-#specify prefix for files
-prefix = "data"
-
-#define thresholds
-spec_threshhold = 50.0
-
-#give background values
-normalization_dict = {4 : 6044491827, 5 : 5880088810, 6 : 5715685929, 7 : 5551283214}
-
-#save original output channel
-original_stdout = sys.stdout
-
-#define nucleic acid dictionary
-nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 }
-inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" }
-
-
-
-#define class of PWMs with score
-class PWM_Score():
-	def __init__(self, PWM, foreground_log_likelihood, iterations, E0):
-		self.PWM = PWM
-		self.foreground_log_likelihood = foreground_log_likelihood
-		self.iterations = iterations
-		self.E0 = E0
-
-
-#--------------------
-#START SCRIPT
-#--------------------
-
-#initialize PWM collection
-PWM_score_array = []
-
-for fi in os.listdir(os.getcwd()):
-	filename = os.fsdecode(fi)
-	
-	if ((not filename.startswith("ENCFF")) and filename.endswith("mer.dat") and filename.startswith(prefix)): 
-		
-		foreground_file = filename
-		
-		#open files
-		f = open(foreground_file,'r')
-		
-		while True:
-			# Get next PWM from file
-			line1 = f.readline()	#skip first line
-			line1 = f.readline()
-			line2 = f.readline()
-			line3 = f.readline()
-			line4 = f.readline()
-			line5 = f.readline()
-			
-			# if two lines are empty    
-			# end of file is reached
-			if not (line1):
-				break
-			 
-			#extract words from lines
-			val1 = line1.split()
-			val2 = line2.split()
-			val3 = line3.split()
-			val4 = line4.split()
-			val5 = line5.split()
-			
-			#skip free line after each PWM
-			line1 = f.readline()
-			
-			Lstart = float(val5[1])
-			
-			line1 = f.readline()	#skip first line
-			line1 = f.readline()
-			line2 = f.readline()
-			line3 = f.readline()
-			line4 = f.readline()
-			line5 = f.readline()
-			
-			# if two lines are empty    
-			# end of file is reached
-			if not (line1):
-				break
-			 
-			#extract words from lines
-			val1 = line1.split()
-			val2 = line2.split()
-			val3 = line3.split()
-			val4 = line4.split()
-			val5 = line5.split()
-			
-			#skip free line after each PWM
-			line1 = f.readline()
-			
-			Lstop = float(val5[1])
-			
-			if (Lstart < Lstop and float(val5[0]) < spec_threshhold):
-				Lw = len(val1) - 1
-				pPWM = [0.0 for j in range(0,4*Lw)]
-				for i in range(0,Lw):
-					pPWM[0*Lw+i] = float(val1[i+1])
-				for i in range(0,Lw):
-					pPWM[1*Lw+i] = float(val2[i+1])
-				for i in range(0,Lw):
-					pPWM[2*Lw+i] = float(val3[i+1])
-				for i in range(0,Lw):
-					pPWM[3*Lw+i] = float(val4[i+1])
-				tmp_foreground = Lstop #/ normalization_dict[Lw]
-				tmp_iterations = int(float(val5[2]))
-				tmp_E0 = float(val5[0])
-				tmp_obj = PWM_Score(PWM = pPWM, foreground_log_likelihood = tmp_foreground, iterations = tmp_iterations, E0 = tmp_E0)
-				PWM_score_array.append(tmp_obj)
-			
-			
-		
-		f.close()
-		
-#with open(filename[:-4]+"_ranked.dat","w") as f:
-with open("PWMs_ranked.dat","w") as f:
-	sys.stdout = f
-	#print ranked consensus PWMs
-	for p in sorted(PWM_score_array, key= lambda item: item.foreground_log_likelihood, reverse= True):
-		print("", end = "\t")
-		for i in range(0,int(len(p.PWM)/4)):
-			print((i+1), end = "\t\t")
-		print("")
-		for i in range(0,4):
-			print(inv_nucl_dict[i], end = "\t")
-			for j  in range(0,int(len(p.PWM)/4)):
-				print(p.PWM[i*int(len(p.PWM)/4)+j], end = "\t")
-			print("")
-		print(p.E0, p.foreground_log_likelihood, p.iterations)
-		print("")
-	sys.stdout = original_stdout