Skip to content
Snippets Groups Projects
Commit d9c69e2f authored by Niels Schlusser's avatar Niels Schlusser
Browse files

Included the Markov model, deleted obsolete scripts

parent 43a208c7
No related branches found
No related tags found
No related merge requests found
Showing
with 201 additions and 549 deletions
File moved
File moved
File moved
File moved
File moved
File moved
File moved
-----------------------------------------------
Niels Schlusser, July, 28th 2022
-----------------------------------------------
This directory contains the python code and bash scripts to turn *.fastq.gz files into the desired format for our EM optimization of binding sites. To this end, an the background frequencies need to be constructed from a Markov model of highest possible degree.
The code was ran on the following system configuration
- Ubuntu 20.04.1
- python 3.8.10
- Biopython 1.79
- Bash 5.0.7(1)
- cutadapt 3.4
The code is organized in two parts: The actual Markov model is run in a python script ("construct_background.py", not parallelized), whereas the overarching tasks, including the execution of the python script itself, are organized in the bash script "execute.sh".
Parameters need to be specified in the bash script "execute.sh", only:
- the degree of the Markov model. It should be as long as possible. In practice, a length of 14 has proven feasible with a decent RAM consumption (~16GB).
- the number of background files
- the (ENCODE-format) filenames (without *.fastq.gz extension) for foreground and background
For downloading and adapter trimming, execute the script with <./execute.sh dlct>. This will yield *.fastq files with trimmed adapters. After this, run <./execute.sh> for the file preparation using the Markov model.
import Bio
import sys
import math
from Bio import SeqIO
#declare input
background_files = []
foreground_files = []
b_kmer_len=0
#initialize input
if len(sys.argv) > 4:
b_kmer_len=int(sys.argv[1])
for ind in range(3,3+int(sys.argv[2])):
background_files.append(sys.argv[ind])
for ind in range(3+int(sys.argv[2]),len(sys.argv)):
foreground_files.append(sys.argv[ind])
else:
print("Insufficient input\n")
#define nucleic acid dictionary
nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 }
inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" }
#save original output channel
original_stdout = sys.stdout
#create array of background counts
background_frequency = [0.0 for i in range(0,pow(4,b_kmer_len))]
#define function which calculates the background frequency of a given read S
def construct_frequency(S):
ret = 0.0
for kmer_start in range(0, len(S)-b_kmer_len+1):
tmp_pos = 0
for kmer_rel_pos in range(0,b_kmer_len):
tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos)
ret += background_frequency[tmp_pos]
return ret
#construct background frequency array
normalization = 0
for data_file in background_files:
for read in SeqIO.parse(data_file+".fastq","fastq"):
s = read.seq
for kmer_start in range(0,len(s)-b_kmer_len+1):
tmp_cnt = 0
is_N = 0
for kmer_rel_pos in range(0,b_kmer_len):
if(s[kmer_start+kmer_rel_pos] != 'N'):
tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos)
else:
is_N = 1
#only use sequence if there is no N in it
if( is_N == 0 ):
background_frequency[tmp_cnt] += 1.0
normalization += 1
#normalize and print log of background likelihood of kmer
with open("background_frequencies_"+str(b_kmer_len)+"mer.dat","w") as f:
sys.stdout = f
for i in range(0,pow(4,b_kmer_len)):
background_frequency[i] = background_frequency[i]/normalization
s = ""
tmpI = i
for j in range(0,b_kmer_len):
tmpN = tmpI % 4
s = s + inv_nucl_dict[tmpN]
tmpI -= tmpN
tmpI /= 4
print(s,background_frequency[i])
sys.stdout = original_stdout
#create foreground sequence dictionaries, print in data format for RBAPBindnSeq
background_normalization = 0
number_kmers = 0
for data_file in foreground_files:
foreground_dict = {}
#create foreground dictionary
for seq_record in SeqIO.parse(data_file+".fastq","fastq"):
if seq_record.seq in foreground_dict:
foreground_dict[seq_record.seq] += 1
else:
foreground_dict[seq_record.seq] = 1
#print read files with background frequencies
with open(data_file+"_"+str(b_kmer_len)+"mer.dat","w") as f:
sys.stdout = f
for s in foreground_dict:
if('N' not in s):
tmp_bgr = construct_frequency(s)
if tmp_bgr > 0.0 and len(s) >= b_kmer_len:
background_normalization += foreground_dict[s]*math.log(tmp_bgr)
number_kmers += foreground_dict[s]*(len(s) - b_kmer_len + 1)
print(len(s),foreground_dict[s],tmp_bgr,s)
sys.stdout = original_stdout
print("background normalization for ", b_kmer_len, "mer: ", background_normalization, "\nNumber of possible ", b_kmer_len, "mers: ", number_kmers)
#!/bin/bash
Lw=14 #kmer-length
nbr_bfiles=2 #number of background files
#input file names
background_files=("ENCFF002DGO" "ENCFF002DGY")
foreground_files=("ENCFF002DGP" "ENCFF002DGQ" "ENCFF002DGR" "ENCFF002DGS" "ENCFF002DGT" "ENCFF002DGU" "ENCFF002DGV" "ENCFF002DGW" "ENCFF002DGX")
####################
####START SCRIPT####
####################
if [[ $1 == "dlct" ]]
then
#download files
for filename in ${background_files[@]}
do
curl -O -L https://www.encodeproject.org/files/${filename}/@@download/${filename}.fastq.gz
done
for filename in ${foreground_files[@]}
do
curl -O -L https://www.encodeproject.org/files/${filename}/@@download/${filename}.fastq.gz
done
#turn into fastq and cut adapter
module purge
ml cutadapt
for filename in ${background_files[@]}
do
cutadapt -j 4 -a TGGAATTCTCGGGTGTCAAGG -o ${filename}.fastq ${filename}.fastq.gz
done
for filename in ${foreground_files[@]}
do
cutadapt -j 4 -a TGGAATTCTCGGGTGTCAAGG -o ${filename}.fastq ${filename}.fastq.gz
done
else
#calculates background frequencies based on Markov model
module purge
ml Biopython
python construct_background.py ${Lw} ${nbr_bfiles} $(echo "${background_files[*]}") $(echo "${foreground_files[*]}")
for filename in ${foreground_files[@]}
do
awk 'BEGIN{max=0}{if($1>max) max=$1}END{print "'${filename}'",max}' ${filename}.fastq.gz
done
rm background_frequencies_${Lw}mer.dat
fi
for lett in 002DGP 002DGQ 002DGR 002DGS 002DGT 002DGU 002DGV 002DGW 002DGX
do
mv ENCFF${lett}_14mer.dat ../RBAPBindnSeq/ENCFF${lett}.dat
done
import Bio
import sys
import math
from Bio import SeqIO
#declare input
background_files = ["ENCFF002DGZ", "ENCFF002DHJ"]
eE0 = math.exp(2.848039)
PWM = [0.99377,0.002394,0.000393,0.000303,0.999999,0.000407,#
3.7e-05,9e-06,0.0,0.999696,0.0,0.001929,#
0.0,0.0,0.999607,0.0,0.0,3e-06,#
0.006192,0.997597,0.0,0.0,1e-06,0.997661]
L_w = len(PWM) // 4
#define nucleic acid dictionary
nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3}
inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T"}
#save original output channel
original_stdout = sys.stdout
#create array of background counts
background_frequency = [0.0 for i in range(0,pow(4,L_w))]
#define function which calculates the background frequency of a given read S
def construct_frequency(S):
ret = 0.0
for kmer_start in range(0,len(S)-L_w+1):
tmp_pos = 0
for kmer_rel_pos in range(0,L_w):
tmp_pos += nucl_dict[ S[kmer_start+kmer_rel_pos] ] * pow(4,kmer_rel_pos)
ret += background_frequency[tmp_pos]
return ret
#function determines exp(E(S))
#position-specific binding likelihood
def eES(read):
ret = 0.0
for kmer_start in range(0,len(read) - L_w + 1):
aux = 1.0
for kmer_rel_pos in range(0,L_w):
aux *= PWM[nucl_dict[read[kmer_start+kmer_rel_pos]]*L_w + kmer_rel_pos]
if 'N' not in read[kmer_start:kmer_start+L_w]:
ret += aux
return ret
#-------------------------------
#START OF THE SCRIPT
#-------------------------------
#construct background frequency array
#normalization = 0
#for data_file in background_files:
#for read in SeqIO.parse(data_file+".fastq","fastq"):
#s = read.seq
#for kmer_start in range(0,len(s)-L_w+1):
#tmp_cnt = 0
#is_N = 0
#for kmer_rel_pos in range(0,L_w):
#if(s[kmer_start+kmer_rel_pos] != 'N'):
#tmp_cnt += nucl_dict[ s[kmer_start + kmer_rel_pos] ] * pow(4,kmer_rel_pos)
#else:
#is_N = 1
##only use sequence if there is no N in it
#if( is_N == 0 ):
#background_frequency[tmp_cnt] += 1.0
#normalization += 1
##normalize
#for i in range(0,pow(4,L_w)):
#background_frequency[i] = background_frequency[i]/normalization
##create background sequence dictionaries, print in data format for RBAPBindnSeq
#number_kmers = 0
#for data_file in background_files:
##print read files with background frequencies
#with open(data_file+".dat","w") as f:
#sys.stdout = f
#for seq_record in SeqIO.parse(data_file+".fastq","fastq"):
#if('N' not in seq_record.seq AND len(seq_record.seq) >= L_w):
#print(len(seq_record.seq),1,construct_frequency(seq_record.seq),seq_record.seq)
#sys.stdout = original_stdout
bgr_log_likelihood = 0.0
norm1 = 0
norm2 = 0.0
for data_file in background_files:
f = open(data_file+'.dat','r')
while True:
line = f.readline()
if not (line):
break
val = line.split()
peES = 0.0
if len(val) == 4:
peES = eES(val[3])
peES *= float(val[2])
peES += eE0*(int(val[0]) - L_w + 1)
if peES > 0.0:
bgr_log_likelihood += int(val[1])*math.log(peES)
norm1 += int(val[1])
norm2 += peES
print("Background normalization",bgr_log_likelihood-norm1*math.log(norm2))
#script returns similarity scores
#of PWMs
import math
import sys
#give input filenames
PWM_file_1 = "no_mask_HNRNPK_7mer_ranked.dat"
PWM_file_2 = "no_mask_HNRNPK_8mer_ranked.dat"
#define thresholds
spec_threshhold = 20.0
#auxiliary method
#computes the Euclidean distance
#for an alignment of rows
#ind1 in PWM1
#ind2 in PWM2
#if ind2 is out of range of PWM2
#assume equal distribution
def distance_row(ind1,PWM1,ind2,PWM2):
ret = 0.0
for ind in range(0,4):
if 0 <= ind2 < int(len(PWM2)/4):
ret += math.pow(PWM1[ind*int(len(PWM1)/4) + ind1] - PWM2[ind*int(len(PWM2)/4) + ind2],2)
else:
ret += math.pow(PWM1[ind*int(len(PWM1)/4) + ind1] - 0.25,2)
return ret
#auxiliary method
#computes maximum Euclidean
#similarity score
#len(PWM1) <= len(PWM2) is assumed
#assumes "half" mismatch for non-overlapping parts of PWM
def max_sim(PWM1,PWM2):
ret = float("inf")
ptr = 0
for offset in range(-int(len(PWM1)/4)+1,int(len(PWM2)/4)):
tmp = 0.0
for i in range(0,int(len(PWM1)/4)):
tmp += distance_row(i,PWM1,i+offset,PWM2)
if(ret > tmp/int(len(PWM1)/4)):
ret = tmp/int(len(PWM1)/4)
ptr = offset
return ret,ptr
#auxiliary method
#returns float list with next PWM
#extracted from in_file
def get_PWM(in_file):
while True:
line1 = in_file.readline() #skip first line
line1 = in_file.readline()
line2 = in_file.readline()
line3 = in_file.readline()
line4 = in_file.readline()
line5 = in_file.readline()
# if two lines are empty
# end of file is reached
if not (line1):
return None
#extract words from lines
val1 = line1.split()
val2 = line2.split()
val3 = line3.split()
val4 = line4.split()
val5 = line5.split()
#skip free line after each PWM
line1 = in_file.readline()
if (int(float(val5[3])) > 0 and float(val5[0]) < spec_threshhold):
Lw = len(val1) - 1
PWM = [0.0 for j in range(0,4*Lw)]
for i in range(0,Lw):
PWM[0*Lw+i] = float(val1[i+1])
for i in range(0,Lw):
PWM[1*Lw+i] = float(val2[i+1])
for i in range(0,Lw):
PWM[2*Lw+i] = float(val3[i+1])
for i in range(0,Lw):
PWM[3*Lw+i] = float(val4[i+1])
return PWM
return None
#define class of comparisons
#of two PWMs
class PWM_Comparison():
def __init__(self, index1, index2, distance, offset, PWM1, PWM2):
self.index1 = index1
self.index2 = index2
self.distance = distance
self.offset = offset
self.PWM1 = PWM1
self.PWM2 = PWM2
#-------------------------------
#START OF THE SCRIPT
#-------------------------------
f2 = open(PWM_file_2,'r')
ind2 = -1
while True:
pPWM2 = get_PWM(f2)
if pPWM2 is None:
break
else:
ind2 += 1
comparison_array = []
f1 = open(PWM_file_1,'r')
ind1 = -1
while True:
pPWM1 = get_PWM(f1)
if pPWM1 is None:
break
else:
ind1 += 1
di,of = max_sim(pPWM1,pPWM2)
tmp_comp = PWM_Comparison(index1 = ind1, index2 = ind2, distance = di, offset = of, PWM1 = pPWM1, PWM2 = pPWM2)
comparison_array.append(tmp_comp)
f1.close()
min_comp = min(comparison_array, key=lambda item: item.distance)
for i in range(0,4):
for j in range(0,int(len(min_comp.PWM1)/4)):
print(min_comp.PWM1[i*int(len(min_comp.PWM1)/4)+j], end = "\t")
print("")
print("")
for i in range(0,4):
for j in range(0,int(len(min_comp.PWM2)/4)):
print(min_comp.PWM2[i*int(len(min_comp.PWM2)/4)+j], end = "\t")
print("")
print(min_comp.index1,min_comp.index2,min_comp.distance,min_comp.offset)
print("")
f2.close()
for kmer_length in 4 5 6 7 8 9 10
do
for file in ENCFF002DHA.dat ENCFF002DHB.dat ENCFF002DHC.dat ENCFF002DHD.dat ENCFF002DHE.dat ENCFF002DHF.dat ENCFF002DHG.dat ENCFF002DHH.dat ENCFF002DHI.dat
do
awk 'BEGIN{sum=0}{sum+=$2*($1 - '${kmer_length}' + 1)}END{printf("%d\n",sum)}' ${file} >> tmp.dat
done
awk 'BEGIN{sum=0}{sum+=$1}END{printf("%d\t%d\n",'${kmer_length}',sum)}' tmp.dat
rm tmp.dat
done
import Bio
import sys
import math
from Bio import SeqIO
#declare input
#files = ["ENCFF002DHJ"]
files = ["ENCFF002DHA", "ENCFF002DHB", "ENCFF002DHC", "ENCFF002DHD", "ENCFF002DHE", "ENCFF002DHF", "ENCFF002DHG", "ENCFF002DHH", "ENCFF002DHI"]
sequence = "TGCATG"
reach=3
L_w=len(sequence)
#define nucleic acid dictionary
nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3}
inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T"}
#create arrays for nucleotide counts
upstream_counts = [0.0 for i in range(0,4*reach)]
downstream_counts = [0.0 for i in range(0,4*reach)]
#-------------------------------
#START OF THE SCRIPT
#-------------------------------
for data_file in files:
for read in SeqIO.parse(data_file+".fastq","fastq"):
s = read.seq
ind = s.find(sequence)
if ind == -1 or "N" in s:
continue
elif reach <= ind < len(s) - L_w - reach:
for r in range(0,reach):
upstream_counts[nucl_dict[s[ind-reach+r]]*reach+r] += 1.0
downstream_counts[nucl_dict[s[ind+L_w+r]]*reach+r] += 1.0
#normalize
for i in range(0,reach):
upstream_norm = 0.0
downstream_norm = 0.0
for j in range(0,4):
upstream_norm += upstream_counts[j*reach+i]
downstream_norm += downstream_counts[j*reach+i]
for j in range(0,4):
upstream_counts[j*reach+i] /= upstream_norm
downstream_counts[j*reach+i] /= downstream_norm
#print result
print("Sequence bias around "+sequence+":")
for j in range(0,4):
for i in range(0,reach):
print(upstream_counts[j*reach+i],end="\t")
for i in range(0,reach):
print(downstream_counts[j*reach+i],end="\t")
print("")
#!/bin/bash
#plots all PWMs from a given input file
filename=PWMs_ranked.dat
#split files
split -l 7 -d --additional-suffix=.dat ${filename} pmat
#safe number of digits for leading zeros
di="$(ls -dq pmat*.dat | wc -l)"
digits="${#di}"
i=0
for wdfile in ${PWD}/pmat*.dat;
do
((i++))
head -n 5 ${wdfile} > mat$(printf "%0${digits}d" ${i}).dat
done
rm pmat*.dat
#load library
echo "library(seqLogo)" >> next.plot.r
#load tables
i=0
for wdfile in ${PWD}/mat*.dat;
do
((i++))
echo 'm'$(printf "%0${digits}d" ${i})' <- read.table("'${wdfile}'", header=TRUE, stringsAsFactors=FALSE)' >> next.plot.r
echo 'p'$(printf "%0${digits}d" ${i})' <- makePWM(m'$(printf "%0${digits}d" ${i})')' >> next.plot.r
done
#export plots to PDF, once with info content scaling, once without
echo 'pdf("plots.pdf")' >> next.plot.r
i=0
for wdfile in ${PWD}/mat*.dat;
do
((i++))
echo 'seqLogo(p'$(printf "%0${digits}d" ${i})')' >> next.plot.r
done
echo 'dev.off()' >> next.plot.r
echo 'q()' >> next.plot.r
echo 'n' >> next.plot.r
Rscript next.plot.r
rm mat*.dat
rm next.plot.r
#script searches output file for PWMs
#subtracts the respective background probabilities
#prints a ranked list of accordingly ranked PWMs
import math
import os
import sys
#specify prefix for files
prefix = "data"
#define thresholds
spec_threshhold = 50.0
#give background values
normalization_dict = {4 : 6044491827, 5 : 5880088810, 6 : 5715685929, 7 : 5551283214}
#save original output channel
original_stdout = sys.stdout
#define nucleic acid dictionary
nucl_dict = {"A" : 0, "C" : 1, "G" : 2, "T" : 3 }
inv_nucl_dict = {0 : "A", 1 : "C", 2 : "G", 3 : "T" }
#define class of PWMs with score
class PWM_Score():
def __init__(self, PWM, foreground_log_likelihood, iterations, E0):
self.PWM = PWM
self.foreground_log_likelihood = foreground_log_likelihood
self.iterations = iterations
self.E0 = E0
#--------------------
#START SCRIPT
#--------------------
#initialize PWM collection
PWM_score_array = []
for fi in os.listdir(os.getcwd()):
filename = os.fsdecode(fi)
if ((not filename.startswith("ENCFF")) and filename.endswith("mer.dat") and filename.startswith(prefix)):
foreground_file = filename
#open files
f = open(foreground_file,'r')
while True:
# Get next PWM from file
line1 = f.readline() #skip first line
line1 = f.readline()
line2 = f.readline()
line3 = f.readline()
line4 = f.readline()
line5 = f.readline()
# if two lines are empty
# end of file is reached
if not (line1):
break
#extract words from lines
val1 = line1.split()
val2 = line2.split()
val3 = line3.split()
val4 = line4.split()
val5 = line5.split()
#skip free line after each PWM
line1 = f.readline()
Lstart = float(val5[1])
line1 = f.readline() #skip first line
line1 = f.readline()
line2 = f.readline()
line3 = f.readline()
line4 = f.readline()
line5 = f.readline()
# if two lines are empty
# end of file is reached
if not (line1):
break
#extract words from lines
val1 = line1.split()
val2 = line2.split()
val3 = line3.split()
val4 = line4.split()
val5 = line5.split()
#skip free line after each PWM
line1 = f.readline()
Lstop = float(val5[1])
if (Lstart < Lstop and float(val5[0]) < spec_threshhold):
Lw = len(val1) - 1
pPWM = [0.0 for j in range(0,4*Lw)]
for i in range(0,Lw):
pPWM[0*Lw+i] = float(val1[i+1])
for i in range(0,Lw):
pPWM[1*Lw+i] = float(val2[i+1])
for i in range(0,Lw):
pPWM[2*Lw+i] = float(val3[i+1])
for i in range(0,Lw):
pPWM[3*Lw+i] = float(val4[i+1])
tmp_foreground = Lstop #/ normalization_dict[Lw]
tmp_iterations = int(float(val5[2]))
tmp_E0 = float(val5[0])
tmp_obj = PWM_Score(PWM = pPWM, foreground_log_likelihood = tmp_foreground, iterations = tmp_iterations, E0 = tmp_E0)
PWM_score_array.append(tmp_obj)
f.close()
#with open(filename[:-4]+"_ranked.dat","w") as f:
with open("PWMs_ranked.dat","w") as f:
sys.stdout = f
#print ranked consensus PWMs
for p in sorted(PWM_score_array, key= lambda item: item.foreground_log_likelihood, reverse= True):
print("", end = "\t")
for i in range(0,int(len(p.PWM)/4)):
print((i+1), end = "\t\t")
print("")
for i in range(0,4):
print(inv_nucl_dict[i], end = "\t")
for j in range(0,int(len(p.PWM)/4)):
print(p.PWM[i*int(len(p.PWM)/4)+j], end = "\t")
print("")
print(p.E0, p.foreground_log_likelihood, p.iterations)
print("")
sys.stdout = original_stdout
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment