Commit 90c6ef59 authored by BIOPZ-Schmidt Ralf's avatar BIOPZ-Schmidt Ralf

bug fix of merging samples after background filtering; bug fix in rs-bam2bed.py

parent 1342c0d7
......@@ -788,42 +788,15 @@ rule create_noBG_3pSites_table:
filtered = expand( config["results_dir"] + "/filteredSites/{sample}.filtered.tsv", sample=config['samples']),
raw_table = config["results_dir"] + "/3pSites.PAS.tsv.gz"
output:
temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz")
table_adjusted = temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz")
params:
cluster_log = config["log_dir"] + "/cluster_logs/create_noBG_3pSites_table.log"
resources:
mem = 10
log:
config["log_dir"] + "/create_noBG_3pSites_table.log"
run:
import gzip
sites = []
reads = {}
header_lines = []
with gzip.open(input.raw_table, "rt") as infile:
for line in infile:
if line.startswith("#"):
header_lines.append(line)
continue
F = line.rstrip().split("\t")
site_id = ":".join(F[0:3])
sites.append(site_id)
reads[site_id] = [F[0], F[1], F[2], F[-2], F[-1]]
for in_f in input.filtered:
with open(in_f, "r") as ifile:
for line in ifile:
line_list = line.rstrip().split("\t")
curr_id = ":".join(line_list[0:3])
reads[curr_id].insert(-2, line_list[3])
with gzip.open(output[0], "wt") as out_file:
for h in header_lines:
out_file.write("%s" % h)
for s in sites:
out_file.write("%s\n" % "\t".join( reads[s] ) )
script:
"scripts/merge-sample-bg-files.py"
#-------------------------------------------------------------------------------
# delete 3' end sites without cutoff-corrected read support from any sample
......
import gzip
map={}
sites = []
reads = {}
header_lines = []
# Get info from original 3pSites file
with gzip.open(snakemake.input.raw_table, "rt") as infile:
for line in infile:
# Header lines
if line.startswith("#"):
l= line.rstrip().split(";")
col=int(l[0].lstrip("#"))
sample=l[1].split("/")[-1].split(".")[0]
# map column to sample
map[sample]=col
header_lines.append(line)
continue
F = line.rstrip().split("\t")
site_id = ":".join(F[0:3])
sites.append(site_id)
# For each site, store list with appropriate length to accomodate all samples
reads[site_id] = [F[0], F[1], F[2]] + [None]*len(map) + [F[-2], F[-1]]
for in_f in snakemake.input.filtered:
# Find sample id
sample=in_f.rstrip().split("/")[-1].split(".")[0]
with open(in_f, "r") as ifile:
for line in ifile:
line_list = line.rstrip().split("\t")
curr_id = ":".join(line_list[0:3])
reads[curr_id][map[sample]]=line_list[3]
with gzip.open(snakemake.output.table_adjusted, "wt") as out_file:
for h in header_lines:
out_file.write("%s" % h)
for s in sites:
out_file.write("%s\n" % "\t".join( reads[s] ) )
......@@ -163,6 +163,9 @@ def generate_bed_line(sam):
number = int(exploded_MID[i*2])
type = exploded_MID[i*2 +1]
# no need to process type "N" at this point
# because currently the genomic seq anyways
# only consists of "?"
if type == "M":
# read and reference were aligned
for _ in itertools.repeat(None, number):
......@@ -196,7 +199,7 @@ def generate_bed_line(sam):
if c == '':
continue # Skip c and this iteration when this happens
# Skip the next G[pointer] that is not a minus strand
# Skip the next G[pointer] that is not a dash
if c.isdigit():
# perfect match here
value = int(c)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment