diff --git a/Snakefile b/Snakefile index 7b4179be2b9f7e50ad40dae48ed0f878f21b9c3d..133a792762d487e6c85ae269dc261f6fd6792ef4 100644 --- a/Snakefile +++ b/Snakefile @@ -788,42 +788,15 @@ rule create_noBG_3pSites_table: filtered = expand( config["results_dir"] + "/filteredSites/{sample}.filtered.tsv", sample=config['samples']), raw_table = config["results_dir"] + "/3pSites.PAS.tsv.gz" output: - temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz") + table_adjusted = temp(config["results_dir"] + "/3pSites.PAS.filtered.tsv.gz") params: cluster_log = config["log_dir"] + "/cluster_logs/create_noBG_3pSites_table.log" resources: mem = 10 log: config["log_dir"] + "/create_noBG_3pSites_table.log" - run: - import gzip - - sites = [] - reads = {} - header_lines = [] - - with gzip.open(input.raw_table, "rt") as infile: - for line in infile: - if line.startswith("#"): - header_lines.append(line) - continue - F = line.rstrip().split("\t") - site_id = ":".join(F[0:3]) - sites.append(site_id) - reads[site_id] = [F[0], F[1], F[2], F[-2], F[-1]] - - for in_f in input.filtered: - with open(in_f, "r") as ifile: - for line in ifile: - line_list = line.rstrip().split("\t") - curr_id = ":".join(line_list[0:3]) - reads[curr_id].insert(-2, line_list[3]) - - with gzip.open(output[0], "wt") as out_file: - for h in header_lines: - out_file.write("%s" % h) - for s in sites: - out_file.write("%s\n" % "\t".join( reads[s] ) ) + script: + "scripts/merge-sample-bg-files.py" #------------------------------------------------------------------------------- # delete 3' end sites without cutoff-corrected read support from any sample diff --git a/scripts/merge-sample-bg-files.py b/scripts/merge-sample-bg-files.py new file mode 100644 index 0000000000000000000000000000000000000000..f766fac7a8a97a78a2306207cb571063b5c18a66 --- /dev/null +++ b/scripts/merge-sample-bg-files.py @@ -0,0 +1,39 @@ +import gzip + +map={} +sites = [] +reads = {} +header_lines = [] + +# Get info from original 3pSites file +with gzip.open(snakemake.input.raw_table, "rt") as infile: + for line in infile: + # Header lines + if line.startswith("#"): + l= line.rstrip().split(";") + col=int(l[0].lstrip("#")) + sample=l[1].split("/")[-1].split(".")[0] + # map column to sample + map[sample]=col + header_lines.append(line) + continue + F = line.rstrip().split("\t") + site_id = ":".join(F[0:3]) + sites.append(site_id) + # For each site, store list with appropriate length to accomodate all samples + reads[site_id] = [F[0], F[1], F[2]] + [None]*len(map) + [F[-2], F[-1]] + +for in_f in snakemake.input.filtered: + # Find sample id + sample=in_f.rstrip().split("/")[-1].split(".")[0] + with open(in_f, "r") as ifile: + for line in ifile: + line_list = line.rstrip().split("\t") + curr_id = ":".join(line_list[0:3]) + reads[curr_id][map[sample]]=line_list[3] + +with gzip.open(snakemake.output.table_adjusted, "wt") as out_file: + for h in header_lines: + out_file.write("%s" % h) + for s in sites: + out_file.write("%s\n" % "\t".join( reads[s] ) ) diff --git a/scripts/rs-bam2bed.py b/scripts/rs-bam2bed.py index d1095a91b84e27defbf28a492ce570abdd4f25ad..7a0cff826a839c277fc34497f4c76b5dec6d7f3e 100644 --- a/scripts/rs-bam2bed.py +++ b/scripts/rs-bam2bed.py @@ -163,6 +163,9 @@ def generate_bed_line(sam): number = int(exploded_MID[i*2]) type = exploded_MID[i*2 +1] + # no need to process type "N" at this point + # because currently the genomic seq anyways + # only consists of "?" if type == "M": # read and reference were aligned for _ in itertools.repeat(None, number): @@ -196,7 +199,7 @@ def generate_bed_line(sam): if c == '': continue # Skip c and this iteration when this happens - # Skip the next G[pointer] that is not a minus strand + # Skip the next G[pointer] that is not a dash if c.isdigit(): # perfect match here value = int(c)