diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index d804326f3ef12fd72cb07ed485add73acafcf701..d74b9588097051464cd5df317d3189d574d6ba02 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -1,21 +1,20 @@ configfile: "config.yaml" #from snakemake.utils import listfiles -localrules: create_output_and_log_directories, remove_multimappers, read_length_histogram, count_reads, determine_p_site_offset, finish +localrules: create_output_and_log_directories, remove_multimappers, read_length_histogram, count_reads, determine_p_site_offset, filter_reads_based_on_read_lengths_and_offsets, bam_sort_and_index, finish -################################################################################# +################################################################################ ### Finish rule -################################################################################# +################################################################################ rule finish: input: - p_site_offsets = expand(os.path.join(config["output_dir"], "{sample}/p_site_offsets/alignment_offset.json"), sample=config["sample"]), pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), - counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]) - -################################################################################# + counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]), + bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]) +################################################################################ ### Create output and log directories -################################################################################# +################################################################################ rule create_output_and_log_directories: output: @@ -32,9 +31,9 @@ rule create_output_and_log_directories: mkdir -p {output.sample_dir}; \ touch {output.flag};" -################################################################################# +################################################################################ ### Clipping reads -################################################################################# +################################################################################ rule clip_reads: input: @@ -63,9 +62,9 @@ rule clip_reads: -i <(zcat {input.reads}) \ -o {output.reads}) &> {log}" -################################################################################# +################################################################################ ### Trimming reads -################################################################################# +################################################################################ rule trim_reads: input: @@ -93,9 +92,9 @@ rule trim_reads: -i <(zcat {input.reads}) \ -o {output.reads}) &> {log}" -################################################################################# +################################################################################ ### Filtering reads -################################################################################# +################################################################################ rule filter_reads: input: @@ -123,9 +122,9 @@ rule filter_reads: -i <(zcat {input.reads}) \ -o {output.reads}) &> {log}" -################################################################################# +################################################################################ ### Convert fastq to fasta -################################################################################# +################################################################################ rule fastq_to_fasta: input: @@ -149,9 +148,9 @@ rule fastq_to_fasta: -i <(zcat {input.reads}) \ -o {output.reads}) &> {log}" -################################################################################# +################################################################################ ### Map reads to other genes (rRNA, tRNA, etc...) -################################################################################# +################################################################################ rule map_to_other_genes: input: @@ -181,9 +180,9 @@ rule map_to_other_genes: -o {output.sam} \ -u {output.reads} ) &> {log}" -################################################################################# +################################################################################ ### Map reads to other genes (rRNA, tRNA, etc...) -################################################################################# +################################################################################ rule map_to_transcripts: input: @@ -322,5 +321,49 @@ rule determine_p_site_offset: --outdir {params.outdir}) &> {log}" ################################################################################ -### Filter reads based on selected read lengths +### Filter reads based on selected read lengths and offsets +### (Create a-site profile) +################################################################################ + +rule filter_reads_based_on_read_lengths_and_offsets: + input: + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"), + p_site_offsets = os.path.join( + config["output_dir"], + "{sample}/p_site_offsets/alignment_offset.json" + ) + output: + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.bam"), + params: + cluster_log = os.path.join(config["cluster_log"], "filter_reads_based_on_read_lengths_and_offsets_{sample}.log") + log: + os.path.join(config["local_log"], "filter_reads_based_on_read_lengths_and_offsets_{sample}.log") + threads: 1 + singularity: + "docker://fgypas/python_pysam:3.6.5_0.15.1" + shell: + "(python scripts/filter_reads_based_on_read_lengths_and_offsets.py \ + --bam {input.bam} \ + --p_site_offsets {input.p_site_offsets} \ + --bam_out {output.bam}) &> {log}" + +################################################################################ +### SAM to BAM sort and index a-site profile ################################################################################ + +rule bam_sort_and_index: + input: + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.bam"), + output: + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam"), + bai = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), + params: + cluster_log = os.path.join(config["cluster_log"], "bam_sort_and_index_{sample}.log") + log: + os.path.join(config["local_log"], "bam_sort_and_index_{sample}.log") + threads: 1 + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools sort {input.bam} > {output.bam}; \ + samtools index {output.bam};) &> {log}" diff --git a/snakemake/process_data/scripts/determine_p_site_offsets.py b/snakemake/process_data/scripts/determine_p_site_offsets.py index 8e7979a8a8a6852d09afb9b4352053f4169de099..4fe2f04e61f43704009f567524d993bc40520706 100755 --- a/snakemake/process_data/scripts/determine_p_site_offsets.py +++ b/snakemake/process_data/scripts/determine_p_site_offsets.py @@ -195,7 +195,8 @@ def main(): sys.stdout.write(str(alignment_offset) + os.linesep) with open(os.path.join(options.outdir, "alignment_offset.json"), 'w') as file: - file.write(json.dumps(alignment_offset)) + json.dump(alignment_offset, file) + # file.write(json.dumps(alignment_offset)) # _____________________________________________________________________________ # ----------------------------------------------------------------------------- diff --git a/snakemake/process_data/scripts/filter_reads_based_on_read_lengths_and_offsets.py b/snakemake/process_data/scripts/filter_reads_based_on_read_lengths_and_offsets.py new file mode 100755 index 0000000000000000000000000000000000000000..19b8d169115228952bee324aae9059d400ff8a91 --- /dev/null +++ b/snakemake/process_data/scripts/filter_reads_based_on_read_lengths_and_offsets.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +from argparse import ArgumentParser, RawTextHelpFormatter +import itertools +import sys +import os +import pysam +import json + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + +def main(): + """ Main function """ + + __doc__ = "Determine p-site offset" + __version__ = "0.1" + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--bam", + dest="bam", + help="Input bam file (sorted and indexed)", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--p_site_offsets", + dest="p_site_offsets", + help="P-site offsets (dictionary of read length and offset)", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--bam_out", + dest="bam_out", + help="Output bam file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Be verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except Exception: + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write("Parsing {} {}".format(options.p_site_offsets, os.linesep)) + + with open(options.p_site_offsets, 'r') as fp: + p_site_offsets = json.load(fp) + + # cast dict to contain keys and values as ints + p_site_offsets = {int(k):int(v) for k,v in p_site_offsets.items()} + + if options.verbose: + sys.stdout.write("and parsing {} {}".format(options.bam, os.linesep)) + + # Open alignment file + bam = pysam.AlignmentFile(options.bam, "rb") + + bam_out = pysam.AlignmentFile(options.bam_out, "wb", header=bam.header) + + for read in bam.fetch(): + # keep only + strand + if read.is_reverse: + continue + read_len = len(str(read.seq)) + + if read_len in p_site_offsets: + + a_site_start = p_site_offsets[read_len] + 3 + + read.reference_start += a_site_start + read.cigar = [(0, 3)] + read.seq = read.seq[a_site_start: 3] + + bam_out.write(read) + + # Close alignment files + bam_out.close() + bam.close() + + + # alignment_offset = {} + # + # for key, value in readsDict.items(): + # best_offset = 0 + # best_offset_score = 0 + # for offset in possible_offsets: + # raw_counts = readsDict[key][offset][0:3] + # if sum(raw_counts) > 0: + # if ((sum(raw_counts) > 200) \ + # and (raw_counts[0] > raw_counts[1]) \ + # and (raw_counts[0] > raw_counts[2])): + # if ((raw_counts[0] > best_offset_score) \ + # and ((not options.filterReadsForPeriodicity) \ + # or (float(raw_counts[0])/sum(raw_counts) > 0.4))): + # best_offset = offset + # best_offset_score = raw_counts[0] + # if best_offset != 0: + # alignment_offset[key] = best_offset + # # self.filtered_alignments += (readsDict[rl]["reads"]) + # sys.stderr.write("PEAK CALL:" + \ + # str(key) + \ + # ": peak found at position " + \ + # str(best_offset) \ + # + os.linesep) + # else: + # sys.stderr.write("PEAK CALL:" + \ + # str(key) + \ + # ": peak not found" + \ + # os.linesep) + # + # sys.stdout.write(str(alignment_offset) + os.linesep) + # + # with open(os.path.join(options.outdir, "alignment_offset.json"), 'w') as file: + # file.write(json.dumps(alignment_offset)) + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0)