diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index 6805b3e3f2880b35375aa77601e414ffa08a1562..d804326f3ef12fd72cb07ed485add73acafcf701 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -1,7 +1,7 @@ configfile: "config.yaml" #from snakemake.utils import listfiles -localrules: create_output_and_log_directories, remove_multimappers, read_length_histogram, count_reads, sfinish +localrules: create_output_and_log_directories, remove_multimappers, read_length_histogram, count_reads, determine_p_site_offset, finish ################################################################################# ### Finish rule @@ -9,6 +9,7 @@ localrules: create_output_and_log_directories, remove_multimappers, read_length_ rule finish: input: + p_site_offsets = expand(os.path.join(config["output_dir"], "{sample}/p_site_offsets/alignment_offset.json"), sample=config["sample"]), pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]) @@ -227,6 +228,28 @@ rule remove_multimappers: shell: "(grep -P \"^@|\tNH:i:1\t\" {input.sam} > {output.sam}) &> {log}" +################################################################################ +### SAM to BAM sort and index +################################################################################ + +rule sam2bam_sort_and_index: + input: + sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam") + output: + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"), + bai = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam.bai") + params: + cluster_log = os.path.join(config["cluster_log"], "sam2bam_sort_and_index_{sample}.log") + log: + os.path.join(config["local_log"], "sam2bam_sort_and_index_{sample}.log") + threads: 1 + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools view -bS {input.sam} \ + | samtools sort - > {output.bam}; \ + samtools index {output.bam};) &> {log}" + ################################################################################ ### Read length histogram ################################################################################ @@ -258,6 +281,8 @@ rule count_reads: transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"] output: counts = os.path.join(config["output_dir"], "{sample}/counts.tsv") + log: + os.path.join(config["local_log"], "count_reads_{sample}.log") threads: 1 singularity: "docker://perl:5.24-slim" @@ -266,3 +291,36 @@ rule count_reads: {input.sam} \ {input.transcript_id_gene_id_CDS} \ > {output.counts})" + +################################################################################ +### Determine P-site offset +################################################################################ + +rule determine_p_site_offset: + input: + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"), + transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"] + output: + p_site_offsets = os.path.join( + config["output_dir"], + "{sample}/p_site_offsets/alignment_offset.json" + ), + p_site_offset = os.path.join(config["output_dir"], + "{sample}/p_site_offsets") + params: + outdir = os.path.join(config["output_dir"], "{sample}/p_site_offsets"), + cluster_log = os.path.join(config["cluster_log"], "determine_p_site_offset_{sample}.log") + log: + os.path.join(config["local_log"], "determine_p_site_offset_{sample}.log") + threads: 1 + singularity: + "docker://fgypas/python_pysam:3.6.5_0.15.1" + shell: + "(python scripts/determine_p_site_offsets.py \ + --bam {input.bam} \ + --cds_coordinates {input.transcript_id_gene_id_CDS} \ + --outdir {params.outdir}) &> {log}" + +################################################################################ +### Filter reads based on selected read lengths +################################################################################ diff --git a/snakemake/process_data/cluster.json b/snakemake/process_data/cluster.json index 25fe29fd44b9fcb486624fc8bbf4b712d48ef2ce..79460b66581f15141f995a41de93fd9fa2d9a852 100644 --- a/snakemake/process_data/cluster.json +++ b/snakemake/process_data/cluster.json @@ -19,5 +19,12 @@ "time": "06:00:00", "threads":"8", "mem":"50G" +}, +"sam2bam_sort_and_index": +{ +"queue":"6hours", +"time": "06:00:00", +"threads":"1", +"mem":"10G" } } diff --git a/snakemake/process_data/scripts/determine_p_site_offsets.py b/snakemake/process_data/scripts/determine_p_site_offsets.py new file mode 100755 index 0000000000000000000000000000000000000000..8e7979a8a8a6852d09afb9b4352053f4169de099 --- /dev/null +++ b/snakemake/process_data/scripts/determine_p_site_offsets.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +from argparse import ArgumentParser, RawTextHelpFormatter +import itertools +import sys +import os +import pysam +import json + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + +def main(): + """ Main function """ + + __doc__ = "Determine p-site offset" + __version__ = "0.1" + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--bam", + dest="bam", + help="Input bam file (sorted and indexed)", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--cds_coordinates", + dest="cds_coordinates", + help="CDS coordinates tsv table", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--filterReadsForPeriodicity", + dest="filterReadsForPeriodicity", + help="Filter reads for periodicity", + required=False, + default=False, + action='store_true' + ) + + parser.add_argument( + "--outdir", + dest="outdir", + help="Output directory", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Be verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except Exception: + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write("Creating output directory: {} {}".format(options.outdir, os.linesep)) + + if not os.path.exists(options.outdir): + os.mkdir(options.outdir) + + if options.verbose: + sys.stdout.write("Parsing {} {}".format(options.cds_coordinates, os.linesep)) + sys.stdout.write("and parsing {} {}".format(options.bam, os.linesep)) + + # Open alignment file + bam = pysam.AlignmentFile(options.bam, "rb") + + ### Calculating P-site offset + possible_offsets = [10, 11, 12, 13, 14, 15, 16] + readsDict = {} + + # loop over CDS coordinates + with open(options.cds_coordinates) as cds_coordinates: + for line in cds_coordinates: + sp_line = line.strip().split("\t") + + transcript_id = sp_line[0] + cds_start = int(sp_line[2]) - 1 + cds_end = int(sp_line[3]) + + for read in bam.fetch(transcript_id, + cds_start, + cds_end): + + # keep only + strand + if read.is_reverse: + continue + + read_len = len(str(read.seq)) + + # Create a dictionary to score the different offsets for the distinct read lenghts + if read_len not in readsDict.keys(): + readsDict[read_len] = {} + for offset in possible_offsets: + readsDict[read_len][offset] = [0, 0, 0, 0, 0] #1st,2nd,3rd,UTR,CDS + readsDict[read_len]["reads"] = [] + + read_start = read.reference_start + + for offset in possible_offsets: + psite_pos = (read_start + offset) #P-site + asite_pos = psite_pos + 3 #A-site + rel_pos_psite = psite_pos - cds_start + rel_pos_asite = asite_pos - cds_start - 3 + + if options.filterReadsForPeriodicity: + #There's NO peak at start codon + if rel_pos_asite < 0: + readsDict[read_len][offset][3] += 1 + elif rel_pos_asite > (cds_end - cds_start): + readsDict[read_len][offset][3] += 1 + else: + if rel_pos_asite > 60: #ignore first 20 codons + frame_rel_pos = rel_pos_asite % 3 + readsDict[read_len][offset][frame_rel_pos] += 1 + else: + #There's a peak at start codon + if rel_pos_psite < 0: + readsDict[read_len][offset][3] += 1 + elif rel_pos_psite > 2: + readsDict[read_len][offset][4] += 1 + elif rel_pos_psite in [0, 1, 2]: #anchor at start codon + readsDict[read_len][offset][rel_pos_psite] += 1 + + # Close alignment file + bam.close() + + alignment_offset = {} + + for key, value in readsDict.items(): + best_offset = 0 + best_offset_score = 0 + for offset in possible_offsets: + raw_counts = readsDict[key][offset][0:3] + if sum(raw_counts) > 0: + if ((sum(raw_counts) > 200) \ + and (raw_counts[0] > raw_counts[1]) \ + and (raw_counts[0] > raw_counts[2])): + if ((raw_counts[0] > best_offset_score) \ + and ((not options.filterReadsForPeriodicity) \ + or (float(raw_counts[0])/sum(raw_counts) > 0.4))): + best_offset = offset + best_offset_score = raw_counts[0] + if best_offset != 0: + alignment_offset[key] = best_offset + # self.filtered_alignments += (readsDict[rl]["reads"]) + sys.stderr.write("PEAK CALL:" + \ + str(key) + \ + ": peak found at position " + \ + str(best_offset) \ + + os.linesep) + else: + sys.stderr.write("PEAK CALL:" + \ + str(key) + \ + ": peak not found" + \ + os.linesep) + + sys.stdout.write(str(alignment_offset) + os.linesep) + + with open(os.path.join(options.outdir, "alignment_offset.json"), 'w') as file: + file.write(json.dumps(alignment_offset)) + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0)