Addition of scripts that determines the p-site offset based on different read lengths

fc115178 · BIOPZ-Gypas Foivos · 2b1aaec6 · fc115178 · fc115178 · fc115178
Commit fc115178 authored 6 years ago by BIOPZ-Gypas Foivos
--- a/snakemake/process_data/Snakefile
+++ b/snakemake/process_data/Snakefile
 configfile: "config.yaml"
 #from snakemake.utils import listfiles
-localrules: create_output_and_log_directories, remove_multimappers, read_length_histogram, count_reads, sfinish
+localrules: create_output_and_log_directories, remove_multimappers, read_length_histogram, count_reads, determine_p_site_offset, finish
 #################################################################################
 ### Finish rule
@@ -9,6 +9,7 @@ localrules: create_output_and_log_directories, remove_multimappers, read_length_
 rule finish:
 	input:
+		p_site_offsets = expand(os.path.join(config["output_dir"], "{sample}/p_site_offsets/alignment_offset.json"), sample=config["sample"]),
 		pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]),
 		counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"])
@@ -227,6 +228,28 @@ rule remove_multimappers:
 	shell:
 		"(grep -P \"^@|\tNH:i:1\t\" {input.sam} > {output.sam}) &> {log}"
+################################################################################
+### SAM to BAM sort and index
+################################################################################
+rule sam2bam_sort_and_index:
+	input:
+		sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.sam")
+	output:
+		bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"),
+		bai = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam.bai")
+	params:
+		cluster_log = os.path.join(config["cluster_log"], "sam2bam_sort_and_index_{sample}.log")
+	log:
+		os.path.join(config["local_log"], "sam2bam_sort_and_index_{sample}.log")
+	threads:	1
+	singularity:
+		"docker://zavolab/samtools:1.8"
+	shell:
+		"(samtools view -bS {input.sam} \
+		 | samtools sort - > {output.bam}; \
+		 samtools index {output.bam};) &> {log}"
 ################################################################################
 ### Read length histogram
 ################################################################################
@@ -258,6 +281,8 @@ rule count_reads:
 		transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"]
 	output:
 		counts = os.path.join(config["output_dir"], "{sample}/counts.tsv")
+	log:
+		os.path.join(config["local_log"], "count_reads_{sample}.log")
 	threads:	1
 	singularity:
 		"docker://perl:5.24-slim"
@@ -266,3 +291,36 @@ rule count_reads:
 		{input.sam} \
 		{input.transcript_id_gene_id_CDS} \
 		> {output.counts})"
+################################################################################
+### Determine P-site offset
+################################################################################
+rule determine_p_site_offset:
+	input:
+		bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"),
+		transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"]
+	output:
+		p_site_offsets = os.path.join(
+			config["output_dir"],
+			"{sample}/p_site_offsets/alignment_offset.json"
+		),
+		p_site_offset = os.path.join(config["output_dir"],
+									 "{sample}/p_site_offsets")
+	params:
+		outdir = os.path.join(config["output_dir"], "{sample}/p_site_offsets"),
+		cluster_log = os.path.join(config["cluster_log"], "determine_p_site_offset_{sample}.log")
+	log:
+		os.path.join(config["local_log"], "determine_p_site_offset_{sample}.log")
+	threads:	1
+	singularity:
+		"docker://fgypas/python_pysam:3.6.5_0.15.1"
+	shell:
+		"(python scripts/determine_p_site_offsets.py \
+		--bam {input.bam} \
+		--cds_coordinates {input.transcript_id_gene_id_CDS} \
+		--outdir {params.outdir}) &> {log}"
+################################################################################
+### Filter reads based on selected read lengths
+################################################################################
--- a/snakemake/process_data/cluster.json
+++ b/snakemake/process_data/cluster.json
@@ -19,5 +19,12 @@
 "time": "06:00:00",
 "threads":"8",
 "mem":"50G"
+},
+"sam2bam_sort_and_index":
+{
+"queue":"6hours",
+"time": "06:00:00",
+"threads":"1",
+"mem":"10G"
 }
 }
--- a/snakemake/process_data/scripts/determine_p_site_offsets.py
+++ b/snakemake/process_data/scripts/determine_p_site_offsets.py
+#!/usr/bin/env python
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# import needed (external) modules
+# -----------------------------------------------------------------------------
+from argparse import ArgumentParser, RawTextHelpFormatter
+import itertools
+import sys
+import os
+import pysam
+import json
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Main function
+# -----------------------------------------------------------------------------
+def main():
+    """ Main function """
+    __doc__ = "Determine p-site offset"
+    __version__ = "0.1"
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=RawTextHelpFormatter
+    )
+    parser.add_argument(
+        "--bam",
+        dest="bam",
+        help="Input bam file (sorted and indexed)",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--cds_coordinates",
+        dest="cds_coordinates",
+        help="CDS coordinates tsv table",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--filterReadsForPeriodicity",
+        dest="filterReadsForPeriodicity",
+        help="Filter reads for periodicity",
+        required=False,
+        default=False,
+        action='store_true'
+    )
+    parser.add_argument(
+        "--outdir",
+        dest="outdir",
+        help="Output directory",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        default=False,
+        required=False,
+        help="Be verbose"
+    )
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=__version__
+    )
+    # _________________________________________________________________________
+    # -------------------------------------------------------------------------
+    # get the arguments
+    # -------------------------------------------------------------------------
+    try:
+        options = parser.parse_args()
+    except Exception:
+        parser.print_help()
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    if options.verbose:
+        sys.stdout.write("Creating output directory: {} {}".format(options.outdir, os.linesep))
+    if not os.path.exists(options.outdir):
+        os.mkdir(options.outdir)
+    if options.verbose:
+        sys.stdout.write("Parsing {} {}".format(options.cds_coordinates, os.linesep))
+        sys.stdout.write("and parsing {} {}".format(options.bam, os.linesep))
+    # Open alignment file
+    bam = pysam.AlignmentFile(options.bam, "rb")
+    ### Calculating P-site offset
+    possible_offsets = [10, 11, 12, 13, 14, 15, 16]
+    readsDict = {}
+    # loop over CDS coordinates
+    with open(options.cds_coordinates) as cds_coordinates:
+        for line in cds_coordinates:
+            sp_line = line.strip().split("\t")
+            transcript_id = sp_line[0]
+            cds_start = int(sp_line[2]) - 1
+            cds_end = int(sp_line[3])
+            for read in bam.fetch(transcript_id,
+                                  cds_start,
+                                  cds_end):
+                # keep only + strand
+                if read.is_reverse:
+                    continue
+                read_len = len(str(read.seq))
+                # Create a dictionary to score the different offsets for the distinct read lenghts
+                if read_len not in readsDict.keys():
+                    readsDict[read_len] = {}
+                    for offset in possible_offsets:
+                        readsDict[read_len][offset] = [0, 0, 0, 0, 0] #1st,2nd,3rd,UTR,CDS
+                        readsDict[read_len]["reads"] = []
+                read_start = read.reference_start
+                for offset in possible_offsets:
+                    psite_pos = (read_start + offset) #P-site
+                    asite_pos = psite_pos + 3         #A-site
+                    rel_pos_psite = psite_pos - cds_start
+                    rel_pos_asite = asite_pos - cds_start - 3
+                    if options.filterReadsForPeriodicity:
+                        #There's NO peak at start codon
+                        if rel_pos_asite < 0:
+                            readsDict[read_len][offset][3] += 1
+                        elif rel_pos_asite > (cds_end - cds_start):
+                            readsDict[read_len][offset][3] += 1
+                        else:
+                            if rel_pos_asite > 60: #ignore first 20 codons
+                                frame_rel_pos = rel_pos_asite % 3
+                                readsDict[read_len][offset][frame_rel_pos] += 1
+                    else:
+                        #There's a peak at start codon
+                        if rel_pos_psite < 0:
+                            readsDict[read_len][offset][3] += 1
+                        elif rel_pos_psite > 2:
+                            readsDict[read_len][offset][4] += 1
+                        elif rel_pos_psite in [0, 1, 2]: #anchor at start codon
+                            readsDict[read_len][offset][rel_pos_psite] += 1
+    # Close alignment file
+    bam.close()
+    alignment_offset = {}
+    for key, value in readsDict.items():
+        best_offset = 0
+        best_offset_score = 0
+        for offset in possible_offsets:
+            raw_counts = readsDict[key][offset][0:3]
+            if sum(raw_counts) > 0:
+                if ((sum(raw_counts) > 200) \
+                and (raw_counts[0] > raw_counts[1]) \
+                and (raw_counts[0] > raw_counts[2])):
+                    if ((raw_counts[0] > best_offset_score) \
+                    and ((not options.filterReadsForPeriodicity) \
+                    or (float(raw_counts[0])/sum(raw_counts) > 0.4))):
+                        best_offset = offset
+                        best_offset_score = raw_counts[0]
+        if best_offset != 0:
+            alignment_offset[key] = best_offset
+            # self.filtered_alignments += (readsDict[rl]["reads"])
+            sys.stderr.write("PEAK CALL:" + \
+                             str(key) + \
+                             ": peak found at position " + \
+                             str(best_offset) \
+                             + os.linesep)
+        else:
+            sys.stderr.write("PEAK CALL:" + \
+                             str(key) + \
+                             ": peak not found" + \
+                             os.linesep)
+    sys.stdout.write(str(alignment_offset) +  os.linesep)
+    with open(os.path.join(options.outdir, "alignment_offset.json"), 'w') as file:
+        file.write(json.dumps(alignment_offset))
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Call the Main function and catch Keyboard interrups
+# -----------------------------------------------------------------------------
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!" + os.linesep)
+        sys.exit(0)