From 5dc673b57d44deb160c8c675e1e178a782864cd1 Mon Sep 17 00:00:00 2001
From: BIOPZ-Gypas Foivos <foivos.gypas@unibas.ch>
Date: Tue, 29 Jan 2019 16:01:33 +0100
Subject: [PATCH] Many changes: count of oligos, identification of
 overrepresented sequences, trimming of bases in the beginning in case of the
 smarter-kit protocol.

---
 .gitignore                                    |  10 ++
 snakemake/prepare_annotation/config.yaml      |   6 +-
 snakemake/process_data/Snakefile              |  64 ++++++++-
 snakemake/process_data/config.yaml            |  13 +-
 .../scripts/find_overepresented_sequences.py  | 121 ++++++++++++++++++
 snakemake/process_data/scripts/scanAdapter.sh |  24 ++++
 6 files changed, 227 insertions(+), 11 deletions(-)
 create mode 100755 snakemake/process_data/scripts/find_overepresented_sequences.py
 create mode 100755 snakemake/process_data/scripts/scanAdapter.sh

diff --git a/.gitignore b/.gitignore
index d425535..3edaa91 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,13 @@
 snakemake/annotation/
 .DS_Store
 ._.DS_Store
+snakemake/prepare_annotation/.snakemake/
+snakemake/prepare_annotation/logs/
+snakemake/prepare_annotation/results/
+snakemake/prepare_annotation/nohup.out
+snakemake/process_data/._dag.png
+snakemake/process_data/.snakemake/
+snakemake/process_data/dag.png
+snakemake/process_data/logs/
+snakemake/process_data/nohup.out
+snakemake/process_data/results/
diff --git a/snakemake/prepare_annotation/config.yaml b/snakemake/prepare_annotation/config.yaml
index 02a2810..854ad37 100644
--- a/snakemake/prepare_annotation/config.yaml
+++ b/snakemake/prepare_annotation/config.yaml
@@ -2,9 +2,9 @@
   ##############################################################################
   ### Annotation
   ##############################################################################
-  genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
-  gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf"
-  other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
+  genome: "../annotation/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa"
+  gtf: "../annotation/Mus_musculus.GRCm38.90.chr.gtf"
+  other_RNAs_sequence: "../annotation/mm10_rrnas.fa"
   ##############################################################################
   ### Output and log directory
   ##############################################################################
diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile
index ba16c62..b660645 100644
--- a/snakemake/process_data/Snakefile
+++ b/snakemake/process_data/Snakefile
@@ -11,8 +11,46 @@ rule finish:
 	input:
 		pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]),
 		counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]),
-		bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"])
+		bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]),
+		oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]),
+		overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"])
 
+################################################################################
+### Count oligos
+################################################################################
+rule count_oligos:
+	input:
+		reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
+		oligos = config["oligos"],
+		script = "scripts/scanAdapter.sh"
+	output:
+		oligos_counts = os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt")
+	log:
+		os.path.join(config["local_log"], "count_oligos_{sample}.log")
+	shell:
+		"({input.script} {input.reads} {input.oligos} > {output.oligos_counts}) &> {log}"
+
+#################################################################################
+### Trim first bases (For Smarter-Kit ONLY)
+#################################################################################
+
+rule trim_first_bases:
+	input:
+		reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"])
+	output:
+		reads = os.path.join(config["output_dir"], "{sample}/trimmed_first_bases.fastq.gz"),
+	params:
+		cut = lambda wildcards: config[wildcards.sample]['trim_bases_5_prime'],
+		minimum_length = "20"
+	log:
+		os.path.join(config["local_log"], "trim_first_bases_{sample}.log")
+	singularity:
+		"docker://zavolab/cutadapt:1.16"
+	shell:
+		"(cutadapt \
+		--cut {params.cut} \
+		--minimum-length {params.minimum_length} \
+		{input.reads} | gzip > {output.reads}) &> {log}"
 
 ################################################################################
 ### Clipping reads
@@ -20,13 +58,14 @@ rule finish:
 
 rule clip_reads:
 	input:
-		reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"])
+		reads = os.path.join(config["output_dir"], "{sample}/trimmed_first_bases.fastq.gz"),
 	output:
 		reads = os.path.join(config["output_dir"], "{sample}", "pro.clipped.fastq.gz")
 	params:
 		v = "-v",
 		n = "-n",
 		l = "20",
+		c = "-c",
 		adapter = lambda wildcards: config[wildcards.sample]['adapter'],
 		z = "-z"
 	log:
@@ -38,6 +77,7 @@ rule clip_reads:
 		{params.v} \
 		{params.n} \
 		-l {params.l} \
+		{params.c} \
 		{params.z} \
 		-a {params.adapter} \
 		-i <(zcat {input.reads}) \
@@ -157,6 +197,26 @@ rule map_to_other_genes:
 		-o {output.sam} \
 		-u {output.reads} ) &> {log}"
 
+################################################################################
+### Count overrepresented sequences from other rRNAs (e.g. rRNA)
+################################################################################
+
+rule count_overrepresented_sequences_other:
+	input:
+		sam = os.path.join(config["output_dir"], "{sample}/other_genes.mapped.sam"),
+		script = "scripts/find_overepresented_sequences.py"
+	output:
+		overrepresented_sequences_counts = os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt")
+	log:
+		os.path.join(config["local_log"], "count_overrepresented_sequences_other_{sample}.log")
+	singularity:
+		"docker://fgypas/python_pysam:3.6.5_0.15.1"
+	shell:
+		"(python {input.script} \
+		--sam {input.sam} \
+		--out {output.overrepresented_sequences_counts}) &> {log}"
+		# "(grep -P -v \"^@\" {input.sam} | cut -f 10 | sort | uniq -c | sort -n -r > {output.overrepresented_sequences_counts}) &> {log}"
+
 ################################################################################
 ### Map reads to other genes (rRNA, tRNA, etc...)
 ################################################################################
diff --git a/snakemake/process_data/config.yaml b/snakemake/process_data/config.yaml
index 25adcf2..4635bd2 100644
--- a/snakemake/process_data/config.yaml
+++ b/snakemake/process_data/config.yaml
@@ -2,11 +2,12 @@
   ##############################################################################
   ### Annotation
   ##############################################################################
-  other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
+  other_RNAs_sequence: "../annotation/mm10_rrnas.fa"
   other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx"
   transcripts_sequence: "../prepare_annotation/results/longest_pc_transcript_per_gene.fa"
   transcripts_index: "../prepare_annotation/results/longest_pc_transcript_per_gene.idx"
   transcript_id_gene_id_CDS: "../prepare_annotation/results/transcript_id_gene_id_CDS.tsv"
+  oligos: "../annotation/oligos.txt"
   ##############################################################################
   ### Output and log directory
   ##############################################################################
@@ -18,9 +19,9 @@
   ##############################################################################
   input_dir: "samples"
   input_reads_pattern: ".fastq.gz"
-  sample: ["example", "example2", "SRR1536304", "SRR1536305"]
-  example: {adapter: GATCGGAAGAGCACA, minimum_quality: 20, quality_type: 33}
-  example2: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 64}
-  SRR1536304: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 33}
-  SRR1536305: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 33}
+  sample: ["BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1", "BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1", "BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1", "BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1"]
+  BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
+  BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
+  BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
+  BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
 ...
diff --git a/snakemake/process_data/scripts/find_overepresented_sequences.py b/snakemake/process_data/scripts/find_overepresented_sequences.py
new file mode 100755
index 0000000..a268002
--- /dev/null
+++ b/snakemake/process_data/scripts/find_overepresented_sequences.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python
+
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# import needed (external) modules
+# -----------------------------------------------------------------------------
+
+from argparse import ArgumentParser, RawTextHelpFormatter
+import sys
+import os
+import pysam
+
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Main function
+# -----------------------------------------------------------------------------
+
+def main():
+    """ Main function """
+
+    __doc__ = "Find overepresented sequences from a SAM file."
+    __version__ = "0.1"
+
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=RawTextHelpFormatter
+    )
+
+    parser.add_argument(
+        "--sam",
+        dest="sam",
+        help="Input bam file (sorted and indexed)",
+        required=True,
+        metavar="FILE"
+    )
+
+    parser.add_argument(
+        "--out",
+        dest="out",
+        help="Output file",
+        required=True,
+        metavar="FILE"
+    )
+
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        default=False,
+        required=False,
+        help="Be verbose"
+    )
+
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=__version__
+    )
+
+    # _________________________________________________________________________
+    # -------------------------------------------------------------------------
+    # get the arguments
+    # -------------------------------------------------------------------------
+    try:
+        options = parser.parse_args()
+    except Exception:
+        parser.print_help()
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    if options.verbose:
+        sys.stdout.write("and parsing {} {}".format(options.sam, os.linesep))
+
+    # Init dictionaries
+    # observed_reads: Store observed_reads to count multimappers only once
+    # sequence_counts: Number of times a sequence was observed
+
+    # key:read id, value: read_id
+    observed_reads = {}
+    # key: sequence, value: count
+    sequence_counts = {}
+
+    # Open alignment file
+    sam = pysam.AlignmentFile(options.sam, "rb")
+
+    for read in sam.fetch():
+
+        sequence = read.seq
+        qname = read.qname
+
+        if qname in observed_reads:
+            continue
+        else:
+            observed_reads[qname] = qname
+
+        if sequence not in sequence_counts:
+            sequence_counts[sequence] = 1
+        else:
+            sequence_counts[sequence] += 1
+
+    sam.close()
+
+    fh = open(options.out, "w")
+    for w in sorted(sequence_counts, key=sequence_counts.get, reverse=True):
+        fh.write(str(w) + "\t" + str(sequence_counts[w]) + "\n")
+    fh.close()
+
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Call the Main function and catch Keyboard interrups
+# -----------------------------------------------------------------------------
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!" + os.linesep)
+        sys.exit(0)
diff --git a/snakemake/process_data/scripts/scanAdapter.sh b/snakemake/process_data/scripts/scanAdapter.sh
new file mode 100755
index 0000000..bd2e7c0
--- /dev/null
+++ b/snakemake/process_data/scripts/scanAdapter.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Alexander Kanitz
+# 04-JAN-2015
+
+# Update: 24-JAN-2010 (Foivos Gypas)
+
+# [USAGE]
+# scanAdapter.sh <FASTQ> <ADAPTER_FILE> (<NUMBER_OF_SEQUENCES>)
+
+# [DESCRIPTION]
+# The script scans a part of a specified FASTQ file (gzipped or uncompressed) for the presence of 
+# adapter sequences present in a separate file and the location of the adapter file. Optionally, 
+# the number of sequences to scan (default: 10,000).
+
+seqFile=$1
+adapterFile=${2}
+seqNumber=$((${3:-10000}*4))
+
+
+while read line; do
+    echo -n "${line}: "
+    grep -c "$line" <(zcat -f -- "$seqFile")
+done < "$adapterFile"
-- 
GitLab