From 5dc673b57d44deb160c8c675e1e178a782864cd1 Mon Sep 17 00:00:00 2001 From: BIOPZ-Gypas Foivos <foivos.gypas@unibas.ch> Date: Tue, 29 Jan 2019 16:01:33 +0100 Subject: [PATCH] Many changes: count of oligos, identification of overrepresented sequences, trimming of bases in the beginning in case of the smarter-kit protocol. --- .gitignore | 10 ++ snakemake/prepare_annotation/config.yaml | 6 +- snakemake/process_data/Snakefile | 64 ++++++++- snakemake/process_data/config.yaml | 13 +- .../scripts/find_overepresented_sequences.py | 121 ++++++++++++++++++ snakemake/process_data/scripts/scanAdapter.sh | 24 ++++ 6 files changed, 227 insertions(+), 11 deletions(-) create mode 100755 snakemake/process_data/scripts/find_overepresented_sequences.py create mode 100755 snakemake/process_data/scripts/scanAdapter.sh diff --git a/.gitignore b/.gitignore index d425535..3edaa91 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,13 @@ snakemake/annotation/ .DS_Store ._.DS_Store +snakemake/prepare_annotation/.snakemake/ +snakemake/prepare_annotation/logs/ +snakemake/prepare_annotation/results/ +snakemake/prepare_annotation/nohup.out +snakemake/process_data/._dag.png +snakemake/process_data/.snakemake/ +snakemake/process_data/dag.png +snakemake/process_data/logs/ +snakemake/process_data/nohup.out +snakemake/process_data/results/ diff --git a/snakemake/prepare_annotation/config.yaml b/snakemake/prepare_annotation/config.yaml index 02a2810..854ad37 100644 --- a/snakemake/prepare_annotation/config.yaml +++ b/snakemake/prepare_annotation/config.yaml @@ -2,9 +2,9 @@ ############################################################################## ### Annotation ############################################################################## - genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa" - gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf" - other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" + genome: "../annotation/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa" + gtf: "../annotation/Mus_musculus.GRCm38.90.chr.gtf" + other_RNAs_sequence: "../annotation/mm10_rrnas.fa" ############################################################################## ### Output and log directory ############################################################################## diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index ba16c62..b660645 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -11,8 +11,46 @@ rule finish: input: pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]), - bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]) + bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]), + oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]), + overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"]) +################################################################################ +### Count oligos +################################################################################ +rule count_oligos: + input: + reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]), + oligos = config["oligos"], + script = "scripts/scanAdapter.sh" + output: + oligos_counts = os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt") + log: + os.path.join(config["local_log"], "count_oligos_{sample}.log") + shell: + "({input.script} {input.reads} {input.oligos} > {output.oligos_counts}) &> {log}" + +################################################################################# +### Trim first bases (For Smarter-Kit ONLY) +################################################################################# + +rule trim_first_bases: + input: + reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]) + output: + reads = os.path.join(config["output_dir"], "{sample}/trimmed_first_bases.fastq.gz"), + params: + cut = lambda wildcards: config[wildcards.sample]['trim_bases_5_prime'], + minimum_length = "20" + log: + os.path.join(config["local_log"], "trim_first_bases_{sample}.log") + singularity: + "docker://zavolab/cutadapt:1.16" + shell: + "(cutadapt \ + --cut {params.cut} \ + --minimum-length {params.minimum_length} \ + {input.reads} | gzip > {output.reads}) &> {log}" ################################################################################ ### Clipping reads @@ -20,13 +58,14 @@ rule finish: rule clip_reads: input: - reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]) + reads = os.path.join(config["output_dir"], "{sample}/trimmed_first_bases.fastq.gz"), output: reads = os.path.join(config["output_dir"], "{sample}", "pro.clipped.fastq.gz") params: v = "-v", n = "-n", l = "20", + c = "-c", adapter = lambda wildcards: config[wildcards.sample]['adapter'], z = "-z" log: @@ -38,6 +77,7 @@ rule clip_reads: {params.v} \ {params.n} \ -l {params.l} \ + {params.c} \ {params.z} \ -a {params.adapter} \ -i <(zcat {input.reads}) \ @@ -157,6 +197,26 @@ rule map_to_other_genes: -o {output.sam} \ -u {output.reads} ) &> {log}" +################################################################################ +### Count overrepresented sequences from other rRNAs (e.g. rRNA) +################################################################################ + +rule count_overrepresented_sequences_other: + input: + sam = os.path.join(config["output_dir"], "{sample}/other_genes.mapped.sam"), + script = "scripts/find_overepresented_sequences.py" + output: + overrepresented_sequences_counts = os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt") + log: + os.path.join(config["local_log"], "count_overrepresented_sequences_other_{sample}.log") + singularity: + "docker://fgypas/python_pysam:3.6.5_0.15.1" + shell: + "(python {input.script} \ + --sam {input.sam} \ + --out {output.overrepresented_sequences_counts}) &> {log}" + # "(grep -P -v \"^@\" {input.sam} | cut -f 10 | sort | uniq -c | sort -n -r > {output.overrepresented_sequences_counts}) &> {log}" + ################################################################################ ### Map reads to other genes (rRNA, tRNA, etc...) ################################################################################ diff --git a/snakemake/process_data/config.yaml b/snakemake/process_data/config.yaml index 25adcf2..4635bd2 100644 --- a/snakemake/process_data/config.yaml +++ b/snakemake/process_data/config.yaml @@ -2,11 +2,12 @@ ############################################################################## ### Annotation ############################################################################## - other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" + other_RNAs_sequence: "../annotation/mm10_rrnas.fa" other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx" transcripts_sequence: "../prepare_annotation/results/longest_pc_transcript_per_gene.fa" transcripts_index: "../prepare_annotation/results/longest_pc_transcript_per_gene.idx" transcript_id_gene_id_CDS: "../prepare_annotation/results/transcript_id_gene_id_CDS.tsv" + oligos: "../annotation/oligos.txt" ############################################################################## ### Output and log directory ############################################################################## @@ -18,9 +19,9 @@ ############################################################################## input_dir: "samples" input_reads_pattern: ".fastq.gz" - sample: ["example", "example2", "SRR1536304", "SRR1536305"] - example: {adapter: GATCGGAAGAGCACA, minimum_quality: 20, quality_type: 33} - example2: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 64} - SRR1536304: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 33} - SRR1536305: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 33} + sample: ["BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1", "BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1", "BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1", "BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1"] + BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} + BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} + BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} + BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} ... diff --git a/snakemake/process_data/scripts/find_overepresented_sequences.py b/snakemake/process_data/scripts/find_overepresented_sequences.py new file mode 100755 index 0000000..a268002 --- /dev/null +++ b/snakemake/process_data/scripts/find_overepresented_sequences.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +from argparse import ArgumentParser, RawTextHelpFormatter +import sys +import os +import pysam + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + +def main(): + """ Main function """ + + __doc__ = "Find overepresented sequences from a SAM file." + __version__ = "0.1" + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--sam", + dest="sam", + help="Input bam file (sorted and indexed)", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--out", + dest="out", + help="Output file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Be verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except Exception: + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write("and parsing {} {}".format(options.sam, os.linesep)) + + # Init dictionaries + # observed_reads: Store observed_reads to count multimappers only once + # sequence_counts: Number of times a sequence was observed + + # key:read id, value: read_id + observed_reads = {} + # key: sequence, value: count + sequence_counts = {} + + # Open alignment file + sam = pysam.AlignmentFile(options.sam, "rb") + + for read in sam.fetch(): + + sequence = read.seq + qname = read.qname + + if qname in observed_reads: + continue + else: + observed_reads[qname] = qname + + if sequence not in sequence_counts: + sequence_counts[sequence] = 1 + else: + sequence_counts[sequence] += 1 + + sam.close() + + fh = open(options.out, "w") + for w in sorted(sequence_counts, key=sequence_counts.get, reverse=True): + fh.write(str(w) + "\t" + str(sequence_counts[w]) + "\n") + fh.close() + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) diff --git a/snakemake/process_data/scripts/scanAdapter.sh b/snakemake/process_data/scripts/scanAdapter.sh new file mode 100755 index 0000000..bd2e7c0 --- /dev/null +++ b/snakemake/process_data/scripts/scanAdapter.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Alexander Kanitz +# 04-JAN-2015 + +# Update: 24-JAN-2010 (Foivos Gypas) + +# [USAGE] +# scanAdapter.sh <FASTQ> <ADAPTER_FILE> (<NUMBER_OF_SEQUENCES>) + +# [DESCRIPTION] +# The script scans a part of a specified FASTQ file (gzipped or uncompressed) for the presence of +# adapter sequences present in a separate file and the location of the adapter file. Optionally, +# the number of sequences to scan (default: 10,000). + +seqFile=$1 +adapterFile=${2} +seqNumber=$((${3:-10000}*4)) + + +while read line; do + echo -n "${line}: " + grep -c "$line" <(zcat -f -- "$seqFile") +done < "$adapterFile" -- GitLab