Skip to content
Snippets Groups Projects
Commit 5dc673b5 authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

Many changes: count of oligos, identification of overrepresented sequences,...

Many changes: count of oligos, identification of overrepresented sequences, trimming of bases in the beginning in case of the smarter-kit protocol.
parent 88f36c60
No related branches found
No related tags found
No related merge requests found
snakemake/annotation/ snakemake/annotation/
.DS_Store .DS_Store
._.DS_Store ._.DS_Store
snakemake/prepare_annotation/.snakemake/
snakemake/prepare_annotation/logs/
snakemake/prepare_annotation/results/
snakemake/prepare_annotation/nohup.out
snakemake/process_data/._dag.png
snakemake/process_data/.snakemake/
snakemake/process_data/dag.png
snakemake/process_data/logs/
snakemake/process_data/nohup.out
snakemake/process_data/results/
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
############################################################################## ##############################################################################
### Annotation ### Annotation
############################################################################## ##############################################################################
genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa" genome: "../annotation/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa"
gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf" gtf: "../annotation/Mus_musculus.GRCm38.90.chr.gtf"
other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" other_RNAs_sequence: "../annotation/mm10_rrnas.fa"
############################################################################## ##############################################################################
### Output and log directory ### Output and log directory
############################################################################## ##############################################################################
......
...@@ -11,8 +11,46 @@ rule finish: ...@@ -11,8 +11,46 @@ rule finish:
input: input:
pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]),
counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]), counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]),
bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]) bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]),
oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]),
overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"])
################################################################################
### Count oligos
################################################################################
rule count_oligos:
input:
reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
oligos = config["oligos"],
script = "scripts/scanAdapter.sh"
output:
oligos_counts = os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt")
log:
os.path.join(config["local_log"], "count_oligos_{sample}.log")
shell:
"({input.script} {input.reads} {input.oligos} > {output.oligos_counts}) &> {log}"
#################################################################################
### Trim first bases (For Smarter-Kit ONLY)
#################################################################################
rule trim_first_bases:
input:
reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"])
output:
reads = os.path.join(config["output_dir"], "{sample}/trimmed_first_bases.fastq.gz"),
params:
cut = lambda wildcards: config[wildcards.sample]['trim_bases_5_prime'],
minimum_length = "20"
log:
os.path.join(config["local_log"], "trim_first_bases_{sample}.log")
singularity:
"docker://zavolab/cutadapt:1.16"
shell:
"(cutadapt \
--cut {params.cut} \
--minimum-length {params.minimum_length} \
{input.reads} | gzip > {output.reads}) &> {log}"
################################################################################ ################################################################################
### Clipping reads ### Clipping reads
...@@ -20,13 +58,14 @@ rule finish: ...@@ -20,13 +58,14 @@ rule finish:
rule clip_reads: rule clip_reads:
input: input:
reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]) reads = os.path.join(config["output_dir"], "{sample}/trimmed_first_bases.fastq.gz"),
output: output:
reads = os.path.join(config["output_dir"], "{sample}", "pro.clipped.fastq.gz") reads = os.path.join(config["output_dir"], "{sample}", "pro.clipped.fastq.gz")
params: params:
v = "-v", v = "-v",
n = "-n", n = "-n",
l = "20", l = "20",
c = "-c",
adapter = lambda wildcards: config[wildcards.sample]['adapter'], adapter = lambda wildcards: config[wildcards.sample]['adapter'],
z = "-z" z = "-z"
log: log:
...@@ -38,6 +77,7 @@ rule clip_reads: ...@@ -38,6 +77,7 @@ rule clip_reads:
{params.v} \ {params.v} \
{params.n} \ {params.n} \
-l {params.l} \ -l {params.l} \
{params.c} \
{params.z} \ {params.z} \
-a {params.adapter} \ -a {params.adapter} \
-i <(zcat {input.reads}) \ -i <(zcat {input.reads}) \
...@@ -157,6 +197,26 @@ rule map_to_other_genes: ...@@ -157,6 +197,26 @@ rule map_to_other_genes:
-o {output.sam} \ -o {output.sam} \
-u {output.reads} ) &> {log}" -u {output.reads} ) &> {log}"
################################################################################
### Count overrepresented sequences from other rRNAs (e.g. rRNA)
################################################################################
rule count_overrepresented_sequences_other:
input:
sam = os.path.join(config["output_dir"], "{sample}/other_genes.mapped.sam"),
script = "scripts/find_overepresented_sequences.py"
output:
overrepresented_sequences_counts = os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt")
log:
os.path.join(config["local_log"], "count_overrepresented_sequences_other_{sample}.log")
singularity:
"docker://fgypas/python_pysam:3.6.5_0.15.1"
shell:
"(python {input.script} \
--sam {input.sam} \
--out {output.overrepresented_sequences_counts}) &> {log}"
# "(grep -P -v \"^@\" {input.sam} | cut -f 10 | sort | uniq -c | sort -n -r > {output.overrepresented_sequences_counts}) &> {log}"
################################################################################ ################################################################################
### Map reads to other genes (rRNA, tRNA, etc...) ### Map reads to other genes (rRNA, tRNA, etc...)
################################################################################ ################################################################################
......
...@@ -2,11 +2,12 @@ ...@@ -2,11 +2,12 @@
############################################################################## ##############################################################################
### Annotation ### Annotation
############################################################################## ##############################################################################
other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" other_RNAs_sequence: "../annotation/mm10_rrnas.fa"
other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx" other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx"
transcripts_sequence: "../prepare_annotation/results/longest_pc_transcript_per_gene.fa" transcripts_sequence: "../prepare_annotation/results/longest_pc_transcript_per_gene.fa"
transcripts_index: "../prepare_annotation/results/longest_pc_transcript_per_gene.idx" transcripts_index: "../prepare_annotation/results/longest_pc_transcript_per_gene.idx"
transcript_id_gene_id_CDS: "../prepare_annotation/results/transcript_id_gene_id_CDS.tsv" transcript_id_gene_id_CDS: "../prepare_annotation/results/transcript_id_gene_id_CDS.tsv"
oligos: "../annotation/oligos.txt"
############################################################################## ##############################################################################
### Output and log directory ### Output and log directory
############################################################################## ##############################################################################
...@@ -18,9 +19,9 @@ ...@@ -18,9 +19,9 @@
############################################################################## ##############################################################################
input_dir: "samples" input_dir: "samples"
input_reads_pattern: ".fastq.gz" input_reads_pattern: ".fastq.gz"
sample: ["example", "example2", "SRR1536304", "SRR1536305"] sample: ["BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1", "BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1", "BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1", "BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1"]
example: {adapter: GATCGGAAGAGCACA, minimum_quality: 20, quality_type: 33} BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
example2: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 64} BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
SRR1536304: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 33} BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
SRR1536305: {adapter: CTGTAGGCACCATCA, minimum_quality: 20, quality_type: 33} BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
... ...
#!/usr/bin/env python
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
from argparse import ArgumentParser, RawTextHelpFormatter
import sys
import os
import pysam
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
__doc__ = "Find overepresented sequences from a SAM file."
__version__ = "0.1"
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--sam",
dest="sam",
help="Input bam file (sorted and indexed)",
required=True,
metavar="FILE"
)
parser.add_argument(
"--out",
dest="out",
help="Output file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Be verbose"
)
parser.add_argument(
'--version',
action='version',
version=__version__
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except Exception:
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write("and parsing {} {}".format(options.sam, os.linesep))
# Init dictionaries
# observed_reads: Store observed_reads to count multimappers only once
# sequence_counts: Number of times a sequence was observed
# key:read id, value: read_id
observed_reads = {}
# key: sequence, value: count
sequence_counts = {}
# Open alignment file
sam = pysam.AlignmentFile(options.sam, "rb")
for read in sam.fetch():
sequence = read.seq
qname = read.qname
if qname in observed_reads:
continue
else:
observed_reads[qname] = qname
if sequence not in sequence_counts:
sequence_counts[sequence] = 1
else:
sequence_counts[sequence] += 1
sam.close()
fh = open(options.out, "w")
for w in sorted(sequence_counts, key=sequence_counts.get, reverse=True):
fh.write(str(w) + "\t" + str(sequence_counts[w]) + "\n")
fh.close()
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
#!/bin/bash
# Alexander Kanitz
# 04-JAN-2015
# Update: 24-JAN-2010 (Foivos Gypas)
# [USAGE]
# scanAdapter.sh <FASTQ> <ADAPTER_FILE> (<NUMBER_OF_SEQUENCES>)
# [DESCRIPTION]
# The script scans a part of a specified FASTQ file (gzipped or uncompressed) for the presence of
# adapter sequences present in a separate file and the location of the adapter file. Optionally,
# the number of sequences to scan (default: 10,000).
seqFile=$1
adapterFile=${2}
seqNumber=$((${3:-10000}*4))
while read line; do
echo -n "${line}: "
grep -c "$line" <(zcat -f -- "$seqFile")
done < "$adapterFile"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment