From 52c1de75dfc3122d2172d062afc924b7a3b19273 Mon Sep 17 00:00:00 2001 From: BIOPZ-Gypas Foivos <foivos.gypas@unibas.ch> Date: Tue, 19 Feb 2019 18:50:02 +0100 Subject: [PATCH] Addition of basic steps for process data --- .gitignore | 3 +- process_data/.gitignore | 7 + process_data/Snakefile | 182 +++++++++++++++++++++ process_data/cluster.json | 41 +++++ process_data/config.yaml | 24 +++ process_data/create_snakemake_flowchart.sh | 1 + process_data/run_snakefile.sh | 13 ++ 7 files changed, 269 insertions(+), 2 deletions(-) create mode 100644 process_data/.gitignore create mode 100644 process_data/Snakefile create mode 100644 process_data/cluster.json create mode 100644 process_data/config.yaml create mode 100755 process_data/create_snakemake_flowchart.sh create mode 100755 process_data/run_snakefile.sh diff --git a/.gitignore b/.gitignore index ea553e2..8b13789 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ -download_annotation -process_data + diff --git a/process_data/.gitignore b/process_data/.gitignore new file mode 100644 index 0000000..58c3d7f --- /dev/null +++ b/process_data/.gitignore @@ -0,0 +1,7 @@ +.* +Log.out +results +logs +dag.png +nohup.out +samples diff --git a/process_data/Snakefile b/process_data/Snakefile new file mode 100644 index 0000000..edd9baa --- /dev/null +++ b/process_data/Snakefile @@ -0,0 +1,182 @@ +configfile: "config.yaml" + +localrules: finish + +################################################################################# +### Final rule +################################################################################# + +rule finish: + input: + fastqc = expand(os.path.join(config["output_dir"], "{sample}", "fastqc"), sample=config["sample"]), + htseq_qa = expand(os.path.join(config["output_dir"], "{sample}", "htseq_qa", "htseq_quality.pdf"), sample=config["sample"]), + gn_estimates = expand(os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.genes.sf"), sample=config["sample"]), + bam = expand(os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam"), sample=config["sample"]) + +################################################################################## +### Fastqc +################################################################################## + +rule fastqc: + input: + reads = os.path.join(config["input_dir"], "{sample}.fastq.gz") + output: + outdir = os.path.join(config["output_dir"], "{sample}", "fastqc") + singularity: + "docker://zavolab/fastqc:0.11.8" + log: + os.path.join(config["local_log"], "fastqc_{sample}.log") + shell: + "(mkdir -p {output.outdir}; \ + fastqc \ + --outdir {output.outdir} \ + {input.reads}) &> {log}" + +################################################################################## +### HTSeq quality assessment of the fastq file +################################################################################## + +rule htseq_qa: + input: + reads = os.path.join(config["input_dir"], "{sample}.fastq.gz") + output: + qual_pdf = os.path.join(config["output_dir"], "{sample}", "htseq_qa", "htseq_quality.pdf") + singularity: + "docker://zavolab/python_htseq:3.6.5_0.10.0" + log: + os.path.join(config["local_log"], "htseq_qa_{sample}.log") + shell: + "(htseq-qa \ + -t fastq \ + -o {output.qual_pdf} \ + {input.reads} ) &> {log}" + +################################################################################## +### Map to other RNAs with Segemehl +################################################################################## + +rule map_to_other_RNAs: + input: + reads = os.path.join(config["input_dir"], "{sample}.fastq.gz"), + index = config["other_RNAs_index"], + sequence = config["other_RNAs_sequence"] + output: + sam = os.path.join(config["output_dir"], "{sample}", "other_genes.mapped.sam"), + reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz") + params: + reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq"), + silent = "--silent", + accuracy = "90" + log: + os.path.join(config["local_log"], "map_to_other_genes_{sample}.log") + threads: 8 + singularity: + "docker://zavolab/segemehl:0.2.0" + shell: + "(segemehl.x \ + {params.silent} \ + -i {input.index} \ + -d {input.sequence} \ + -q {input.reads} \ + --accuracy {params.accuracy} \ + --threads {threads} \ + -o {output.sam} \ + -u {params.reads}; \ + gzip -c {params.reads} > {output.reads}; \ + rm {params.reads}) &> {log}" + +################################################################################## +### salmon quant +################################################################################## + +rule salmon_quant: + input: + reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz"), + gtf = config["annotation_filtered"], + index = config["salmon_index"] + output: + output_dir = os.path.join(config["output_dir"], "{sample}", "salmon_quant"), + gn_estimates = os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.genes.sf"), + tr_estimates = os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.sf") + params: + libType = lambda wildcards: config[wildcards.sample]['libType'], + fldMean = lambda wildcards: config[wildcards.sample]['fldMean'], + fldSD = lambda wildcards: config[wildcards.sample]['fldSD'], + log: + os.path.join(config["local_log"], "salmon_quant_{sample}.log") + threads: 6 + singularity: + "docker://zavolab/salmon:0.11.0" + shell: + "(salmon quant \ + --index {input.index} \ + --libType {params.libType} \ + --unmatedReads <(zcat {input.reads}) \ + --seqBias \ + --geneMap {input.gtf} \ + --fldMean {params.fldMean} \ + --fldSD {params.fldSD} \ + --threads {threads} \ + --output {output.output_dir}) &> {log}" + +################################################################################# +### Align reads STAR +################################################################################# + +rule align_reads_STAR: + input: + index = config["STAR_index"], + reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz"), + gtf = config["annotation"] + output: + outputfile = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam") + params: + outFileNamePrefix = os.path.join(config["output_dir"], "{sample}", "STAR_") + log: + os.path.join(config["local_log"],"align_reads_STAR_{sample}.log") + threads: 8 + singularity: + "docker://zavolab/star:2.6.0a" + shell: + "(STAR --runMode alignReads \ + --twopassMode Basic \ + --runThreadN {threads} \ + --genomeDir {input.index} \ + --sjdbGTFfile {input.gtf} \ + --readFilesIn {input.reads} \ + --readFilesCommand zcat \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMtype BAM Unsorted) &> {log}" + +################################################################################ +### Sort alignment file +################################################################################ + +rule sort_bam: + input: + bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam") + output: + bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam") + threads: 8 + log: + os.path.join(config["local_log"],"sort_bam_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools sort -@ {threads} {input.bam} > {output.bam}) &> {log}" + +################################################################################ +### Index alignment file +################################################################################ + +rule samtools_index: + input: + bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam") + output: + bai = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam.bai") + log: + os.path.join(config["local_log"],"samtools_index_{sample}.log") + singularity: + "docker://zavolab/samtools:1.8" + shell: + "(samtools index {input.bam} > {output.bai}) &> {log}" diff --git a/process_data/cluster.json b/process_data/cluster.json new file mode 100644 index 0000000..eb681fc --- /dev/null +++ b/process_data/cluster.json @@ -0,0 +1,41 @@ +{ + "__default__" : + { + "queue": "6hours", + "time": "01:00:00", + "threads": "1", + "mem": "4G", + "name": "{rule}.{wildcards}", + "out": "$PWD/logs/cluster_log/{rule}.{wildcards}-%j-%N.out" + }, + "generate_segemehl_index_other_RNAs": + { + "time": "06:00:00", + "threads":"8", + "mem":"50G" + }, + "map_to_other_RNAs": + { + "time": "06:00:00", + "threads":"8", + "mem":"50G" + }, + "index_genome_STAR": + { + "time": "06:00:00", + "threads":"8", + "mem":"75G" + }, + "align_reads_STAR": + { + "time": "06:00:00", + "threads":"8", + "mem":"75G" + }, + "salmon_quant": + { + "time": "02:00:00", + "threads":"6", + "mem":"32G" + } +} diff --git a/process_data/config.yaml b/process_data/config.yaml new file mode 100644 index 0000000..d2c34ba --- /dev/null +++ b/process_data/config.yaml @@ -0,0 +1,24 @@ +--- + ############################################################################## + ### Annotation + ############################################################################## + annotation: "../prepare_annotation/results/annotation.gtf" + genome: "../prepare_annotation/results/genome.fa" + annotation_filtered: "../prepare_annotation/results/filtered_transcripts.gtf" + STAR_index: "../prepare_annotation/results/STAR_index/" + other_RNAs_sequence: "../prepare_annotation/other.fa" + other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx" + salmon_index: "../prepare_annotation/results/filtered_transcripts_salmon.idx/" + ############################################################################## + ### Output and log directories + ############################################################################## + output_dir: "results" + local_log: "logs/local_log" + cluster_log: "logs/cluster_log" + ############################################################################## + ### Sample info + ############################################################################## + input_dir: "samples" + sample: ["test"] + test: {libType: A, fldMean: 300, fldSD: 100} +... diff --git a/process_data/create_snakemake_flowchart.sh b/process_data/create_snakemake_flowchart.sh new file mode 100755 index 0000000..ce71a9a --- /dev/null +++ b/process_data/create_snakemake_flowchart.sh @@ -0,0 +1 @@ +snakemake --dag -np | dot -Tpng > dag.png diff --git a/process_data/run_snakefile.sh b/process_data/run_snakefile.sh new file mode 100755 index 0000000..ceec8a8 --- /dev/null +++ b/process_data/run_snakefile.sh @@ -0,0 +1,13 @@ +# set -e + +mkdir -p logs/cluster_log +mkdir -p logs/local_log + +snakemake \ +--cluster-config cluster.json \ +--cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --job-name={cluster.name} -o {cluster.out} -p scicore" \ +--cores 256 \ +-p \ +--rerun-incomplete \ +--use-singularity \ +--singularity-args "--bind ${PWD}" -- GitLab