Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • zavolan_group/pipelines/zarp
1 result
Show changes
Commits on Source (4)
Showing
with 1297 additions and 880 deletions
......@@ -4,7 +4,7 @@ before_script:
- apt update && apt install -y gcc
- conda init bash && source ~/.bashrc && echo $CONDA_DEFAULT_ENV
- conda env create -f install/environment.root.yml
- conda activate rnaseq_pipeline && echo $CONDA_DEFAULT_ENV
- conda activate rhea && echo $CONDA_DEFAULT_ENV
- conda env update -f install/environment.dev.yml
test:
......
# RNA-Seq pipeline
# Rhea pipeline
[Snakemake][snakemake] workflow for general purpose RNA-Seq library annotation
developed by the [Zavolan lab][zavolan-lab].
......@@ -29,8 +29,8 @@ Traverse to the desired path on your file system, then clone the repository and
move into it with:
```bash
git clone ssh://git@git.scicore.unibas.ch:2222/zavolan_group/pipelines/rnaseqpipeline.git
cd rnaseqpipeline
git clone ssh://git@git.scicore.unibas.ch:2222/zavolan_group/pipelines/rhea.git
cd rhea
```
### Installing Conda
......@@ -87,7 +87,7 @@ conda env create -f install/environment.root.yml
Activate the Conda environment with:
```bash
conda activate rnaseq_pipeline
conda activate rhea
```
### Installing non-essential dependencies
......
......@@ -5,6 +5,8 @@ import sys
import pandas as pd
import shutil
import glob
from zipfile import ZipFile
# Get sample table
samples_table = pd.read_csv(
......@@ -17,7 +19,8 @@ samples_table = pd.read_csv(
)
# Global config
localrules: finish, rename_star_rpm_for_alfa
localrules: finish, rename_star_rpm_for_alfa, prepare_files_for_report, \
prepare_MultiQC_config
# Create log directories
os.makedirs(
......@@ -44,76 +47,11 @@ rule finish:
Rule for collecting outputs
"""
input:
outdir1 = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"mate1_fastqc"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
pseudoalignment = expand(
MultiQC_report = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"quant_kallisto",
"{sample}.kallisto.pseudo.sam"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
TIN_boxplot_PNG = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.png"),
TIN_boxplot_PDF = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.pdf"),
salmon_merge_genes = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
salmon_merge_transcripts = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
star_rpm = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
alfa_reports = expand(os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Biotypes.pdf"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [
samples_table.loc[i,"seqmode"]
for i in list(samples_table.index.values)]),
alfa_all_samples = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Categories.pdf")
"multiqc_summary"),
output_dir=config["output_dir"])
rule create_index_star:
"""
......@@ -921,3 +859,377 @@ rule alfa_qc_all_samples:
"""
(alfa -c {input.tables} -o {params.out_dir}) &> {log}
"""
rule prepare_files_for_report:
'''
Re-structure the results and add comments for MultiQC parsing
'''
input:
outdir1 = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"mate1_fastqc"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
pseudoalignment = expand(
os.path.join(
config['output_dir'],
"{seqmode}",
"{sample}",
"quant_kallisto",
"{sample}.kallisto.pseudo.sam"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
TIN_boxplot_PNG = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.png"),
TIN_boxplot_PDF = os.path.join(
config['output_dir'],
"TIN_scores_boxplot.pdf"),
salmon_merge_genes = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"genes_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
salmon_merge_transcripts = expand(
os.path.join(
config["output_dir"],
"summary_salmon",
"quantmerge",
"transcripts_{salmon_merge_on}.tsv"),
salmon_merge_on=["tpm", "numreads"]),
star_rpm = expand(
os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"STAR_coverage",
"{sample}_Signal.UniqueMultiple.str1.out.bg"),
zip,
sample=[i for i in list(samples_table.index.values)],
seqmode=[samples_table.loc[i, 'seqmode']
for i in list(samples_table.index.values)]),
alfa_reports = expand(os.path.join(
config["output_dir"],
"{seqmode}",
"{sample}",
"ALFA",
"ALFA_plots.Biotypes.pdf"),
zip,
sample= [i for i in list(samples_table.index.values)],
seqmode= [
samples_table.loc[i,"seqmode"]
for i in list(samples_table.index.values)]),
alfa_all_samples = os.path.join(
config["output_dir"],
"ALFA",
"ALFA_plots.Categories.pdf")
output:
samples_dir = directory(os.path.join(
"{output_dir}",
"samples"))
params:
results_dir = config["output_dir"],
log_dir = config["log_dir"],
log_samples_dir = os.path.join(
config["log_dir"],
"samples")
log:
LOG_local_log = \
os.path.join("{output_dir}", "local_log", \
"prepare_files_for_report.log")
run:
# remove "single/paired end" from the results directories
os.mkdir(output.samples_dir)
# move paired end results
paired_end_dir = glob.glob(
os.path.join(
params.results_dir,
"paired_end",
"*"))
for s in paired_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.results_dir,
"samples",
sample_name))
shutil.rmtree(
os.path.join(
params.results_dir,
"paired_end"),
ignore_errors=False,
onerror=None)
# move single end results
single_end_dir = glob.glob(
os.path.join(
params.results_dir,
"single_end",
"*"))
for s in single_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.results_dir,
"samples",
sample_name))
shutil.rmtree(
os.path.join(
params.results_dir,
"single_end"),
ignore_errors=False,
onerror=None)
# remove "single/paired end" from the logs directories
os.mkdir(params.log_samples_dir)
# move paired end results
paired_end_dir = glob.glob(
os.path.join(
params.log_dir,
"paired_end",
"*"))
for s in paired_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.log_dir,
"samples",
sample_name))
shutil.rmtree(
os.path.join(
params.log_dir,
"paired_end"),
ignore_errors=False,
onerror=None)
# move single end results
single_end_dir = glob.glob(
os.path.join(
params.log_dir,
"single_end",
"*"))
for s in single_end_dir:
sample_name = s.split("/")[-1]
shutil.copytree(
s, \
os.path.join(
params.log_dir,
"samples",
sample_name))
shutil.rmtree(
os.path.join(
params.log_dir,
"single_end"),
ignore_errors=False,
onerror=None)
# encapsulate salmon quantification results
all_samples_dirs = glob.glob(
os.path.join(
params.results_dir,
"samples",
"*"))
for s in all_samples_dirs:
sample_name = s.split("/")[-1]
shutil.move(
os.path.join(
s,
"salmon_quant"),
os.path.join(
s,
sample_name)
)
os.mkdir(os.path.join(
s,
"salmon_quant"))
shutil.move(
os.path.join(
s,
sample_name),
os.path.join(
s,
"salmon_quant",
sample_name)
)
# adjust FastQC results 'Filename' field:
fastq_zip_list = glob.glob(
os.path.join(
params.results_dir,
"samples",
"*",
"*_fastqc",
"*_fastqc.zip"))
for zipfile in fastq_zip_list:
sample_name = zipfile.split("/")[-3]
zipfile_path_chunks = zipfile.split("/")
new_path = os.path.join(*(zipfile_path_chunks[:-1]))
with ZipFile(zipfile, 'r') as zip_f:
zip_f.extractall(new_path)
fastqc_data_f = os.path.join(
zipfile[:-4],
"fastqc_data.txt")
with open(fastqc_data_f) as f:
log_lines = f.read().splitlines()
log_lines[3] = "Filename\t" + sample_name+"|"+log_lines[3].split("\t")[1]
with open(fastqc_data_f, "w") as f:
for i in log_lines: f.write(i+"\n")
os.remove(zipfile)
# adjust Kallisto quantification logs
kallisto_logs = glob.glob(
os.path.join(
params.log_dir,
"samples",
"*",
"genome_quantification_kallisto.stderr.log"))
for kallisto_log in kallisto_logs:
with open(kallisto_log) as f:
log_lines = f.read().splitlines()
temp = log_lines[8].split(".")
log_lines[8] = temp[0] + "." + temp[2] + "." + temp[3]
with open(kallisto_log+".MODIFIED", "w") as f:
for i in log_lines: f.write(i+"\n")
# add #-comment to all cutadapt logs:
cutadapt_logs = glob.glob(
os.path.join(
params.log_dir,
"samples",
"*",
"remove_*_cutadapt.stdout.log"))
for cutadapt_log in cutadapt_logs:
sample_name = cutadapt_log.split("/")[-2]
with open(cutadapt_log) as f:
log_lines = f.read().splitlines()
log_lines[1] = log_lines[1] + " # " + sample_name
with open(cutadapt_log, "w") as f:
for i in log_lines: f.write(i+"\n")
# adjust TIN boxplots filenames for MutliQC recognition
os.rename(
input.TIN_boxplot_PNG,
os.path.join(
params.results_dir,
"TIN scores_mqc.png"))
os.rename(
input.TIN_boxplot_PDF,
os.path.join(
params.results_dir,
"TIN scores_mqc.pdf"))
rule prepare_MultiQC_config:
'''
Prepare config for the MultiQC
'''
input:
multiqc_input_dir = os.path.join(
"{output_dir}",
"samples")
output:
multiqc_config = os.path.join(
"{output_dir}",
"MultiQC_config.yaml")
params:
logo_path = os.path.join(
"..",
"..",
"images",
"logo.128px.png"),
results_dir = config["output_dir"]
log:
LOG_local_log = \
os.path.join("{output_dir}", "local_log", \
"prepare_MultiQC_config.log")
run:
with open(output.multiqc_config, "w") as YAML:
YAML.write("---\n\n")
YAML.write("title: \"Rhea\"\n")
YAML.write("subtitle: \"RNA-Seq processing pipeline developed by the members of Zavolan Lab\"\n")
YAML.write("intro_text: \"Short analysis title from config[analysis_title]\"\n")
YAML.write("custom_logo: \""+params.logo_path+"\"\n")
YAML.write("custom_logo_url: \"https://www.biozentrum.unibas.ch/research/researchgroups/overview/unit/zavolan/research-group-mihaela-zavolan/\"\n")
YAML.write("custom_logo_title: \"ZavoLab\"\n\n")
YAML.write("report_header_info:\n")
YAML.write(" - Project Type: \"Snakemake workflow\"\n")
YAML.write(" - Analysis Type: \"RNA-seq\"\n")
YAML.write(" - Analysis Author: \"config[author_name]\"\n")
YAML.write(" - Contact E-mail: \"config[author_email]\"\n\n")
YAML.write("top_modules:\n\n")
YAML.write(" - fastqc:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/mate1_fastqc/*\"\n")
YAML.write(" - \"*/mate2_fastqc/*\"\n")
YAML.write("\n")
YAML.write(" - cutadapt:\n")
YAML.write(" name: \"Cutadapt: adapter removal\"\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/remove_adapters_cutadapt.stdout.log\"\n")
YAML.write("\n")
YAML.write(" - cutadapt:\n")
YAML.write(" name: \"Cutadapt: polyA tails removal\"\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/remove_polya_cutadapt.stdout.log\"\n")
YAML.write("\n")
YAML.write(" - star:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/map_genome/*\"\n")
YAML.write("\n")
YAML.write(" - TIN_scores:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/TIN scores_mqc.png\"\n")
YAML.write("\n")
YAML.write(" - salmon:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/salmon_quant/*\"\n")
YAML.write("\n")
YAML.write(" - kallisto:\n")
YAML.write(" path_filters:\n")
YAML.write(" - \"*/genome_quantification_kallisto.stderr.log.MODIFIED\"\n")
YAML.write("\n")
YAML.write("...")
rule MULTIQC_report:
'''
Create report with MultiQC
'''
input:
multiqc_config = os.path.join(
config["output_dir"],
"MultiQC_config.yaml")
output:
MultiQC_report = \
directory(os.path.join("{output_dir}", "multiqc_summary"))
params:
results_dir = config["output_dir"],
log_dir = config["log_dir"]
log:
LOG_local_log = \
os.path.join("{output_dir}", "local_log", \
"MULTIQC_report.log")
singularity:
"docker://ewels/multiqc:1.7"
shell:
"""
multiqc \
--outdir {output.MultiQC_report} \
--config {input.multiqc_config} \
{params.results_dir} \
{params.log_dir} \
&> {log.LOG_local_log};
"""
\ No newline at end of file
This diff is collapsed.
images/logo.128px.png

7.84 KiB

This diff is collapsed.
name: rnaseq_pipeline
name: rhea
channels:
- bioconda
- conda-forge
......
name: rnaseq_pipeline
name: rhea
channels:
- conda-forge
- defaults
......
name: rnaseq_pipeline
name: rhea
channels:
- defaults
dependencies:
......
# RNAseq pipeline documentation
# Rhea workflow documentation
This document describes the individual rules of the pipeline for information purposes. For instructions on installation and usage please refer to the [README](README.md).
## Overview
......@@ -14,11 +14,16 @@ This document describes the individual rules of the pipeline for information pur
* **star_rpm**
* **rename_star_rpm_for_alfa**
* **calculate_TIN_scores**
* **merge_TIN_scores**
* **plot_TIN_scores**
* **salmon_quantmerge_genes**
* **salmon_quantmerge_transcripts**
* **generate_alfa_index**
* **alfa_qc**
* **alfa_qc_all_samples**
* **prepare_files_for_report**
* **prepare_MultiQC_config**
* **MULTIQC_report**
### Sequencing mode specific
* **(pe_)fastqc**
......@@ -160,6 +165,22 @@ Given a set of BAM files and a gene annotation BED file, calculates the Transcri
#### merge_TIN_scores
Concatenates the tsv files of all samples into one wider table.
**Input:** TIN score tsv files per sample
**Output:** TIN score tsv file for all samples
#### plot_TIN_scores
Generates sample-wise [boxplots](https://en.wikipedia.org/wiki/Box_plot) of TIN scores.
**Input:** TIN score tsv file for all samples
**Output:** .pdf and .png files with boxplots
#### salmon_quantmerge_genes
Merge the salmon quantification *gene* results for all samples of same sequencing mode into a single file. Do this for tpm and number of reads separately.
......@@ -197,7 +218,29 @@ The main output of ALFA are two plots, `ALFA_Biotypes.pdf` and `ALFA_Categories.
Combine the output of all samples into one plot generated by [ALFA](https://github.com/biocompibens/ALFA).
**Input:** ALFA_feature_counts.tsv from each sample in `samples.tsv`
**Output:** ALFA_Biotypes.pdf and ALFA_Categories.pdf for all samples together
**Output:** ALFA_Biotypes.pdf and ALFA_Categories.pdf for all samples together
#### prepare_files_for_report
This is an internal rule with `run` directive. It gathers all the output files, restructures the `log` and `results` directories and modifies some `stdout` and `stderr` streams of previous rules for proper parsing of sample names in the final report.
#### prepare_MultiQC_config
Prepares a dedicated config file for [MultiQC](https://multiqc.info/).
**Input:** Currently directories created during `prepare_files_for_report` serve as input.
**Output:** Config file in .yaml format
#### MULTIQC_report
Creates an interactive report after the pipeline is finished. [MultiQC](https://multiqc.info/) gathers results and logs after distinct bioinformatics tools, parses them and presents the output graphically in an HTML file.
**Input:** Config file fort MultiQC in .yaml format
**Output:** Directory with automatically generated HTML report
### Sequencing mode specific rules
......@@ -315,3 +358,4 @@ Spliced Transcripts Alignment to a Reference; Read the [Publication](https://www
* -l: fragment length, user specified as `mean`
* -s: fragment length SD, user specified as `sd`
......@@ -31,7 +31,7 @@ snakemake \
--printshellcmds \
--rerun-incomplete \
--use-singularity \
--singularity-args="--bind ${PWD}/../input_files" \
--singularity-args="--bind ${PWD}/../input_files,${PWD}/../../images" \
--verbose \
results/ALFA/ALFA_plots.Categories.pdf
......
......@@ -16,66 +16,66 @@ results/star_indexes/homo_sapiens/75/STAR_index/sjdbInfo.txt
results/star_indexes/homo_sapiens/75/STAR_index/sjdbList.fromGTF.out.tab
results/star_indexes/homo_sapiens/75/STAR_index/sjdbList.out.tab
results/star_indexes/homo_sapiens/75/STAR_index/transcriptInfo.tab
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_adapters_mate1.fastq
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_adapters_mate2.fastq
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_polya_mate1.fastq
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_polya_mate2.fastq
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/map_genome/synthetic_10_reads_paired_synthetic_10_reads_paired_SJ.out.tab
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/fastqc_data.txt
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/fastqc.fo
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/summary.txt
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/adapter_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/duplication_levels.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_n_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_quality.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_sequence_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_gc_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_quality.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_tile_quality.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/sequence_length_distribution.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/fastqc_data.txt
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/fastqc.fo
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/summary.txt
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/adapter_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/duplication_levels.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_base_n_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_base_quality.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_base_sequence_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_sequence_gc_content.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_sequence_quality.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_tile_quality.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/sequence_length_distribution.png
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/quant_kallisto/abundance.tsv
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/quant_kallisto/pseudoalignments.bam
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/quant_kallisto/synthetic_10_reads_paired_synthetic_10_reads_paired.kallisto.pseudo.sam
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/lib_format_counts.json
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/ambig_info.tsv
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/expected_bias
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/observed_bias
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/observed_bias_3p
results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/aux_info/unmapped_names.txt
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_adapters_mate1.fastq
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_polya_mate1.fastq
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_SJ.out.tab
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/fastqc_data.txt
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/fastqc.fo
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/summary.txt
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/adapter_content.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/duplication_levels.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_n_content.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_quality.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_sequence_content.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_gc_content.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_quality.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_tile_quality.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/sequence_length_distribution.png
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/abundance.tsv
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/pseudoalignments.bam
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.kallisto.pseudo.sam
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/lib_format_counts.json
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/aux_info/ambig_info.tsv
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/aux_info/expected_bias
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/aux_info/observed_bias
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/aux_info/observed_bias_3p
results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/aux_info/unmapped_names.txt
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_adapters_mate1.fastq
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_adapters_mate2.fastq
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_polya_mate1.fastq
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/synthetic_10_reads_paired_synthetic_10_reads_paired.remove_polya_mate2.fastq
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/map_genome/synthetic_10_reads_paired_synthetic_10_reads_paired_SJ.out.tab
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/fastqc_data.txt
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/fastqc.fo
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/summary.txt
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/adapter_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/duplication_levels.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_n_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_quality.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_sequence_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_gc_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_quality.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_tile_quality.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate1_fastqc/synthetic.mate_1_fastqc/Images/sequence_length_distribution.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/fastqc_data.txt
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/fastqc.fo
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/summary.txt
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/adapter_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/duplication_levels.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_base_n_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_base_quality.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_base_sequence_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_sequence_gc_content.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_sequence_quality.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/per_tile_quality.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/mate2_fastqc/synthetic.mate_2_fastqc/Images/sequence_length_distribution.png
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/quant_kallisto/abundance.tsv
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/quant_kallisto/pseudoalignments.bam
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/quant_kallisto/synthetic_10_reads_paired_synthetic_10_reads_paired.kallisto.pseudo.sam
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/lib_format_counts.json
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/aux_info/ambig_info.tsv
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/aux_info/expected_bias
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/aux_info/observed_bias
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/aux_info/observed_bias_3p
results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/aux_info/unmapped_names.txt
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_adapters_mate1.fastq
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.remove_polya_mate1.fastq
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_SJ.out.tab
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/fastqc_data.txt
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/fastqc.fo
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/summary.txt
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/adapter_content.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/duplication_levels.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_n_content.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_quality.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_base_sequence_content.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_gc_content.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_sequence_quality.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/per_tile_quality.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/mate1_fastqc/synthetic.mate_1_fastqc/Images/sequence_length_distribution.png
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/abundance.tsv
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/pseudoalignments.bam
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant_kallisto/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1.kallisto.pseudo.sam
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/lib_format_counts.json
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/aux_info/ambig_info.tsv
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/aux_info/expected_bias
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/aux_info/observed_bias
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/aux_info/observed_bias_3p
results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/aux_info/unmapped_names.txt
results/transcriptome/homo_sapiens/transcriptome.fa
......@@ -29,7 +29,7 @@ snakemake \
--printshellcmds \
--rerun-incomplete \
--use-singularity \
--singularity-args="--bind ${PWD}/../input_files" \
--singularity-args="--bind ${PWD}/../input_files,${PWD}/../../images" \
--verbose
# Check md5 sum of some output files
......@@ -55,7 +55,7 @@ md5sum --check "expected_output.md5"
echo "Verifying STAR output"
result=$(bedtools intersect -F 1 -v -bed \
-a ../input_files/synthetic.mate_1.bed \
-b results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Aligned.sortedByCoord.out.bam \
-b results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Aligned.sortedByCoord.out.bam \
| wc -l)
if [ $result != "0" ]; then
echo "Alignments for mate 1 reads are not consistent with ground truth"
......@@ -63,7 +63,7 @@ if [ $result != "0" ]; then
fi
result=$(bedtools intersect -F 1 -v -bed \
-a <(cat ../input_files/synthetic.mate_1.bed ../input_files/synthetic.mate_2.bed) \
-b results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/map_genome/synthetic_10_reads_paired_synthetic_10_reads_paired_Aligned.sortedByCoord.out.bam \
-b results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/map_genome/synthetic_10_reads_paired_synthetic_10_reads_paired_Aligned.sortedByCoord.out.bam \
| wc -l)
if [ $result != "0" ]; then
echo "Alignments for mate 1 reads are not consistent with ground truth"
......@@ -73,9 +73,9 @@ fi
# Check whether Salmon assigns reads to expected genes
echo "Verifying Salmon output"
diff \
<(cat results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat ../input_files/synthetic.mate_1.bed | cut -f7 | sort | uniq -c | sort -k2nr | awk '{printf($2"\t"$1"\n")}')
diff \
<(cat results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat ../input_files/synthetic.mate_1.bed | cut -f7 | sort | uniq -c | sort -k2nr | awk '{printf($2"\t"$1"\n")}')
......@@ -31,7 +31,7 @@ snakemake \
--printshellcmds \
--rerun-incomplete \
--use-singularity \
--singularity-args="--bind ${PWD}/../input_files" \
--singularity-args="--bind ${PWD}/../input_files,${PWD}/../../images" \
--verbose
# Check md5 sum of some output files
......@@ -57,7 +57,7 @@ md5sum --check "expected_output.md5"
echo "Verifying STAR output"
result=$(bedtools intersect -F 1 -v -bed \
-a ../input_files/synthetic.mate_1.bed \
-b results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Aligned.sortedByCoord.out.bam \
-b results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/map_genome/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1_Aligned.sortedByCoord.out.bam \
| wc -l)
if [ $result != "0" ]; then
echo "Alignments for mate 1 reads are not consistent with ground truth"
......@@ -65,7 +65,7 @@ if [ $result != "0" ]; then
fi
result=$(bedtools intersect -F 1 -v -bed \
-a <(cat ../input_files/synthetic.mate_1.bed ../input_files/synthetic.mate_2.bed) \
-b results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/map_genome/synthetic_10_reads_paired_synthetic_10_reads_paired_Aligned.sortedByCoord.out.bam \
-b results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/map_genome/synthetic_10_reads_paired_synthetic_10_reads_paired_Aligned.sortedByCoord.out.bam \
| wc -l)
if [ $result != "0" ]; then
echo "Alignments for mate 1 reads are not consistent with ground truth"
......@@ -75,9 +75,9 @@ fi
# Check whether Salmon assigns reads to expected genes
echo "Verifying Salmon output"
diff \
<(cat results/single_end/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat results/samples/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/salmon_quant/synthetic_10_reads_mate_1_synthetic_10_reads_mate_1/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat ../input_files/synthetic.mate_1.bed | cut -f7 | sort | uniq -c | sort -k2nr | awk '{printf($2"\t"$1"\n")}')
diff \
<(cat results/paired_end/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat results/samples/synthetic_10_reads_paired_synthetic_10_reads_paired/salmon_quant/synthetic_10_reads_paired_synthetic_10_reads_paired/quant.genes.sf | cut -f1,5 | tail -n +2 | sort -k1,1) \
<(cat ../input_files/synthetic.mate_1.bed | cut -f7 | sort | uniq -c | sort -k2nr | awk '{printf($2"\t"$1"\n")}')
......@@ -177,7 +177,7 @@ rule pe_remove_polya_cutadapt:
-o {output.reads1} \
-p {output.reads2} \
{input.reads1} \
{input.reads2};) \
{input.reads2}) \
1> {log.stdout} 2>{log.stderr}"
......
......@@ -84,7 +84,7 @@ rule remove_adapters_cutadapt:
-a {params.adapters_3} \
-g {params.adapters_5} \
-o {output.reads} \
{input.reads};) \
{input.reads}) \
1> {log.stdout} 2> {log.stderr}"
......