From 1bef02ccc87ba049114c42fc2a3c86ea42655277 Mon Sep 17 00:00:00 2001 From: Christoph Stritt <stritt0001@login01.cluster.bc2.ch> Date: Thu, 19 Oct 2023 13:10:00 +0200 Subject: [PATCH] Paths fixed, works on sciCORE now --- .gitignore | 4 +- assembly/README.md | 65 ++++++++++++++++--------- assembly/workflow/Snakefile | 4 +- assembly/workflow/rules/annotate.smk | 11 +++-- assembly/workflow/rules/circularize.smk | 6 +-- assembly/workflow/rules/common.smk | 1 - assembly/workflow/rules/mapreads.smk | 24 ++------- assembly/workflow/rules/summarize.smk | 6 +-- 8 files changed, 63 insertions(+), 58 deletions(-) diff --git a/.gitignore b/.gitignore index d8d6f29..7e6e344 100755 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ pangenome assembly/container/assemblySC.sif assembly/.snakemake +assembly/.cache assembly/rulegraph.pdf -facienda.md \ No newline at end of file +assembly/resources/bakta_db +facienda.md diff --git a/assembly/README.md b/assembly/README.md index f3b9227..36e5237 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -1,43 +1,50 @@ # Genome assembly workflow -The user needs to modify two things to run the workflow on her samples: +The genome assembly workflow includes the following tools/steps: + - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, + - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly. + - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA. + - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly. + - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies. -- the file config/samples.tsv, which contains the sample names and the corresponding paths to the HiFi consensus reads. -- the file config/config.yaml, which contains global options for the analysis +# Run the pipeline +The user needs to provide two things to run the workflow on her samples: +- a config file with some global options for the analysis +- a tab separate table, without header, that contains the sample names and the corresponding paths to the HiFi consensus reads. -## samples.tsv -This is a tab-separated table with a header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. - -## config.yaml +## config.yml In the file config/config.yaml some global parameters can be set: ```yaml - -samples: samples.inhouse.hifi.test.csv - -results_directory: ./results +samples: config/samples.tsv +outdir: ./results output_prefix: pb_bernese ref: genome_size: 4.4m gbf: resources/H37Rv.gbf -threads: 4 +bakta_db: resources/bakta_db + +threads_per_job: 4 keep_intermediate: "Yes" ``` +## samples.tsv +This is a tab-separated table with no header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. + + +## Snakemake dry run -## Dry-run ``` -snakemake -n +snakemake -n --configfile /path/to/config.yml ``` - -## Run the workflow on scicore +## Run the workflow on sciCORE Important: singularity containers most be given access to the file locations through the --bind argument. E.g. if the long reads are on /scicore/home/jean-jacques/reads/, add this location in the snakemake command (see also the full command below): @@ -60,12 +67,11 @@ ml snakemake/6.6.1-foss-2021a # Real run snakemake \ - --config samples=1.5 \ - --config outdir=./results \ - - --jobs 4 - --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/ --bind /scratch" \ - --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=pbassembly.o%j --error=pbassembly.e%j" + --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \ + --jobs 4 \ + --latency-wait 60 \ + --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \ + --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j" # Leave the screen: CTRL+a+d @@ -73,4 +79,17 @@ snakemake \ # Return to the screen screen -r assembly -``` \ No newline at end of file +``` + +# Output +For each sample defined in the samples table, a folder is generated in the output directory. It contains: + +``` +assembly.circularized.renamed.fasta +bakta/ +circlator/ +flye/ +longqc/ +remapping/ + +``` diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile index aa6153d..bd247ba 100755 --- a/assembly/workflow/Snakefile +++ b/assembly/workflow/Snakefile @@ -1,5 +1,5 @@ -containerized: "container/assemblySMK.sif" +containerized: "container/assemblySC.sif" include: "rules/common.smk" @@ -19,5 +19,5 @@ rule all: input: config["outdir"] + "/read_summaries.tsv", config["outdir"] + "/assembly_summaries.tsv", - expand(config["outdir"] + "/{sample}/{sample}.gff", sample = samples.keys()), + expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()), expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) diff --git a/assembly/workflow/rules/annotate.smk b/assembly/workflow/rules/annotate.smk index b8a55bf..22bc5a2 100755 --- a/assembly/workflow/rules/annotate.smk +++ b/assembly/workflow/rules/annotate.smk @@ -1,24 +1,25 @@ rule bakta: input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" - output: config["outdir"] + "/{sample}/{sample}.gff" + output: config["outdir"] + "/{sample}/bakta/{sample}.gff3" params: bakta_db = config["bakta_db"], refgbf = config["ref"]["gbf"], asm = "{sample}", - outdir = config["outdir"] + "/{sample}/", + outdir = config["outdir"] + "/{sample}/bakta", threads = config["threads_per_job"] shell: """ bakta {input} \ --db {params.bakta_db} \ + --output {params.outdir} \ --prefix {params.asm} \ --genus Mycobacterium \ --species tuberculosis \ - --strain {params.sample} \ - --locus-tag {params.sample} \ - --complete --compatible \ + --strain {params.asm} \ + --locus-tag {params.asm} \ + --complete --compliant --force \ --proteins {params.refgbf} \ --keep-contig-headers \ --threads {params.threads} diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk index befa762..8d1faf0 100755 --- a/assembly/workflow/rules/circularize.smk +++ b/assembly/workflow/rules/circularize.smk @@ -1,7 +1,7 @@ rule circlator_mapreads: input: - assembly = config["outdir"] + "/{sample}/assembly.fasta", + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq) output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam" params: @@ -29,7 +29,7 @@ rule circlator_localassembly: output: config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta" params: outdir = config["outdir"] + "/{sample}/circlator/03.assemble", - threads = config["threads"] + threads = config["threads_per_job"] shell: """ flye --pacbio-hifi {input} --out-dir {params.outdir} --genome-size 100k --threads {params.threads} @@ -38,7 +38,7 @@ rule circlator_localassembly: rule circlator_merge: input: - assembly = config["outdir"] + "/{sample}/assembly.fasta", + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", localassembly = config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta" output: config["outdir"] + "/{sample}/circlator/04.merge.fasta" params: diff --git a/assembly/workflow/rules/common.smk b/assembly/workflow/rules/common.smk index eac2fbc..b26c43f 100755 --- a/assembly/workflow/rules/common.smk +++ b/assembly/workflow/rules/common.smk @@ -8,7 +8,6 @@ class sample: samples = {} with open(config["samples"]) as f: - next(f) for line in f: fields = line.strip().split("\t") samples[fields[0]] = sample(fields) diff --git a/assembly/workflow/rules/mapreads.smk b/assembly/workflow/rules/mapreads.smk index c511055..3fd0440 100755 --- a/assembly/workflow/rules/mapreads.smk +++ b/assembly/workflow/rules/mapreads.smk @@ -1,29 +1,13 @@ -rule map_SR: - input: - reads = lambda wildcards: samples[wildcards.sample].shortread_fastq, - assembly = "results/{sample}/assembly.circularized.renamed.fasta" - output: "results/{sample}/remapping/shortreads.bam" - params: - intermediate = "results/{sample}/remapping/shortreads.raw.bam" - threads: config["threads"] - shell: - """ - minimap2 -ax sr {input.assembly} {input.reads} | samtools view -hu > {params.intermediate} - samtools sort -@ {threads} {params.intermediate} -o {output} - samtools index {output} - rm {params.intermediate} - - """ rule map_LR: input: reads = lambda wildcards: samples[wildcards.sample].longread_fastq, - assembly = "results/{sample}/assembly.circularized.renamed.fasta" - output: "results/{sample}/remapping/longreads.bam" + assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + output: config["outdir"] + "/{sample}/remapping/longreads.bam" params: - intermediate = "results/{sample}/remapping/longreads.raw.bam" - threads: config["threads"] + intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam" + threads: config["threads_per_job"] shell: """ minimap2 -ax map-hifi {input.assembly} {input.reads} | samtools view -hu > {params.intermediate} diff --git a/assembly/workflow/rules/summarize.smk b/assembly/workflow/rules/summarize.smk index c64e64a..e58fa97 100755 --- a/assembly/workflow/rules/summarize.smk +++ b/assembly/workflow/rules/summarize.smk @@ -10,7 +10,7 @@ rule read_summary: for FILE in input: - sample = FILE.split("/")[1] + sample = FILE.split("/")[-3] d = json.load(open(FILE)) d_flat = {} @@ -52,7 +52,7 @@ rule read_summary: rule assembly_summaries: - input: expand("{outdir}/{sample}/assembly_info.txt", outdir = config["outdir"], sample = samples.keys()) + input: expand("{outdir}/{sample}/flye/assembly_info.txt", outdir = config["outdir"], sample = samples.keys()) output: summaries = config["outdir"] + "/assembly_summaries.tsv", single_contig_assemblies = config["outdir"] + "/single_contig_samples.txt" @@ -65,7 +65,7 @@ rule assembly_summaries: for FILE in input: - sample = FILE.split("/")[1] + sample = FILE.split("/")[-3] contig_count = 0 with open(FILE) as f: -- GitLab