diff --git a/.gitignore b/.gitignore index d8d6f2916ca0c3747f240c99d386278cb4c3dda5..7e6e3444c93e30a2373e62f129b8848f13bc3170 100755 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ pangenome assembly/container/assemblySC.sif assembly/.snakemake +assembly/.cache assembly/rulegraph.pdf -facienda.md \ No newline at end of file +assembly/resources/bakta_db +facienda.md diff --git a/assembly/README.md b/assembly/README.md index f3b9227a112a4dd84e6ada1cfc39d7b57e94ef67..36e523782949dc9e37ad3413ca033c7fde5a9172 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -1,43 +1,50 @@ # Genome assembly workflow -The user needs to modify two things to run the workflow on her samples: +The genome assembly workflow includes the following tools/steps: + - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, + - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly. + - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA. + - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly. + - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies. -- the file config/samples.tsv, which contains the sample names and the corresponding paths to the HiFi consensus reads. -- the file config/config.yaml, which contains global options for the analysis +# Run the pipeline +The user needs to provide two things to run the workflow on her samples: +- a config file with some global options for the analysis +- a tab separate table, without header, that contains the sample names and the corresponding paths to the HiFi consensus reads. -## samples.tsv -This is a tab-separated table with a header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. - -## config.yaml +## config.yml In the file config/config.yaml some global parameters can be set: ```yaml - -samples: samples.inhouse.hifi.test.csv - -results_directory: ./results +samples: config/samples.tsv +outdir: ./results output_prefix: pb_bernese ref: genome_size: 4.4m gbf: resources/H37Rv.gbf -threads: 4 +bakta_db: resources/bakta_db + +threads_per_job: 4 keep_intermediate: "Yes" ``` +## samples.tsv +This is a tab-separated table with no header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. + + +## Snakemake dry run -## Dry-run ``` -snakemake -n +snakemake -n --configfile /path/to/config.yml ``` - -## Run the workflow on scicore +## Run the workflow on sciCORE Important: singularity containers most be given access to the file locations through the --bind argument. E.g. if the long reads are on /scicore/home/jean-jacques/reads/, add this location in the snakemake command (see also the full command below): @@ -60,12 +67,11 @@ ml snakemake/6.6.1-foss-2021a # Real run snakemake \ - --config samples=1.5 \ - --config outdir=./results \ - - --jobs 4 - --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/ --bind /scratch" \ - --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=pbassembly.o%j --error=pbassembly.e%j" + --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \ + --jobs 4 \ + --latency-wait 60 \ + --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \ + --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j" # Leave the screen: CTRL+a+d @@ -73,4 +79,17 @@ snakemake \ # Return to the screen screen -r assembly -``` \ No newline at end of file +``` + +# Output +For each sample defined in the samples table, a folder is generated in the output directory. It contains: + +``` +assembly.circularized.renamed.fasta +bakta/ +circlator/ +flye/ +longqc/ +remapping/ + +``` diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile index aa6153d077ed89bd3db43f6c1613b52272aa1fbc..bd247bace07ed543a7b3cc1d62159df53d4c795a 100755 --- a/assembly/workflow/Snakefile +++ b/assembly/workflow/Snakefile @@ -1,5 +1,5 @@ -containerized: "container/assemblySMK.sif" +containerized: "container/assemblySC.sif" include: "rules/common.smk" @@ -19,5 +19,5 @@ rule all: input: config["outdir"] + "/read_summaries.tsv", config["outdir"] + "/assembly_summaries.tsv", - expand(config["outdir"] + "/{sample}/{sample}.gff", sample = samples.keys()), + expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()), expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) diff --git a/assembly/workflow/rules/annotate.smk b/assembly/workflow/rules/annotate.smk index b8a55bf4fca81a628427bb6343c9ab6802833173..22bc5a204c317bebbc54b36dc77a9019b0a053f2 100755 --- a/assembly/workflow/rules/annotate.smk +++ b/assembly/workflow/rules/annotate.smk @@ -1,24 +1,25 @@ rule bakta: input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" - output: config["outdir"] + "/{sample}/{sample}.gff" + output: config["outdir"] + "/{sample}/bakta/{sample}.gff3" params: bakta_db = config["bakta_db"], refgbf = config["ref"]["gbf"], asm = "{sample}", - outdir = config["outdir"] + "/{sample}/", + outdir = config["outdir"] + "/{sample}/bakta", threads = config["threads_per_job"] shell: """ bakta {input} \ --db {params.bakta_db} \ + --output {params.outdir} \ --prefix {params.asm} \ --genus Mycobacterium \ --species tuberculosis \ - --strain {params.sample} \ - --locus-tag {params.sample} \ - --complete --compatible \ + --strain {params.asm} \ + --locus-tag {params.asm} \ + --complete --compliant --force \ --proteins {params.refgbf} \ --keep-contig-headers \ --threads {params.threads} diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk index befa76253cbe02261ff3dd43b3addbd7845f5d9a..8d1faf0c346125809cdcb4b064dac91fd2d145ec 100755 --- a/assembly/workflow/rules/circularize.smk +++ b/assembly/workflow/rules/circularize.smk @@ -1,7 +1,7 @@ rule circlator_mapreads: input: - assembly = config["outdir"] + "/{sample}/assembly.fasta", + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq) output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam" params: @@ -29,7 +29,7 @@ rule circlator_localassembly: output: config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta" params: outdir = config["outdir"] + "/{sample}/circlator/03.assemble", - threads = config["threads"] + threads = config["threads_per_job"] shell: """ flye --pacbio-hifi {input} --out-dir {params.outdir} --genome-size 100k --threads {params.threads} @@ -38,7 +38,7 @@ rule circlator_localassembly: rule circlator_merge: input: - assembly = config["outdir"] + "/{sample}/assembly.fasta", + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", localassembly = config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta" output: config["outdir"] + "/{sample}/circlator/04.merge.fasta" params: diff --git a/assembly/workflow/rules/common.smk b/assembly/workflow/rules/common.smk index eac2fbc0f20255a5105cdacfd40dd26496a3afe4..b26c43f13c757fd49d4a788592560e220ef2cef6 100755 --- a/assembly/workflow/rules/common.smk +++ b/assembly/workflow/rules/common.smk @@ -8,7 +8,6 @@ class sample: samples = {} with open(config["samples"]) as f: - next(f) for line in f: fields = line.strip().split("\t") samples[fields[0]] = sample(fields) diff --git a/assembly/workflow/rules/mapreads.smk b/assembly/workflow/rules/mapreads.smk index c5110553d587dc034d9fe3de669c8c2288ef9022..3fd0440e0b8a9da99a63ec0dc5fcc62a7c912507 100755 --- a/assembly/workflow/rules/mapreads.smk +++ b/assembly/workflow/rules/mapreads.smk @@ -1,29 +1,13 @@ -rule map_SR: - input: - reads = lambda wildcards: samples[wildcards.sample].shortread_fastq, - assembly = "results/{sample}/assembly.circularized.renamed.fasta" - output: "results/{sample}/remapping/shortreads.bam" - params: - intermediate = "results/{sample}/remapping/shortreads.raw.bam" - threads: config["threads"] - shell: - """ - minimap2 -ax sr {input.assembly} {input.reads} | samtools view -hu > {params.intermediate} - samtools sort -@ {threads} {params.intermediate} -o {output} - samtools index {output} - rm {params.intermediate} - - """ rule map_LR: input: reads = lambda wildcards: samples[wildcards.sample].longread_fastq, - assembly = "results/{sample}/assembly.circularized.renamed.fasta" - output: "results/{sample}/remapping/longreads.bam" + assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + output: config["outdir"] + "/{sample}/remapping/longreads.bam" params: - intermediate = "results/{sample}/remapping/longreads.raw.bam" - threads: config["threads"] + intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam" + threads: config["threads_per_job"] shell: """ minimap2 -ax map-hifi {input.assembly} {input.reads} | samtools view -hu > {params.intermediate} diff --git a/assembly/workflow/rules/summarize.smk b/assembly/workflow/rules/summarize.smk index c64e64a93fc6b81c80e7f3bf65b844de9e3be41f..e58fa9773bfcdde0f1877b07e36b8fb0e86fe578 100755 --- a/assembly/workflow/rules/summarize.smk +++ b/assembly/workflow/rules/summarize.smk @@ -10,7 +10,7 @@ rule read_summary: for FILE in input: - sample = FILE.split("/")[1] + sample = FILE.split("/")[-3] d = json.load(open(FILE)) d_flat = {} @@ -52,7 +52,7 @@ rule read_summary: rule assembly_summaries: - input: expand("{outdir}/{sample}/assembly_info.txt", outdir = config["outdir"], sample = samples.keys()) + input: expand("{outdir}/{sample}/flye/assembly_info.txt", outdir = config["outdir"], sample = samples.keys()) output: summaries = config["outdir"] + "/assembly_summaries.tsv", single_contig_assemblies = config["outdir"] + "/single_contig_samples.txt" @@ -65,7 +65,7 @@ rule assembly_summaries: for FILE in input: - sample = FILE.split("/")[1] + sample = FILE.split("/")[-3] contig_count = 0 with open(FILE) as f: