From fe8fafc542f0da2370ffa482194ce9fc7ded7a92 Mon Sep 17 00:00:00 2001 From: Christoph Stritt <stritt0001@login01.cluster.bc2.ch> Date: Tue, 24 Oct 2023 13:53:52 +0200 Subject: [PATCH] variantcalling works now --- .gitignore | 2 + assembly/README.md | 30 +++++---- assembly/workflow/Snakefile | 23 +++++-- assembly/workflow/rules/annotate.smk | 2 +- assembly/workflow/rules/circularize.smk | 2 +- assembly/workflow/rules/mapreads.smk | 2 +- variantcalling/README.md | 24 +++---- variantcalling/workflow/Snakefile | 64 ++++++++----------- .../workflow/scripts/combine_assemblies.py | 7 +- 9 files changed, 79 insertions(+), 77 deletions(-) diff --git a/.gitignore b/.gitignore index 7e6e344..479190f 100755 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ assembly/.cache assembly/rulegraph.pdf assembly/resources/bakta_db facienda.md +variantcalling/container/pggb_latest.sif +variantcalling/.snakemake diff --git a/assembly/README.md b/assembly/README.md index 36e5237..670400b 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -17,19 +17,22 @@ The user needs to provide two things to run the workflow on her samples: In the file config/config.yaml some global parameters can be set: ```yaml -samples: config/samples.tsv -outdir: ./results -output_prefix: pb_bernese +# REQUIRED +samples: config/samples.tsv # Path to sample table, no header, tab-separated +outdir: ./results # Path to output directory -ref: - genome_size: 4.4m - gbf: resources/H37Rv.gbf +# OPTIONAL +annotate: "Yes" # Annotate assembly with bakta Yes/No -bakta_db: resources/bakta_db +ref: + genome_size: 4.4m # + gbf: resources/H37Rv.gbf # Used for bakta annotation step -threads_per_job: 4 +bakta_db: resources/bakta_db # Used for bakta annotation step -keep_intermediate: "Yes" +threads_per_job: 4 # Should match cpus-per-task in the snakemake command + +keep_intermediate: "Yes" # Not implemented yet... ``` @@ -57,21 +60,24 @@ It's convenient to run snakemake in a screen, so we can do other things on scico ``` # Open screen -screen -r assembly +screen -R assembly # Load Snakemake module ml snakemake/6.6.1-foss-2021a # Dry run +snakemake -n \ + --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/config.yml # Real run snakemake \ - --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \ + --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/config.yml \ --jobs 4 \ + --keep-going \ --latency-wait 60 \ --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \ - --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j" + --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/pbassembly.e%j" # Leave the screen: CTRL+a+d diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile index bd247ba..d47aae1 100755 --- a/assembly/workflow/Snakefile +++ b/assembly/workflow/Snakefile @@ -9,15 +9,24 @@ include: "rules/assemble.smk" include: "rules/circularize.smk" -include: "rules/annotate.smk" +if config["annotate"] == "Yes": + include: "rules/annotate.smk" include: "rules/summarize.smk" include: "rules/mapreads.smk" -rule all: - input: - config["outdir"] + "/read_summaries.tsv", - config["outdir"] + "/assembly_summaries.tsv", - expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()), - expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) +if config["annotate"] == "Yes": + rule all: + input: + config["outdir"] + "/read_summaries.tsv", + config["outdir"] + "/assembly_summaries.tsv", + expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()), + expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) + +else: + rule all: + input: + config["outdir"] + "/read_summaries.tsv", + config["outdir"] + "/assembly_summaries.tsv", + expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) diff --git a/assembly/workflow/rules/annotate.smk b/assembly/workflow/rules/annotate.smk index 22bc5a2..feeae2e 100755 --- a/assembly/workflow/rules/annotate.smk +++ b/assembly/workflow/rules/annotate.smk @@ -1,6 +1,6 @@ rule bakta: - input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + input: config["outdir"] + "/{sample}/{sample}.fasta" output: config["outdir"] + "/{sample}/bakta/{sample}.gff3" params: bakta_db = config["bakta_db"], diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk index 8d1faf0..89f2f78 100755 --- a/assembly/workflow/rules/circularize.smk +++ b/assembly/workflow/rules/circularize.smk @@ -79,7 +79,7 @@ rule rename: prefix = "{sample}", keep_intermediate = config["keep_intermediate"] - output: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + output: config["outdir"] + "/{sample}/{sample}.fasta" run: diff --git a/assembly/workflow/rules/mapreads.smk b/assembly/workflow/rules/mapreads.smk index 3fd0440..b9ef631 100755 --- a/assembly/workflow/rules/mapreads.smk +++ b/assembly/workflow/rules/mapreads.smk @@ -3,7 +3,7 @@ rule map_LR: input: reads = lambda wildcards: samples[wildcards.sample].longread_fastq, - assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + assembly = config["outdir"] + "/{sample}/{sample}.fasta" output: config["outdir"] + "/{sample}/remapping/longreads.bam" params: intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam" diff --git a/variantcalling/README.md b/variantcalling/README.md index 5d5c070..1b8a2da 100755 --- a/variantcalling/README.md +++ b/variantcalling/README.md @@ -13,28 +13,24 @@ singularity pull docker://ghcr.io/pangenome/pggb:latest ## config/config.yaml ```yaml - -reference: - -threads: 4 - -output_dir: ./results - -pggb: - p: 99 - s: 5k - +outdir: /home/cristobal/TB/projects/pacbio_microscale/results/variants/assembly +samples: /home/cristobal/TB/projects/pacbio_microscale/results/variants/assembly/samples.tsv +reference: N1426 +threads: 20 ``` ## Dry run -snakemake -n +``` +snakemake -n --configfile +``` ## Run ``` snakemake \ --jobs 1 \ + --configfile ~/TB/projects/pacbio_microscale/results/variants/assembly/config.yml \ --latency-wait 60 \ --use-conda --use-envmodules \ - --use-singularity --singularity-args "--bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/workflows --bind /scratch" \ - --cluster "sbatch --job-name=bernese_variants --cpus-per-task=20 --mem-per-cpu=1G --time=06:00:00 --qos=6hours --output=bernese_variants.o%j --error=bernese_variants.e%j" + --use-singularity --singularity-args "--bind /scicore/home/gagneux/stritt0001 --bind /scratch" \ + --cluster "sbatch --job-name=pggb --cpus-per-task=20 --mem-per-cpu=1G --time=06:00:00 --qos=6hours --output=pggb.o%j --error=pggb.e%j" ``` diff --git a/variantcalling/workflow/Snakefile b/variantcalling/workflow/Snakefile index cc39245..0b62ac0 100755 --- a/variantcalling/workflow/Snakefile +++ b/variantcalling/workflow/Snakefile @@ -1,51 +1,39 @@ -samples = [ -"PB000150", # whichever reference -"PB000152", -"PB000153", -"PB000155", # Basal strain in both ML SNP tree and tip dated BEAST tree (P034) -#"PB000175", # ignore outgroup for variant calling, to avoid complications with nested variants -"PB000177", -"PB000178", -"PB000180", -"PB000182", -"PB000183", -"PB000184", -"PB000186", -"PB000187", -"PB000189", -"PB000190", -"PB000191", -"PB000192", -#"PB000193", -"PB000194" -] +configfile: "config/config.yml" + +import pandas as pd + +samples = pd.read_table(config["samples"], header=None ) + rule all: input: - "results/vg/bernese.variants.vcf", - "results/raxml-ng/core_gene_alignment.aln.raxml.bestTree" + config["outdir"] + "/variants.vcf" rule combine_assemblies: - input: expand('/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/workflows/assemblySMK/results/{sample}/assembly.circularized.renamed.fasta', sample = samples) + input: list(samples[1]) output: - fasta = "results/single_contig_assemblies.fasta", - discarded = "results/discarded_assemblies.txt" - conda: "config/biopython.yaml" + fasta = config["outdir"] + "/single_contig_assemblies.fasta", + discarded = config["outdir"] + "/discarded_assemblies.txt" + params: + outdir = config["outdir"] + conda: "../config/biopython.yaml" shell: """ - python workflow/scripts/combine_assemblies.py {input} + python workflow/scripts/combine_assemblies.py {params.outdir} {input} """ rule pggb: - input: "results/single_contig_assemblies.fasta" - output: "results/pggb/bernese.smooth.final.gfa" + input: config["outdir"] + "/single_contig_assemblies.fasta" + output: config["outdir"] + "/graph.smooth.final.gfa" threads: 20 params: - nr_strains = len(samples) - singularity: "./pggb_latest.sif" + nr_strains = len(samples), + outdir = config["outdir"], + outgraph = config["outdir"] + "/*smooth.final.gfa" + singularity: "container/pggb_latest.sif" shell: """ @@ -53,22 +41,22 @@ rule pggb: samtools faidx {input}.gz pggb -i {input}.gz \ - -o ./results/pggb \ + -o {params.outdir} \ -t {threads} \ -n {params.nr_strains} \ -p 99 \ -s 5k - cp results/pggb/*smooth.final.gfa {output} + cp {params.outgraph} {output} """ rule call_variants: - input: "results/pggb/bernese.smooth.final.gfa" - output: "results/vg/bernese.variants.vcf" + input: config["outdir"] + "/graph.smooth.final.gfa" + output: config["outdir"] + "/variants.vcf" params: - reference = "PB000155_1" - singularity: "./pggb_latest.sif" + reference = config["reference"] + singularity: "container/pggb_latest.sif" shell: """ diff --git a/variantcalling/workflow/scripts/combine_assemblies.py b/variantcalling/workflow/scripts/combine_assemblies.py index 3932157..d9b6ae1 100755 --- a/variantcalling/workflow/scripts/combine_assemblies.py +++ b/variantcalling/workflow/scripts/combine_assemblies.py @@ -6,10 +6,11 @@ import sys from Bio import SeqIO -assemblies = sys.argv[1:] +outdir = sys.argv[1] +assemblies = sys.argv[2:] -fasta_handle = open("results_sim/single_contig_assemblies.fasta", "w") -discarded = open("results_sim/discarded_assemblies.txt", "w") +fasta_handle = open(outdir + "/single_contig_assemblies.fasta", "w") +discarded = open(outdir + "/discarded_assemblies.txt", "w") for ASSEMBLY in assemblies: -- GitLab