diff --git a/.gitignore b/.gitignore index 7e6e3444c93e30a2373e62f129b8848f13bc3170..479190f97f1a8c16a3319dceee47caf8be1a0178 100755 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,5 @@ assembly/.cache assembly/rulegraph.pdf assembly/resources/bakta_db facienda.md +variantcalling/container/pggb_latest.sif +variantcalling/.snakemake diff --git a/assembly/README.md b/assembly/README.md index 36e523782949dc9e37ad3413ca033c7fde5a9172..670400b106a079dbb8b5fc99d08142e88af4e9c9 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -17,19 +17,22 @@ The user needs to provide two things to run the workflow on her samples: In the file config/config.yaml some global parameters can be set: ```yaml -samples: config/samples.tsv -outdir: ./results -output_prefix: pb_bernese +# REQUIRED +samples: config/samples.tsv # Path to sample table, no header, tab-separated +outdir: ./results # Path to output directory -ref: - genome_size: 4.4m - gbf: resources/H37Rv.gbf +# OPTIONAL +annotate: "Yes" # Annotate assembly with bakta Yes/No -bakta_db: resources/bakta_db +ref: + genome_size: 4.4m # + gbf: resources/H37Rv.gbf # Used for bakta annotation step -threads_per_job: 4 +bakta_db: resources/bakta_db # Used for bakta annotation step -keep_intermediate: "Yes" +threads_per_job: 4 # Should match cpus-per-task in the snakemake command + +keep_intermediate: "Yes" # Not implemented yet... ``` @@ -57,21 +60,24 @@ It's convenient to run snakemake in a screen, so we can do other things on scico ``` # Open screen -screen -r assembly +screen -R assembly # Load Snakemake module ml snakemake/6.6.1-foss-2021a # Dry run +snakemake -n \ + --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/config.yml # Real run snakemake \ - --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \ + --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/config.yml \ --jobs 4 \ + --keep-going \ --latency-wait 60 \ --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \ - --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j" + --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/pbassembly.e%j" # Leave the screen: CTRL+a+d diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile index bd247bace07ed543a7b3cc1d62159df53d4c795a..d47aae17958fe3368d27171f4e3d5d969858e714 100755 --- a/assembly/workflow/Snakefile +++ b/assembly/workflow/Snakefile @@ -9,15 +9,24 @@ include: "rules/assemble.smk" include: "rules/circularize.smk" -include: "rules/annotate.smk" +if config["annotate"] == "Yes": + include: "rules/annotate.smk" include: "rules/summarize.smk" include: "rules/mapreads.smk" -rule all: - input: - config["outdir"] + "/read_summaries.tsv", - config["outdir"] + "/assembly_summaries.tsv", - expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()), - expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) +if config["annotate"] == "Yes": + rule all: + input: + config["outdir"] + "/read_summaries.tsv", + config["outdir"] + "/assembly_summaries.tsv", + expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()), + expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) + +else: + rule all: + input: + config["outdir"] + "/read_summaries.tsv", + config["outdir"] + "/assembly_summaries.tsv", + expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys()) diff --git a/assembly/workflow/rules/annotate.smk b/assembly/workflow/rules/annotate.smk index 22bc5a204c317bebbc54b36dc77a9019b0a053f2..feeae2eed8d1dca8d0930a359a6cabc71c9f8c92 100755 --- a/assembly/workflow/rules/annotate.smk +++ b/assembly/workflow/rules/annotate.smk @@ -1,6 +1,6 @@ rule bakta: - input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + input: config["outdir"] + "/{sample}/{sample}.fasta" output: config["outdir"] + "/{sample}/bakta/{sample}.gff3" params: bakta_db = config["bakta_db"], diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk index 8d1faf0c346125809cdcb4b064dac91fd2d145ec..89f2f783376deb23fa141fdaddd49b69c669e514 100755 --- a/assembly/workflow/rules/circularize.smk +++ b/assembly/workflow/rules/circularize.smk @@ -79,7 +79,7 @@ rule rename: prefix = "{sample}", keep_intermediate = config["keep_intermediate"] - output: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + output: config["outdir"] + "/{sample}/{sample}.fasta" run: diff --git a/assembly/workflow/rules/mapreads.smk b/assembly/workflow/rules/mapreads.smk index 3fd0440e0b8a9da99a63ec0dc5fcc62a7c912507..b9ef631790821f0ef68c232d82371992d3053537 100755 --- a/assembly/workflow/rules/mapreads.smk +++ b/assembly/workflow/rules/mapreads.smk @@ -3,7 +3,7 @@ rule map_LR: input: reads = lambda wildcards: samples[wildcards.sample].longread_fastq, - assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta" + assembly = config["outdir"] + "/{sample}/{sample}.fasta" output: config["outdir"] + "/{sample}/remapping/longreads.bam" params: intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam" diff --git a/variantcalling/README.md b/variantcalling/README.md index 5d5c07062c9a0280ca3fd240d834f4153a34f915..1b8a2daab5dbf670cfe0588fe16a8bb3e6eaef2b 100755 --- a/variantcalling/README.md +++ b/variantcalling/README.md @@ -13,28 +13,24 @@ singularity pull docker://ghcr.io/pangenome/pggb:latest ## config/config.yaml ```yaml - -reference: - -threads: 4 - -output_dir: ./results - -pggb: - p: 99 - s: 5k - +outdir: /home/cristobal/TB/projects/pacbio_microscale/results/variants/assembly +samples: /home/cristobal/TB/projects/pacbio_microscale/results/variants/assembly/samples.tsv +reference: N1426 +threads: 20 ``` ## Dry run -snakemake -n +``` +snakemake -n --configfile +``` ## Run ``` snakemake \ --jobs 1 \ + --configfile ~/TB/projects/pacbio_microscale/results/variants/assembly/config.yml \ --latency-wait 60 \ --use-conda --use-envmodules \ - --use-singularity --singularity-args "--bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/workflows --bind /scratch" \ - --cluster "sbatch --job-name=bernese_variants --cpus-per-task=20 --mem-per-cpu=1G --time=06:00:00 --qos=6hours --output=bernese_variants.o%j --error=bernese_variants.e%j" + --use-singularity --singularity-args "--bind /scicore/home/gagneux/stritt0001 --bind /scratch" \ + --cluster "sbatch --job-name=pggb --cpus-per-task=20 --mem-per-cpu=1G --time=06:00:00 --qos=6hours --output=pggb.o%j --error=pggb.e%j" ``` diff --git a/variantcalling/workflow/Snakefile b/variantcalling/workflow/Snakefile index cc39245ce400f64986af595a1ab0d4464165df94..0b62ac0878a1ace7c0f2a735bcff299584bb61b0 100755 --- a/variantcalling/workflow/Snakefile +++ b/variantcalling/workflow/Snakefile @@ -1,51 +1,39 @@ -samples = [ -"PB000150", # whichever reference -"PB000152", -"PB000153", -"PB000155", # Basal strain in both ML SNP tree and tip dated BEAST tree (P034) -#"PB000175", # ignore outgroup for variant calling, to avoid complications with nested variants -"PB000177", -"PB000178", -"PB000180", -"PB000182", -"PB000183", -"PB000184", -"PB000186", -"PB000187", -"PB000189", -"PB000190", -"PB000191", -"PB000192", -#"PB000193", -"PB000194" -] +configfile: "config/config.yml" + +import pandas as pd + +samples = pd.read_table(config["samples"], header=None ) + rule all: input: - "results/vg/bernese.variants.vcf", - "results/raxml-ng/core_gene_alignment.aln.raxml.bestTree" + config["outdir"] + "/variants.vcf" rule combine_assemblies: - input: expand('/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/workflows/assemblySMK/results/{sample}/assembly.circularized.renamed.fasta', sample = samples) + input: list(samples[1]) output: - fasta = "results/single_contig_assemblies.fasta", - discarded = "results/discarded_assemblies.txt" - conda: "config/biopython.yaml" + fasta = config["outdir"] + "/single_contig_assemblies.fasta", + discarded = config["outdir"] + "/discarded_assemblies.txt" + params: + outdir = config["outdir"] + conda: "../config/biopython.yaml" shell: """ - python workflow/scripts/combine_assemblies.py {input} + python workflow/scripts/combine_assemblies.py {params.outdir} {input} """ rule pggb: - input: "results/single_contig_assemblies.fasta" - output: "results/pggb/bernese.smooth.final.gfa" + input: config["outdir"] + "/single_contig_assemblies.fasta" + output: config["outdir"] + "/graph.smooth.final.gfa" threads: 20 params: - nr_strains = len(samples) - singularity: "./pggb_latest.sif" + nr_strains = len(samples), + outdir = config["outdir"], + outgraph = config["outdir"] + "/*smooth.final.gfa" + singularity: "container/pggb_latest.sif" shell: """ @@ -53,22 +41,22 @@ rule pggb: samtools faidx {input}.gz pggb -i {input}.gz \ - -o ./results/pggb \ + -o {params.outdir} \ -t {threads} \ -n {params.nr_strains} \ -p 99 \ -s 5k - cp results/pggb/*smooth.final.gfa {output} + cp {params.outgraph} {output} """ rule call_variants: - input: "results/pggb/bernese.smooth.final.gfa" - output: "results/vg/bernese.variants.vcf" + input: config["outdir"] + "/graph.smooth.final.gfa" + output: config["outdir"] + "/variants.vcf" params: - reference = "PB000155_1" - singularity: "./pggb_latest.sif" + reference = config["reference"] + singularity: "container/pggb_latest.sif" shell: """ diff --git a/variantcalling/workflow/scripts/combine_assemblies.py b/variantcalling/workflow/scripts/combine_assemblies.py index 39321570a9e9d355b9e249789fc125b40a2c4a34..d9b6ae169eb12efec93db947ac5b89ce196cc18c 100755 --- a/variantcalling/workflow/scripts/combine_assemblies.py +++ b/variantcalling/workflow/scripts/combine_assemblies.py @@ -6,10 +6,11 @@ import sys from Bio import SeqIO -assemblies = sys.argv[1:] +outdir = sys.argv[1] +assemblies = sys.argv[2:] -fasta_handle = open("results_sim/single_contig_assemblies.fasta", "w") -discarded = open("results_sim/discarded_assemblies.txt", "w") +fasta_handle = open(outdir + "/single_contig_assemblies.fasta", "w") +discarded = open(outdir + "/discarded_assemblies.txt", "w") for ASSEMBLY in assemblies: