Skip to content
Snippets Groups Projects
Commit fe8fafc5 authored by Christoph Stritt's avatar Christoph Stritt
Browse files

variantcalling works now

parent 1bef02cc
No related branches found
No related tags found
No related merge requests found
......@@ -5,3 +5,5 @@ assembly/.cache
assembly/rulegraph.pdf
assembly/resources/bakta_db
facienda.md
variantcalling/container/pggb_latest.sif
variantcalling/.snakemake
......@@ -17,19 +17,22 @@ The user needs to provide two things to run the workflow on her samples:
In the file config/config.yaml some global parameters can be set:
```yaml
samples: config/samples.tsv
outdir: ./results
output_prefix: pb_bernese
# REQUIRED
samples: config/samples.tsv # Path to sample table, no header, tab-separated
outdir: ./results # Path to output directory
ref:
genome_size: 4.4m
gbf: resources/H37Rv.gbf
# OPTIONAL
annotate: "Yes" # Annotate assembly with bakta Yes/No
bakta_db: resources/bakta_db
ref:
genome_size: 4.4m #
gbf: resources/H37Rv.gbf # Used for bakta annotation step
threads_per_job: 4
bakta_db: resources/bakta_db # Used for bakta annotation step
keep_intermediate: "Yes"
threads_per_job: 4 # Should match cpus-per-task in the snakemake command
keep_intermediate: "Yes" # Not implemented yet...
```
......@@ -57,21 +60,24 @@ It's convenient to run snakemake in a screen, so we can do other things on scico
```
# Open screen
screen -r assembly
screen -R assembly
# Load Snakemake module
ml snakemake/6.6.1-foss-2021a
# Dry run
snakemake -n \
--configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/config.yml
# Real run
snakemake \
--configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \
--configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/config.yml \
--jobs 4 \
--keep-going \
--latency-wait 60 \
--use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \
--cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j"
--cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/demo/pbassembly.e%j"
# Leave the screen: CTRL+a+d
......
......@@ -9,15 +9,24 @@ include: "rules/assemble.smk"
include: "rules/circularize.smk"
include: "rules/annotate.smk"
if config["annotate"] == "Yes":
include: "rules/annotate.smk"
include: "rules/summarize.smk"
include: "rules/mapreads.smk"
rule all:
input:
config["outdir"] + "/read_summaries.tsv",
config["outdir"] + "/assembly_summaries.tsv",
expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()),
expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys())
if config["annotate"] == "Yes":
rule all:
input:
config["outdir"] + "/read_summaries.tsv",
config["outdir"] + "/assembly_summaries.tsv",
expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()),
expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys())
else:
rule all:
input:
config["outdir"] + "/read_summaries.tsv",
config["outdir"] + "/assembly_summaries.tsv",
expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys())
rule bakta:
input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
input: config["outdir"] + "/{sample}/{sample}.fasta"
output: config["outdir"] + "/{sample}/bakta/{sample}.gff3"
params:
bakta_db = config["bakta_db"],
......
......@@ -79,7 +79,7 @@ rule rename:
prefix = "{sample}",
keep_intermediate = config["keep_intermediate"]
output: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
output: config["outdir"] + "/{sample}/{sample}.fasta"
run:
......
......@@ -3,7 +3,7 @@
rule map_LR:
input:
reads = lambda wildcards: samples[wildcards.sample].longread_fastq,
assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
assembly = config["outdir"] + "/{sample}/{sample}.fasta"
output: config["outdir"] + "/{sample}/remapping/longreads.bam"
params:
intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam"
......
......@@ -13,28 +13,24 @@ singularity pull docker://ghcr.io/pangenome/pggb:latest
## config/config.yaml
```yaml
reference:
threads: 4
output_dir: ./results
pggb:
p: 99
s: 5k
outdir: /home/cristobal/TB/projects/pacbio_microscale/results/variants/assembly
samples: /home/cristobal/TB/projects/pacbio_microscale/results/variants/assembly/samples.tsv
reference: N1426
threads: 20
```
## Dry run
snakemake -n
```
snakemake -n --configfile
```
## Run
```
snakemake \
--jobs 1 \
--configfile ~/TB/projects/pacbio_microscale/results/variants/assembly/config.yml \
--latency-wait 60 \
--use-conda --use-envmodules \
--use-singularity --singularity-args "--bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/workflows --bind /scratch" \
--cluster "sbatch --job-name=bernese_variants --cpus-per-task=20 --mem-per-cpu=1G --time=06:00:00 --qos=6hours --output=bernese_variants.o%j --error=bernese_variants.e%j"
--use-singularity --singularity-args "--bind /scicore/home/gagneux/stritt0001 --bind /scratch" \
--cluster "sbatch --job-name=pggb --cpus-per-task=20 --mem-per-cpu=1G --time=06:00:00 --qos=6hours --output=pggb.o%j --error=pggb.e%j"
```
samples = [
"PB000150", # whichever reference
"PB000152",
"PB000153",
"PB000155", # Basal strain in both ML SNP tree and tip dated BEAST tree (P034)
#"PB000175", # ignore outgroup for variant calling, to avoid complications with nested variants
"PB000177",
"PB000178",
"PB000180",
"PB000182",
"PB000183",
"PB000184",
"PB000186",
"PB000187",
"PB000189",
"PB000190",
"PB000191",
"PB000192",
#"PB000193",
"PB000194"
]
configfile: "config/config.yml"
import pandas as pd
samples = pd.read_table(config["samples"], header=None )
rule all:
input:
"results/vg/bernese.variants.vcf",
"results/raxml-ng/core_gene_alignment.aln.raxml.bestTree"
config["outdir"] + "/variants.vcf"
rule combine_assemblies:
input: expand('/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/workflows/assemblySMK/results/{sample}/assembly.circularized.renamed.fasta', sample = samples)
input: list(samples[1])
output:
fasta = "results/single_contig_assemblies.fasta",
discarded = "results/discarded_assemblies.txt"
conda: "config/biopython.yaml"
fasta = config["outdir"] + "/single_contig_assemblies.fasta",
discarded = config["outdir"] + "/discarded_assemblies.txt"
params:
outdir = config["outdir"]
conda: "../config/biopython.yaml"
shell:
"""
python workflow/scripts/combine_assemblies.py {input}
python workflow/scripts/combine_assemblies.py {params.outdir} {input}
"""
rule pggb:
input: "results/single_contig_assemblies.fasta"
output: "results/pggb/bernese.smooth.final.gfa"
input: config["outdir"] + "/single_contig_assemblies.fasta"
output: config["outdir"] + "/graph.smooth.final.gfa"
threads: 20
params:
nr_strains = len(samples)
singularity: "./pggb_latest.sif"
nr_strains = len(samples),
outdir = config["outdir"],
outgraph = config["outdir"] + "/*smooth.final.gfa"
singularity: "container/pggb_latest.sif"
shell:
"""
......@@ -53,22 +41,22 @@ rule pggb:
samtools faidx {input}.gz
pggb -i {input}.gz \
-o ./results/pggb \
-o {params.outdir} \
-t {threads} \
-n {params.nr_strains} \
-p 99 \
-s 5k
cp results/pggb/*smooth.final.gfa {output}
cp {params.outgraph} {output}
"""
rule call_variants:
input: "results/pggb/bernese.smooth.final.gfa"
output: "results/vg/bernese.variants.vcf"
input: config["outdir"] + "/graph.smooth.final.gfa"
output: config["outdir"] + "/variants.vcf"
params:
reference = "PB000155_1"
singularity: "./pggb_latest.sif"
reference = config["reference"]
singularity: "container/pggb_latest.sif"
shell:
"""
......
......@@ -6,10 +6,11 @@ import sys
from Bio import SeqIO
assemblies = sys.argv[1:]
outdir = sys.argv[1]
assemblies = sys.argv[2:]
fasta_handle = open("results_sim/single_contig_assemblies.fasta", "w")
discarded = open("results_sim/discarded_assemblies.txt", "w")
fasta_handle = open(outdir + "/single_contig_assemblies.fasta", "w")
discarded = open(outdir + "/discarded_assemblies.txt", "w")
for ASSEMBLY in assemblies:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment