From 1bef02ccc87ba049114c42fc2a3c86ea42655277 Mon Sep 17 00:00:00 2001
From: Christoph Stritt <stritt0001@login01.cluster.bc2.ch>
Date: Thu, 19 Oct 2023 13:10:00 +0200
Subject: [PATCH] Paths fixed, works on sciCORE now

---
 .gitignore                              |  4 +-
 assembly/README.md                      | 65 ++++++++++++++++---------
 assembly/workflow/Snakefile             |  4 +-
 assembly/workflow/rules/annotate.smk    | 11 +++--
 assembly/workflow/rules/circularize.smk |  6 +--
 assembly/workflow/rules/common.smk      |  1 -
 assembly/workflow/rules/mapreads.smk    | 24 ++-------
 assembly/workflow/rules/summarize.smk   |  6 +--
 8 files changed, 63 insertions(+), 58 deletions(-)

diff --git a/.gitignore b/.gitignore
index d8d6f29..7e6e344 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,7 @@
 pangenome
 assembly/container/assemblySC.sif
 assembly/.snakemake
+assembly/.cache
 assembly/rulegraph.pdf
-facienda.md
\ No newline at end of file
+assembly/resources/bakta_db
+facienda.md
diff --git a/assembly/README.md b/assembly/README.md
index f3b9227..36e5237 100755
--- a/assembly/README.md
+++ b/assembly/README.md
@@ -1,43 +1,50 @@
 # Genome assembly workflow
 
-The user needs to modify two things to run the workflow on her samples:
+The genome assembly workflow includes the following tools/steps:
+  - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, 
+  - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly.
+  - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA.
+  - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly.
+  - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies.  
 
-- the file config/samples.tsv, which contains the sample names and the corresponding paths to the HiFi consensus reads. 
-- the file config/config.yaml, which contains global options for the analysis 
 
+# Run the pipeline
+The user needs to provide two things to run the workflow on her samples:
+- a config file with some global options for the analysis
+- a tab separate table, without header, that contains the sample names and the corresponding paths to the HiFi consensus reads. 
 
-## samples.tsv
-This is a tab-separated table with a header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. 
-
-## config.yaml
+## config.yml
 In the file config/config.yaml some global parameters can be set:
 
 ```yaml
-
-samples: samples.inhouse.hifi.test.csv
-
-results_directory: ./results
+samples: config/samples.tsv
+outdir: ./results
 output_prefix: pb_bernese
 
 ref:
   genome_size: 4.4m
   gbf: resources/H37Rv.gbf
 
-threads: 4
+bakta_db: resources/bakta_db
+
+threads_per_job: 4
 
 keep_intermediate: "Yes"
 
 ```
 
+## samples.tsv
+This is a tab-separated table with no header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. 
+
+
 
+## Snakemake dry run
 
-## Dry-run
 ```
-snakemake -n
+snakemake -n --configfile /path/to/config.yml
 ```
 
-
-## Run the workflow on scicore
+## Run the workflow on sciCORE
 
 Important: singularity containers most be given access to the file locations through the --bind argument. E.g. if the long reads are on /scicore/home/jean-jacques/reads/, add this location in the snakemake command (see also the full command below): 
 
@@ -60,12 +67,11 @@ ml snakemake/6.6.1-foss-2021a
 
 # Real run 
 snakemake \
- --config samples=1.5 \
- --config outdir=./results \
-
- --jobs 4 
- --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/ --bind /scratch" \
- --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=pbassembly.o%j --error=pbassembly.e%j"
+ --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \
+ --jobs 4 \
+ --latency-wait 60 \
+ --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \
+ --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j"
 
 
 # Leave the screen: CTRL+a+d
@@ -73,4 +79,17 @@ snakemake \
 # Return to the screen 
 screen -r assembly
 
-```
\ No newline at end of file
+```
+
+# Output
+For each sample defined in the samples table, a folder is generated in the output directory. It contains: 
+
+```
+assembly.circularized.renamed.fasta
+bakta/
+circlator/
+flye/
+longqc/
+remapping/
+
+```
diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile
index aa6153d..bd247ba 100755
--- a/assembly/workflow/Snakefile
+++ b/assembly/workflow/Snakefile
@@ -1,5 +1,5 @@
 
-containerized: "container/assemblySMK.sif"
+containerized: "container/assemblySC.sif"
 
 include: "rules/common.smk"
 
@@ -19,5 +19,5 @@ rule all:
     input:
         config["outdir"] + "/read_summaries.tsv",
         config["outdir"] + "/assembly_summaries.tsv", 
-        expand(config["outdir"] + "/{sample}/{sample}.gff", sample = samples.keys()),
+        expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()),
         expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys())
diff --git a/assembly/workflow/rules/annotate.smk b/assembly/workflow/rules/annotate.smk
index b8a55bf..22bc5a2 100755
--- a/assembly/workflow/rules/annotate.smk
+++ b/assembly/workflow/rules/annotate.smk
@@ -1,24 +1,25 @@
 
 rule bakta:
   input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
-  output: config["outdir"] + "/{sample}/{sample}.gff"
+  output: config["outdir"] + "/{sample}/bakta/{sample}.gff3"
   params:
     bakta_db = config["bakta_db"],
     refgbf = config["ref"]["gbf"],
     asm = "{sample}",
-    outdir = config["outdir"] + "/{sample}/",
+    outdir = config["outdir"] + "/{sample}/bakta",
     threads = config["threads_per_job"]
 
   shell:
     """
     bakta {input} \
     --db {params.bakta_db} \
+    --output {params.outdir} \
     --prefix {params.asm} \
     --genus Mycobacterium \
     --species tuberculosis \
-    --strain {params.sample} \
-    --locus-tag {params.sample} \
-    --complete --compatible \
+    --strain {params.asm} \
+    --locus-tag {params.asm} \
+    --complete --compliant --force \
     --proteins {params.refgbf} \
     --keep-contig-headers \
     --threads  {params.threads}
diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk
index befa762..8d1faf0 100755
--- a/assembly/workflow/rules/circularize.smk
+++ b/assembly/workflow/rules/circularize.smk
@@ -1,7 +1,7 @@
 
 rule circlator_mapreads:
     input: 
-        assembly = config["outdir"] + "/{sample}/assembly.fasta",
+        assembly = config["outdir"] + "/{sample}/flye/assembly.fasta",
         reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq)
     output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam"
     params:
@@ -29,7 +29,7 @@ rule circlator_localassembly:
     output: config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta"
     params:
         outdir = config["outdir"] + "/{sample}/circlator/03.assemble",
-        threads = config["threads"]
+        threads = config["threads_per_job"]
     shell:
         """
         flye --pacbio-hifi {input} --out-dir {params.outdir} --genome-size 100k --threads {params.threads}
@@ -38,7 +38,7 @@ rule circlator_localassembly:
 
 rule circlator_merge:
     input: 
-        assembly = config["outdir"]  + "/{sample}/assembly.fasta",
+        assembly = config["outdir"]  + "/{sample}/flye/assembly.fasta",
         localassembly = config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta"
     output: config["outdir"] + "/{sample}/circlator/04.merge.fasta"
     params:
diff --git a/assembly/workflow/rules/common.smk b/assembly/workflow/rules/common.smk
index eac2fbc..b26c43f 100755
--- a/assembly/workflow/rules/common.smk
+++ b/assembly/workflow/rules/common.smk
@@ -8,7 +8,6 @@ class sample:
 samples = {}
 
 with open(config["samples"]) as f:
-    next(f)
     for line in f:
         fields = line.strip().split("\t")
         samples[fields[0]] = sample(fields) 
diff --git a/assembly/workflow/rules/mapreads.smk b/assembly/workflow/rules/mapreads.smk
index c511055..3fd0440 100755
--- a/assembly/workflow/rules/mapreads.smk
+++ b/assembly/workflow/rules/mapreads.smk
@@ -1,29 +1,13 @@
 
-rule map_SR:
-    input:
-        reads = lambda wildcards: samples[wildcards.sample].shortread_fastq, 
-        assembly = "results/{sample}/assembly.circularized.renamed.fasta"
-    output: "results/{sample}/remapping/shortreads.bam"
-    params:
-        intermediate = "results/{sample}/remapping/shortreads.raw.bam"
-    threads: config["threads"]
-    shell:
-        """
-        minimap2 -ax sr {input.assembly} {input.reads} | samtools view -hu > {params.intermediate}
-        samtools sort -@ {threads} {params.intermediate} -o {output}
-        samtools index {output}
-        rm {params.intermediate}
-
-        """
 
 rule map_LR:
     input:
         reads = lambda wildcards: samples[wildcards.sample].longread_fastq,  
-        assembly = "results/{sample}/assembly.circularized.renamed.fasta"
-    output: "results/{sample}/remapping/longreads.bam"
+        assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
+    output: config["outdir"] + "/{sample}/remapping/longreads.bam"
     params:
-        intermediate = "results/{sample}/remapping/longreads.raw.bam"
-    threads: config["threads"]
+        intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam"
+    threads: config["threads_per_job"]
     shell:
         """
         minimap2 -ax map-hifi {input.assembly} {input.reads} | samtools view -hu > {params.intermediate}
diff --git a/assembly/workflow/rules/summarize.smk b/assembly/workflow/rules/summarize.smk
index c64e64a..e58fa97 100755
--- a/assembly/workflow/rules/summarize.smk
+++ b/assembly/workflow/rules/summarize.smk
@@ -10,7 +10,7 @@ rule read_summary:
 
         for FILE in input:
 
-            sample = FILE.split("/")[1]
+            sample = FILE.split("/")[-3]
 
             d = json.load(open(FILE))
             d_flat = {}
@@ -52,7 +52,7 @@ rule read_summary:
 
 
 rule assembly_summaries:
-  input: expand("{outdir}/{sample}/assembly_info.txt", outdir = config["outdir"], sample = samples.keys())
+  input: expand("{outdir}/{sample}/flye/assembly_info.txt", outdir = config["outdir"], sample = samples.keys())
   output: 
     summaries = config["outdir"] + "/assembly_summaries.tsv",
     single_contig_assemblies = config["outdir"] + "/single_contig_samples.txt"
@@ -65,7 +65,7 @@ rule assembly_summaries:
 
     for FILE in input:
 
-      sample = FILE.split("/")[1]
+      sample = FILE.split("/")[-3]
       contig_count = 0
 
       with open(FILE) as f:
-- 
GitLab