Paths fixed, works on sciCORE now

1bef02cc · Christoph Stritt · 0507ed5e · 1bef02cc · 1bef02cc · 1bef02cc
Commit 1bef02cc authored 1 year ago by Christoph Stritt
--- a/.gitignore
+++ b/.gitignore
 pangenome
 assembly/container/assemblySC.sif
 assembly/.snakemake
+assembly/.cache
 assembly/rulegraph.pdf
-facienda.md
\ No newline at end of file
+assembly/resources/bakta_db
+facienda.md
--- a/assembly/README.md
+++ b/assembly/README.md
 # Genome assembly workflow

-The user needs to modify two things to run the workflow on her samples:
+The genome assembly workflow includes the following tools/steps:
+  - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, 
+  - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly.
+  - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA.
+  - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly.
+  - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies.  

- the file config/samples.tsv, which contains the sample names and the corresponding paths to the HiFi consensus reads. 
- the file config/config.yaml, which contains global options for the analysis 

+# Run the pipeline
+The user needs to provide two things to run the workflow on her samples:
+- a config file with some global options for the analysis
+- a tab separate table, without header, that contains the sample names and the corresponding paths to the HiFi consensus reads. 

-## samples.tsv
-This is a tab-separated table with a header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. 
-
-## config.yaml
+## config.yml
 In the file config/config.yaml some global parameters can be set:

 ```yaml
-
-samples: samples.inhouse.hifi.test.csv
-
-results_directory: ./results
+samples: config/samples.tsv
+outdir: ./results
 output_prefix: pb_bernese

 ref:
  genome_size: 4.4m
  gbf: resources/H37Rv.gbf

-threads: 4
+bakta_db: resources/bakta_db
+
+threads_per_job: 4

 keep_intermediate: "Yes"

 ```

+## samples.tsv
+This is a tab-separated table with no header and two colums (see example): the first containing the sample names, which will be used to name the assemblies; the second with the absolute paths to the fastq files. 
+
+

+## Snakemake dry run

-## Dry-run
 ```
-snakemake -n
+snakemake -n --configfile /path/to/config.yml
 ```

-
-## Run the workflow on scicore
+## Run the workflow on sciCORE

 Important: singularity containers most be given access to the file locations through the --bind argument. E.g. if the long reads are on /scicore/home/jean-jacques/reads/, add this location in the snakemake command (see also the full command below): 

@@ -60,12 +67,11 @@ ml snakemake/6.6.1-foss-2021a

 # Real run 
 snakemake \
- --config samples=1.5 \
- --config outdir=./results \
-
- --jobs 4 
- --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/ --bind /scratch" \
- --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=pbassembly.o%j --error=pbassembly.e%j"
+ --configfile /scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/config.yml \
+ --jobs 4 \
+ --latency-wait 60 \
+ --use-singularity --singularity-args "--bind /scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux --bind /scicore/home/gagneux/stritt0001 --bind /scratch" \
+ --cluster "sbatch --job-name=pbassembly --cpus-per-task=4 --mem-per-cpu=4G --time=06:00:00 --qos=6hours --output=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.o%j --error=/scicore/home/gagneux/stritt0001/TB/projects/pacbio_microscale/results/assembly/pbassembly.e%j"


 # Leave the screen: CTRL+a+d
@@ -73,4 +79,17 @@ snakemake \
 # Return to the screen 
 screen -r assembly

-```
\ No newline at end of file
+```
+
+# Output
+For each sample defined in the samples table, a folder is generated in the output directory. It contains: 
+
+```
+assembly.circularized.renamed.fasta
+bakta/
+circlator/
+flye/
+longqc/
+remapping/
+
+```
--- a/assembly/workflow/Snakefile
+++ b/assembly/workflow/Snakefile

-containerized: "container/assemblySMK.sif"
+containerized: "container/assemblySC.sif"

 include: "rules/common.smk"

@@ -19,5 +19,5 @@ rule all:
    input:
        config["outdir"] + "/read_summaries.tsv",
        config["outdir"] + "/assembly_summaries.tsv", 
-        expand(config["outdir"] + "/{sample}/{sample}.gff", sample = samples.keys()),
+        expand(config["outdir"] + "/{sample}/bakta/{sample}.gff3", sample = samples.keys()),
        expand(config["outdir"] + "/{sample}/remapping/longreads.bam", sample = samples.keys())
--- a/assembly/workflow/rules/annotate.smk
+++ b/assembly/workflow/rules/annotate.smk

 rule bakta:
  input: config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
-  output: config["outdir"] + "/{sample}/{sample}.gff"
+  output: config["outdir"] + "/{sample}/bakta/{sample}.gff3"
  params:
    bakta_db = config["bakta_db"],
    refgbf = config["ref"]["gbf"],
    asm = "{sample}",
-    outdir = config["outdir"] + "/{sample}/",
+    outdir = config["outdir"] + "/{sample}/bakta",
    threads = config["threads_per_job"]

  shell:
    """
    bakta {input} \
    --db {params.bakta_db} \
+    --output {params.outdir} \
    --prefix {params.asm} \
    --genus Mycobacterium \
    --species tuberculosis \
-    --strain {params.sample} \
-    --locus-tag {params.sample} \
-    --complete --compatible \
+    --strain {params.asm} \
+    --locus-tag {params.asm} \
+    --complete --compliant --force \
    --proteins {params.refgbf} \
    --keep-contig-headers \
    --threads  {params.threads}

--- a/assembly/workflow/rules/circularize.smk
+++ b/assembly/workflow/rules/circularize.smk

 rule circlator_mapreads:
    input: 
-        assembly = config["outdir"] + "/{sample}/assembly.fasta",
+        assembly = config["outdir"] + "/{sample}/flye/assembly.fasta",
        reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq)
    output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam"
    params:
@@ -29,7 +29,7 @@ rule circlator_localassembly:
    output: config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta"
    params:
        outdir = config["outdir"] + "/{sample}/circlator/03.assemble",
-        threads = config["threads"]
+        threads = config["threads_per_job"]
    shell:
        """
        flye --pacbio-hifi {input} --out-dir {params.outdir} --genome-size 100k --threads {params.threads}
@@ -38,7 +38,7 @@ rule circlator_localassembly:

 rule circlator_merge:
    input: 
-        assembly = config["outdir"]  + "/{sample}/assembly.fasta",
+        assembly = config["outdir"]  + "/{sample}/flye/assembly.fasta",
        localassembly = config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta"
    output: config["outdir"] + "/{sample}/circlator/04.merge.fasta"
    params:

--- a/assembly/workflow/rules/common.smk
+++ b/assembly/workflow/rules/common.smk
@@ -8,7 +8,6 @@ class sample:
 samples = {}

 with open(config["samples"]) as f:
-    next(f)
    for line in f:
        fields = line.strip().split("\t")
        samples[fields[0]] = sample(fields) 
--- a/assembly/workflow/rules/mapreads.smk
+++ b/assembly/workflow/rules/mapreads.smk

-rule map_SR:
-    input:
-        reads = lambda wildcards: samples[wildcards.sample].shortread_fastq, 
-        assembly = "results/{sample}/assembly.circularized.renamed.fasta"
-    output: "results/{sample}/remapping/shortreads.bam"
-    params:
-        intermediate = "results/{sample}/remapping/shortreads.raw.bam"
-    threads: config["threads"]
-    shell:
-        """
-        minimap2 -ax sr {input.assembly} {input.reads} | samtools view -hu > {params.intermediate}
-        samtools sort -@ {threads} {params.intermediate} -o {output}
-        samtools index {output}
-        rm {params.intermediate}
-
-        """

 rule map_LR:
    input:
        reads = lambda wildcards: samples[wildcards.sample].longread_fastq,  
-        assembly = "results/{sample}/assembly.circularized.renamed.fasta"
-    output: "results/{sample}/remapping/longreads.bam"
+        assembly = config["outdir"] + "/{sample}/assembly.circularized.renamed.fasta"
+    output: config["outdir"] + "/{sample}/remapping/longreads.bam"
    params:
-        intermediate = "results/{sample}/remapping/longreads.raw.bam"
-    threads: config["threads"]
+        intermediate = config["outdir"] + "/{sample}/remapping/longreads.raw.bam"
+    threads: config["threads_per_job"]
    shell:
        """
        minimap2 -ax map-hifi {input.assembly} {input.reads} | samtools view -hu > {params.intermediate}

--- a/assembly/workflow/rules/summarize.smk
+++ b/assembly/workflow/rules/summarize.smk
@@ -10,7 +10,7 @@ rule read_summary:

        for FILE in input:

-            sample = FILE.split("/")[1]
+            sample = FILE.split("/")[-3]

            d = json.load(open(FILE))
            d_flat = {}
@@ -52,7 +52,7 @@ rule read_summary:


 rule assembly_summaries:
-  input: expand("{outdir}/{sample}/assembly_info.txt", outdir = config["outdir"], sample = samples.keys())
+  input: expand("{outdir}/{sample}/flye/assembly_info.txt", outdir = config["outdir"], sample = samples.keys())
  output: 
    summaries = config["outdir"] + "/assembly_summaries.tsv",
    single_contig_assemblies = config["outdir"] + "/single_contig_samples.txt"
@@ -65,7 +65,7 @@ rule assembly_summaries:

    for FILE in input:

-      sample = FILE.split("/")[1]
+      sample = FILE.split("/")[-3]
      contig_count = 0

      with open(FILE) as f: