From bc6a3bd2b3c7088d4bdbd3e56d110ed1aa2e5d81 Mon Sep 17 00:00:00 2001
From: cstritt <crstp.strt@gmail.com>
Date: Tue, 19 Mar 2024 15:22:40 +0100
Subject: [PATCH] Assembly pipeline including a short-read polishing step, for
 older PacBio Reads

---
 assembly/README.md                      |  4 +++
 assembly/workflow/Snakefile             |  2 ++
 assembly/workflow/rules/assemble.smk    |  2 +-
 assembly/workflow/rules/circularize.smk |  2 +-
 assembly/workflow/rules/polish.smk      | 46 +++++++++++++++++++++++++
 assembly/workflow/rules/readQC.smk      |  2 +-
 6 files changed, 55 insertions(+), 3 deletions(-)
 create mode 100644 assembly/workflow/rules/polish.smk

diff --git a/assembly/README.md b/assembly/README.md
index 8c339e5..2ad7833 100755
--- a/assembly/README.md
+++ b/assembly/README.md
@@ -1,8 +1,12 @@
 # Genome assembly workflow
 
+Adapted for older PacBio reads: includes a short-read polishing step. The sample table (samples.tsv) requires to additional columns, sr1 and sr2, indicating paths to forward and reverse short reads.
+
+
 The genome assembly workflow includes the following tools/steps:
   - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, 
   - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly.
+  - [Polypolish](https://github.com/rrwick/Polypolish): Polishing.
   - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA.
   - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly.
   - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies.  
diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile
index 8dbb248..350324b 100755
--- a/assembly/workflow/Snakefile
+++ b/assembly/workflow/Snakefile
@@ -6,6 +6,8 @@ include: "rules/readQC.smk"
 
 include: "rules/assemble.smk"
 
+include: "rules/polish.smk"
+
 include: "rules/circularize.smk"
 
 if config["annotate"] == "Yes":
diff --git a/assembly/workflow/rules/assemble.smk b/assembly/workflow/rules/assemble.smk
index 64fa1af..584e9e3 100755
--- a/assembly/workflow/rules/assemble.smk
+++ b/assembly/workflow/rules/assemble.smk
@@ -13,7 +13,7 @@ rule flye:
   shell:
     """
     flye \
-      --pacbio-hifi {input} \
+      --pacbio-raw {input} \
       --genome-size {params.genome_size} \
       --iterations {params.iterations} \
       --threads {threads} \
diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk
index 89f2f78..2f4d1ad 100755
--- a/assembly/workflow/rules/circularize.smk
+++ b/assembly/workflow/rules/circularize.smk
@@ -1,7 +1,7 @@
 
 rule circlator_mapreads:
     input: 
-        assembly = config["outdir"] + "/{sample}/flye/assembly.fasta",
+        assembly = config["outdir"] + "/{sample}/polypolish/assembly.polished.fasta",
         reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq)
     output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam"
     params:
diff --git a/assembly/workflow/rules/polish.smk b/assembly/workflow/rules/polish.smk
new file mode 100644
index 0000000..9084839
--- /dev/null
+++ b/assembly/workflow/rules/polish.smk
@@ -0,0 +1,46 @@
+rule align_SR:
+    input: 
+        assembly = config["outdir"] + "/{sample}/flye/assembly.fasta",
+        sr1 = lambda wildcards: samples[wildcards.sample].sr1,
+        sr2 = lambda wildcards: samples[wildcards.sample].sr2
+
+    output: 
+        aln1 = config["outdir"] + "/{sample}/polypolish/aln_R1.sam",
+        aln2 = config["outdir"] + "/{sample}/polypolish/aln_R2.sam"
+    threads: config["threads_per_job"]
+    shell:
+        """
+        bwa index {input.assembly}
+        bwa mem -t {threads} -a {input.assembly} {input.sr1} > {output.aln1}
+        bwa mem -t {threads} -a {input.assembly} {input.sr2} > {output.aln2}
+
+        """
+
+
+
+rule insert_filter:
+    input: 
+        aln1 = config["outdir"] + "/{sample}/polypolish/aln_R1.sam",
+        aln2 = config["outdir"] + "/{sample}/polypolish/aln_R2.sam"
+    output: 
+        alnfilt1 = config["outdir"] + "/{sample}/polypolish/aln_R1.filtered.sam",
+        alnfitl2 = config["outdir"] + "/{sample}/polypolish/aln_R2.filtered.sam"
+    shell:
+        """
+        polypolish filter --in1 {input.aln1} --in2 {input.aln2} --out1 {output.alnfilt1} --out2 {output.alnfilt2}
+
+        """
+
+
+rule polypolish:
+    input:
+        assembly = config["outdir"] + "/{sample}/flye/assembly.fasta",
+        alnfil1 = config["outdir"] + "/{sample}/polypolish/aln_R1.filtered.sam",
+        alnfil2 = config["outdir"] + "/{sample}/polypolish/aln_R2.filtered.sam"
+
+    output: config["outdir"] + "/{sample}/polypolish/assembly.polished.fasta"
+    shell:
+        """
+        polypolish polish {input.assembly} {input.alnfilt1} {input.alnfilt2} > {output}
+    
+        """
\ No newline at end of file
diff --git a/assembly/workflow/rules/readQC.smk b/assembly/workflow/rules/readQC.smk
index da08f9a..811cfaa 100755
--- a/assembly/workflow/rules/readQC.smk
+++ b/assembly/workflow/rules/readQC.smk
@@ -8,7 +8,7 @@ rule longQC:
     shell:
         """
         rm -r {params.outfolder}
-        python /LongQC/longQC.py sampleqc -x pb-sequel -o {params.outfolder}/ -p {params.threads} {input}
+        python /LongQC/longQC.py sampleqc -x pb-r2 -o {params.outfolder}/ -p {params.threads} {input}
 
         """
 
-- 
GitLab