From bc6a3bd2b3c7088d4bdbd3e56d110ed1aa2e5d81 Mon Sep 17 00:00:00 2001 From: cstritt <crstp.strt@gmail.com> Date: Tue, 19 Mar 2024 15:22:40 +0100 Subject: [PATCH] Assembly pipeline including a short-read polishing step, for older PacBio Reads --- assembly/README.md | 4 +++ assembly/workflow/Snakefile | 2 ++ assembly/workflow/rules/assemble.smk | 2 +- assembly/workflow/rules/circularize.smk | 2 +- assembly/workflow/rules/polish.smk | 46 +++++++++++++++++++++++++ assembly/workflow/rules/readQC.smk | 2 +- 6 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 assembly/workflow/rules/polish.smk diff --git a/assembly/README.md b/assembly/README.md index 8c339e5..2ad7833 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -1,8 +1,12 @@ # Genome assembly workflow +Adapted for older PacBio reads: includes a short-read polishing step. The sample table (samples.tsv) requires to additional columns, sr1 and sr2, indicating paths to forward and reverse short reads. + + The genome assembly workflow includes the following tools/steps: - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly. + - [Polypolish](https://github.com/rrwick/Polypolish): Polishing. - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA. - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly. - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies. diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile index 8dbb248..350324b 100755 --- a/assembly/workflow/Snakefile +++ b/assembly/workflow/Snakefile @@ -6,6 +6,8 @@ include: "rules/readQC.smk" include: "rules/assemble.smk" +include: "rules/polish.smk" + include: "rules/circularize.smk" if config["annotate"] == "Yes": diff --git a/assembly/workflow/rules/assemble.smk b/assembly/workflow/rules/assemble.smk index 64fa1af..584e9e3 100755 --- a/assembly/workflow/rules/assemble.smk +++ b/assembly/workflow/rules/assemble.smk @@ -13,7 +13,7 @@ rule flye: shell: """ flye \ - --pacbio-hifi {input} \ + --pacbio-raw {input} \ --genome-size {params.genome_size} \ --iterations {params.iterations} \ --threads {threads} \ diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk index 89f2f78..2f4d1ad 100755 --- a/assembly/workflow/rules/circularize.smk +++ b/assembly/workflow/rules/circularize.smk @@ -1,7 +1,7 @@ rule circlator_mapreads: input: - assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", + assembly = config["outdir"] + "/{sample}/polypolish/assembly.polished.fasta", reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq) output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam" params: diff --git a/assembly/workflow/rules/polish.smk b/assembly/workflow/rules/polish.smk new file mode 100644 index 0000000..9084839 --- /dev/null +++ b/assembly/workflow/rules/polish.smk @@ -0,0 +1,46 @@ +rule align_SR: + input: + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", + sr1 = lambda wildcards: samples[wildcards.sample].sr1, + sr2 = lambda wildcards: samples[wildcards.sample].sr2 + + output: + aln1 = config["outdir"] + "/{sample}/polypolish/aln_R1.sam", + aln2 = config["outdir"] + "/{sample}/polypolish/aln_R2.sam" + threads: config["threads_per_job"] + shell: + """ + bwa index {input.assembly} + bwa mem -t {threads} -a {input.assembly} {input.sr1} > {output.aln1} + bwa mem -t {threads} -a {input.assembly} {input.sr2} > {output.aln2} + + """ + + + +rule insert_filter: + input: + aln1 = config["outdir"] + "/{sample}/polypolish/aln_R1.sam", + aln2 = config["outdir"] + "/{sample}/polypolish/aln_R2.sam" + output: + alnfilt1 = config["outdir"] + "/{sample}/polypolish/aln_R1.filtered.sam", + alnfitl2 = config["outdir"] + "/{sample}/polypolish/aln_R2.filtered.sam" + shell: + """ + polypolish filter --in1 {input.aln1} --in2 {input.aln2} --out1 {output.alnfilt1} --out2 {output.alnfilt2} + + """ + + +rule polypolish: + input: + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", + alnfil1 = config["outdir"] + "/{sample}/polypolish/aln_R1.filtered.sam", + alnfil2 = config["outdir"] + "/{sample}/polypolish/aln_R2.filtered.sam" + + output: config["outdir"] + "/{sample}/polypolish/assembly.polished.fasta" + shell: + """ + polypolish polish {input.assembly} {input.alnfilt1} {input.alnfilt2} > {output} + + """ \ No newline at end of file diff --git a/assembly/workflow/rules/readQC.smk b/assembly/workflow/rules/readQC.smk index da08f9a..811cfaa 100755 --- a/assembly/workflow/rules/readQC.smk +++ b/assembly/workflow/rules/readQC.smk @@ -8,7 +8,7 @@ rule longQC: shell: """ rm -r {params.outfolder} - python /LongQC/longQC.py sampleqc -x pb-sequel -o {params.outfolder}/ -p {params.threads} {input} + python /LongQC/longQC.py sampleqc -x pb-r2 -o {params.outfolder}/ -p {params.threads} {input} """ -- GitLab