diff --git a/assembly/README.md b/assembly/README.md index 8c339e5721c86fa4741a2b0c236bea8d961d7190..2ad78334637b0e110bf06971a11ae53094540c44 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -1,8 +1,12 @@ # Genome assembly workflow +Adapted for older PacBio reads: includes a short-read polishing step. The sample table (samples.tsv) requires to additional columns, sr1 and sr2, indicating paths to forward and reverse short reads. + + The genome assembly workflow includes the following tools/steps: - [LongQC](https://doi.org/10.1534/g3.119.400864): Get some read summary statistics. The reads are not modified in any way before assembly, - [Flye](https://doi.org/10.1038/s41587-019-0072-8): Assembly. + - [Polypolish](https://github.com/rrwick/Polypolish): Polishing. - [circlator](https://doi.org/10.1186/s13059-015-0849-0): Reorient the assembly such that it begins with dnaA. - [bakta](https://doi.org/10.1099/mgen.0.000685): Annotate the reoriented assembly. - [minimap2](https://doi.org/10.1093/bioinformatics/bty191): Map the long reads back against the assembly. The resulting alignments can be used to check for inconsistencies between reads and assemblies. diff --git a/assembly/workflow/Snakefile b/assembly/workflow/Snakefile index 8dbb2482198a0f109f40862b950d99b35605dc65..350324b53e8eb4f3dbf85348852a1094a4b1413c 100755 --- a/assembly/workflow/Snakefile +++ b/assembly/workflow/Snakefile @@ -6,6 +6,8 @@ include: "rules/readQC.smk" include: "rules/assemble.smk" +include: "rules/polish.smk" + include: "rules/circularize.smk" if config["annotate"] == "Yes": diff --git a/assembly/workflow/rules/assemble.smk b/assembly/workflow/rules/assemble.smk index 64fa1af9bef3385e2d7c310300081a71dd285c79..584e9e38ebb423aca6fbae21c7cb8e91a9bc4240 100755 --- a/assembly/workflow/rules/assemble.smk +++ b/assembly/workflow/rules/assemble.smk @@ -13,7 +13,7 @@ rule flye: shell: """ flye \ - --pacbio-hifi {input} \ + --pacbio-raw {input} \ --genome-size {params.genome_size} \ --iterations {params.iterations} \ --threads {threads} \ diff --git a/assembly/workflow/rules/circularize.smk b/assembly/workflow/rules/circularize.smk index 89f2f783376deb23fa141fdaddd49b69c669e514..2f4d1adfc23452fc8af0acf090a3717d00fee625 100755 --- a/assembly/workflow/rules/circularize.smk +++ b/assembly/workflow/rules/circularize.smk @@ -1,7 +1,7 @@ rule circlator_mapreads: input: - assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", + assembly = config["outdir"] + "/{sample}/polypolish/assembly.polished.fasta", reads = lambda wildcards: expand(samples[wildcards.sample].longread_fastq) output: config["outdir"] + "/{sample}/circlator/01.mapreads.bam" params: diff --git a/assembly/workflow/rules/polish.smk b/assembly/workflow/rules/polish.smk new file mode 100644 index 0000000000000000000000000000000000000000..908483967109804af61e3000be897103f828b35b --- /dev/null +++ b/assembly/workflow/rules/polish.smk @@ -0,0 +1,46 @@ +rule align_SR: + input: + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", + sr1 = lambda wildcards: samples[wildcards.sample].sr1, + sr2 = lambda wildcards: samples[wildcards.sample].sr2 + + output: + aln1 = config["outdir"] + "/{sample}/polypolish/aln_R1.sam", + aln2 = config["outdir"] + "/{sample}/polypolish/aln_R2.sam" + threads: config["threads_per_job"] + shell: + """ + bwa index {input.assembly} + bwa mem -t {threads} -a {input.assembly} {input.sr1} > {output.aln1} + bwa mem -t {threads} -a {input.assembly} {input.sr2} > {output.aln2} + + """ + + + +rule insert_filter: + input: + aln1 = config["outdir"] + "/{sample}/polypolish/aln_R1.sam", + aln2 = config["outdir"] + "/{sample}/polypolish/aln_R2.sam" + output: + alnfilt1 = config["outdir"] + "/{sample}/polypolish/aln_R1.filtered.sam", + alnfitl2 = config["outdir"] + "/{sample}/polypolish/aln_R2.filtered.sam" + shell: + """ + polypolish filter --in1 {input.aln1} --in2 {input.aln2} --out1 {output.alnfilt1} --out2 {output.alnfilt2} + + """ + + +rule polypolish: + input: + assembly = config["outdir"] + "/{sample}/flye/assembly.fasta", + alnfil1 = config["outdir"] + "/{sample}/polypolish/aln_R1.filtered.sam", + alnfil2 = config["outdir"] + "/{sample}/polypolish/aln_R2.filtered.sam" + + output: config["outdir"] + "/{sample}/polypolish/assembly.polished.fasta" + shell: + """ + polypolish polish {input.assembly} {input.alnfilt1} {input.alnfilt2} > {output} + + """ \ No newline at end of file diff --git a/assembly/workflow/rules/readQC.smk b/assembly/workflow/rules/readQC.smk index da08f9a54eb68d76e472dd338e4d04b732dbf285..811cfaa25a317fe594d9fa78e804ecd134bc80df 100755 --- a/assembly/workflow/rules/readQC.smk +++ b/assembly/workflow/rules/readQC.smk @@ -8,7 +8,7 @@ rule longQC: shell: """ rm -r {params.outfolder} - python /LongQC/longQC.py sampleqc -x pb-sequel -o {params.outfolder}/ -p {params.threads} {input} + python /LongQC/longQC.py sampleqc -x pb-r2 -o {params.outfolder}/ -p {params.threads} {input} """