From dca40749bedfdd1f424639a5db4c6d2df20a2abc Mon Sep 17 00:00:00 2001 From: Christoph Stritt <stritt0001@login01.cluster.bc2.ch> Date: Fri, 5 Jul 2024 11:44:52 +0200 Subject: [PATCH] Duplicate read removal step added in circularize rule --- cluster/config.yaml | 2 +- config/config.yaml | 8 +++++--- workflow/rules/circularize.smk | 21 ++++++++++++++++++++- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/cluster/config.yaml b/cluster/config.yaml index e4ec829..381cd8c 100644 --- a/cluster/config.yaml +++ b/cluster/config.yaml @@ -16,7 +16,7 @@ default-resources: restart-times: 3 max-jobs-per-second: 10 max-status-checks-per-second: 1 -local-cores: 1 +local-cores: 20 latency-wait: 60 jobs: 500 keep-going: True diff --git a/config/config.yaml b/config/config.yaml index eb11d3b..7feb5f7 100755 --- a/config/config.yaml +++ b/config/config.yaml @@ -2,8 +2,10 @@ # ############################## -samples: config/samples.tsv -outdir: ./results +samples: config/samples.tsv # overwritten by run_assembly_pipeline.py +outdir: ./results # overwritten by run_assembly_pipeline.py + +annotate: "No" ref: genome_size: 4.4m @@ -12,7 +14,7 @@ ref: bakta_db: /scicore/home/gagneux/GROUP/PacbioSnake_resources/databases/bakta_db container: /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/assemblySC.sif -threads_per_job: 4 +threads_per_job: 10 # Max. 20 assembly_iterations: 3 diff --git a/workflow/rules/circularize.smk b/workflow/rules/circularize.smk index 89f2f78..94d3a62 100755 --- a/workflow/rules/circularize.smk +++ b/workflow/rules/circularize.smk @@ -24,8 +24,27 @@ rule circlator_bam2reads: """ + +rule circlator_removeduplicates: + input: config["outdir"] +"/{sample}/circlator/02.bam2reads.fasta" + output: config["outdir"] +"/{sample}/circlator/02.bam2reads.nodup.fasta" + run: + + import sys + from Bio import SeqIO + + record_dict = {} + + for record in SeqIO.parse(input[0], "fasta"): + record_dict[record.id] = record + + # record_dict = SeqIO.to_dict(SeqIO.parse(input[0], "fasta")) # Does not allow duplicate entries... + with open(output[0], "w") as output_handle: + SeqIO.write(record_dict.values(), output_handle, "fasta") + + rule circlator_localassembly: - input: config["outdir"] + "/{sample}/circlator/02.bam2reads.fasta" + input: config["outdir"] + "/{sample}/circlator/02.bam2reads.nodup.fasta" output: config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta" params: outdir = config["outdir"] + "/{sample}/circlator/03.assemble", -- GitLab