From dca40749bedfdd1f424639a5db4c6d2df20a2abc Mon Sep 17 00:00:00 2001
From: Christoph Stritt <stritt0001@login01.cluster.bc2.ch>
Date: Fri, 5 Jul 2024 11:44:52 +0200
Subject: [PATCH] Duplicate read removal step added in circularize rule

---
 cluster/config.yaml            |  2 +-
 config/config.yaml             |  8 +++++---
 workflow/rules/circularize.smk | 21 ++++++++++++++++++++-
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/cluster/config.yaml b/cluster/config.yaml
index e4ec829..381cd8c 100644
--- a/cluster/config.yaml
+++ b/cluster/config.yaml
@@ -16,7 +16,7 @@ default-resources:
 restart-times: 3
 max-jobs-per-second: 10
 max-status-checks-per-second: 1
-local-cores: 1
+local-cores: 20
 latency-wait: 60
 jobs: 500
 keep-going: True
diff --git a/config/config.yaml b/config/config.yaml
index eb11d3b..7feb5f7 100755
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -2,8 +2,10 @@
 #
 ##############################
 
-samples: config/samples.tsv
-outdir: ./results
+samples: config/samples.tsv # overwritten by run_assembly_pipeline.py
+outdir: ./results # overwritten by run_assembly_pipeline.py
+
+annotate: "No"
 
 ref:
   genome_size: 4.4m
@@ -12,7 +14,7 @@ ref:
 bakta_db: /scicore/home/gagneux/GROUP/PacbioSnake_resources/databases/bakta_db
 container: /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/assemblySC.sif
 
-threads_per_job: 4
+threads_per_job: 10 # Max. 20
 
 assembly_iterations: 3
 
diff --git a/workflow/rules/circularize.smk b/workflow/rules/circularize.smk
index 89f2f78..94d3a62 100755
--- a/workflow/rules/circularize.smk
+++ b/workflow/rules/circularize.smk
@@ -24,8 +24,27 @@ rule circlator_bam2reads:
 
         """
 
+
+rule circlator_removeduplicates:
+    input: config["outdir"] +"/{sample}/circlator/02.bam2reads.fasta"
+    output: config["outdir"] +"/{sample}/circlator/02.bam2reads.nodup.fasta"
+    run:
+
+        import sys
+        from Bio import SeqIO
+
+        record_dict = {}
+
+        for record in SeqIO.parse(input[0], "fasta"):
+            record_dict[record.id] = record
+
+        # record_dict = SeqIO.to_dict(SeqIO.parse(input[0], "fasta")) # Does not allow duplicate entries...
+        with open(output[0], "w") as output_handle:
+            SeqIO.write(record_dict.values(), output_handle, "fasta")
+
+
 rule circlator_localassembly:
-    input: config["outdir"] + "/{sample}/circlator/02.bam2reads.fasta"
+    input: config["outdir"] + "/{sample}/circlator/02.bam2reads.nodup.fasta"
     output: config["outdir"] + "/{sample}/circlator/03.assemble/assembly.fasta"
     params:
         outdir = config["outdir"] + "/{sample}/circlator/03.assemble",
-- 
GitLab