From 501332dfb18799cb69b598b92320859a016467f4 Mon Sep 17 00:00:00 2001
From: Samuel Mondal <samuel.mondal@unibas.ch>
Date: Sat, 26 Nov 2022 22:34:50 +0100
Subject: [PATCH] New function that reads fasta file one line at a time rather
 than loading the whole file

---
 sequence_extractor/exon_concatenation.py | 32 +++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/sequence_extractor/exon_concatenation.py b/sequence_extractor/exon_concatenation.py
index 909a741..27f0c2a 100644
--- a/sequence_extractor/exon_concatenation.py
+++ b/sequence_extractor/exon_concatenation.py
@@ -1,4 +1,4 @@
-def exon_concatenation(
+def exon_concatenation_old(
 	filename: str
 ) -> list:
 	"""Concatenates all sequences in fasta file with the same transcript ID header and then outputs a list containing sequence headers (Transcript ID) and sequences that have been concatenated.
@@ -27,3 +27,33 @@ def exon_concatenation(
 	to_write_to_file.append(annotation)
 	to_write_to_file.append(read)
 	return to_write_to_file
+
+def exon_concatenation(
+	post_bedtools_fasta: str
+) -> list:
+	"""Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences.
+
+	Args:
+		post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run
+
+	Returns:
+		A list with transcript ID in even indices and corresponding concatenated exons in odd indices.
+	"""
+    with open(post_bedtools_fasta,'r') as fa:
+        annotation = []
+        fasta_format_list = []
+        for line1,line2 in zip(fa,fa):
+            if len(annotation) == 0:
+                annotation.append(line1[0:16])
+                read = line2[:-1]
+            else:
+                if annotation[-1] == line1[0:16]:
+                    read += line2[:-1]
+                elif annotation[-1] != line1[0:16]:
+                    fasta_format_list.append(annotation[-1])
+                    fasta_format_list.append(read)
+                    annotation.append(line1[0:16])
+                    read = line2[:-1]
+        fasta_format_list.append(annotation[-1])
+        fasta_format_list.append(read)
+    return fasta_format_list
-- 
GitLab