From 501332dfb18799cb69b598b92320859a016467f4 Mon Sep 17 00:00:00 2001 From: Samuel Mondal <samuel.mondal@unibas.ch> Date: Sat, 26 Nov 2022 22:34:50 +0100 Subject: [PATCH] New function that reads fasta file one line at a time rather than loading the whole file --- sequence_extractor/exon_concatenation.py | 32 +++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/sequence_extractor/exon_concatenation.py b/sequence_extractor/exon_concatenation.py index 909a741..27f0c2a 100644 --- a/sequence_extractor/exon_concatenation.py +++ b/sequence_extractor/exon_concatenation.py @@ -1,4 +1,4 @@ -def exon_concatenation( +def exon_concatenation_old( filename: str ) -> list: """Concatenates all sequences in fasta file with the same transcript ID header and then outputs a list containing sequence headers (Transcript ID) and sequences that have been concatenated. @@ -27,3 +27,33 @@ def exon_concatenation( to_write_to_file.append(annotation) to_write_to_file.append(read) return to_write_to_file + +def exon_concatenation( + post_bedtools_fasta: str +) -> list: + """Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences. + + Args: + post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run + + Returns: + A list with transcript ID in even indices and corresponding concatenated exons in odd indices. + """ + with open(post_bedtools_fasta,'r') as fa: + annotation = [] + fasta_format_list = [] + for line1,line2 in zip(fa,fa): + if len(annotation) == 0: + annotation.append(line1[0:16]) + read = line2[:-1] + else: + if annotation[-1] == line1[0:16]: + read += line2[:-1] + elif annotation[-1] != line1[0:16]: + fasta_format_list.append(annotation[-1]) + fasta_format_list.append(read) + annotation.append(line1[0:16]) + read = line2[:-1] + fasta_format_list.append(annotation[-1]) + fasta_format_list.append(read) + return fasta_format_list -- GitLab