diff --git a/sequence_extractor/exon_concatenation.py b/sequence_extractor/exon_concatenation.py index 909a741ca2f84e4abd581c965f43bae21904c14f..27f0c2a6d12fb7c391b29eadbcdd77b573749d4f 100644 --- a/sequence_extractor/exon_concatenation.py +++ b/sequence_extractor/exon_concatenation.py @@ -1,4 +1,4 @@ -def exon_concatenation( +def exon_concatenation_old( filename: str ) -> list: """Concatenates all sequences in fasta file with the same transcript ID header and then outputs a list containing sequence headers (Transcript ID) and sequences that have been concatenated. @@ -27,3 +27,33 @@ def exon_concatenation( to_write_to_file.append(annotation) to_write_to_file.append(read) return to_write_to_file + +def exon_concatenation( + post_bedtools_fasta: str +) -> list: + """Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences. + + Args: + post_bedtools_fasta: The name of the fasta file obtained after bedtools has been run + + Returns: + A list with transcript ID in even indices and corresponding concatenated exons in odd indices. + """ + with open(post_bedtools_fasta,'r') as fa: + annotation = [] + fasta_format_list = [] + for line1,line2 in zip(fa,fa): + if len(annotation) == 0: + annotation.append(line1[0:16]) + read = line2[:-1] + else: + if annotation[-1] == line1[0:16]: + read += line2[:-1] + elif annotation[-1] != line1[0:16]: + fasta_format_list.append(annotation[-1]) + fasta_format_list.append(read) + annotation.append(line1[0:16]) + read = line2[:-1] + fasta_format_list.append(annotation[-1]) + fasta_format_list.append(read) + return fasta_format_list