Skip to content
Snippets Groups Projects
Commit 3d754385 authored by Samuel Mondal's avatar Samuel Mondal
Browse files

minor changes based on pylint

parent 6f5e54c4
No related branches found
No related tags found
1 merge request!61minor changes based on pylint
import pandas as pd
import argparse
from gtfparse import read_gtf
"""This script defines a BED from exon annotation in a GTF, to get exon coordinates for use in bedtools. It also ensures that the concatenation happens in the correct order, regardless of the strandedness of the transcript.
Args:
......@@ -11,6 +7,10 @@ from gtfparse import read_gtf
BED file with the format: chr, start, end, transcript_id, score, strand, gene_id
"""
import argparse
import pandas as pd
from gtfparse import read_gtf
parser = argparse.ArgumentParser(
prog = 'pre_bedtools',
description = 'extracts ordered information from gtf file and for transcripts in the negative strand, flips the order in which exons are ordered.')
......@@ -31,4 +31,3 @@ gtf_df_pos = gtf_exons[gtf_exons["strand"] == "+"]
gtf_df_pos = gtf_df_pos.sort_values(['transcript_id','start'],ascending=True).groupby('transcript_id').head(len(gtf_df_pos. transcript_id))
pd.concat([gtf_df_pos, gtf_df_neg]).to_csv(args.output_bed_file,sep="\t",index=False) #gtf_df_pos and gtf_df_neg must be dataframes
""" command line script to be run on output fasta file from bedtools getfasta """
import argparse
import logging
from exon_concatenation import exon_concatenation
......@@ -14,10 +15,18 @@ parser.add_argument('--output_file_name',
args = parser.parse_args()
def main():
"""Runs on the output from bedtools and concatenates the exons together and adds a polyA tail and outputs a fasta file.
Args:
None: this will run on its own by taking the information from argparse
Returns:
A fasta file with a single entry for each transcript ID with polyA tail being added onto the sequence at 3'end
"""
LOG.info("sequence_extractor begins")
fasta_list = exon_concatenation(args.input_fasta_file)
final_list = poly_a_addition_to_fasta_list(fasta_list)
with open(args.output_file_name, 'w') as fasta_out:
with open(args.output_file_name, 'w', encoding="utf-8") as fasta_out:
fasta_out.write('\n'.join('%s\n%s' % x for x in final_list))
LOG.info("sequence_extractor ends")
......
"""Script containing the function to concatenate exons and output the results in a list of tuples"""
def exon_concatenation(
post_bedtools_fasta: str
post_bedtools_fasta: str,
) -> list:
"""Concatenate all sequences starting with identical transcripit ID and outputs it as a list with sequence header (Transcript ID) and concatenated sequences as tuples.
......@@ -9,10 +10,10 @@ def exon_concatenation(
Returns:
A list containing transcript ID and concatenated exons in tuples.
"""
with open(post_bedtools_fasta,'r') as fa:
with open(post_bedtools_fasta,'r', encoding="utf-8") as fasta:
annotation = []
fasta_format_list = []
for line1,line2 in zip(fa,fa):
for line1,line2 in zip(fasta,fasta):
if len(annotation) == 0:
annotation.append(line1[0:16])
read = line2[:-1]
......
""" This script contains two functions and the first function is called by the second function and used to add poly A tail to the concatenated exon"""
import numpy as np
# To do: Taking probabilities of nucleotides from user and raising error if sum != 1
def poly_a_generator(
......@@ -11,9 +12,9 @@ def poly_a_generator(
Returns:
RNA with polyA tail added to its 3' end.
"""
listA = ['A','T','G','C']
polyA = ''.join(np.random.choice(listA,250,p=[0.914,0.028,0.025,0.033]))
return (exon+polyA)
list_of_nucleotides = ['A','T','G','C']
poly_a_string = ''.join(np.random.choice(list_of_nucleotides,250,p=[0.914,0.028,0.025,0.033]))
return exon+poly_a_string
def poly_a_addition_to_fasta_list(
fasta_list: list,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment