diff --git a/tests/test_functions.py b/tests/test_functions.py index 943dc255971e67b9871af42fde5b881c3edfba5b..a0f2f1ad3d9630cc613012398e9aa045d13fb2a6 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -4,6 +4,7 @@ import pandas as pd import numpy as np +# pylint: disable=C0103 def find_path(filename: str) -> str: """Find the path to a file. diff --git a/tests/test_match_reptrans_explvl.py b/tests/test_match_reptrans_explvl.py index ae3bb896aeecb7c5448e5da0540c29cbf94242b5..abef23b98d6aae2bf75b1aaf7ba5ddca4c2d96ca 100644 --- a/tests/test_match_reptrans_explvl.py +++ b/tests/test_match_reptrans_explvl.py @@ -223,11 +223,7 @@ class TestMatchReptrans: "at least one row contain NA values" def test_output_tsv(): - """ - This function test if a tsv file is generated from a pandas - dataframe in the right format. - """ - + """Test if a tsv file is generated from a df in the right format.""" dict_repr_test = { 'ENSMUSG00000079415': 'ENSMUST00000112933', "ENSMUSG00000024691": "ENSMUST00000025595", @@ -251,7 +247,7 @@ class TestMatchReptrans: ref_path = tFun.find_path("test_ref_output.tsv") output_path = tFun.find_output() - with open(ref_path, 'r') as t1, open(output_path, 'r') as t2: + with open(ref_path, 'r', encoding="utf-8") as t1, open(output_path, 'r') as t2: fileRef = t1.readlines() fileOutput = t2.readlines() diff --git a/transcript_sampler/find_reptrans.py b/transcript_sampler/find_reptrans.py index e24746e280bb09c4fb6d100f2b739ebc012161b1..46af0ba6d35fde24fea74d108e80c18dfd66f85c 100644 --- a/transcript_sampler/find_reptrans.py +++ b/transcript_sampler/find_reptrans.py @@ -1,10 +1,11 @@ """Find representative transcripts.""" - import logging +from typing import Union LOG = logging.getLogger(__name__) +# pylint: disable=R0912,R0915 class FindRepTrans: """Find representative transcripts.""" @@ -12,7 +13,7 @@ class FindRepTrans: """Initiate.""" @staticmethod - def attributes_converter(attributes: str) -> list: + def attributes_converter(attributes): """Attributes converter function. This funtion converts the "unstructured" ;-seperated part of @@ -23,7 +24,7 @@ class FindRepTrans: Input: attributes = str() # the unstructured part of the entry Output: - attributes = list() # cleaned list with the \ + attributes = list() # cleaned list with the characteristics described above """ attributes = ( @@ -96,7 +97,7 @@ class FindRepTrans: ValueError: If an unexpected entry is encountered in the GTF file. """ # setting default variables - rep_transcripts = dict() + rep_transcripts: dict = {} cur_g_id = "" # [transcript_id, transcript_support_level, transcript_length] cur_best_trans = ["", 100, 0] @@ -122,11 +123,11 @@ class FindRepTrans: if cur_g_id != attributes[1]: LOG.error("Exon from an unexpected gene") raise ValueError("Exon from an unexpected gene") - elif ( + if ( self.find_in_attributes( attributes, "transcript_id" - ) != cur_tID - ): + ) != cur_t_ID + ): LOG.error("Exon from an unexpected transcript") raise ValueError("Exon from an unexpected transcript") @@ -148,10 +149,10 @@ class FindRepTrans: raise ValueError("Transcript from an unexpected gene") # finding the transcript id and the support level - cur_tID = self.find_in_attributes( + cur_t_ID = self.find_in_attributes( attributes, "transcript_id" ) - t_supp_lvl = self.find_in_attributes( + t_supp_lvl: Union[int, str] = self.find_in_attributes( attributes, "transcript_support_level" ) @@ -161,7 +162,7 @@ class FindRepTrans: if t_supp_lvl == "NA": t_supp_lvl = 100 else: - if t_supp_lvl.isdigit(): + if isinstance(t_supp_lvl, str) and t_supp_lvl.isdigit(): t_supp_lvl = int(t_supp_lvl) else: t_supp_lvl = 100 @@ -169,11 +170,11 @@ class FindRepTrans: # decides if the transcript has potential to become the # representative transcript if t_supp_lvl < cur_best_trans[1] or cur_best_trans[0] == "": - cur_best_trans = [cur_tID, t_supp_lvl, 0] + cur_best_trans = [cur_t_ID, t_supp_lvl, 0] pot_best_trans = False ignor_trans = False elif t_supp_lvl == cur_best_trans[1]: - pot_best_trans = [cur_tID, t_supp_lvl, 0] + pot_best_trans = [cur_t_ID, t_supp_lvl, 0] else: ignor_trans = True @@ -203,7 +204,7 @@ class FindRepTrans: if cur_g_id in rep_transcripts: if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] or (rep_transcripts[cur_g_id][1] == cur_best_trans[1] and - rep_transcripts[cur_g_id][2] < cur_best_trans[2])): + rep_transcripts[cur_g_id][2] < cur_best_trans[2])): rep_transcripts[cur_g_id] = cur_best_trans else: rep_transcripts[cur_g_id] = cur_best_trans diff --git a/transcript_sampler/match_reptrans_explvl.py b/transcript_sampler/match_reptrans_explvl.py index 5bc73833a9a9d32fb5e1b646ace9c185af6e426d..654f8dc95c04ac41822508b82e04bbe85277e114 100644 --- a/transcript_sampler/match_reptrans_explvl.py +++ b/transcript_sampler/match_reptrans_explvl.py @@ -1,4 +1,4 @@ -"""Match representative transcript with expression level""" +"""Match representative transcript with expression level.""" # Made by Hugo Gillet # import logging @@ -40,12 +40,15 @@ class MatchReptransExplvl: return df_gtf @staticmethod - def dict_repr_trans_to_df(dict_reprTrans: "dict[str, str]") -> pd.DataFrame: + def dict_repr_trans_to_df( + dict_repr_trans: "dict[str, str]" + ) -> pd.DataFrame: """ - Convert a dictionary of genes and their representative transcript into a DataFrame. + Convert a dict of genes and their representative transcript into a df. Args: - dict_reprTrans (dict): {'Gene': ['transcriptA', 'transcriptB'], ...} + dict_repr_trans (dict): + {'Gene': ['transcriptA', 'transcriptB'], ...} Returns: Pandas DataFrame with 'Gene' and 'Transcript' as columns. @@ -55,22 +58,32 @@ class MatchReptransExplvl: TypeError: Keys should be strings. TypeError: Values should be strings. """ - if not isinstance(dict_reprTrans, dict): + if not isinstance(dict_repr_trans, dict): LOG.error("Only dictionaries are allowed") raise TypeError("Only dictionaries are allowed") - if not all(isinstance(key, str) for key in dict_reprTrans.keys()): + if not all( + isinstance(key, str) for key in dict_repr_trans.keys() + ): LOG.error("Keys should be strings") raise TypeError("Keys should be strings") - if not all(isinstance(value, str) for value in dict_reprTrans.values()): + if not all( + isinstance(value, str) for value in dict_repr_trans.values() + ): LOG.error("Values should be strings") raise TypeError("Values should be strings") - df_reprTrans = pd.DataFrame.from_dict(dict_reprTrans, orient="index", columns=["reprTranscript"]) - df_reprTrans = df_reprTrans.reset_index() - df_reprTrans.columns = ["Gene", "reprTrans"] - df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace(r"\.[1-9]", "", regex=True) + df_repr_trans = pd.DataFrame.from_dict( + dict_repr_trans, orient="index", columns=["reprTranscript"] + ) + df_repr_trans = df_repr_trans.reset_index() + column_names = ["Gene", "reprTrans"] + df_repr_trans.columns = pd.Index(column_names) + # pylint: disable=E1136,E1137 + df_repr_trans["reprTrans"] = df_repr_trans["reprTrans"].str.replace( + r"\.[1-9]", "", regex=True + ) - return df_reprTrans + return df_repr_trans @staticmethod def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame: @@ -99,85 +112,111 @@ class MatchReptransExplvl: @staticmethod def expr_level_by_gene( - df_exprTranscript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame + df_expr_transcript: pd.DataFrame, df_output_gtf_selection: pd.DataFrame ) -> pd.DataFrame: - """ - Find the gene of each transcript given by the expression level CSV/TSV file - and sum the expression level of all transcripts from the same gene. - + """Sum expression levels. + + Find the gene of each transcript given by the expression level + CSV/TSV file and sum the expression level of all transcripts + from the same gene. + Args: - df_exprTranscript (pd.DataFrame): Pandas DataFrame containing transcripts and their expression levels, - generated by the "tsv_or_csv_to_df" function. - df_output_gtf_selection (pd.DataFrame): Pandas DataFrame containing genes and transcripts, - generated by the "transcripts_by_gene_inDf" function. - + df_expr_transcript (pd.DataFrame): + Pandas DataFrame containing transcripts and their + expression levels, generated by the + "tsv_or_csv_to_df" function. + df_output_gtf_selection (pd.DataFrame): + Pandas DataFrame containing genes and transcripts, + generated by the "transcripts_by_gene_inDf" function. + Returns: - Pandas DataFrame having 'Gene' and sum of its transcript expression levels. - + Pandas DataFrame having 'Gene' and sum of its + transcript expression levels. + Raises: None """ - df_merged = pd.merge(df_output_gtf_selection, df_exprTranscript, how="inner", on="Transcript") - df_sum = df_merged.groupby("Gene")["Expression_level"].sum().reset_index() + df_merged = pd.merge( + df_output_gtf_selection, df_expr_transcript, + how="inner", on="Transcript") + df_sum = df_merged.groupby("Gene")["Expression_level"].sum( + ).reset_index() return df_sum @staticmethod def match_by_gene( - df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame + df_repr_transcript: pd.DataFrame, + df_expression_level_by_gene: pd.DataFrame ) -> pd.DataFrame: """ Find matching genes between the two DataFrames. - + Args: - df_reprTranscript (pd.DataFrame): Pandas DataFrame containing genes and their representative transcripts, - generated by the "dict_repr_trans_to_df()" function. - df_expressionLevel_byGene (pd.DataFrame): Pandas DataFrame containing genes and their expression levels, - generated by the "transcript_by_gene_inDf()" function. - + df_repr_transcript (pd.DataFrame): Pandas DataFrame + containing genes and their representative transcripts, + generated by the "dict_repr_trans_to_df()" function. + df_expression_level_by_gene (pd.DataFrame): Pandas DataFrame + containing genes and their expression levels, + generated by the "transcript_by_gene_inDf()" function. + Returns: - Pandas DataFrame having representative transcripts and their expression levels. - + Pandas DataFrame having representative transcripts and + their expression levels. + Raises: None """ - df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene, how="inner", on="Gene") + df_merged = pd.merge( + df_repr_transcript, df_expression_level_by_gene, + how="inner", on="Gene" + ) df_clean = df_merged.loc[:, ["reprTrans", "Expression_level"]] return df_clean def match_repr_transcript_expression_level( - self, exprTrans: str, dict_reprTrans: dict, gtf_file: str, + self, expr_trans: str, dict_repr_trans: dict, gtf_file: str, ): - """ - Combine functions to replace transcripts from an expression level CSV/TSV file with representative transcripts. + """Replace expression level with representative transcripts. + + Combine functions to replace transcripts from an expression level + CSV/TSV file with representative transcripts. Args: - exprTrans (str): CSV or TSV file containing transcripts and their expression level. - dict_reprTrans (dict): Dictionary of genes and their representative transcripts. + expr_trans (str): CSV or TSV file containing transcripts + and their expression level. + dict_repr_trans (dict): Dictionary of genes + and their representative transcripts. gtf_file (str): Path to the GTF file. Returns: - Pandas DataFrame of representative transcripts and their expression level. + Pandas DataFrame of representative transcripts + and their expression level. Raises: None """ df_gene_transcript = self.gtf_to_df(gtf_file) - df_exprTrans = self.tsv_or_csv_to_df(exprTrans) - df_reprTrans = self.dict_repr_trans_to_df(dict_reprTrans) - df_expr_level_by_gene = self.expr_level_by_gene(df_exprTrans, df_gene_transcript) - df_match = self.match_by_gene(df_reprTrans, df_expr_level_by_gene) - df_match.rename(columns={"reprTrans": "id", "Expression_level": "level"}, inplace=True) + df_expr_trans = self.tsv_or_csv_to_df(expr_trans) + df_repr_trans = self.dict_repr_trans_to_df(dict_repr_trans) + df_expr_level_by_gene = self.expr_level_by_gene( + df_expr_trans, df_gene_transcript + ) + df_match = self.match_by_gene(df_repr_trans, df_expr_level_by_gene) + df_match.rename( + columns={"reprTrans": "id", "Expression_level": "level"}, + inplace=True + ) return df_match - -# def dict_repr_trans_to_df(dict_reprTrans: "dict[str, str]") -> pd.DataFrame: +# def dict_repr_trans_to_df(dict_repr_trans: "dict[str, str]") -> pd.DataFrame: # """Convert a dictionary of genes and their representative # transcript into a dataframe # Args: -# dict_reprTrans (dict): {'Gene':['transcriptA', 'transcriptB'], ...} +# dict_repr_trans (dict): +# {'Gene':['transcriptA', 'transcriptB'], ...} # Returns: # Pandas dataframe having Gene and transcript as columns @@ -189,22 +228,22 @@ class MatchReptransExplvl: # """ # pass -# if not type(dict_reprTrans) is dict: +# if not type(dict_repr_trans) is dict: # raise TypeError("Only dict are allowed") -# if type(list(dict_reprTrans.keys())[0]) is not str: +# if type(list(dict_repr_trans.keys())[0]) is not str: # raise TypeError("Key should be strings") -# if type(list(dict_reprTrans.values())[0]) is not str: +# if type(list(dict_repr_trans.values())[0]) is not str: # raise TypeError("Values should be strings") -# df_reprTrans = pd.DataFrame.from_dict( -# dict_reprTrans, orient="index", columns=["reprTranscript"] +# df_repr_trans = pd.DataFrame.from_dict( +# dict_repr_trans, orient="index", columns=["reprTranscript"] # ) -# df_reprTrans = df_reprTrans.reset_index(level=0) -# df_reprTrans.columns = ["Gene", "reprTrans"] -# df_reprTrans["reprTrans"] = df_reprTrans["reprTrans"].str.replace( +# df_repr_trans = df_repr_trans.reset_index(level=0) +# df_repr_trans.columns = ["Gene", "reprTrans"] +# df_repr_trans["reprTrans"] = df_repr_trans["reprTrans"].str.replace( # r"\.[1-9]", "", regex=True # ) -# return df_reprTrans +# return df_repr_trans # def gene_and_transcript(gtf_file: str) -> pd.DataFrame: @@ -259,7 +298,7 @@ class MatchReptransExplvl: # file, and summ expression level of all transcipts from the same gene. # Args: -# df_exprTranscript: pandas df containing transcript and +# df_expr_transcript: pandas df containing transcript and # their exp level generated by "tsv_or_csv_to_df" function # df_output_gtf_selection : pandas df containing genes and # transcripts, generated by "transcripts_by_gene_inDf" function @@ -282,15 +321,16 @@ class MatchReptransExplvl: # def match_by_gene( -# df_reprTranscript: pd.DataFrame, df_expressionLevel_byGene: pd.DataFrame +# df_repr_transcript: pd.DataFrame, +# df_expression_level_by_gene: pd.DataFrame # ) -> pd.DataFrame: # """Find matching genes bewteen the 2 args # Args: -# df_reprTranscript : pandas Dataframe containing genes +# df_repr_transcript : pandas Dataframe containing genes # and their representative transcript, generated by # "dict_repr_trans_to_df()" -# df_expressionLevel_byGene : pandas Dataframe containing +# df_expression_level_by_gene : pandas Dataframe containing # genes and their expression level generated by # "transcript_by_gene_inDf()" @@ -303,7 +343,8 @@ class MatchReptransExplvl: # """ # pass # df_merged = pd.merge( -# df_reprTranscript, df_expressionLevel_byGene, how="outer", on="Gene" +# df_repr_transcript, df_expression_level_by_gene, +# how="outer", on="Gene" # ) # df_clean = df_merged.dropna(axis=0) # df_clean = df_clean.loc[:, ["reprTrans", "Expression_level"]] @@ -312,15 +353,15 @@ class MatchReptransExplvl: # # functions to run this part of the programm # def match_repr_transcript_expression_level( -# exprTrans: str, dict_reprTrans: dict, gtf_file: str, +# expr_trans: str, dict_repr_trans: dict, gtf_file: str, # ): -# """Combine functions to replace transcripts from an exp level csv/tsv file +# """Combine functions to replace transcripts from exp level csv/tsv file # with representative transcripts # Args: -# exprTrans (str): csv or tsv file containing transcripts +# expr_trans (str): csv or tsv file containing transcripts # and their expression level -# dict_reprTrans (dict) : dict of genes and their +# dict_repr_trans (dict) : dict of genes and their # representative transcipt # intemediate_file (str) : txt file containing genes, transcript # and their expression level from the transkript_extractor function @@ -333,12 +374,12 @@ class MatchReptransExplvl: # None # """ # df_gene_transcript = gene_and_transcript(gtf_file) -# df_exprTrans = tsv_or_csv_to_df(exprTrans) -# df_reprTrans = dict_repr_trans_to_df(dict_reprTrans) +# df_expr_trans = tsv_or_csv_to_df(expr_trans) +# df_repr_trans = dict_repr_trans_to_df(dict_repr_trans) # df_expr_level_by_gene = expr_level_by_gene( -# df_exprTrans, df_gene_transcript +# df_expr_trans, df_gene_transcript # ) # error here -# df_match = match_by_gene(df_reprTrans, df_expr_level_by_gene) +# df_match = match_by_gene(df_repr_trans, df_expr_level_by_gene) # df_match.rename(columns={'reprTrans': 'id', 'Expression_level': 'level'}, # inplace=True) # return df_match