test_match.py

import pandas as pd
import json
import re
import match_reprtranscript_expressionlevel as match
import os
import pytest
import test_Functions as tFun
import numpy as np
import representative as repr
from pandas.testing import assert_frame_equal

def test_dict_reprTrans_to_df():
    """
    This function test if a dict of {gene: representativeTranscript}
    is converted in a dataframe in the right format 
    """
    dict_repr_test = {"ENSMUSG00000079415":"ENSMUST00000112933", 
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
    dict_mixed = {"a":2, "b":3}
    str_random = "jflkajflkaelfha"
    dict_int = {12:34, 13:66}
    df = match.dict_reprTrans_to_df(dict_repr_test)
    datatype={'Gene': np.dtype('O'), 'reprTrans': np.dtype('O')}

    with pytest.raises(TypeError, match=r"Only dict are allowed"):
        match.dict_reprTrans_to_df(str_random) 
    with pytest.raises(TypeError, match=r"Key should be strings"):
        match.dict_reprTrans_to_df(dict_int) 
    with pytest.raises(TypeError, match=r"Values should be strings"):
        match.dict_reprTrans_to_df(dict_mixed)
    assert tFun.column_number(df)==2, "number of columns is not equal to 2"
    assert tFun.column_dType(df)==datatype, "at least one column has the wrong datatype"
    assert tFun.duplicated_rows(df).empty, "at least one row are duplicated "
    assert tFun.NA_value(df) == 0, "at least one row contain NA values "


def test_txt_to_dict():
    path = tFun.find_path("test_dict_repr_trans.txt")
    dico = match.txt_to_dict(path)
    dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', 
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
    assert dico == dict_test

def test_transcripts_by_gene_inDf():
    """
    This function test if a dataframe generated from 
    the intermediate file is converted in another 
    dataframe without the support level column.
    """
    path = tFun.find_path_intermediateFile()
    df = repr.import_gtfSelection_to_df(path)
    df_gene = match.transcripts_by_gene_inDf(df)
    datatype={'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
    assert tFun.column_number(df_gene)==2, "number of columns is not equal to 2"
    assert tFun.column_dType(df_gene)==datatype, "at least one column has the wrong datatype"
    assert tFun.duplicated_rows(df_gene).empty, "at least one row are duplicated "
    assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values "


def test_tsv_or_csv_to_df():
    """
    This function test if the function tsv_or_csv_to_df() cans take 
    csv and tsv file as input and return a pandas dataframe in the 
    right format 
    """
    path_tsv = tFun.find_path(r"test_gene_exprL")
    df_tsv = match.tsv_or_csv_to_df(path_tsv)
    path_csv = tFun.find_path(r"test_gene_exprL_csv.csv")
    df_csv = match.tsv_or_csv_to_df(path_csv)
    datatype ={'Transcript': np.dtype('O'), 'Expression_level': np.dtype('float64')}
    assert tFun.column_number(df_tsv)==2, "number of columns is not equal to 2"
    assert tFun.column_dType(df_tsv)==datatype, "at least one column has the wrong datatype"
    assert tFun.duplicated_rows(df_tsv).empty, "at least one row are duplicated "
    assert tFun.NA_value(df_tsv) == 0, "at least one row contain NA values "
    assert_frame_equal(df_tsv, df_csv), "csv and tsv import doesn't match"
    

def test_exprLevel_byGene():
    """
    This function test if the function exprLevel_byGene can find the gene of 
    each transcipt given by the expression level csv/tsv file and sum their 
    expression level 
    """
    path_tsv = tFun.find_path(r"test_gene_exprL")
    df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)

    path_intermediate = tFun.find_path_intermediateFile()
    df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
    df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)

    df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript)

    datatype ={'Expression_level': np.dtype('float64')}
    assert tFun.column_number(df_exprLevel)==1, "number of columns is not equal to 1"
    assert tFun.column_dType(df_exprLevel)==datatype, "at least one column has the wrong datatype"
    assert tFun.duplicated_rows(df_exprLevel).empty, "at least one row are duplicated "
    assert tFun.NA_value(df_exprLevel) == 0, "at least one row contain NA values "
    assert tFun.duplicated_index(df_exprLevel).empty, "at least one index element is duplicated"
    
def test_match_byGene():
    """
    This function test if the function "match_byGene()" can 
    create a pandas dataframe matching representative transcript
    and their expression level based on their gene in the 
    correct pandas dataframe format. 
    """


    dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', 
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
    df_dict_reprTrans = match.dict_reprTrans_to_df(dict_repr_test)


    path_tsv = tFun.find_path(r"test_gene_exprL")
    df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
    path_intermediate = tFun.find_path_intermediateFile()
    df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
    df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
    df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript)

    df_match = match.match_byGene(df_dict_reprTrans, df_exprLevel)
    datatype = {'reprTrans': np.dtype('O'), 'Expression_level': np.dtype('float64')}

    assert tFun.column_number(df_match)==2, "number of columns is not equal to 2"
    assert tFun.column_dType(df_match)==datatype, "at least one column has the wrong datatype"
    assert tFun.duplicated_rows(df_match).empty, "at least one row are duplicated "
    assert tFun.NA_value(df_match) == 0, "at least one row contain NA values "
    assert tFun.duplicated_index(df_match).empty, "at least one index element is duplicated"

def test_output_tsv(): 
    """
    This function test if a tsv file is generated from a pandas
    dataframe in the right format. 
    """

    dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', 
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
    df_dict_reprTrans = match.dict_reprTrans_to_df(dict_repr_test)


    path_tsv = tFun.find_path(r"test_gene_exprL")
    df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
    path_intermediate = tFun.find_path_intermediateFile()
    df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
    df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)

    df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript)

    df_match = match.match_byGene(df_dict_reprTrans, df_exprLevel)

    match.output_tsv(df_match)

    ref_path=tFun.find_path("test_ref_output.tsv")
    output_path = tFun.find_output()

    with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
        fileRef = t1.readlines()
        fileOutput = t2.readlines()


    assert sorted(fileRef) == sorted(fileOutput), "the output does't match the expected tsv file"
    
 
def test_match_reprTranscript_expressionLevel():
    input_path = tFun.find_path("test_gene_exprL")
    intermediate_path = tFun.find_path_intermediateFile()
    dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', 
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}

    match.match_reprTranscript_expressionLevel(input_path, dict_repr_test, intermediate_path)

    ref_path=tFun.find_path("test_ref_output.tsv")
    output_path = tFun.find_output()
    

    with open(ref_path, 'r') as t1,\
         open(output_path, 'r') as t2,\
         open(input_path, 'r') as t3 :
        fileRef = t1.readlines()
        fileOutput = t2.readlines()
        fileInput = t3.readlines()

    assert sorted(fileRef) == sorted(fileOutput), "the output does't match the expected tsv file"
    assert sorted(fileRef) != sorted(fileInput), "the output does't match the expected tsv file"
    
         
test_dict_reprTrans_to_df()
test_txt_to_dict()
test_transcripts_by_gene_inDf()
test_tsv_or_csv_to_df()
test_exprLevel_byGene()
test_match_byGene()
test_output_tsv()
test_match_reprTranscript_expressionLevel()

print("test_match is done ! No error was found")