Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import pandas as pd
import json
import re
import match_reprtranscript_expressionlevel as match
import os
import pytest
import test_Functions as tFun
import numpy as np
import representative as repr
from pandas.testing import assert_frame_equal
def test_dict_reprTrans_to_df():
"""
This function test if a dict of {gene: representativeTranscript}
is converted in a dataframe in the right format
"""
dict_repr_test = {"ENSMUSG00000079415":"ENSMUST00000112933",
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
dict_mixed = {"a":2, "b":3}
str_random = "jflkajflkaelfha"
dict_int = {12:34, 13:66}
df = match.dict_reprTrans_to_df(dict_repr_test)
datatype={'Gene': np.dtype('O'), 'reprTrans': np.dtype('O')}
with pytest.raises(TypeError, match=r"Only dict are allowed"):
match.dict_reprTrans_to_df(str_random)
with pytest.raises(TypeError, match=r"Key should be strings"):
match.dict_reprTrans_to_df(dict_int)
with pytest.raises(TypeError, match=r"Values should be strings"):
match.dict_reprTrans_to_df(dict_mixed)
assert tFun.column_number(df)==2, "number of columns is not equal to 2"
assert tFun.column_dType(df)==datatype, "at least one column has the wrong datatype"
assert tFun.duplicated_rows(df).empty, "at least one row are duplicated "
assert tFun.NA_value(df) == 0, "at least one row contain NA values "
def test_txt_to_dict():
path = tFun.find_path("test_dict_repr_trans.txt")
dico = match.txt_to_dict(path)
dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
assert dico == dict_test
def test_transcripts_by_gene_inDf():
"""
This function test if a dataframe generated from
the intermediate file is converted in another
dataframe without the support level column.
"""
path = tFun.find_path_intermediateFile()
df = repr.import_gtfSelection_to_df(path)
df_gene = match.transcripts_by_gene_inDf(df)
datatype={'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
assert tFun.column_number(df_gene)==2, "number of columns is not equal to 2"
assert tFun.column_dType(df_gene)==datatype, "at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_gene).empty, "at least one row are duplicated "
assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values "
def test_tsv_or_csv_to_df():
"""
This function test if the function tsv_or_csv_to_df() cans take
csv and tsv file as input and return a pandas dataframe in the
right format
"""
path_tsv = tFun.find_path(r"test_gene_exprL")
df_tsv = match.tsv_or_csv_to_df(path_tsv)
path_csv = tFun.find_path(r"test_gene_exprL_csv.csv")
df_csv = match.tsv_or_csv_to_df(path_csv)
datatype ={'Transcript': np.dtype('O'), 'Expression_level': np.dtype('float64')}
assert tFun.column_number(df_tsv)==2, "number of columns is not equal to 2"
assert tFun.column_dType(df_tsv)==datatype, "at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_tsv).empty, "at least one row are duplicated "
assert tFun.NA_value(df_tsv) == 0, "at least one row contain NA values "
assert_frame_equal(df_tsv, df_csv), "csv and tsv import doesn't match"
def test_exprLevel_byGene():
"""
This function test if the function exprLevel_byGene can find the gene of
each transcipt given by the expression level csv/tsv file and sum their
expression level
"""
path_tsv = tFun.find_path(r"test_gene_exprL")
df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
path_intermediate = tFun.find_path_intermediateFile()
df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript)
datatype ={'Expression_level': np.dtype('float64')}
assert tFun.column_number(df_exprLevel)==1, "number of columns is not equal to 1"
assert tFun.column_dType(df_exprLevel)==datatype, "at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_exprLevel).empty, "at least one row are duplicated "
assert tFun.NA_value(df_exprLevel) == 0, "at least one row contain NA values "
assert tFun.duplicated_index(df_exprLevel).empty, "at least one index element is duplicated"
def test_match_byGene():
"""
This function test if the function "match_byGene()" can
create a pandas dataframe matching representative transcript
and their expression level based on their gene in the
correct pandas dataframe format.
"""
dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
df_dict_reprTrans = match.dict_reprTrans_to_df(dict_repr_test)
path_tsv = tFun.find_path(r"test_gene_exprL")
df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
path_intermediate = tFun.find_path_intermediateFile()
df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript)
df_match = match.match_byGene(df_dict_reprTrans, df_exprLevel)
datatype = {'reprTrans': np.dtype('O'), 'Expression_level': np.dtype('float64')}
assert tFun.column_number(df_match)==2, "number of columns is not equal to 2"
assert tFun.column_dType(df_match)==datatype, "at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_match).empty, "at least one row are duplicated "
assert tFun.NA_value(df_match) == 0, "at least one row contain NA values "
assert tFun.duplicated_index(df_match).empty, "at least one index element is duplicated"
def test_output_tsv():
"""
This function test if a tsv file is generated from a pandas
dataframe in the right format.
"""
dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
df_dict_reprTrans = match.dict_reprTrans_to_df(dict_repr_test)
path_tsv = tFun.find_path(r"test_gene_exprL")
df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
path_intermediate = tFun.find_path_intermediateFile()
df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
df_exprLevel = match.exprLevel_byGene(df_tsv_exprL, df_gene_transcript)
df_match = match.match_byGene(df_dict_reprTrans, df_exprLevel)
match.output_tsv(df_match)
ref_path=tFun.find_path("test_ref_output.tsv")
output_path = tFun.find_output()
with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
fileRef = t1.readlines()
fileOutput = t2.readlines()
assert sorted(fileRef) == sorted(fileOutput), "the output does't match the expected tsv file"
def test_match_reprTranscript_expressionLevel():
input_path = tFun.find_path("test_gene_exprL")
intermediate_path = tFun.find_path_intermediateFile()
dict_repr_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
"ENSMUSG00000024691" : "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
match.match_reprTranscript_expressionLevel(input_path, dict_repr_test, intermediate_path)
ref_path=tFun.find_path("test_ref_output.tsv")
output_path = tFun.find_output()
with open(ref_path, 'r') as t1,\
open(output_path, 'r') as t2,\
open(input_path, 'r') as t3 :
fileRef = t1.readlines()
fileOutput = t2.readlines()
fileInput = t3.readlines()
assert sorted(fileRef) == sorted(fileOutput), "the output does't match the expected tsv file"
assert sorted(fileRef) != sorted(fileInput), "the output does't match the expected tsv file"
test_dict_reprTrans_to_df()
test_txt_to_dict()
test_transcripts_by_gene_inDf()
test_tsv_or_csv_to_df()
test_exprLevel_byGene()
test_match_byGene()
test_output_tsv()
test_match_reprTranscript_expressionLevel()
print("test_match is done ! No error was found")