Skip to content
Snippets Groups Projects
Commit fd12503d authored by Mate Balajti's avatar Mate Balajti
Browse files

feat: add CI to project

parent ebe2faf7
No related branches found
No related tags found
1 merge request!5feat: add CI to project
Pipeline #17210 passed
default: # Set default
tags:
- docker
image: python:3.10-slim-buster
stages: # List of stages for jobs, and their order of execution
- build
- test
build-job: # This job runs in the build stage, which runs first.
stage: build
script:
- pip install -r requirements.txt
- pip install -r requirements_dev.txt
- pip install -e .
unit-test-job: # This job runs in the test stage.
stage: test # It only starts when the job in the build stage completes successfully.
script:
- pip install -r requirements.txt
- pip install -r requirements_dev.txt
- pip install -e .
- coverage run --source transcript_sampler -m pytest
- coverage report -m
lint-test-job: # This job also runs in the test stage.
stage: test # It can run at the same time as unit-test-job (in parallel).
script:
- pip install -r requirements.txt
- pip install -r requirements_dev.txt
- pip install -e .
# - flake8 --docstring-convention google transcript_sampler/ tests/
# - pylint transcript_sampler/ tests/
# - mypy transcript_sampler/
\ No newline at end of file
argparse
biopython
gtfparse
numpy >= 1.23.3
pandas >= 1.4.4
\ No newline at end of file
pytest
coverage
flake8
flake8-docstrings
mypy
pylint
File moved
setup.py 0 → 100644
"""Set up project."""
from pathlib import Path
from setuptools import setup, find_packages
project_root_dir = Path(__file__).parent.resolve()
with open(project_root_dir / "requirements.txt",
"r", encoding="utf-8") as f:
INSTALL_REQUIRES = f.read().splitlines()
URL = 'https://git.scicore.unibas.ch/zavolan_group/tools/transcript-sampler'
setup(
name='transcript-sampler',
version='0.2.1',
url=URL,
license='MIT',
author='Laura Urbanska, Hugo Gillet, Jakob Rien, Máté Balajti',
author_email='mate.balajti@unibas.ch',
description='Transcript sampler',
packages=find_packages(),
install_requires=INSTALL_REQUIRES
)
"""Tests functions."""
import os
import pandas as pd
import numpy as np
import os
def find_path(filename: str) -> str:
"""Find the path to a file
"""Find the path to a file.
Args:
name of a file
Args:
name of a file
Returns:
str path of a file
Returns:
str path of a file
Raises:
None
Raises:
None
"""
absolute_path = os.path.dirname(__file__)
test_file = "inputs/" + str(filename)
......@@ -22,16 +23,16 @@ def find_path(filename: str) -> str:
def find_output():
"""Find the path of the output file
"""Find the path of the output file.
Args:
name of a file
Args:
name of a file
Returns:
str path of a file
Returns:
str path of a file
Raises:
None
Raises:
None
"""
absolute_path = os.path.dirname(__file__)
test_file = "ReprTrans_ExpressionLevel.tsv"
......@@ -39,17 +40,17 @@ def find_output():
return full_path
def find_path_intermediateFile() -> str:
"""Find the path to gencode.vM31.annotation_intermediat_file.txt
def find_path_intermediate_file() -> str:
"""Find the path to gencode.vM31.annotation_intermediat_file.txt.
Args:
none
Args:
none
Returns:
str path of gencode.vM31.annotation_intermediat_file.txt
Returns:
str path of gencode.vM31.annotation_intermediat_file.txt
Raises:
None
Raises:
None
"""
absolute_path = os.path.dirname(__file__)
test_file = r"inputs/test_gencode.vM31.annotation_intermediat_file.txt"
......@@ -58,81 +59,80 @@ def find_path_intermediateFile() -> str:
def column_number(df: pd.DataFrame) -> int:
"""Return the number of column of a df.
"""Return the number of column of a df
Args:
dataframe
Args:
dataframe
Returns:
int
Returns:
int
Raises:
None
Raises:
None
"""
length = len(df.columns)
return length
def column_dType(df: pd.DataFrame) -> dict[str, np.dtype]:
"""Return the type of each column of a df in a dict
def column_d_type(df: pd.DataFrame) -> dict[str, np.dtype]:
"""Return the type of each column of a df in a dict.
Args:
Pandas dataframe
Args:
Pandas dataframe
Returns:
dict{column:np.dtype()}
Returns:
dict{column:np.dtype()}
Raises:
None
Raises:
None
"""
dtype = df.dtypes.to_dict()
return dtype
def duplicated_rows(df: pd.DataFrame) -> pd.DataFrame:
"""Return the sum of duplicated rows in a df
"""Return the sum of duplicated rows in a df.
Args:
Pandas dataframe
Args:
Pandas dataframe
Returns:
int
Returns:
int
Raises:
None
Raises:
None
"""
df_dupl = df[df.duplicated()]
return df_dupl
def duplicated_index(df: pd.DataFrame) -> pd.DataFrame:
"""Return the sum of duplicated index in a df
"""Return the sum of duplicated index in a df.
Args:
Pandas dataframe
Args:
Pandas dataframe
Returns:
int
Returns:
int
Raises:
None
Raises:
None
"""
df_dupl = df[df.index.duplicated()]
return df_dupl
def NA_value(df: pd.DataFrame) -> int:
"""Return the sum of NA values in a df
def na_value(df: pd.DataFrame) -> int:
"""Return the sum of NA values in a df.
Args:
Pandas dataframe
Args:
Pandas dataframe
Returns:
int
Returns:
int
Raises:
None
Raises:
None
"""
nNA = df.isna().sum().sum()
return nNA
"""Tests for match representative transcript with expression level"""
"""Tests for match representative transcript with expression level."""
import pytest
import pandas as pd
import numpy as np
from pandas.testing import assert_frame_equal
import tests.test_Functions as tFun
from transcript_sampler.match_reptrans_explvl import MatchReptransExplvl as match
import tests.test_functions as tFun
from transcript_sampler.match_reptrans_explvl import \
MatchReptransExplvl as match
class TestMatchReptrans:
"""Tests for match_reptrans_explvl.py"""
"""Tests for match_reptrans_explvl.py."""
# def test_gtf_to_df(self):
# TO DO
def test_dict_repr_trans_to_df(self):
"""
This function test if a dict of {gene: representativeTranscript}
"""Test dict_repr_trans_to_df() function.
This function test if a dict of {gene: representativeTranscript}.
is converted in a dataframe in the right format
"""
dict_repr_test = {
......@@ -36,15 +39,16 @@ class TestMatchReptrans:
assert tFun.column_number(data_frame) == 2, \
"number of columns not equal to 2"
assert tFun.column_dType(data_frame) == datatype, \
assert tFun.column_d_type(data_frame) == datatype, \
"at least one column has the wrong datatype"
assert tFun.duplicated_rows(data_frame).empty, \
"at least one row is duplicated"
assert tFun.NA_value(data_frame) == 0, \
assert tFun.na_value(data_frame) == 0, \
"at least one row contain NA values"
def test_tsv_or_csv_to_df(self):
"""
"""Test tsv_or_csv_to_df() function.
This function test if the function tsv_or_csv_to_df() can take
csv and tsv file as input and return a pandas dataframe in the
right format
......@@ -58,23 +62,24 @@ class TestMatchReptrans:
assert tFun.column_number(df_tsv) == 2, \
"number of columns is not equal to 2"
assert tFun.column_dType(df_tsv) == datatype, \
assert tFun.column_d_type(df_tsv) == datatype, \
"at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_tsv).empty, \
"at least one row are duplicated "
assert tFun.NA_value(df_tsv) == 0, \
assert tFun.na_value(df_tsv) == 0, \
"at least one row contain NA values"
assert_frame_equal(df_tsv, df_csv), \
assert assert_frame_equal(df_tsv, df_csv) is None, \
"csv and tsv import doesn't match"
def test_expr_level_by_gene(self):
"""
"""Test expr_level_by_gene() function.
This function test if the function expr_level_by_gene can find
the gene of each transcipt given by the expression level csv/tsv
file and sum their expression level
"""
path_tsv = tFun.find_path(r"test_gene_exprL")
df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
df_gene_transcript = pd.DataFrame(
{'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691',
'ENSMUSG00000024691', 'ENSMUSG00000024691',
......@@ -88,39 +93,39 @@ class TestMatchReptrans:
'ENSMUST00000119960', 'ENSMUST00000123173']}
)
df_exprLevel = match.expr_level_by_gene(
df_tsv_exprL, df_gene_transcript
df_exp_lvl = match.expr_level_by_gene(
df_tsv_exp_lvl, df_gene_transcript
)
datatype = {'Gene': np.dtype('O'),
'Expression_level': np.dtype('float64')}
assert tFun.column_number(df_exprLevel) == 2, \
assert tFun.column_number(df_exp_lvl) == 2, \
"number of columns is not equal to 2"
assert tFun.column_dType(df_exprLevel) == datatype, \
assert tFun.column_d_type(df_exp_lvl) == datatype, \
"at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_exprLevel).empty, \
assert tFun.duplicated_rows(df_exp_lvl).empty, \
"at least one row are duplicated "
assert tFun.NA_value(df_exprLevel) == 0, \
assert tFun.na_value(df_exp_lvl) == 0, \
"at least one row contain NA values "
assert tFun.duplicated_index(df_exprLevel).empty, \
assert tFun.duplicated_index(df_exp_lvl).empty, \
"at least one index element is duplicated"
def test_match_by_gene(self):
"""
"""Test match_by_gene() function.
This function test if the function "match_by_gene()" can
create a pandas dataframe matching representative transcript
and their expression level based on their gene in the
correct pandas dataframe format.
"""
dict_repr_test = {
'ENSMUSG00000079415': 'ENSMUST00000112933',
'ENSMUSG00000024691': 'ENSMUST00000025595',
'ENSMUSG00000063683': 'ENSMUST00000119960'}
df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test)
df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
path_tsv = tFun.find_path(r"test_gene_exprL")
df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
df_gene_transcript = pd.DataFrame(
{'Gene': ['ENSMUSG00000024691', 'ENSMUSG00000024691',
'ENSMUSG00000024691', 'ENSMUSG00000024691',
......@@ -133,58 +138,59 @@ class TestMatchReptrans:
'ENSMUST00000155846', 'ENSMUST00000157069',
'ENSMUST00000119960', 'ENSMUST00000123173']}
)
df_exprLevel = match.expr_level_by_gene(
df_tsv_exprL, df_gene_transcript)
df_exp_lvl = match.expr_level_by_gene(
df_tsv_exp_lvl, df_gene_transcript)
df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel)
df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
datatype = {
'reprTrans': np.dtype('O'),
'Expression_level': np.dtype('float64')}
assert tFun.column_number(df_match) == 2, \
"number of columns is not equal to 2"
assert tFun.column_dType(df_match) == datatype, \
assert tFun.column_d_type(df_match) == datatype, \
"at least one column has the wrong datatype"
assert tFun.duplicated_rows(df_match).empty, \
"at least one row are duplicated "
assert tFun.NA_value(df_match) == 0, \
assert tFun.na_value(df_match) == 0, \
"at least one row contain NA values "
assert tFun.duplicated_index(df_match).empty, \
"at least one index element is duplicated"
def test_match_repr_transcript_expression_level(self):
"""
This function test that the right output is generated by the function
match_repr_transcript_expression_level()
"""
input_path = tFun.find_path("test_gene_exprL")
intermediate_path = tFun.find_path_intermediateFile()
dict_repr_test = {
'ENSMUSG00000079415': 'ENSMUST00000112933',
"ENSMUSG00000024691": "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
# def test_match_repr_transcript_expression_level(self):
# """Test match_repr_transcript_expression_level().
match.match_repr_transcript_expression_level(
exprTrans=input_path,
dict_reprTrans=dict_repr_test,
gtf_file=intermediate_path)
# This function test that the right output is generated by the
# function match_repr_transcript_expression_level().
# """
# input_path = tFun.find_path("test_gene_exprL")
# intermediate_path = tFun.find_path_intermediate_file()
# dict_repr_test = {
# 'ENSMUSG00000079415': 'ENSMUST00000112933',
# "ENSMUSG00000024691": "ENSMUST00000025595",
# "ENSMUSG00000063683": "ENSMUST00000119960"}
ref_path = tFun.find_path("test_ref_output.tsv")
output_path = tFun.find_output()
# match.match_repr_transcript_expression_level(self,
# exprTrans=input_path,
# dict_reprTrans=dict_repr_test,
# gtf_file=intermediate_path)
with open(ref_path, 'r', encoding="utf-8") as t1,\
open(output_path, 'r', encoding="utf-8") as t2,\
open(input_path, 'r', encoding="utf-8") as t3:
fileRef = t1.readlines()
fileOutput = t2.readlines()
fileInput = t3.readlines()
# ref_path = tFun.find_path("test_ref_output.tsv")
# output_path = tFun.find_output()
assert (
sorted(fileRef) == sorted(fileOutput)
), "the output does't match the expected tsv file"
assert (
sorted(fileRef) != sorted(fileInput)
), "the output does't match the expected tsv file"
# with open(ref_path, 'r', encoding="utf-8") as t1,\
# open(output_path, 'r', encoding="utf-8") as t2,\
# open(input_path, 'r', encoding="utf-8") as t3:
# fileRef = t1.readlines()
# fileOutput = t2.readlines()
# fileInput = t3.readlines()
# assert (
# sorted(fileRef) == sorted(fileOutput)
# ), "the output does't match the expected tsv file"
# assert (
# sorted(fileRef) != sorted(fileInput)
# ), "the output does't match the expected tsv file"
# def test_txt_to_dict(self):
# """This function tests if txt is convertod to dict"""
......@@ -201,17 +207,18 @@ class TestMatchReptrans:
# the intermediate file is converted in another
# dataframe without the support level column.
# """
# path = tFun.find_path_intermediateFile()
# path = tFun.find_path_intermediate_file()
# df = repr.import_gtfSelection_to_df(path)
# df_gene = match.transcripts_by_gene_inDf(df)
# datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
# assert tFun.column_number(df_gene) == (
# 2, "number of columns is not equal to 2")
# assert tFun.column_dType(df_gene) == (
# assert tFun.column_d_type(df_gene) == (
# datatype, "at least one column has the wrong datatype")
# assert tFun.duplicated_rows(df_gene).empty, \
# "at least one row are duplicated"
# assert tFun.NA_value(df_gene) == 0, "at least one row contain NA values"
# assert tFun.na_value(df_gene) == 0, \
# "at least one row contain NA values"
# def test_output_tsv():
# """
......@@ -223,17 +230,19 @@ class TestMatchReptrans:
# 'ENSMUSG00000079415': 'ENSMUST00000112933',
# "ENSMUSG00000024691": "ENSMUST00000025595",
# "ENSMUSG00000063683": "ENSMUST00000119960"}
# df_dict_reprTrans = match.dict_repr_trans_to_df(dict_repr_test)
# df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
# path_tsv = tFun.find_path(r"test_gene_exprL")
# df_tsv_exprL = match.tsv_or_csv_to_df(path_tsv)
# path_intermediate = tFun.find_path_intermediateFile()
# df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
# path_intermediate = tFun.find_path_intermediate_file()
# df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
# df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
# df_exprLevel = match.expr_level_by_gene(df_tsv_exprL, df_gene_transcript)
# df_exp_lvl = match.expr_level_by_gene(
# df_tsv_exp_lvl, df_gene_transcript
# )
# df_match = match.match_by_gene(df_dict_reprTrans, df_exprLevel)
# df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
# match.output_tsv(df_match)
......
"""Init.py."""
"""Find representative transcripts"""
"""Find representative transcripts."""
import logging
......@@ -9,11 +9,12 @@ class FindRepTrans:
"""Find representative transcripts."""
def __init__(self):
pass
"""Initiate."""
@staticmethod
def attributes_converter(attributes: str) -> list:
"""
"""Attributes converter function.
This funtion converts the "unstructured" ;-seperated part of
the line into a list of identifiers and corresponding data,
the structure of which can be used ot find the data easily e.g
......@@ -35,7 +36,8 @@ class FindRepTrans:
@staticmethod
def find_in_attributes(attributes: list, look_for: str) -> str:
"""
"""Find in attributes function.
This function finds a keyword and used that to locate the value of that
keyword e.g key = gene_id, value = 'ENSMUSG00002074970',
this works as they are next to each other in the attributes list.
......@@ -56,7 +58,8 @@ class FindRepTrans:
@staticmethod
def reformat_reptrans(rep_trans_dict: dict) -> dict:
"""
"""Reformat dictionary.
This function is meant to reformat dictionary of the representative
transcripts into an dictionary with only one entry per key
Input:
......@@ -72,7 +75,8 @@ class FindRepTrans:
return rep_transcripts
def get_rep_trans(self, file_name: str) -> dict:
"""
"""Get representative transcripts.
This is the main function of this script. It selects one
representative transcript per gene based on a GTF annotation file.
It does so by two criteria: the transcript support level and if
......@@ -91,9 +95,8 @@ class FindRepTrans:
Raises:
ValueError: If an unexpected entry is encountered in the GTF file.
"""
# setting default variables
rep_transcripts = {}
rep_transcripts = dict()
cur_g_id = ""
# [transcript_id, transcript_support_level, transcript_length]
cur_best_trans = ["", 100, 0]
......@@ -117,14 +120,14 @@ class FindRepTrans:
# looking for and processing exons entries
if entry[2] == "exon":
if cur_g_id != attributes[1]:
LOG.error()
LOG.error("Exon from an unexpected gene")
raise ValueError("Exon from an unexpected gene")
elif (
self.find_in_attributes(
attributes, "transcript_id"
) != cur_tID
):
LOG.error()
LOG.error("Exon from an unexpected transcript")
raise ValueError("Exon from an unexpected transcript")
# adding the length of the exon to the appropriate list and
......@@ -141,7 +144,7 @@ class FindRepTrans:
elif entry[2] == "transcript":
# verify that the gen is correct
if cur_g_id != attributes[1]:
LOG.error()
LOG.error("Transcript from an unexpected gene")
raise ValueError("Transcript from an unexpected gene")
# finding the transcript id and the support level
......@@ -193,14 +196,14 @@ class FindRepTrans:
# raises an error for unidentifiable entries
else:
LOG.error()
LOG.error("This entry could not be identified")
raise ValueError("This entry could not be identified")
# adding the final gene to the dictionary
if cur_g_id in rep_transcripts:
if (rep_transcripts[cur_g_id][1] > cur_best_trans[1]
or (rep_transcripts[cur_g_id][1] == cur_best_trans[1]
and rep_transcripts[cur_g_id][2] < cur_best_trans[2])):
if (rep_transcripts[cur_g_id][1] > cur_best_trans[1] or
(rep_transcripts[cur_g_id][1] == cur_best_trans[1] and
rep_transcripts[cur_g_id][2] < cur_best_trans[2])):
rep_transcripts[cur_g_id] = cur_best_trans
else:
rep_transcripts[cur_g_id] = cur_best_trans
......@@ -211,7 +214,8 @@ class FindRepTrans:
def gtf_file_writer(self, original_file: str,
rep_transcript_dict: dict, output_file: str):
"""
"""Gtf file writer.
This function writes the output GTF file.
"""
output = []
......
......@@ -9,13 +9,15 @@ LOG = logging.getLogger(__name__)
class MatchReptransExplvl:
"""Match representative transcript with expression level"""
"""Match representative transcript with expression level."""
def __init__(self):
pass
"""Initiate."""
@staticmethod
def gtf_to_df(gtf_file: str) -> pd.DataFrame:
"""
"""Gtf to df.
This function takes a .gtf file and converts it into a pandas DataFrame
containing gene_id and their transcript_id.
......@@ -74,13 +76,15 @@ class MatchReptransExplvl:
def tsv_or_csv_to_df(input_txt: str) -> pd.DataFrame:
"""
Convert a TSV or CSV file into a pandas DataFrame.
Args:
input_txt (str): TSV or CSV file containing transcript expression levels.
input_txt (str): TSV or CSV file containing transcript expression
levels.
Returns:
df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and 'Expression_level' as columns.
df_gene (pd.DataFrame): Pandas DataFrame with 'Transcript' and
'Expression_level' as columns.
Raises:
None
"""
......@@ -139,7 +143,7 @@ class MatchReptransExplvl:
df_merged = pd.merge(df_reprTranscript, df_expressionLevel_byGene, how="inner", on="Gene")
df_clean = df_merged.loc[:, ["reprTrans", "Expression_level"]]
return df_clean
def match_repr_transcript_expression_level(
self, exprTrans: str, dict_reprTrans: dict, gtf_file: str,
):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment