Skip to content
Snippets Groups Projects
Commit 60430a41 authored by Mate Balajti's avatar Mate Balajti
Browse files

refactor: update gitignore, requirements, cli

parent 3a28cd35
No related branches found
No related tags found
1 merge request!7feat: add tests
Pipeline #17349 failed
# ignore ALL .log files # Created by https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
*.log # Edit at https://www.toptal.com/developers/gitignore?templates=macos,visualstudiocode
# ignore ALL files in ANY directory named temp ### macOS ###
temp/ # General
__pycache__ .DS_Store
output_files .AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### macOS Patch ###
# iCloud generated files
*.icloud
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
.vscode
# End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
__pycache__/
*_cache *_cache
*egg-info/ *egg-info/
.coverage .coverage
build/ build/
*/play.py */play.py
\ No newline at end of file *.log
temp/
output_files
argparse argparse
biopython biopython
gtfparse gtfparse
polars == 0.16.17
numpy >= 1.23.3 numpy >= 1.23.3
pandas >= 1.4.4 pandas >= 1.4.4
\ No newline at end of file
ENST00000472194 0.8914783511010855 ENST00000472194,0.8914783511010855
ENST00000308647 1.0887715239511602 ENST00000308647,1.0887715239511602
ENST00000442483 0.8381441606416928 ENST00000442483,0.8381441606416928
ENST00000511072 0.9145581387636652 ENST00000511072,0.9145581387636652
...@@ -157,105 +157,107 @@ class TestMatchReptrans: ...@@ -157,105 +157,107 @@ class TestMatchReptrans:
assert tFun.duplicated_index(df_match).empty, \ assert tFun.duplicated_index(df_match).empty, \
"at least one index element is duplicated" "at least one index element is duplicated"
# def test_match_repr_transcript_expression_level(self): def test_match_repr_transcript_expression_level(self):
# """Test match_repr_transcript_expression_level(). """Test match_repr_transcript_expression_level().
# This function test that the right output is generated by the This function test that the right output is generated by the
# function match_repr_transcript_expression_level(). function match_repr_transcript_expression_level().
# """ """
# input_path = tFun.find_path("test_gene_exprL") input_path = tFun.find_path("test_gene_exprL")
# intermediate_path = tFun.find_path_intermediate_file() intermediate_path = tFun.find_path_intermediate_file()
# dict_repr_test = { dict_repr_test = {
# 'ENSMUSG00000079415': 'ENSMUST00000112933', 'ENSMUSG00000079415': 'ENSMUST00000112933',
# "ENSMUSG00000024691": "ENSMUST00000025595", "ENSMUSG00000024691": "ENSMUST00000025595",
# "ENSMUSG00000063683": "ENSMUST00000119960"} "ENSMUSG00000063683": "ENSMUST00000119960"}
# match.match_repr_transcript_expression_level(self, match.match_repr_transcript_expression_level(
# exprTrans=input_path, self,
# dict_reprTrans=dict_repr_test, exprTrans=input_path,
# gtf_file=intermediate_path) dict_reprTrans=dict_repr_test,
gtf_file=intermediate_path
# ref_path = tFun.find_path("test_ref_output.tsv") )
# output_path = tFun.find_output()
ref_path = tFun.find_path("test_ref_output.tsv")
# with open(ref_path, 'r', encoding="utf-8") as t1,\ output_path = tFun.find_output()
# open(output_path, 'r', encoding="utf-8") as t2,\
# open(input_path, 'r', encoding="utf-8") as t3: with open(ref_path, 'r', encoding="utf-8") as t1,\
# fileRef = t1.readlines() open(output_path, 'r', encoding="utf-8") as t2,\
# fileOutput = t2.readlines() open(input_path, 'r', encoding="utf-8") as t3:
# fileInput = t3.readlines() fileRef = t1.readlines()
fileOutput = t2.readlines()
# assert ( fileInput = t3.readlines()
# sorted(fileRef) == sorted(fileOutput)
# ), "the output does't match the expected tsv file" assert (
# assert ( sorted(fileRef) == sorted(fileOutput)
# sorted(fileRef) != sorted(fileInput) ), "the output does't match the expected tsv file"
# ), "the output does't match the expected tsv file" assert (
sorted(fileRef) != sorted(fileInput)
# def test_txt_to_dict(self): ), "the output does't match the expected tsv file"
# """This function tests if txt is convertod to dict"""
# path = tFun.find_path("test_dict_repr_trans.txt") def test_txt_to_dict(self):
# dico = match.txt_to_dict(path) """This function tests if txt is convertod to dict"""
# dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933', path = tFun.find_path("test_dict_repr_trans.txt")
# "ENSMUSG00000024691": "ENSMUST00000025595", dico = match.txt_to_dict(path)
# "ENSMUSG00000063683": "ENSMUST00000119960"} dict_test = {'ENSMUSG00000079415': 'ENSMUST00000112933',
# assert dico == dict_test "ENSMUSG00000024691": "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
# def test_transcripts_by_gene_inDf(): assert dico == dict_test
# """
# This function test if a dataframe generated from def test_transcripts_by_gene_inDf():
# the intermediate file is converted in another """
# dataframe without the support level column. This function test if a dataframe generated from
# """ the intermediate file is converted in another
# path = tFun.find_path_intermediate_file() dataframe without the support level column.
# df = repr.import_gtfSelection_to_df(path) """
# df_gene = match.transcripts_by_gene_inDf(df) path = tFun.find_path_intermediate_file()
# datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')} df = repr.import_gtfSelection_to_df(path)
# assert tFun.column_number(df_gene) == ( df_gene = match.transcripts_by_gene_inDf(df)
# 2, "number of columns is not equal to 2") datatype = {'Gene': np.dtype('O'), 'Transcript': np.dtype('O')}
# assert tFun.column_d_type(df_gene) == ( assert tFun.column_number(df_gene) == (
# datatype, "at least one column has the wrong datatype") 2, "number of columns is not equal to 2")
# assert tFun.duplicated_rows(df_gene).empty, \ assert tFun.column_d_type(df_gene) == (
# "at least one row are duplicated" datatype, "at least one column has the wrong datatype")
# assert tFun.na_value(df_gene) == 0, \ assert tFun.duplicated_rows(df_gene).empty, \
# "at least one row contain NA values" "at least one row are duplicated"
assert tFun.na_value(df_gene) == 0, \
# def test_output_tsv(): "at least one row contain NA values"
# """
# This function test if a tsv file is generated from a pandas def test_output_tsv():
# dataframe in the right format. """
# """ This function test if a tsv file is generated from a pandas
dataframe in the right format.
# dict_repr_test = { """
# 'ENSMUSG00000079415': 'ENSMUST00000112933',
# "ENSMUSG00000024691": "ENSMUST00000025595", dict_repr_test = {
# "ENSMUSG00000063683": "ENSMUST00000119960"} 'ENSMUSG00000079415': 'ENSMUST00000112933',
# df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test) "ENSMUSG00000024691": "ENSMUST00000025595",
"ENSMUSG00000063683": "ENSMUST00000119960"}
# path_tsv = tFun.find_path(r"test_gene_exprL") df_dict_repr_trans = match.dict_repr_trans_to_df(dict_repr_test)
# df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
# path_intermediate = tFun.find_path_intermediate_file() path_tsv = tFun.find_path(r"test_gene_exprL")
# df_intermediate = repr.import_gtfSelection_to_df(path_intermediate) df_tsv_exp_lvl = match.tsv_or_csv_to_df(path_tsv)
# df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate) path_intermediate = tFun.find_path_intermediate_file()
df_intermediate = repr.import_gtfSelection_to_df(path_intermediate)
# df_exp_lvl = match.expr_level_by_gene( df_gene_transcript = match.transcripts_by_gene_inDf(df_intermediate)
# df_tsv_exp_lvl, df_gene_transcript
# ) df_exp_lvl = match.expr_level_by_gene(
df_tsv_exp_lvl, df_gene_transcript
# df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl) )
# match.output_tsv(df_match) df_match = match.match_by_gene(df_dict_repr_trans, df_exp_lvl)
# ref_path = tFun.find_path("test_ref_output.tsv") match.output_tsv(df_match)
# output_path = tFun.find_output()
ref_path = tFun.find_path("test_ref_output.tsv")
# with open(ref_path, 'r') as t1, open(output_path, 'r') as t2: output_path = tFun.find_output()
# fileRef = t1.readlines()
# fileOutput = t2.readlines() with open(ref_path, 'r') as t1, open(output_path, 'r') as t2:
fileRef = t1.readlines()
# assert ( fileOutput = t2.readlines()
# sorted(fileRef) == sorted(fileOutput)
# ), "the output does't match the expected tsv file" assert (
sorted(fileRef) == sorted(fileOutput)
), "the output does't match the expected tsv file"
# test_dict_repr_trans_to_df() # test_dict_repr_trans_to_df()
# test_txt_to_dict() # test_txt_to_dict()
......
...@@ -25,31 +25,31 @@ def main(): ...@@ -25,31 +25,31 @@ def main():
formatter_class=argparse.ArgumentDefaultsHelpFormatter, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
) )
parser.add_argument( parser.add_argument(
"--input_gtf", required=True, default=None, "-ic", "--input_csv", required=True, default=None,
help="GTF file with genome annotation" help="CSV or TSV file with transcripts and their expression level"
) )
parser.add_argument( parser.add_argument(
"--input_csv", required=True, default=None, "-ig", "--input_gtf", required=True, default=None,
help="CSV or TSV file with transcripts and their expression level" help="GTF file with genome annotation"
) )
parser.add_argument( parser.add_argument(
"--output_gtf", required=True, default=None, "-oc", "--output_csv", required=True, default=None,
help="Output path for the new GTF file of representative transcripts" help="Output path for the new CSV file of representative transcripts "
"and their sampled number"
) )
parser.add_argument( parser.add_argument(
"--output_csv", required=True, default=None, "-og", "--output_gtf", required=True, default=None,
help="Output path for the new CSV file of representative transcripts \ help="Output path for the new GTF file of representative transcripts"
and their sampled number"
) )
parser.add_argument( parser.add_argument(
"--n_to_sample", required=True, default=None, "-n", "--n_to_sample", required=True, default=None,
help="Total number of transcripts to sample" help="Total number of transcripts to sample"
) )
args = parser.parse_args() args = parser.parse_args()
log = logging.getLogger("main") log = logging.getLogger("main")
start = time.time() start = time.time()
log.info("Started transcript sampler...") log.info("Started transcript sampler.")
dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf) dict_repr_trans = find_rep_trans.get_rep_trans(args.input_gtf)
df_repr = match_reptrs_explvl.match_repr_transcript_expression_level( df_repr = match_reptrs_explvl.match_repr_transcript_expression_level(
dict_reprTrans=dict_repr_trans, dict_reprTrans=dict_repr_trans,
...@@ -57,20 +57,20 @@ def main(): ...@@ -57,20 +57,20 @@ def main():
gtf_file=args.input_gtf gtf_file=args.input_gtf
) )
log.info( log.info(
"Finding match between representative transcripts \ "Finding match between representative transcripts "
and expression level file" "and expression level file..."
) )
log.info("Poisson sampling of transcripts") log.info("Poisson sampling of transcripts...")
poisson_sample.transcript_sampling( poisson_sample.transcript_sampling(
args.n_to_sample, df_repr, args.output_csv) args.n_to_sample, df_repr, args.output_csv)
log.info("Output CSV file ready") log.info("Output CSV file ready.")
log.info("Writing output GTF file") log.info("Writing output GTF file...")
find_rep_trans.gtf_file_writer( find_rep_trans.gtf_file_writer(
args.input_gtf, dict_repr_trans, args.output_gtf) args.input_gtf, dict_repr_trans, args.output_gtf)
end = time.time() end = time.time()
log.info("Script executed in %s sec", (end - start)) log.info("Script executed in %s sec.", round(end - start, 2))
if __name__ == "__main__": if __name__ == "__main__":
......
"""Sample transcripts by Poisson-sampling""" """Sample transcripts by Poisson-sampling."""
import pandas as pd import pandas as pd
import numpy as np import numpy as np
# pylint: disable=R0903
class SampleTranscript: class SampleTranscript:
''' """Sample transcript.
Sample transcript
This part of the code does Poisson sampling proportionally This part of the code does Poisson sampling proportionally
to gene expression levels for each gene. to gene expression levels for each gene.
...@@ -17,10 +17,11 @@ class SampleTranscript: ...@@ -17,10 +17,11 @@ class SampleTranscript:
output: csv file with gene id and count output: csv file with gene id and count
gtf file with transcript samples gtf file with transcript samples
''' """
@staticmethod @staticmethod
def transcript_sampling(total_transcript_number, df_repr, output_csv): def transcript_sampling(total_transcript_number, df_repr, output_csv):
"""Samples transcript based on Poisson-sampling""" """Sample transcript based on Poisson-sampling."""
total = df_repr["level"].sum() total = df_repr["level"].sum()
total_transcript_number = int(total_transcript_number) total_transcript_number = int(total_transcript_number)
normalized = total_transcript_number / total normalized = total_transcript_number / total
...@@ -28,7 +29,7 @@ class SampleTranscript: ...@@ -28,7 +29,7 @@ class SampleTranscript:
transcript_numbers = pd.DataFrame({ transcript_numbers = pd.DataFrame({
"id": df_repr["id"], "count": levels "id": df_repr["id"], "count": levels
}) })
transcript_numbers.to_csv(output_csv, index=False) transcript_numbers.to_csv(output_csv, index=False, header=False)
# python_version = "3.7.13" # python_version = "3.7.13"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment