Skip to content
Snippets Groups Projects
Commit d4e83766 authored by Eric Boittier's avatar Eric Boittier
Browse files

clean up

parent a884c9a7
No related branches found
No related tags found
No related merge requests found
Pipeline #14905 failed
...@@ -6,6 +6,5 @@ default: ...@@ -6,6 +6,5 @@ default:
my_tests: my_tests:
# Good to put a small description here of what this job does # Good to put a small description here of what this job does
script: script:
- pip install -r requirements_dev.txt - pip install -e -r requirements_dev.txt
- black cdna
- pylint cdna - pylint cdna
...@@ -45,6 +45,9 @@ def seq_compliment(sequence: str) -> str: ...@@ -45,6 +45,9 @@ def seq_compliment(sequence: str) -> str:
class CDNAGen: class CDNAGen:
"""
Module that performs the cDNA synthesis.
"""
def __init__(self, ifasta: str, igtf: str, icpn: str, ofasta: str, ocsv: str): def __init__(self, ifasta: str, igtf: str, icpn: str, ofasta: str, ocsv: str):
# inputs # inputs
self.fasta = ifasta self.fasta = ifasta
...@@ -56,7 +59,7 @@ class CDNAGen: ...@@ -56,7 +59,7 @@ class CDNAGen:
# variables # variables
self.fasta_dict = None self.fasta_dict = None
self.fasta_records = None self.fasta_records = None
self.df_input_GTF = None self.gtf_df = None
self.run() self.run()
def run(self) -> None: def run(self) -> None:
...@@ -77,7 +80,7 @@ class CDNAGen: ...@@ -77,7 +80,7 @@ class CDNAGen:
def add_records(self) -> None: def add_records(self) -> None:
self.fasta_records = [] self.fasta_records = []
for index, row in self.df_input_GTF.iterrows(): for index, row in self.gtf_df.iterrows():
if row["compliment"] is not None: if row["compliment"] is not None:
copy_number = row["Transcript_Copy_Number"] copy_number = row["Transcript_Copy_Number"]
record = SeqRecord( record = SeqRecord(
...@@ -94,7 +97,7 @@ class CDNAGen: ...@@ -94,7 +97,7 @@ class CDNAGen:
Returns: None Returns: None
""" """
self.df_input_GTF["priming_site"] = self.df_input_GTF.apply( self.gtf_df["priming_site"] = self.gtf_df.apply(
lambda row: self.read_primingsite(row["seqname"], row["start"]), lambda row: self.read_primingsite(row["seqname"], row["start"]),
axis=1, axis=1,
) )
...@@ -105,7 +108,7 @@ class CDNAGen: ...@@ -105,7 +108,7 @@ class CDNAGen:
Returns: None Returns: None
""" """
self.df_input_GTF["compliment"] = self.df_input_GTF["priming_site"].apply( self.gtf_df["compliment"] = self.gtf_df["priming_site"].apply(
lambda x: seq_compliment(x) lambda x: seq_compliment(x)
) )
...@@ -147,11 +150,11 @@ class CDNAGen: ...@@ -147,11 +150,11 @@ class CDNAGen:
Returns: None Returns: None
""" """
df_input_CSV = pd.read_csv(self.cpn, index_col=False) df_csv = pd.read_csv(self.cpn, index_col=False)
df_input_CSV = ( df_csv = (
df_input_CSV.reset_index() df_csv.reset_index()
) # make sure indexes pair with number of rows ) # make sure indexes pair with number of rows
self.df_input_CSV = df_input_CSV self.csv_df = df_csv
def read_gtf(self) -> None: def read_gtf(self) -> None:
"""Read and process the GTF file. """Read and process the GTF file.
...@@ -163,43 +166,43 @@ class CDNAGen: ...@@ -163,43 +166,43 @@ class CDNAGen:
""" """
# returns GTF with essential columns such as "feature", "seqname", "start", "end" # returns GTF with essential columns such as "feature", "seqname", "start", "end"
# alongside the names of any optional keys which appeared in the attribute column # alongside the names of any optional keys which appeared in the attribute column
df_input_GTF = read_gtf(self.gtf) gtf_df = read_gtf(self.gtf)
df_input_GTF["Binding_Probability"] = pd.to_numeric( gtf_df["Binding_Probability"] = pd.to_numeric(
df_input_GTF["Binding_Probability"] gtf_df["Binding_Probability"]
) # convert to numeric ) # convert to numeric
df_normalization_bind_probablility = df_input_GTF.groupby("seqname")[ df_normalization_bind_probablility = gtf_df.groupby("seqname")[
"Binding_Probability" "Binding_Probability"
].sum() # extract binding probablility ].sum() # extract binding probability
count = 0 count = 0
prev_id = None prev_id = None
# Adds Normalized_Binding_Probability and Transcript_Copy_Number to each transcript in the dataframe # Adds Normalized_Binding_Probability and Transcript_Copy_Number to each transcript in the dataframe
for index, row in df_input_GTF.iterrows(): for index, row in gtf_df.iterrows():
# GTF transcript ID # GTF transcript ID
id_GTF = str(row["seqname"]) id_ = str(row["seqname"])
if id_GTF == prev_id: if id_ == prev_id:
count += 1 count += 1
else: else:
prev_id = None prev_id = None
count = 0 count = 0
# CVS transcript ID # CVS transcript ID
id_CSV = str(row["seqname"]).split("_")[1] id_csv = str(row["seqname"]).split("_")[1]
# Calculate Normalized_Binding_Probability and add to GTF dataframe # Calculate Normalized_Binding_Probability and add to GTF dataframe
df_input_GTF.loc[index, "Normalized_Binding_Probability"] = ( gtf_df.loc[index, "Normalized_Binding_Probability"] = (
row["Binding_Probability"] / df_normalization_bind_probablility[id_GTF] row["Binding_Probability"] / df_normalization_bind_probablility[id_]
) )
# Calculate Normalized_Binding_Probability and add to GTF dataframe # Calculate Normalized_Binding_Probability and add to GTF dataframe
csv_transcript_copy_number = self.df_input_CSV.loc[ csv_transcript_copy_number = self.csv_df.loc[
self.df_input_CSV["ID of transcript"] == int(id_CSV), self.csv_df["ID of transcript"] == int(id_csv),
"Transcript copy number", "Transcript copy number",
].iloc[0] ].iloc[0]
df_input_GTF.loc[index, "Transcript_Copy_Number"] = round( gtf_df.loc[index, "Transcript_Copy_Number"] = round(
csv_transcript_copy_number csv_transcript_copy_number
* df_input_GTF.loc[index, "Normalized_Binding_Probability"] * gtf_df.loc[index, "Normalized_Binding_Probability"]
) )
df_input_GTF.loc[index, "cdna_ID"] = f"{id_GTF}_{count}" gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}"
prev_id = id_GTF prev_id = id_
self.df_input_GTF = df_input_GTF self.gtf_df = gtf_df
def write_fasta(self) -> None: def write_fasta(self) -> None:
"""Writes cDNA fasta records to file. """Writes cDNA fasta records to file.
...@@ -220,7 +223,7 @@ class CDNAGen: ...@@ -220,7 +223,7 @@ class CDNAGen:
Returns: None Returns: None
""" """
self.df_input_GTF[["cdna_ID", "Transcript_Copy_Number"]].to_csv( self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]].to_csv(
self.output_csv, index=False self.output_csv, index=False
) )
print(f"Copy number csv file successfully written to: {self.output_csv}") print(f"Copy number csv file successfully written to: {self.output_csv}")
...@@ -4,7 +4,14 @@ import logging ...@@ -4,7 +4,14 @@ import logging
from cdna import CDNAGen from cdna import CDNAGen
def parser(): def parser() -> None:
""" Parser for cDNA generator
Parses command line arguments for cDNA generation.
Returns: None
"""
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
prog="cDNA generator", prog="cDNA generator",
description="Generate cDNA sequences based on primer probabilities.", description="Generate cDNA sequences based on primer probabilities.",
...@@ -48,4 +55,4 @@ if __name__ == "__main__": ...@@ -48,4 +55,4 @@ if __name__ == "__main__":
print("**********************") print("**********************")
print("Running cDNA generator") print("Running cDNA generator")
print("**********************") print("**********************")
cnda_object = parser() parser()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment