diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 61961d58c63a045ccb55e7426f53d6001180ccad..304d7c4aaf0ab0320455d66fc3f3e61a6a519f57 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,6 +6,5 @@ default: my_tests: # Good to put a small description here of what this job does script: - - pip install -r requirements_dev.txt - - black cdna + - pip install -e -r requirements_dev.txt - pylint cdna diff --git a/cdna/cdna.py b/cdna/cdna.py index 326d90584a7d616329e0191378142d423bc61115..c88b39188b859fa0315b4585154405b2895c3ff5 100644 --- a/cdna/cdna.py +++ b/cdna/cdna.py @@ -45,6 +45,9 @@ def seq_compliment(sequence: str) -> str: class CDNAGen: + """ + Module that performs the cDNA synthesis. + """ def __init__(self, ifasta: str, igtf: str, icpn: str, ofasta: str, ocsv: str): # inputs self.fasta = ifasta @@ -56,7 +59,7 @@ class CDNAGen: # variables self.fasta_dict = None self.fasta_records = None - self.df_input_GTF = None + self.gtf_df = None self.run() def run(self) -> None: @@ -77,7 +80,7 @@ class CDNAGen: def add_records(self) -> None: self.fasta_records = [] - for index, row in self.df_input_GTF.iterrows(): + for index, row in self.gtf_df.iterrows(): if row["compliment"] is not None: copy_number = row["Transcript_Copy_Number"] record = SeqRecord( @@ -94,7 +97,7 @@ class CDNAGen: Returns: None """ - self.df_input_GTF["priming_site"] = self.df_input_GTF.apply( + self.gtf_df["priming_site"] = self.gtf_df.apply( lambda row: self.read_primingsite(row["seqname"], row["start"]), axis=1, ) @@ -105,7 +108,7 @@ class CDNAGen: Returns: None """ - self.df_input_GTF["compliment"] = self.df_input_GTF["priming_site"].apply( + self.gtf_df["compliment"] = self.gtf_df["priming_site"].apply( lambda x: seq_compliment(x) ) @@ -147,11 +150,11 @@ class CDNAGen: Returns: None """ - df_input_CSV = pd.read_csv(self.cpn, index_col=False) - df_input_CSV = ( - df_input_CSV.reset_index() + df_csv = pd.read_csv(self.cpn, index_col=False) + df_csv = ( + df_csv.reset_index() ) # make sure indexes pair with number of rows - self.df_input_CSV = df_input_CSV + self.csv_df = df_csv def read_gtf(self) -> None: """Read and process the GTF file. @@ -163,43 +166,43 @@ class CDNAGen: """ # returns GTF with essential columns such as "feature", "seqname", "start", "end" # alongside the names of any optional keys which appeared in the attribute column - df_input_GTF = read_gtf(self.gtf) - df_input_GTF["Binding_Probability"] = pd.to_numeric( - df_input_GTF["Binding_Probability"] + gtf_df = read_gtf(self.gtf) + gtf_df["Binding_Probability"] = pd.to_numeric( + gtf_df["Binding_Probability"] ) # convert to numeric - df_normalization_bind_probablility = df_input_GTF.groupby("seqname")[ + df_normalization_bind_probablility = gtf_df.groupby("seqname")[ "Binding_Probability" - ].sum() # extract binding probablility + ].sum() # extract binding probability count = 0 prev_id = None # Adds Normalized_Binding_Probability and Transcript_Copy_Number to each transcript in the dataframe - for index, row in df_input_GTF.iterrows(): + for index, row in gtf_df.iterrows(): # GTF transcript ID - id_GTF = str(row["seqname"]) - if id_GTF == prev_id: + id_ = str(row["seqname"]) + if id_ == prev_id: count += 1 else: prev_id = None count = 0 # CVS transcript ID - id_CSV = str(row["seqname"]).split("_")[1] + id_csv = str(row["seqname"]).split("_")[1] # Calculate Normalized_Binding_Probability and add to GTF dataframe - df_input_GTF.loc[index, "Normalized_Binding_Probability"] = ( - row["Binding_Probability"] / df_normalization_bind_probablility[id_GTF] + gtf_df.loc[index, "Normalized_Binding_Probability"] = ( + row["Binding_Probability"] / df_normalization_bind_probablility[id_] ) # Calculate Normalized_Binding_Probability and add to GTF dataframe - csv_transcript_copy_number = self.df_input_CSV.loc[ - self.df_input_CSV["ID of transcript"] == int(id_CSV), + csv_transcript_copy_number = self.csv_df.loc[ + self.csv_df["ID of transcript"] == int(id_csv), "Transcript copy number", ].iloc[0] - df_input_GTF.loc[index, "Transcript_Copy_Number"] = round( + gtf_df.loc[index, "Transcript_Copy_Number"] = round( csv_transcript_copy_number - * df_input_GTF.loc[index, "Normalized_Binding_Probability"] + * gtf_df.loc[index, "Normalized_Binding_Probability"] ) - df_input_GTF.loc[index, "cdna_ID"] = f"{id_GTF}_{count}" - prev_id = id_GTF + gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}" + prev_id = id_ - self.df_input_GTF = df_input_GTF + self.gtf_df = gtf_df def write_fasta(self) -> None: """Writes cDNA fasta records to file. @@ -220,7 +223,7 @@ class CDNAGen: Returns: None """ - self.df_input_GTF[["cdna_ID", "Transcript_Copy_Number"]].to_csv( + self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]].to_csv( self.output_csv, index=False ) print(f"Copy number csv file successfully written to: {self.output_csv}") diff --git a/cdna/cli.py b/cdna/cli.py index e8e2cd7e3ab674cf8ed90a7b8b2c8cdb077bbeac..daf4ebfa6f28756705687aa161f5a633476af8f3 100644 --- a/cdna/cli.py +++ b/cdna/cli.py @@ -4,7 +4,14 @@ import logging from cdna import CDNAGen -def parser(): +def parser() -> None: + """ Parser for cDNA generator + + Parses command line arguments for cDNA generation. + + Returns: None + + """ parser = argparse.ArgumentParser( prog="cDNA generator", description="Generate cDNA sequences based on primer probabilities.", @@ -48,4 +55,4 @@ if __name__ == "__main__": print("**********************") print("Running cDNA generator") print("**********************") - cnda_object = parser() + parser()