diff --git a/.DS_Store b/.DS_Store index 0be791f97a0ff41cfcb9194d1ac6fbdea9665f90..258757630b3ea2ae538ac895182db1eba76b1f92 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/build/lib/generatecDNA/__init__.py b/build/lib/generatecDNA/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dde8fb9c9bece6a973113aaa11f2aad7aef1925e --- /dev/null +++ b/build/lib/generatecDNA/__init__.py @@ -0,0 +1,3 @@ +"""This is the __init__ function.""" + +__version__ = "0.1.0" diff --git a/build/lib/generatecDNA/generatecDNA-cli.py b/build/lib/generatecDNA/generatecDNA-cli.py new file mode 100644 index 0000000000000000000000000000000000000000..46009bfbc33f17e2589eec56dcecd31c8c65022b --- /dev/null +++ b/build/lib/generatecDNA/generatecDNA-cli.py @@ -0,0 +1,37 @@ +"""Command-line interface client.""" + +import argparse +import generatecDNA as gn + + +def main() -> None: + """Entry point for CLI executable.""" + parser = argparse.ArgumentParser(description="cDNA generator") + + parser.add_argument( + "-rna", + type=str, + metavar="", + help="Path file to fasta file with RNA sequence") + parser.add_argument( + "-gtf", + type=str, + metavar="", + help="Path file to gtf file") + parser.add_argument( + "-cnr", + type=str, + metavar="", + help="Path file to copy number file") + + args = parser.parse_args() + + Generator = gn.GeneratecDNA( + fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr) + Generator.generatecDNA( + fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr) + print("Done") + + +if __name__ == '__main__': + main() diff --git a/build/lib/generatecDNA/generatecDNA.py b/build/lib/generatecDNA/generatecDNA.py new file mode 100644 index 0000000000000000000000000000000000000000..c134b51d4426863fd9e5c00d163473d2e8befd97 --- /dev/null +++ b/build/lib/generatecDNA/generatecDNA.py @@ -0,0 +1,257 @@ +"""Module to generate cDNA copies. + +Class: + GeneratecDNA: contains one method + generatecDNA: takes as input fasta-formatted file & + gtf-formatted-file & csv-formatted file, + outputs fasta-formatted file with cDNA ID and unique cDNA sequence & + csv-formatted file with cDNA ID and copy number +""" + +import random + + +class GeneratecDNA: + """Contains function to generate cDNA. + + Args: + input files: path to fasta-file (RNA_ID & RNA_Seq), + gtf-file (RNA_ID & Priming sites & Probability), + csv-file (RNA_ID & copy number) + + Attributes: + fastaFile: RNA_ID & RNA_Seq + gtf: RNA_ID & Priming sites & Probability + cp_nr: RNA_ID & copy number + """ + + def __init__(self, fastaFile, gtf, cp_nr) -> str: + """Class intructor.""" + self.fastaFile = fastaFile + self.gtf = gtf + self.cp_nr = cp_nr + + def generatecDNA(self, fastaFile, gtf, cp_nr): + """Generate cDNA. + + Args: + fastaFile (str): RNA_ID & RNA_Seq + gtf (str): RNA_ID & Priming sites & Probability + cp_nr (str): RNA_ID & copy number + + Returns: + cDNA.fasta: cDNA_ID & cDNA sequence + cDNA.csv: cDNA_ID & copy number + """ + # defining global variables + gtfFileInputDict = {} + csvFileInputDict = {} + fastaInputDict = {} + # READING INPUT FILES / PART I + # open gtf file + with open(gtf, 'r') as gt: + # read gtf file + for mygtfline in gt: + currentGTFString = mygtfline + gtf_list = currentGTFString.split('\t') + gtf_seqname = gtf_list[0] + gtf_start = gtf_list[3] + gtf_end = gtf_list[4] + gtf_score = gtf_list[5] + my_temp_list_1 = [int(gtf_start), + int(gtf_end), float(gtf_score)] + if gtf_seqname in gtfFileInputDict: + my_temp_list_2 = gtfFileInputDict[gtf_seqname] + my_temp_list_2.append(my_temp_list_1) + gtfFileInputDict[gtf_seqname] = my_temp_list_2 + else: + gtfFileInputDict[gtf_seqname] = [my_temp_list_1] + print(gtfFileInputDict) + # open csv file + with open(cp_nr, 'r') as cp: + # read csv file + for mycsvline in cp: + currentcsvstring = mycsvline + csv_list = currentcsvstring.split(',') + csv_trans_id = csv_list[0] + csv_count = csv_list[2] + csv_count = csv_count.replace("\n", "") + """ trans id should be always new, + otherwise unhash csv_current_count + in defining variables section. + if csv_trans_id in csvFileInputDict: + csv_current_count = csvFileInputDict[csv_trans_id] + csv_current_count += csv_count + csvFileInputDict[csv_trans_id] = csv_current_count + else: + csvFileInputDict[csv_trans_id] = csv_count + """ + csvFileInputDict[csv_trans_id] = int(csv_count) + print(csvFileInputDict) + # open fasta file + with open(fastaFile, 'r') as fa: + # defining variables + fasta_id = "" + fasta_seq = "" + fasta_id_found = False + fasta_seq_found = False + # read fasta file + for myfastaline in fa: + currentfastastring = myfastaline + # find fasta ID + if not fasta_id_found and not fasta_seq_found: + position_of_start = currentfastastring.find('>') + if position_of_start != 0: + continue + elif position_of_start == 0: + fasta_id = myfastaline + fasta_id = fasta_id.replace(">", "") + fasta_id = fasta_id.replace("\n", "") + # I don't know, how the sequence id is formatted and + # which part thereof is equal to the transcript ID + # in the csv-formatted file and gtf-formatted file + # temp_fasta_list_1 = fasta_id.split('\t') + # fasta_id = temp_fasta_list_1[0] + fasta_id_found = True + continue + else: + print("FASTA: Start position in fasta file not found") + break + # find fasta sequence + if fasta_id_found and not fasta_seq_found: + while not fasta_seq_found: + zero_position = currentfastastring[0] + if zero_position == ";": + currentfastastring = fa.readline() + elif zero_position == ">": + assert False, "FASTA: No Sequence after headline" + else: + fasta_seq = currentfastastring + fasta_seq_found = True + if fasta_id_found and fasta_seq_found: + fastaInputDict[fasta_id] = fasta_seq + fasta_id_found = False + fasta_seq_found = False + fasta_id = "" + fasta_seq = "" + print(fastaInputDict) + # COMPUTATION OF INPUT FILES / PART II + outputFastaDict = {} + outputCSVDict = {} + # starting Loop1: read fasta dict + for (k, v) in fastaInputDict.items(): + rna_seq = v + # search for transcript ID in gtf-file to get + # priming sites and scores + if k in gtfFileInputDict: + gtfList = gtfFileInputDict[k] + else: + assert False, "Fasta-ID from fasta-file not found in gtf-file" + # Excluding priming sites within 40 bases + # at the beginning of the transcript and + # ordering priming sites on the RNA sequence in gtf-dict + # sorting + gtfList.sort(key=lambda x: x[0]) + # elimination + for i in gtfList: + if i[0] <= 40: + gtfList.remove(i) + # search for transcript ID in csv-file + # to get copy number of transcript + if k in csvFileInputDict: + actual_count = csvFileInputDict[k] + else: + assert False, "Fasta-ID from fasta-file not found in csv-file" + # random choosing + scores = [] + for i in gtfList: + scores.append(i[2]) + print("gtfList: ", gtfList) + print("scores: ", scores) + my_weighted_list = random.choices( + gtfList, weights=scores, k=actual_count) + # counts per priming site + counts_per_priming_site = [] + for i in range(0, len(gtfList)): + counts_per_priming_site.append(0) + for i in range(0, len(gtfList)): + counts_per_priming_site[i] = my_weighted_list.count(gtfList[i]) + print("counts: ", counts_per_priming_site) + # Loop2: through gtfList to create cDNA starting on priming sites + # according to counts per priming sites + counter_cDNA = 0 + for i in gtfList: + cDNA_3_5 = "" + counter_cDNA += 1 + cDNA_ID = "-".join([k, "cDNA", str(counter_cDNA)]) + if counter_cDNA == 1: + end = i[1] + # create 3' to 5' cDNA + for j in range(0, int(end)): + if rna_seq[j] == "A": + cDNA_3_5 = cDNA_3_5 + "T" + elif rna_seq[j] == "U": + cDNA_3_5 = cDNA_3_5 + "A" + elif rna_seq[j] == "G": + cDNA_3_5 = cDNA_3_5 + "C" + elif rna_seq[j] == "C": + cDNA_3_5 = cDNA_3_5 + "G" + else: + print( + k, rna_seq, gtfList, i, + cDNA_ID, counts_per_priming_site) + assert False, "cDNA synthesis failed, position " \ + "is not A,U,G or C in transcript" + else: + previous_end = end + 1 + this_end = i[1] + # create 3' to 5' cDNA + for j in range(int(previous_end), int(this_end)): + if rna_seq[j] == "A": + cDNA_3_5 = cDNA_3_5 + "T" + elif rna_seq[j] == "U": + cDNA_3_5 = cDNA_3_5 + "A" + elif rna_seq[j] == "G": + cDNA_3_5 = cDNA_3_5 + "C" + elif rna_seq[j] == "C": + cDNA_3_5 = cDNA_3_5 + "G" + else: + print( + k, rna_seq, gtfList, i, + cDNA_ID, counts_per_priming_site) + assert False, "cDNA synthesis failed, " \ + "position is not A,U,G or C " \ + "in transcript" + # reverse sequence to 5' to 3' + cDNA_5_3 = cDNA_3_5[::-1] + if counts_per_priming_site[(counter_cDNA - 1)] == 0: + continue + elif cDNA_5_3 in outputCSVDict: + new_count = outputCSVDict[cDNA_5_3] + new_count += counts_per_priming_site[(counter_cDNA - 1)] + outputCSVDict[cDNA_5_3] = new_count + else: + outputFastaDict[cDNA_5_3] = cDNA_ID + outputCSVDict[cDNA_5_3] = \ + counts_per_priming_site[(counter_cDNA - 1)] + # WRITING OUTPUT FILES / PART III + # write fasta-file and csv-formatted file + with open("cDNA.fasta", 'w') as myFa, open("cDNA.csv", 'w') as myCO: + firstLine = True + for (k, v) in outputFastaDict.items(): + headline = "".join([">", v]) + csvLine = ",".join([v, str(outputCSVDict[k])]) + if firstLine: + myFa.write(headline) + myFa.write("\n") + myFa.write(k) + myCO.write(csvLine) + firstLine = False + else: + myFa.write("\n") + myFa.write(headline) + myFa.write("\n") + myFa.write(k) + myCO.write("\n") + myCO.write(csvLine) + return myFa, myCO diff --git a/generatecDNA.egg-info/PKG-INFO b/generatecDNA.egg-info/PKG-INFO index ccabcbf3f3825340e7447fb6861d5f584ba273ce..fb1f429830a66a8b43d340d5f6f51c0a4e325954 100644 --- a/generatecDNA.egg-info/PKG-INFO +++ b/generatecDNA.egg-info/PKG-INFO @@ -1,10 +1,10 @@ Metadata-Version: 2.1 Name: generatecDNA Version: 0.1.0 -Summary: Generates cDNA copies of RNA transcript from internal priming sites +Summary: Generates cDNA copies of RNA transcriptfrom internal priming sites Home-page: https://git.scicore.unibas.ch/zavolan_group/pipelines/scrna-seq-simulation.git Author: Suvarnan Selliah and Ruth Eneida Montano Crespo -Author-email: r.montanocrespo@unibas.ch +Author-email: s.selliah@unibas.ch,r.montanocrespo@unibas.ch License: MIT Platform: UNKNOWN License-File: LICENSE.md diff --git a/generatecDNA/__pycache__/__init__.cpython-39.pyc b/generatecDNA/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5645f9ca76b976ab3f1bb9645b08638f1aa6a6fa Binary files /dev/null and b/generatecDNA/__pycache__/__init__.cpython-39.pyc differ diff --git a/generatecDNA/__pycache__/generatecDNA.cpython-39-pytest-6.2.5.pyc b/generatecDNA/__pycache__/generatecDNA.cpython-39-pytest-6.2.5.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a0d7794b293b572e92a4216f10ab30a25418c393 Binary files /dev/null and b/generatecDNA/__pycache__/generatecDNA.cpython-39-pytest-6.2.5.pyc differ diff --git a/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc b/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc index f4322b4d9bb4b2f764b84c23b40cd483e44a35fc..9866867732759db27f44e64bd5c8800e4213d946 100644 Binary files a/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc and b/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc differ diff --git a/generatecDNA/generatecDNA.py b/generatecDNA/generatecDNA.py index 0f470ab53458d2cf6d823df859ec6d7009683592..c134b51d4426863fd9e5c00d163473d2e8befd97 100644 --- a/generatecDNA/generatecDNA.py +++ b/generatecDNA/generatecDNA.py @@ -1,5 +1,4 @@ -"""Package contains utilities to generate cDNA - as part of the workflow to simulate scRNAseq. +"""Module to generate cDNA copies. Class: GeneratecDNA: contains one method diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000000000000000000000000000000000000..24a31c7aff5d360e64e3aeaa4c78c08beee5b8c2 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,4 @@ +coverage==6.2 +flake8-docstrings==1.6.0 +flake8==4.0.1 +pytest==6.2.5 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..65750d91965f080d4504025cc6de6968b4582e46 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +coverage==6.2 +flake8-docstrings==1.6.0 +flake8==4.0.1 +pytest==6.2.5 diff --git a/tests/.DS_Store b/tests/.DS_Store index 9f52ee6408fed3d2e48fd284a905747184f46c18..84fa1862bbbd180beb9a94b147bfa4b03c799777 100644 Binary files a/tests/.DS_Store and b/tests/.DS_Store differ diff --git a/tests/.coverage b/tests/.coverage new file mode 100644 index 0000000000000000000000000000000000000000..929bab39819360bc383b3a3b737bcc57517146dd Binary files /dev/null and b/tests/.coverage differ diff --git a/tests/__pycache__/test_generatecDNA.cpython-39-pytest-6.2.5.pyc b/tests/__pycache__/test_generatecDNA.cpython-39-pytest-6.2.5.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53f3f8e42efcd93b67840bce64f6db4c5f79e007 Binary files /dev/null and b/tests/__pycache__/test_generatecDNA.cpython-39-pytest-6.2.5.pyc differ diff --git a/tests/test_generatecDNA.py b/tests/test_generatecDNA.py new file mode 100644 index 0000000000000000000000000000000000000000..76f848e22a12168d9730875f0a20da22b7506395 --- /dev/null +++ b/tests/test_generatecDNA.py @@ -0,0 +1,23 @@ +"""Testing generatecDNA module. + +Test script for generatecDNA module, +from the workflow to simulate scRNAseq. +""" + +from generatecDNA.generatecDNA import GeneratecDNA + +fastaFile = "fasta_example.fasta" +gtfFile = "gtf_example.gtf" +nrcFile = "nrcopies_example.csv" +gn = GeneratecDNA(fastaFile, fastaFile, nrcFile) + + +def test_generatecDNA(): + """Testing main function of generatecDNA module. + + Function to test type of input files required + to generate cDNA copies + """ + assert fastaFile.endswith(".fasta") + assert gtfFile.endswith(".gtf") + assert nrcFile.endswith(".csv")