Final Version 1

55588f1d · Suvarnan Selliah · 6fc6fa37 · 55588f1d · 55588f1d · 55588f1d
Commit 55588f1d authored 3 years ago by Suvarnan Selliah
--- a/.DS_Store
+++ b/.DS_Store
--- a/build/lib/generatecDNA/__init__.py
+++ b/build/lib/generatecDNA/__init__.py
+"""This is the __init__ function."""
+
+__version__ = "0.1.0"
--- a/build/lib/generatecDNA/generatecDNA-cli.py
+++ b/build/lib/generatecDNA/generatecDNA-cli.py
+"""Command-line interface client."""
+
+import argparse
+import generatecDNA as gn
+
+
+def main() -> None:
+    """Entry point for CLI executable."""
+    parser = argparse.ArgumentParser(description="cDNA generator")
+
+    parser.add_argument(
+        "-rna",
+        type=str,
+        metavar="",
+        help="Path file to fasta file with RNA sequence")
+    parser.add_argument(
+        "-gtf",
+        type=str,
+        metavar="",
+        help="Path file to gtf file")
+    parser.add_argument(
+        "-cnr",
+        type=str,
+        metavar="",
+        help="Path file to copy number file")
+
+    args = parser.parse_args()
+
+    Generator = gn.GeneratecDNA(
+        fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr)
+    Generator.generatecDNA(
+        fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr)
+    print("Done")
+
+
+if __name__ == '__main__':
+    main()
--- a/build/lib/generatecDNA/generatecDNA.py
+++ b/build/lib/generatecDNA/generatecDNA.py
+"""Module to generate cDNA copies.
+
+Class:
+    GeneratecDNA: contains one method
+        generatecDNA: takes as input fasta-formatted file &
+        gtf-formatted-file & csv-formatted file,
+        outputs fasta-formatted file with cDNA ID and unique cDNA sequence &
+        csv-formatted file with cDNA ID and copy number
+"""
+
+import random
+
+
+class GeneratecDNA:
+    """Contains function to generate cDNA.
+
+    Args:
+        input files: path to fasta-file (RNA_ID & RNA_Seq),
+        gtf-file (RNA_ID & Priming sites & Probability),
+        csv-file (RNA_ID & copy number)
+
+    Attributes:
+        fastaFile: RNA_ID & RNA_Seq
+        gtf: RNA_ID & Priming sites & Probability
+        cp_nr: RNA_ID & copy number
+    """
+
+    def __init__(self, fastaFile, gtf, cp_nr) -> str:
+        """Class intructor."""
+        self.fastaFile = fastaFile
+        self.gtf = gtf
+        self.cp_nr = cp_nr
+
+    def generatecDNA(self, fastaFile, gtf, cp_nr):
+        """Generate cDNA.
+
+        Args:
+            fastaFile (str): RNA_ID & RNA_Seq
+            gtf (str): RNA_ID & Priming sites & Probability
+            cp_nr (str): RNA_ID & copy number
+
+        Returns:
+            cDNA.fasta: cDNA_ID & cDNA sequence
+            cDNA.csv: cDNA_ID & copy number
+        """
+        # defining global variables
+        gtfFileInputDict = {}
+        csvFileInputDict = {}
+        fastaInputDict = {}
+        # READING INPUT FILES / PART I
+        # open gtf file
+        with open(gtf, 'r') as gt:
+            # read gtf file
+            for mygtfline in gt:
+                currentGTFString = mygtfline
+                gtf_list = currentGTFString.split('\t')
+                gtf_seqname = gtf_list[0]
+                gtf_start = gtf_list[3]
+                gtf_end = gtf_list[4]
+                gtf_score = gtf_list[5]
+                my_temp_list_1 = [int(gtf_start),
+                                  int(gtf_end), float(gtf_score)]
+                if gtf_seqname in gtfFileInputDict:
+                    my_temp_list_2 = gtfFileInputDict[gtf_seqname]
+                    my_temp_list_2.append(my_temp_list_1)
+                    gtfFileInputDict[gtf_seqname] = my_temp_list_2
+                else:
+                    gtfFileInputDict[gtf_seqname] = [my_temp_list_1]
+        print(gtfFileInputDict)
+        # open csv file
+        with open(cp_nr, 'r') as cp:
+            # read csv file
+            for mycsvline in cp:
+                currentcsvstring = mycsvline
+                csv_list = currentcsvstring.split(',')
+                csv_trans_id = csv_list[0]
+                csv_count = csv_list[2]
+                csv_count = csv_count.replace("\n", "")
+                """ trans id should be always new,
+                    otherwise unhash csv_current_count
+                    in defining variables section.
+                if csv_trans_id in csvFileInputDict:
+                    csv_current_count = csvFileInputDict[csv_trans_id]
+                    csv_current_count += csv_count
+                    csvFileInputDict[csv_trans_id] = csv_current_count
+                else:
+                    csvFileInputDict[csv_trans_id] = csv_count
+                """
+                csvFileInputDict[csv_trans_id] = int(csv_count)
+        print(csvFileInputDict)
+        # open fasta file
+        with open(fastaFile, 'r') as fa:
+            # defining variables
+            fasta_id = ""
+            fasta_seq = ""
+            fasta_id_found = False
+            fasta_seq_found = False
+            # read fasta file
+            for myfastaline in fa:
+                currentfastastring = myfastaline
+                # find fasta ID
+                if not fasta_id_found and not fasta_seq_found:
+                    position_of_start = currentfastastring.find('>')
+                    if position_of_start != 0:
+                        continue
+                    elif position_of_start == 0:
+                        fasta_id = myfastaline
+                        fasta_id = fasta_id.replace(">", "")
+                        fasta_id = fasta_id.replace("\n", "")
+                        # I don't know, how the sequence id is formatted and
+                        # which part thereof is equal to the transcript ID
+                        # in the csv-formatted file and gtf-formatted file
+                        # temp_fasta_list_1 = fasta_id.split('\t')
+                        # fasta_id = temp_fasta_list_1[0]
+                        fasta_id_found = True
+                        continue
+                    else:
+                        print("FASTA: Start position in fasta file not found")
+                        break
+                # find fasta sequence
+                if fasta_id_found and not fasta_seq_found:
+                    while not fasta_seq_found:
+                        zero_position = currentfastastring[0]
+                        if zero_position == ";":
+                            currentfastastring = fa.readline()
+                        elif zero_position == ">":
+                            assert False, "FASTA: No Sequence after headline"
+                        else:
+                            fasta_seq = currentfastastring
+                            fasta_seq_found = True
+                if fasta_id_found and fasta_seq_found:
+                    fastaInputDict[fasta_id] = fasta_seq
+                    fasta_id_found = False
+                    fasta_seq_found = False
+                    fasta_id = ""
+                    fasta_seq = ""
+        print(fastaInputDict)
+        # COMPUTATION OF INPUT FILES / PART II
+        outputFastaDict = {}
+        outputCSVDict = {}
+        # starting Loop1: read fasta dict
+        for (k, v) in fastaInputDict.items():
+            rna_seq = v
+            # search for transcript ID in gtf-file to get
+            # priming sites and scores
+            if k in gtfFileInputDict:
+                gtfList = gtfFileInputDict[k]
+            else:
+                assert False, "Fasta-ID from fasta-file not found in gtf-file"
+            # Excluding priming sites within 40 bases
+            # at the beginning of the transcript and
+            # ordering priming sites on the RNA sequence in gtf-dict
+            # sorting
+            gtfList.sort(key=lambda x: x[0])
+            # elimination
+            for i in gtfList:
+                if i[0] <= 40:
+                    gtfList.remove(i)
+            # search for transcript ID in csv-file
+            # to get copy number of transcript
+            if k in csvFileInputDict:
+                actual_count = csvFileInputDict[k]
+            else:
+                assert False, "Fasta-ID from fasta-file not found in csv-file"
+            # random choosing
+            scores = []
+            for i in gtfList:
+                scores.append(i[2])
+            print("gtfList: ", gtfList)
+            print("scores: ", scores)
+            my_weighted_list = random.choices(
+                gtfList, weights=scores, k=actual_count)
+            # counts per priming site
+            counts_per_priming_site = []
+            for i in range(0, len(gtfList)):
+                counts_per_priming_site.append(0)
+            for i in range(0, len(gtfList)):
+                counts_per_priming_site[i] = my_weighted_list.count(gtfList[i])
+            print("counts: ", counts_per_priming_site)
+            # Loop2: through gtfList to create cDNA starting on priming sites
+            # according to counts per priming sites
+            counter_cDNA = 0
+            for i in gtfList:
+                cDNA_3_5 = ""
+                counter_cDNA += 1
+                cDNA_ID = "-".join([k, "cDNA", str(counter_cDNA)])
+                if counter_cDNA == 1:
+                    end = i[1]
+                    # create 3' to 5' cDNA
+                    for j in range(0, int(end)):
+                        if rna_seq[j] == "A":
+                            cDNA_3_5 = cDNA_3_5 + "T"
+                        elif rna_seq[j] == "U":
+                            cDNA_3_5 = cDNA_3_5 + "A"
+                        elif rna_seq[j] == "G":
+                            cDNA_3_5 = cDNA_3_5 + "C"
+                        elif rna_seq[j] == "C":
+                            cDNA_3_5 = cDNA_3_5 + "G"
+                        else:
+                            print(
+                                k, rna_seq, gtfList, i,
+                                cDNA_ID, counts_per_priming_site)
+                            assert False, "cDNA synthesis failed, position " \
+                                          "is not A,U,G or C in transcript"
+                else:
+                    previous_end = end + 1
+                    this_end = i[1]
+                    # create 3' to 5' cDNA
+                    for j in range(int(previous_end), int(this_end)):
+                        if rna_seq[j] == "A":
+                            cDNA_3_5 = cDNA_3_5 + "T"
+                        elif rna_seq[j] == "U":
+                            cDNA_3_5 = cDNA_3_5 + "A"
+                        elif rna_seq[j] == "G":
+                            cDNA_3_5 = cDNA_3_5 + "C"
+                        elif rna_seq[j] == "C":
+                            cDNA_3_5 = cDNA_3_5 + "G"
+                        else:
+                            print(
+                                k, rna_seq, gtfList, i,
+                                cDNA_ID, counts_per_priming_site)
+                            assert False, "cDNA synthesis failed, " \
+                                          "position is not A,U,G or C " \
+                                          "in transcript"
+                # reverse sequence to 5' to 3'
+                cDNA_5_3 = cDNA_3_5[::-1]
+                if counts_per_priming_site[(counter_cDNA - 1)] == 0:
+                    continue
+                elif cDNA_5_3 in outputCSVDict:
+                    new_count = outputCSVDict[cDNA_5_3]
+                    new_count += counts_per_priming_site[(counter_cDNA - 1)]
+                    outputCSVDict[cDNA_5_3] = new_count
+                else:
+                    outputFastaDict[cDNA_5_3] = cDNA_ID
+                    outputCSVDict[cDNA_5_3] = \
+                        counts_per_priming_site[(counter_cDNA - 1)]
+        # WRITING OUTPUT FILES / PART III
+        # write fasta-file and csv-formatted file
+        with open("cDNA.fasta", 'w') as myFa, open("cDNA.csv", 'w') as myCO:
+            firstLine = True
+            for (k, v) in outputFastaDict.items():
+                headline = "".join([">", v])
+                csvLine = ",".join([v, str(outputCSVDict[k])])
+                if firstLine:
+                    myFa.write(headline)
+                    myFa.write("\n")
+                    myFa.write(k)
+                    myCO.write(csvLine)
+                    firstLine = False
+                else:
+                    myFa.write("\n")
+                    myFa.write(headline)
+                    myFa.write("\n")
+                    myFa.write(k)
+                    myCO.write("\n")
+                    myCO.write(csvLine)
+        return myFa, myCO
--- a/generatecDNA.egg-info/PKG-INFO
+++ b/generatecDNA.egg-info/PKG-INFO
 Metadata-Version: 2.1
 Name: generatecDNA
 Version: 0.1.0
-Summary: Generates cDNA copies of RNA transcript from internal priming sites
+Summary: Generates cDNA copies of RNA transcriptfrom internal priming sites
 Home-page: https://git.scicore.unibas.ch/zavolan_group/pipelines/scrna-seq-simulation.git
 Author: Suvarnan Selliah and Ruth Eneida Montano Crespo
-Author-email: r.montanocrespo@unibas.ch
+Author-email: s.selliah@unibas.ch,r.montanocrespo@unibas.ch
 License: MIT
 Platform: UNKNOWN
 License-File: LICENSE.md

--- a/generatecDNA/__pycache__/__init__.cpython-39.pyc
+++ b/generatecDNA/__pycache__/__init__.cpython-39.pyc
--- a/generatecDNA/__pycache__/generatecDNA.cpython-39-pytest-6.2.5.pyc
+++ b/generatecDNA/__pycache__/generatecDNA.cpython-39-pytest-6.2.5.pyc
--- a/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc
+++ b/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc
--- a/generatecDNA/generatecDNA.py
+++ b/generatecDNA/generatecDNA.py
-"""Package contains utilities to generate cDNA
- as part of the workflow to simulate scRNAseq.
+"""Module to generate cDNA copies.

 Class:
    GeneratecDNA: contains one method

--- a/requirements-dev.txt
+++ b/requirements-dev.txt
+coverage==6.2
+flake8-docstrings==1.6.0
+flake8==4.0.1
+pytest==6.2.5
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+coverage==6.2
+flake8-docstrings==1.6.0
+flake8==4.0.1
+pytest==6.2.5
--- a/tests/.DS_Store
+++ b/tests/.DS_Store
--- a/tests/.coverage
+++ b/tests/.coverage
--- a/tests/__pycache__/test_generatecDNA.cpython-39-pytest-6.2.5.pyc
+++ b/tests/__pycache__/test_generatecDNA.cpython-39-pytest-6.2.5.pyc
--- a/tests/test_generatecDNA.py
+++ b/tests/test_generatecDNA.py
+"""Testing generatecDNA module.
+
+Test script for generatecDNA module,
+from the workflow to simulate scRNAseq.
+"""
+
+from generatecDNA.generatecDNA import GeneratecDNA
+
+fastaFile = "fasta_example.fasta"
+gtfFile = "gtf_example.gtf"
+nrcFile = "nrcopies_example.csv"
+gn = GeneratecDNA(fastaFile, fastaFile, nrcFile)
+
+
+def test_generatecDNA():
+    """Testing main function of generatecDNA module.
+
+    Function to test type of input files required
+    to generate cDNA copies
+    """
+    assert fastaFile.endswith(".fasta")
+    assert gtfFile.endswith(".gtf")
+    assert nrcFile.endswith(".csv")