Skip to content
Snippets Groups Projects
Commit 55588f1d authored by Suvarnan Selliah's avatar Suvarnan Selliah
Browse files

Final Version 1

parent 6fc6fa37
No related branches found
No related tags found
1 merge request!27Issue #5
Showing
with 331 additions and 4 deletions
No preview for this file type
"""This is the __init__ function."""
__version__ = "0.1.0"
"""Command-line interface client."""
import argparse
import generatecDNA as gn
def main() -> None:
"""Entry point for CLI executable."""
parser = argparse.ArgumentParser(description="cDNA generator")
parser.add_argument(
"-rna",
type=str,
metavar="",
help="Path file to fasta file with RNA sequence")
parser.add_argument(
"-gtf",
type=str,
metavar="",
help="Path file to gtf file")
parser.add_argument(
"-cnr",
type=str,
metavar="",
help="Path file to copy number file")
args = parser.parse_args()
Generator = gn.GeneratecDNA(
fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr)
Generator.generatecDNA(
fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr)
print("Done")
if __name__ == '__main__':
main()
"""Module to generate cDNA copies.
Class:
GeneratecDNA: contains one method
generatecDNA: takes as input fasta-formatted file &
gtf-formatted-file & csv-formatted file,
outputs fasta-formatted file with cDNA ID and unique cDNA sequence &
csv-formatted file with cDNA ID and copy number
"""
import random
class GeneratecDNA:
"""Contains function to generate cDNA.
Args:
input files: path to fasta-file (RNA_ID & RNA_Seq),
gtf-file (RNA_ID & Priming sites & Probability),
csv-file (RNA_ID & copy number)
Attributes:
fastaFile: RNA_ID & RNA_Seq
gtf: RNA_ID & Priming sites & Probability
cp_nr: RNA_ID & copy number
"""
def __init__(self, fastaFile, gtf, cp_nr) -> str:
"""Class intructor."""
self.fastaFile = fastaFile
self.gtf = gtf
self.cp_nr = cp_nr
def generatecDNA(self, fastaFile, gtf, cp_nr):
"""Generate cDNA.
Args:
fastaFile (str): RNA_ID & RNA_Seq
gtf (str): RNA_ID & Priming sites & Probability
cp_nr (str): RNA_ID & copy number
Returns:
cDNA.fasta: cDNA_ID & cDNA sequence
cDNA.csv: cDNA_ID & copy number
"""
# defining global variables
gtfFileInputDict = {}
csvFileInputDict = {}
fastaInputDict = {}
# READING INPUT FILES / PART I
# open gtf file
with open(gtf, 'r') as gt:
# read gtf file
for mygtfline in gt:
currentGTFString = mygtfline
gtf_list = currentGTFString.split('\t')
gtf_seqname = gtf_list[0]
gtf_start = gtf_list[3]
gtf_end = gtf_list[4]
gtf_score = gtf_list[5]
my_temp_list_1 = [int(gtf_start),
int(gtf_end), float(gtf_score)]
if gtf_seqname in gtfFileInputDict:
my_temp_list_2 = gtfFileInputDict[gtf_seqname]
my_temp_list_2.append(my_temp_list_1)
gtfFileInputDict[gtf_seqname] = my_temp_list_2
else:
gtfFileInputDict[gtf_seqname] = [my_temp_list_1]
print(gtfFileInputDict)
# open csv file
with open(cp_nr, 'r') as cp:
# read csv file
for mycsvline in cp:
currentcsvstring = mycsvline
csv_list = currentcsvstring.split(',')
csv_trans_id = csv_list[0]
csv_count = csv_list[2]
csv_count = csv_count.replace("\n", "")
""" trans id should be always new,
otherwise unhash csv_current_count
in defining variables section.
if csv_trans_id in csvFileInputDict:
csv_current_count = csvFileInputDict[csv_trans_id]
csv_current_count += csv_count
csvFileInputDict[csv_trans_id] = csv_current_count
else:
csvFileInputDict[csv_trans_id] = csv_count
"""
csvFileInputDict[csv_trans_id] = int(csv_count)
print(csvFileInputDict)
# open fasta file
with open(fastaFile, 'r') as fa:
# defining variables
fasta_id = ""
fasta_seq = ""
fasta_id_found = False
fasta_seq_found = False
# read fasta file
for myfastaline in fa:
currentfastastring = myfastaline
# find fasta ID
if not fasta_id_found and not fasta_seq_found:
position_of_start = currentfastastring.find('>')
if position_of_start != 0:
continue
elif position_of_start == 0:
fasta_id = myfastaline
fasta_id = fasta_id.replace(">", "")
fasta_id = fasta_id.replace("\n", "")
# I don't know, how the sequence id is formatted and
# which part thereof is equal to the transcript ID
# in the csv-formatted file and gtf-formatted file
# temp_fasta_list_1 = fasta_id.split('\t')
# fasta_id = temp_fasta_list_1[0]
fasta_id_found = True
continue
else:
print("FASTA: Start position in fasta file not found")
break
# find fasta sequence
if fasta_id_found and not fasta_seq_found:
while not fasta_seq_found:
zero_position = currentfastastring[0]
if zero_position == ";":
currentfastastring = fa.readline()
elif zero_position == ">":
assert False, "FASTA: No Sequence after headline"
else:
fasta_seq = currentfastastring
fasta_seq_found = True
if fasta_id_found and fasta_seq_found:
fastaInputDict[fasta_id] = fasta_seq
fasta_id_found = False
fasta_seq_found = False
fasta_id = ""
fasta_seq = ""
print(fastaInputDict)
# COMPUTATION OF INPUT FILES / PART II
outputFastaDict = {}
outputCSVDict = {}
# starting Loop1: read fasta dict
for (k, v) in fastaInputDict.items():
rna_seq = v
# search for transcript ID in gtf-file to get
# priming sites and scores
if k in gtfFileInputDict:
gtfList = gtfFileInputDict[k]
else:
assert False, "Fasta-ID from fasta-file not found in gtf-file"
# Excluding priming sites within 40 bases
# at the beginning of the transcript and
# ordering priming sites on the RNA sequence in gtf-dict
# sorting
gtfList.sort(key=lambda x: x[0])
# elimination
for i in gtfList:
if i[0] <= 40:
gtfList.remove(i)
# search for transcript ID in csv-file
# to get copy number of transcript
if k in csvFileInputDict:
actual_count = csvFileInputDict[k]
else:
assert False, "Fasta-ID from fasta-file not found in csv-file"
# random choosing
scores = []
for i in gtfList:
scores.append(i[2])
print("gtfList: ", gtfList)
print("scores: ", scores)
my_weighted_list = random.choices(
gtfList, weights=scores, k=actual_count)
# counts per priming site
counts_per_priming_site = []
for i in range(0, len(gtfList)):
counts_per_priming_site.append(0)
for i in range(0, len(gtfList)):
counts_per_priming_site[i] = my_weighted_list.count(gtfList[i])
print("counts: ", counts_per_priming_site)
# Loop2: through gtfList to create cDNA starting on priming sites
# according to counts per priming sites
counter_cDNA = 0
for i in gtfList:
cDNA_3_5 = ""
counter_cDNA += 1
cDNA_ID = "-".join([k, "cDNA", str(counter_cDNA)])
if counter_cDNA == 1:
end = i[1]
# create 3' to 5' cDNA
for j in range(0, int(end)):
if rna_seq[j] == "A":
cDNA_3_5 = cDNA_3_5 + "T"
elif rna_seq[j] == "U":
cDNA_3_5 = cDNA_3_5 + "A"
elif rna_seq[j] == "G":
cDNA_3_5 = cDNA_3_5 + "C"
elif rna_seq[j] == "C":
cDNA_3_5 = cDNA_3_5 + "G"
else:
print(
k, rna_seq, gtfList, i,
cDNA_ID, counts_per_priming_site)
assert False, "cDNA synthesis failed, position " \
"is not A,U,G or C in transcript"
else:
previous_end = end + 1
this_end = i[1]
# create 3' to 5' cDNA
for j in range(int(previous_end), int(this_end)):
if rna_seq[j] == "A":
cDNA_3_5 = cDNA_3_5 + "T"
elif rna_seq[j] == "U":
cDNA_3_5 = cDNA_3_5 + "A"
elif rna_seq[j] == "G":
cDNA_3_5 = cDNA_3_5 + "C"
elif rna_seq[j] == "C":
cDNA_3_5 = cDNA_3_5 + "G"
else:
print(
k, rna_seq, gtfList, i,
cDNA_ID, counts_per_priming_site)
assert False, "cDNA synthesis failed, " \
"position is not A,U,G or C " \
"in transcript"
# reverse sequence to 5' to 3'
cDNA_5_3 = cDNA_3_5[::-1]
if counts_per_priming_site[(counter_cDNA - 1)] == 0:
continue
elif cDNA_5_3 in outputCSVDict:
new_count = outputCSVDict[cDNA_5_3]
new_count += counts_per_priming_site[(counter_cDNA - 1)]
outputCSVDict[cDNA_5_3] = new_count
else:
outputFastaDict[cDNA_5_3] = cDNA_ID
outputCSVDict[cDNA_5_3] = \
counts_per_priming_site[(counter_cDNA - 1)]
# WRITING OUTPUT FILES / PART III
# write fasta-file and csv-formatted file
with open("cDNA.fasta", 'w') as myFa, open("cDNA.csv", 'w') as myCO:
firstLine = True
for (k, v) in outputFastaDict.items():
headline = "".join([">", v])
csvLine = ",".join([v, str(outputCSVDict[k])])
if firstLine:
myFa.write(headline)
myFa.write("\n")
myFa.write(k)
myCO.write(csvLine)
firstLine = False
else:
myFa.write("\n")
myFa.write(headline)
myFa.write("\n")
myFa.write(k)
myCO.write("\n")
myCO.write(csvLine)
return myFa, myCO
Metadata-Version: 2.1
Name: generatecDNA
Version: 0.1.0
Summary: Generates cDNA copies of RNA transcript from internal priming sites
Summary: Generates cDNA copies of RNA transcriptfrom internal priming sites
Home-page: https://git.scicore.unibas.ch/zavolan_group/pipelines/scrna-seq-simulation.git
Author: Suvarnan Selliah and Ruth Eneida Montano Crespo
Author-email: r.montanocrespo@unibas.ch
Author-email: s.selliah@unibas.ch,r.montanocrespo@unibas.ch
License: MIT
Platform: UNKNOWN
License-File: LICENSE.md
......
File added
File added
No preview for this file type
"""Package contains utilities to generate cDNA
as part of the workflow to simulate scRNAseq.
"""Module to generate cDNA copies.
Class:
GeneratecDNA: contains one method
......
coverage==6.2
flake8-docstrings==1.6.0
flake8==4.0.1
pytest==6.2.5
\ No newline at end of file
coverage==6.2
flake8-docstrings==1.6.0
flake8==4.0.1
pytest==6.2.5
No preview for this file type
File added
File added
"""Testing generatecDNA module.
Test script for generatecDNA module,
from the workflow to simulate scRNAseq.
"""
from generatecDNA.generatecDNA import GeneratecDNA
fastaFile = "fasta_example.fasta"
gtfFile = "gtf_example.gtf"
nrcFile = "nrcopies_example.csv"
gn = GeneratecDNA(fastaFile, fastaFile, nrcFile)
def test_generatecDNA():
"""Testing main function of generatecDNA module.
Function to test type of input files required
to generate cDNA copies
"""
assert fastaFile.endswith(".fasta")
assert gtfFile.endswith(".gtf")
assert nrcFile.endswith(".csv")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment