diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0be791f97a0ff41cfcb9194d1ac6fbdea9665f90 Binary files /dev/null and b/.DS_Store differ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..26d33521af10bcc7fd8cea344038eaaeb78d0ef5 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/Generate_cDNA.iml b/.idea/Generate_cDNA.iml new file mode 100644 index 0000000000000000000000000000000000000000..d0876a78d06ac03b5d78c8dcdb95570281c6f1d6 --- /dev/null +++ b/.idea/Generate_cDNA.iml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<module type="PYTHON_MODULE" version="4"> + <component name="NewModuleRootManager"> + <content url="file://$MODULE_DIR$" /> + <orderEntry type="inheritedJdk" /> + <orderEntry type="sourceFolder" forTests="false" /> + </component> +</module> \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000000000000000000000000000000000000..105ce2da2d6447d11dfe32bfb846c3d5b199fc99 --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ +<component name="InspectionProjectProfileManager"> + <settings> + <option name="USE_PROJECT_PROFILE" value="false" /> + <version value="1.0" /> + </settings> +</component> \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000000000000000000000000000000000..7ba73c25da2261de84065478d9e30daadbe7ae3c --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7" project-jdk-type="Python SDK" /> +</project> \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000000000000000000000000000000000..2859f751a6abdb8dc90e753aac87246408d80272 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="ProjectModuleManager"> + <modules> + <module fileurl="file://$PROJECT_DIR$/.idea/Generate_cDNA.iml" filepath="$PROJECT_DIR$/.idea/Generate_cDNA.iml" /> + </modules> + </component> +</project> \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..94a25f7f4cb416c083d265558da75d457237d671 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="$PROJECT_DIR$" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/Generate_cDNA.py b/Generate_cDNA.py deleted file mode 100644 index c06fddb82546d7cc2ef25002b7828e10fab78f42..0000000000000000000000000000000000000000 --- a/Generate_cDNA.py +++ /dev/null @@ -1,237 +0,0 @@ -#!/usr/bin/env python3 -#Author: Suvarnan Selliah - -class GeneratecDNA: - - def generatecDNA(fasta, gtf, cp_nr, my_output_fasta="cDNA.fasta", my_output_csv="cDNA.csv", placeholder = "ph-file.csv"): - - cDNA_transcript_id = 1 - #READING INPUT FILES / PART I - #open files - with open(fasta, 'r') as fa, open(gtf, 'r') as gt, open(cp_nr, 'r') as cp: - - #read fasta-file - for myFastaline in fa: - - #search for transcript id and transcript sequence in fasta-file - fasta_id = "" - fasta_seq = "" - fasta_id_found = False - fasta_seq_found = False - currentFastaString = myFastaline - if fasta_id_found == False: - position_of_start = currentFastaString.find('>') - if position_of_start != 0: - continue - elif position_of_start == 0: - fasta_id = myFastaline - fasta_id = fasta_id.replace(">", "") - fasta_id_found = True - continue - else: - print("FASTA: Start position in fasta file not found") - break - if fasta_id_found == True and fasta_seq_found == False: - while fasta_seq_found == False: - currentFastaString = fa.readline() - zero_position = currentFastaString[0] - if zero_position == ";": - continue - elif zero_position == ">": - print("FASTA: No Sequence after headline") - break - else: - fasta_seq = currentFastaString - fasta_seq_found = True - - #starting to work with gtf-file - #defining variables for gtf-file - gtf_seqname = '' - gtf_start = 0 - gtf_end = 0 - gtf_score = 0.0 - gtf_info_found = False - gtf_entries = 0 - gtf_list_of_lines = [] - gtf_prob_list = [] - - #defining variables for csv-file - csv_trans_id = "" - csv_gene_id = "" - csv_count = 0 - csv_info_found = False - - if fasta_id_found == True and fasta_seq_found == True: - # search for transcript id from fasta-file in gtf-file - for myGTFline in gt: - currentGTFString = myGTFline - gtf_list = currentGTFString.split('\t') - gtf_seqname = gtf_list[0] - if gtf_seqname == fasta_id: - gtf_entries += 1 - gtf_start = gtf_list[3] - gtf_end = gtf_list[4] - gtf_score = gtf_list[5] - gtf_temp_list = [] - gtf_temp_list.append(gtf_start) - gtf_temp_list.append(gtf_end) - gtf_temp_list.append(gtf_score) - gtf_list_of_lines.append(gtf_temp_list) - gtf_prob_list.append(gtf_score) - else: - continue - if gtf_entries != 0: - gtf_info_found = True - fasta_id_found = False - fasta_seq_found = False - assert gtf_info_found, "Sequence ID from fasta-file not found in gtf-file" - - #search copy number of transcript in 3. file/csv-file - for myCSVline in cp: - currentCSVString = myCSVline - csv_list = currentCSVString.split(',') - csv_trans_id = csv_list[0] - if csv_trans_id == fasta_id: - csv_gene_id = csv_list[1] - csv_count = csv_list[2] - csv_info_found = True - gtf_info_found = False - break - else: - continue - assert csv_info_found, "Data (TranscriptID,GeneID,Count) from csv-file/3.file not found" - - #COMPUTATION & OUTPUT / PART II - #set score to 0 for primimg sites close to the end of sequence - seq_len = len(fasta_seq) - seq_len -= 22 - gtf_list_of_lines_len = len(gtf_list_of_lines) - for i in range(gtf_list_of_lines_len): - if gtf_list_of_lines[i][1] > seq_len: - gtf_list_of_lines[i][2] = 0.0 - gtf_prob_list[i] = 0.0 - - #assign priming sites (according to score) to copy number - sum_of_score = 0.0 - for i in gtf_prob_list: - sum_of_score += i - one_score = 100/sum_of_score - norm_list = [] - gtf_prob_list_len = len(gtf_prob_list) - for i in range(gtf_prob_list_len): - norm_list.append((one_score * gtf_prob_list[i])) - one_norm = csv_count / 100 - distr_RNA_to_prim_sites = [] - norm_list_len = len(norm_list) - for i in range(norm_list_len): - distr_RNA_to_prim_sites.append((one_norm * norm_list[i])) - total_RNA_number = 0 - distr_RNA_to_prim_sites_len = len(distr_RNA_to_prim_sites) - for i in range(distr_RNA_to_prim_sites_len): - total_RNA_number += distr_RNA_to_prim_sites[i] - new_distr_RNA_to_prim_sites = [] - if total_RNA_number != csv_count: - for i in range(distr_RNA_to_prim_sites_len): - new_distr_RNA_to_prim_sites.append(int(distr_RNA_to_prim_sites[i])) - new_distr_RNA_to_prim_sites_len = len(new_distr_RNA_to_prim_sites) - counter = 0 - while total_RNA_number != csv_count: - new_distr_RNA_to_prim_sites[counter] = round(distr_RNA_to_prim_sites[counter]) - counter += 1 - assert counter <= new_distr_RNA_to_prim_sites_len, "Calculated RNA transcripts (assigned to priming sites) are more than initial count" - - #order the priming sites - prim_sites_ordered = [] - for i in range(gtf_list_of_lines_len): - prim_sites_ordered.append(gtf_list_of_lines[i][0]) - prim_sites_ordered.sort() - - #searching for 2 priming sites - prim_sites_ordered_len = len(prim_sites_ordered) - ph_1 = 0 - ph_2 = 0 - for i in range(0, (prim_sites_ordered_len-1), 2): - if prim_sites_ordered[i] == 0.0: - continue - else: - search_for_1 = prim_sites_ordered[i] - search_for_2 = prim_sites_ordered[i+1] - for j in range(gtf_list_of_lines_len): - if gtf_list_of_lines[j][0] == search_for_1: - ph_1 = j - if gtf_list_of_lines[j][0] == search_for_2: - ph_2 = j - #making cDNA and comparing in library - start_1 = gtf_list_of_lines[ph_1][0] - start_2 = gtf_list_of_lines[ph_2][0] - start_1 -= 1 - start_2 -= 1 - trans_between_prim_sites = fasta_seq[start_1:start_2] - cDNA = '' - for element in range(0, len(trans_between_prim_sites)): - if trans_between_prim_sites[element] == 'A': - cDNA[element] = 'T' - elif trans_between_prim_sites[element] == 'U': - cDNA[element] = 'A' - elif trans_between_prim_sites[element] == 'G': - cDNA[element] = 'C' - elif trans_between_prim_sites[element] == 'C': - cDNA[element] = 'G' - else: - assert False, "cDNA synthesis failed, position is not A,U,G or C in transcript" - # open output files - if i == 0: - with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv: - myfasta.write(">" + string(cDNA_transcript_id)) - myfasta.write("\n") - myfasta.write(cDNA) - mycsv.write(",".join([cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]])) - else: - found_cDNA_id = '' - found_cDNA_id_bool = False - with open(my_output_fasta, 'r') as myfasta, open(my_output_csv, 'r') as mycsv, open(placeholder, 'w') as phf: - for myline in myfasta: - pos = myline.find('>') - if pos != 0: - continue - if pos == 0: - fid = myline - fid = fid.replace(">", "") - fbool = False - while fbool == False: - myline = myfasta.readline() - zer = myline[0] - if zer == ";": - continue - elif zer == ">": - assert False, "Error in searching cDNA in output fasta-file" - else: - fseq = myline - fbool = True - if fseq ==cDNA: - found_cDNA_id = fid - found_cDNA_id_bool = True - break - if found_cDNA_id_bool == True: - for myline_csv_out in mycsv: - csvlist = myline_csv_out.split(',') - csvcDNAid = csvlist[0] - if csvcDNAid == found_cDNA_id: - csvgeneid = csvlist[1] - csvcDNAcount = csvlist[2] - csvcDNAcount += new_distr_RNA_to_prim_sites[ph_1] - phf.write(",".join([csvcDNAid, csvgeneid, csvcDNAcount])) - else: - phf.write(myline_csv_out) - if found_cDNA_id_bool == True: - with open(my_output_csv, 'w') as mycsv, open(placeholder, 'r') as phf: - for myplaceholder in phf: - mycsv.write(myplaceholder) - else: - cDNA_transcript_id += 1 - with open(my_output_fasta, 'a') as myfasta, open(my_output_csv, 'a') as mycsv: - myfasta.write(">" + string(cDNA_transcript_id)) - myfasta.write("\n") - myfasta.write(cDNA) - mycsv.write(",".join( - [cDNA_transcript_id, csv_gene_id, new_distr_RNA_to_prim_sites[ph_1]])) \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..d5499e763f555733f046b040fdaa0e7b30afd2c7 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2021 Suvarnan Selliah and Ruth Eneida Montano Crespo + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6354e709f6f3684d9fc0a62279323b5244ea0399 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ + +## cDNA generator + +This script generates cDNA copies of transcripts allowing for the priming of DNA synthesis at transcript-internal sites. + +Input: + +* fasta-formatted file of transcript sequences +* gtf-formatted file with potential priming sites for individual transcripts, with associated probabilities +* file with the copy number of each unique transcript subjected to the cDNA synthesis + +Output: + +* fasta-formatted files with DNA copies of the transcripts, ending at the one of the possible priming sites for each transcript. Priming sites are sampled in proportion to their probability. Each copy of a unique transcript is independently sampled, but only unique DNA sequences are saved to the output file. +* Csv-formatted file with the copy number of each unique DNA copy. \ No newline at end of file diff --git a/generatecDNA-nf.nf b/generatecDNA-nf.nf new file mode 100644 index 0000000000000000000000000000000000000000..100a08a0acc3c4b1e0ce454a4ec674fd48e05658 --- /dev/null +++ b/generatecDNA-nf.nf @@ -0,0 +1,25 @@ +#!/usr/bin/env nextflow + +/* Path to transcript sequences sampled (fasta formatted) */ +params.sampledTranscriptSeq = "$baseDir/tests/*.fasta" +/* Path to potential priming sites for individual transcripts (gtf formatted) */ +params.internalPrimingSites = "$baseDir/tests/*.gtf" +/* Path to copy number of each unique transcript subjected to the cDNA synthesis (csv: transcriptID,count) */ +params.transcriptCounts = "$baseDir/tests/*.csv" + +process generatecDNA { + + input: + path sampledTranscriptSeq from params.sampledTranscriptSeq + path internalPrimingSites from params.internalPrimingSites + path transcriptCounts from params.transcriptCounts + + output: +/* O5. Path to unique cDNA sequences */ + path "cDNA_Seq" into cDNAseq_ch +/* O6. Path to cDNA count table */ + path "cDNA_Count" into cDNA_Count_ch + + """ + python $baseDir/generatecDNA/generatecDNA.py -rna ${sampledTranscriptSeq} -gtf ${internalPrimingSites} -cnr ${transcriptCounts} + """ \ No newline at end of file diff --git a/generatecDNA.egg-info/PKG-INFO b/generatecDNA.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..ccabcbf3f3825340e7447fb6861d5f584ba273ce --- /dev/null +++ b/generatecDNA.egg-info/PKG-INFO @@ -0,0 +1,13 @@ +Metadata-Version: 2.1 +Name: generatecDNA +Version: 0.1.0 +Summary: Generates cDNA copies of RNA transcript from internal priming sites +Home-page: https://git.scicore.unibas.ch/zavolan_group/pipelines/scrna-seq-simulation.git +Author: Suvarnan Selliah and Ruth Eneida Montano Crespo +Author-email: r.montanocrespo@unibas.ch +License: MIT +Platform: UNKNOWN +License-File: LICENSE.md + +UNKNOWN + diff --git a/generatecDNA.egg-info/SOURCES.txt b/generatecDNA.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..94fdf5c7097bd4461c3adc36d54f2506c4401897 --- /dev/null +++ b/generatecDNA.egg-info/SOURCES.txt @@ -0,0 +1,11 @@ +LICENSE.md +README.md +setup.py +generatecDNA/__init__.py +generatecDNA/generatecDNA-cli.py +generatecDNA/generatecDNA.py +generatecDNA.egg-info/PKG-INFO +generatecDNA.egg-info/SOURCES.txt +generatecDNA.egg-info/dependency_links.txt +generatecDNA.egg-info/entry_points.txt +generatecDNA.egg-info/top_level.txt \ No newline at end of file diff --git a/generatecDNA.egg-info/dependency_links.txt b/generatecDNA.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/generatecDNA.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/generatecDNA.egg-info/entry_points.txt b/generatecDNA.egg-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f34db363c7308d02bd1d1edacc7a0fb77c52283 --- /dev/null +++ b/generatecDNA.egg-info/entry_points.txt @@ -0,0 +1,3 @@ +[console_scripts] +generatecDNA = generatecDNA.__main__:main + diff --git a/generatecDNA.egg-info/top_level.txt b/generatecDNA.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..06a294c32bfe9755537b48a4f1f835a1052d80ad --- /dev/null +++ b/generatecDNA.egg-info/top_level.txt @@ -0,0 +1 @@ +generatecDNA diff --git a/generatecDNA/.DS_Store b/generatecDNA/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..d26fe4d4827233bf0a8017f5f274df73be301a99 Binary files /dev/null and b/generatecDNA/.DS_Store differ diff --git a/generatecDNA/__init__.py b/generatecDNA/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dde8fb9c9bece6a973113aaa11f2aad7aef1925e --- /dev/null +++ b/generatecDNA/__init__.py @@ -0,0 +1,3 @@ +"""This is the __init__ function.""" + +__version__ = "0.1.0" diff --git a/generatecDNA/__pycache__/__init__.cpython-310.pyc b/generatecDNA/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f05da8021066e73461eaab38e654999c8cde7044 Binary files /dev/null and b/generatecDNA/__pycache__/__init__.cpython-310.pyc differ diff --git a/generatecDNA/__pycache__/generatecDNA.cpython-310.pyc b/generatecDNA/__pycache__/generatecDNA.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3fa119fa372e6138678c2cb12a3cb877b3da94ba Binary files /dev/null and b/generatecDNA/__pycache__/generatecDNA.cpython-310.pyc differ diff --git a/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc b/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f4322b4d9bb4b2f764b84c23b40cd483e44a35fc Binary files /dev/null and b/generatecDNA/__pycache__/generatecDNA.cpython-39.pyc differ diff --git a/generatecDNA/generatecDNA-cli.py b/generatecDNA/generatecDNA-cli.py new file mode 100644 index 0000000000000000000000000000000000000000..46009bfbc33f17e2589eec56dcecd31c8c65022b --- /dev/null +++ b/generatecDNA/generatecDNA-cli.py @@ -0,0 +1,37 @@ +"""Command-line interface client.""" + +import argparse +import generatecDNA as gn + + +def main() -> None: + """Entry point for CLI executable.""" + parser = argparse.ArgumentParser(description="cDNA generator") + + parser.add_argument( + "-rna", + type=str, + metavar="", + help="Path file to fasta file with RNA sequence") + parser.add_argument( + "-gtf", + type=str, + metavar="", + help="Path file to gtf file") + parser.add_argument( + "-cnr", + type=str, + metavar="", + help="Path file to copy number file") + + args = parser.parse_args() + + Generator = gn.GeneratecDNA( + fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr) + Generator.generatecDNA( + fastaFile=args.rna, gtf=args.gtf, cp_nr=args.cnr) + print("Done") + + +if __name__ == '__main__': + main() diff --git a/generatecDNA/generatecDNA.py b/generatecDNA/generatecDNA.py new file mode 100644 index 0000000000000000000000000000000000000000..0f470ab53458d2cf6d823df859ec6d7009683592 --- /dev/null +++ b/generatecDNA/generatecDNA.py @@ -0,0 +1,258 @@ +"""Package contains utilities to generate cDNA + as part of the workflow to simulate scRNAseq. + +Class: + GeneratecDNA: contains one method + generatecDNA: takes as input fasta-formatted file & + gtf-formatted-file & csv-formatted file, + outputs fasta-formatted file with cDNA ID and unique cDNA sequence & + csv-formatted file with cDNA ID and copy number +""" + +import random + + +class GeneratecDNA: + """Contains function to generate cDNA. + + Args: + input files: path to fasta-file (RNA_ID & RNA_Seq), + gtf-file (RNA_ID & Priming sites & Probability), + csv-file (RNA_ID & copy number) + + Attributes: + fastaFile: RNA_ID & RNA_Seq + gtf: RNA_ID & Priming sites & Probability + cp_nr: RNA_ID & copy number + """ + + def __init__(self, fastaFile, gtf, cp_nr) -> str: + """Class intructor.""" + self.fastaFile = fastaFile + self.gtf = gtf + self.cp_nr = cp_nr + + def generatecDNA(self, fastaFile, gtf, cp_nr): + """Generate cDNA. + + Args: + fastaFile (str): RNA_ID & RNA_Seq + gtf (str): RNA_ID & Priming sites & Probability + cp_nr (str): RNA_ID & copy number + + Returns: + cDNA.fasta: cDNA_ID & cDNA sequence + cDNA.csv: cDNA_ID & copy number + """ + # defining global variables + gtfFileInputDict = {} + csvFileInputDict = {} + fastaInputDict = {} + # READING INPUT FILES / PART I + # open gtf file + with open(gtf, 'r') as gt: + # read gtf file + for mygtfline in gt: + currentGTFString = mygtfline + gtf_list = currentGTFString.split('\t') + gtf_seqname = gtf_list[0] + gtf_start = gtf_list[3] + gtf_end = gtf_list[4] + gtf_score = gtf_list[5] + my_temp_list_1 = [int(gtf_start), + int(gtf_end), float(gtf_score)] + if gtf_seqname in gtfFileInputDict: + my_temp_list_2 = gtfFileInputDict[gtf_seqname] + my_temp_list_2.append(my_temp_list_1) + gtfFileInputDict[gtf_seqname] = my_temp_list_2 + else: + gtfFileInputDict[gtf_seqname] = [my_temp_list_1] + print(gtfFileInputDict) + # open csv file + with open(cp_nr, 'r') as cp: + # read csv file + for mycsvline in cp: + currentcsvstring = mycsvline + csv_list = currentcsvstring.split(',') + csv_trans_id = csv_list[0] + csv_count = csv_list[2] + csv_count = csv_count.replace("\n", "") + """ trans id should be always new, + otherwise unhash csv_current_count + in defining variables section. + if csv_trans_id in csvFileInputDict: + csv_current_count = csvFileInputDict[csv_trans_id] + csv_current_count += csv_count + csvFileInputDict[csv_trans_id] = csv_current_count + else: + csvFileInputDict[csv_trans_id] = csv_count + """ + csvFileInputDict[csv_trans_id] = int(csv_count) + print(csvFileInputDict) + # open fasta file + with open(fastaFile, 'r') as fa: + # defining variables + fasta_id = "" + fasta_seq = "" + fasta_id_found = False + fasta_seq_found = False + # read fasta file + for myfastaline in fa: + currentfastastring = myfastaline + # find fasta ID + if not fasta_id_found and not fasta_seq_found: + position_of_start = currentfastastring.find('>') + if position_of_start != 0: + continue + elif position_of_start == 0: + fasta_id = myfastaline + fasta_id = fasta_id.replace(">", "") + fasta_id = fasta_id.replace("\n", "") + # I don't know, how the sequence id is formatted and + # which part thereof is equal to the transcript ID + # in the csv-formatted file and gtf-formatted file + # temp_fasta_list_1 = fasta_id.split('\t') + # fasta_id = temp_fasta_list_1[0] + fasta_id_found = True + continue + else: + print("FASTA: Start position in fasta file not found") + break + # find fasta sequence + if fasta_id_found and not fasta_seq_found: + while not fasta_seq_found: + zero_position = currentfastastring[0] + if zero_position == ";": + currentfastastring = fa.readline() + elif zero_position == ">": + assert False, "FASTA: No Sequence after headline" + else: + fasta_seq = currentfastastring + fasta_seq_found = True + if fasta_id_found and fasta_seq_found: + fastaInputDict[fasta_id] = fasta_seq + fasta_id_found = False + fasta_seq_found = False + fasta_id = "" + fasta_seq = "" + print(fastaInputDict) + # COMPUTATION OF INPUT FILES / PART II + outputFastaDict = {} + outputCSVDict = {} + # starting Loop1: read fasta dict + for (k, v) in fastaInputDict.items(): + rna_seq = v + # search for transcript ID in gtf-file to get + # priming sites and scores + if k in gtfFileInputDict: + gtfList = gtfFileInputDict[k] + else: + assert False, "Fasta-ID from fasta-file not found in gtf-file" + # Excluding priming sites within 40 bases + # at the beginning of the transcript and + # ordering priming sites on the RNA sequence in gtf-dict + # sorting + gtfList.sort(key=lambda x: x[0]) + # elimination + for i in gtfList: + if i[0] <= 40: + gtfList.remove(i) + # search for transcript ID in csv-file + # to get copy number of transcript + if k in csvFileInputDict: + actual_count = csvFileInputDict[k] + else: + assert False, "Fasta-ID from fasta-file not found in csv-file" + # random choosing + scores = [] + for i in gtfList: + scores.append(i[2]) + print("gtfList: ", gtfList) + print("scores: ", scores) + my_weighted_list = random.choices( + gtfList, weights=scores, k=actual_count) + # counts per priming site + counts_per_priming_site = [] + for i in range(0, len(gtfList)): + counts_per_priming_site.append(0) + for i in range(0, len(gtfList)): + counts_per_priming_site[i] = my_weighted_list.count(gtfList[i]) + print("counts: ", counts_per_priming_site) + # Loop2: through gtfList to create cDNA starting on priming sites + # according to counts per priming sites + counter_cDNA = 0 + for i in gtfList: + cDNA_3_5 = "" + counter_cDNA += 1 + cDNA_ID = "-".join([k, "cDNA", str(counter_cDNA)]) + if counter_cDNA == 1: + end = i[1] + # create 3' to 5' cDNA + for j in range(0, int(end)): + if rna_seq[j] == "A": + cDNA_3_5 = cDNA_3_5 + "T" + elif rna_seq[j] == "U": + cDNA_3_5 = cDNA_3_5 + "A" + elif rna_seq[j] == "G": + cDNA_3_5 = cDNA_3_5 + "C" + elif rna_seq[j] == "C": + cDNA_3_5 = cDNA_3_5 + "G" + else: + print( + k, rna_seq, gtfList, i, + cDNA_ID, counts_per_priming_site) + assert False, "cDNA synthesis failed, position " \ + "is not A,U,G or C in transcript" + else: + previous_end = end + 1 + this_end = i[1] + # create 3' to 5' cDNA + for j in range(int(previous_end), int(this_end)): + if rna_seq[j] == "A": + cDNA_3_5 = cDNA_3_5 + "T" + elif rna_seq[j] == "U": + cDNA_3_5 = cDNA_3_5 + "A" + elif rna_seq[j] == "G": + cDNA_3_5 = cDNA_3_5 + "C" + elif rna_seq[j] == "C": + cDNA_3_5 = cDNA_3_5 + "G" + else: + print( + k, rna_seq, gtfList, i, + cDNA_ID, counts_per_priming_site) + assert False, "cDNA synthesis failed, " \ + "position is not A,U,G or C " \ + "in transcript" + # reverse sequence to 5' to 3' + cDNA_5_3 = cDNA_3_5[::-1] + if counts_per_priming_site[(counter_cDNA - 1)] == 0: + continue + elif cDNA_5_3 in outputCSVDict: + new_count = outputCSVDict[cDNA_5_3] + new_count += counts_per_priming_site[(counter_cDNA - 1)] + outputCSVDict[cDNA_5_3] = new_count + else: + outputFastaDict[cDNA_5_3] = cDNA_ID + outputCSVDict[cDNA_5_3] = \ + counts_per_priming_site[(counter_cDNA - 1)] + # WRITING OUTPUT FILES / PART III + # write fasta-file and csv-formatted file + with open("cDNA.fasta", 'w') as myFa, open("cDNA.csv", 'w') as myCO: + firstLine = True + for (k, v) in outputFastaDict.items(): + headline = "".join([">", v]) + csvLine = ",".join([v, str(outputCSVDict[k])]) + if firstLine: + myFa.write(headline) + myFa.write("\n") + myFa.write(k) + myCO.write(csvLine) + firstLine = False + else: + myFa.write("\n") + myFa.write(headline) + myFa.write("\n") + myFa.write(k) + myCO.write("\n") + myCO.write(csvLine) + return myFa, myCO diff --git a/setup.py b/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..335dd934586c5ecf0dcf68d61b13e5914f9a398d --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +"""cDNA Generator. + +@author: Suvarnan Selliah & Ruth Montano +""" + +from setuptools import setup, find_packages + +with open("README.md", "r") as f: + long_description = f.read() + +setup( + name='generatecDNA', + url=('https://git.scicore.unibas.ch/' + 'zavolan_group/pipelines/scrna-seq-simulation.git'), + author='Suvarnan Selliah and Ruth Eneida Montano Crespo', + author_email='s.selliah@unibas.ch,r.montanocrespo@unibas.ch', + description=('Generates cDNA copies of RNA transcript' + 'from internal priming sites'), + license='MIT', + version='0.1.0', + packages=find_packages(), + install_requires=[], + + entry_points={ + 'console_scripts': [ + 'generatecDNA = generatecDNA.__main__:main' + ] + }) diff --git a/tests/.DS_Store b/tests/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..9f52ee6408fed3d2e48fd284a905747184f46c18 Binary files /dev/null and b/tests/.DS_Store differ diff --git a/tests/__pycache__/test.cpython-310-pytest-6.2.5.pyc b/tests/__pycache__/test.cpython-310-pytest-6.2.5.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eb8b9d2fd21bd6ecbf4ff82a5f60b0e6ef2056f4 Binary files /dev/null and b/tests/__pycache__/test.cpython-310-pytest-6.2.5.pyc differ diff --git a/tests/fasta_example.fasta b/tests/fasta_example.fasta new file mode 100644 index 0000000000000000000000000000000000000000..1a25b150101d1ca5424abce4593aa908d9469ebd --- /dev/null +++ b/tests/fasta_example.fasta @@ -0,0 +1,21 @@ +>RNA_1 +;some comment from RNA_1 +CCCGGGGAUAACAACCCCGUGGUCUUUGAAGCCCUAGGCAAUGUGUGACAUUCCACCCGAACACCAUGCAUCCUUUGAUCGCAUUGGGCAGGGGACGUCGCUCCACUGGCGUAACACGAAGAGCAUUGGGUGUAACUUCCGAGAGGAGAUUGAGGGUGUGGAGCGACCUGGACCUCACUCCCCCUCAUGCACAGCGUGGACGGAAUCUGAUUUUCGUAUCGAGAAUAUAAAGUACUGUAGCGGCUCCCAUCGGUUGUGGUGUGGUCCAGCCGUGGGUGGACAUCUACCGUGAUUGCAGGGACUUUUCCUGAGAAAUGGCUACUGCAUUUGCACACGUCGCACUGUGACCCCAGGAUGUAUGACCAACCGCCUAGGCGCGGAUCAGGUCUUGACCCUGAUAUGAUUGGGUGUUGGCCCUGCCUGCCUCGAUUAGGGCACUACCACCACGCUCUCACGUCUCUGUUACCCUGGCAAUAAUGCUGGUGAUCUGGGGUGGCUAAUCGAGGGUAUUUACUAUGUGGGCUCAUCGAUGACGGGGCAGUACUCGUUUAAAGCAACCGCGCAGAAGUAGGACGUCCAAGAAUCCUCGCAAGAUAUAACAGUGUUCAUUAGAGUGUUGCUAUUUGCAAUUUUGAAGGGUGUGUUACGUACGUCUCAGGCGUCCACGGCUCCUUCCUACACUACACGCAGUCCCUAGCAACACAGGGUCUUAUCCACUCAACUAAUAGAACGCGUAUAAGACAUAGACCUUCUAGAACUACGAGUAAAUGGCCCAUGUUAUGCAUCAAUCGACUUCAUCCUCGAUGAGAUUGUAGUGCUCAAGCUUCUUAGGCACUAGCACGAUUGACCUGCCCAGAAUGGUGAAUUGCUGCGCUACAAACAAGGGUGACUCCGGCACAAGCUUGCUCCUGUAGAGUCACGGGUUACGCAAAGCGGUACGGUCAUGGACUGUCAGGCUGGUUUUUUGGCACGCUCCGGGACCCGAUACUGGCAUCGGGGA +>RNA_2 +GGCUAGACUCACCAGGGAGCUUGGGUUGUACAUCGUACGGCUGGCCAUGGGGUCAGCAAUAUUAGUUAGGCUCAGCUCGUUUUUGGUCCCCAUGUUCGUCCCCCCUGGUUACAGUAUCCACGGUUUCUUCUGCGCAAACAAGGUAGCCGGACAAACAAGCUGGUGGCCUGGGGGUCGACAUGUUACACUCUGAAGGGUUAAGUCACUUCCCAUCGACUGCGUGGGCAUCUUUAUACAGCAGCGCUUGAAACCCCAAGAACGUAGGCUGCGCCCCUCCCUGCUCAAGGCCGUGGCUCUGUCAUCUGCUAAGUGAGCGAGGAGUGUGAUACGUUGCCAUUAAGCGUCUUAAGUUUUUCAUAGAUAGUGUAACGUCGCUGCUGAACUAUAAUAAGGAACCGCUAGGAAUCCACCGAUAACGUAGAUCUCCCUCGGAUGAUUUCCGGCUUAACGCGCACUAGCUGAUUCAUAUCAUGAACAAUAAGAUACAGACGUACUCUGCAGCGUGGACCUCACGGAACUUGUGAGUGCUGGUAGUUGCAGCAUGCGGCCGCUAUACCCGCCUGUUCUUAUUGAUCAUGAUCGUUCUUUCUGAGCGCUACACUGCUGGUGGUCCUGAAGCGGCGUAAUAAUUCAGGCAGUUUACAUUGCUUGGGAACGUAGCAACUAUGAUCGAAUUCGUACCCCCAGCACUAAGUACGGAUUAGCGCCAGUCGCGAAUUCUAUGAUCGUGCGAUGAUACCACUAUACGCGACAAUUAAGGUAAGCGUGGAACGCUAGGAAAGAGUGAUAUCAAAUGCGCACUACUGAGUCCCAGGUGUACAGUUACCGAAUGAUUUCAGAGUACGUAUUGCUUGUGGUGGAUCUCCUUGCAGAGAAGCUCACGAGUCCCGGACUGCCCAUGUGCACUGACUUGUUAGAAUAGAGAAUAGUAGAGGCUACUGUCCCGCACAGCUACCGUCCAUAGCAGAUUCUGUUCGCGUUUUAGAGAGGCAACUAGCAC +>RNA_3 +CGGCUCCAAGACGUCGUAUAGUUACAGACACUAGGCGGUCUAGGGUCGUACUUUGAGCAACAUUGAAUCUGUUCAGACAUUUUACUGCUGAGCAUUUAGACCGGAUCGGGUGCGAGUGAGGGAUGGACGUGCCCGAUCACUGCGGUAACGGUCAUCACCACUUUGGGGAGGCCCAUUUAUUAGUCAGAGUAGGUUGCGAAGGUAAACACCGGCCUUAACAAGACUCACGAGGUCCCGAUAGGCAUGGACUCAGUACCAUUGGCUGGCGUGCAUAAACCAUAAGCUAGCCUGCUAGCUUCUUGCAGAAACGUAAACAAUAAAGUUUAGUAAGUAAGCCGCCGGAAAUUAUGUGGUUACCAACGUUAGCGUCUAUGAGUAACUCGCCAUCGGAUAAAAUUUCUCCCUACUUUACUCCCGAACGCCUUGGGGCAUGACUUGCAUAUACUCUUACACGCCUUCAAAAGCGGAGGGGAGAAUGACCAUCAUCAUGGUGCCAGCCUGGCUAAACUGCUGCCGGUGCGAAUUUUUCCCAGUACACCACAAAUACUGGCUCACAAAGUGUAGUGGGAUUACAUGUGAAGCAUGAUACGGAUAGGCGGGUCACGACAGCUUGGUGCUACUUGUUGGGGAAUAUAAAAUCGACUAAAGUGACCCCCACGGCUAAGUCUGUCAGCGAUGUAUUCUGUUAACCGGUCGUCUUUGACGGCGAGUGUCAUAUUCCUCUUAUAAUUCAAAGUCAGUGGGGCCUGGUAUUAUGCACAGCGCGGCCGCAAGCAUAGCGGAUACGUAUACUCAGAAGUAUAAUGUUUUCGUACCCUGACGCCAGAAGCAACUAGAUAUCGUCUUCGUGCAUCACGGAAUAUACGGCUACUGGCGGUAACCGUUGCUAGUGGUUAGGGGUUUAUAGGCGGCAGGUCUAGGUUCAUUGGCAAACGCACACGCGAUCACUCUCGUUUAUCUAGUCACGAUGGCUAGCCCGCGGGCACGAGCGCCUCAGGUC +>RNA_4 +GAUGCGCGAAUCAACACCUAGGUCAUUAUACUGGGGUGGCAAAGCUUAUCGUGACUCGACGGGCUUUUUCCACCACCCCUACUCGGACCAUUGCUGGUCGAUCUACUAUAAGCGGCCACAAUGUGAAGUCCGGCUCCCGACGUGAGUAUCUAGAUAAUUAUGAGCAAGACACCAGUUAUGGCAUCUAGAAAUCAGCCACGGCCGCGCGCGGGGGGUACUGUACACUGAAACAGCAAACUUCUAUGACAGCCCGAAGUUGUUCCGGCGUGCGGUACAGAAAAGACUAAACAGGGCCUCAUUCCGCAUGUCACGUCACCGUCACCUUCGGACCCCACCUAGUUUUAGGAAACUCGCUCGUUUGAUAAGCAUAAAAACUUGUAUUGCAGCAAAGGGGUUCGCACGACUAAAAAGAUCCGCGUGUUGUGUAACAAGAGAUAACGUGGGGUCACACAAAAUGCCGACAGCCUACCGUUGUAAUGGCGAGCGAGUUCUGCCCGGGACGACAGCUCUAACAUUUUUGGCGCGUAGAAUAUGAAACUUCAUUAAAACAGGACGCACAACAUCUAUGGUGGUGAUUCAACCUUAGGAUGACCACAUCGAUUACUUGGAUUCUAAUGGUAGAAGCUAAGGGCUCCAUGAGAGACCACUGCGAGCGACUUGACAUGGCCUUUGCGCCAUUGUAUUGCCAUGCAAGAACAUCUAUGAUGGUUAUGGUCUCGUGUUGGUCCCCCACAGAGCAUCGUUAGGGUCGUCCAGGUUCAGGAGACCCUCUAAUUACGAGUCCUCUCAUACGGAGGAACUAGCGCACCCAGCCGCCGAUUCGUAGGGAUAUUAUAAUCGUACCGAGGAGCGCAGAUGAACUCGUGGUGUCUCUGUCAGUCCUCACCGCUAGUCCCGACCAUCGUGCGUACUGUCUUCUAACAUCAACGCCAUCUUCGCCUCGGCCACGUCUCAUAAUUCUUUUCUAAUGGCACAUUCUAGCGACUCAUAAUUUUGUCAG +>RNA_5 +CGCGGAAGCCUCCGGCGAAGCGAUGGUGAGCCCACUGACCCUCACAGUUAACCCCAUCCGCAUCAUCAUUAAGAUCGCACCCGAAGUACGGUCAUCCGAGGGAAACUGCAGCAGCCUAUUGGGAACAGCGUUCACGACCUCCGCCGUCCGCCGUUACUCCCAUCUUGCGGGUGCCGACAGUAACGCCCUCCCGUGCUCCCCGCCGGUAGUCUGACAAUUUAUCCUAUAGGGACUAGCGAUCAGAUGGGAACCCGCCUCUCCCAUUGCUACCGCUCCGCCGGCCGCUAGCAACGGCCCAGCACAUUCGAAGAUUACUUUUCGCUGGCCCAUACUAGAACCAAUCCAUUCUACGGACAUACGGAUUGGCGCAAGUCCCUCAAAUCCCCUGCCUAAGCACGUUCUGCAGCGGGAGACACUUCAAGAGGGUAGGGGGAUUUAGCAAUGCGAUUGUGGUGUCACGAGAGUACGGUCCAUAAUUUAAAGUGGAGCUAUCCCGCUUAGUGUCUCCUCGUAUGGGAGAGCGAUUUAUCGGAGCCUGAACCACGCAACCAAUGCAAGGAUUGGACUACACGGAUACAAGGUGUGCAUGGCGCGAAUCCCGUGCUUCAAAGAGGCGCCCACUACAUCGACGCAUAGGUAGUAACUUGCUUUCUACAAGUAACCUUUCAGAUACUCGUUAACAUUCCCAUGGUUUCGGCACUUCCGUAACUCGAAACUACAUGAGCAGUAUUUGCGGGUCCGGUGCGCUGAUUUCGAACCUACUAGAGCUCUAGGAGCAACUGUGCAGCGGGGGUGGAGCCUUUGCCACCUAUCCGGUUAAGCUACAAGACACUAUUGUGGCCUCGCUCGCUAACGAUGUCAGUCUUAAUAAGUGGUCAGUGCUCCUCGUAUGCUAUGGGGUGCUUCAACGCCCGGGAAGUGAGACAAUGGGUACGAACAACGCCCAUCAUUAUGGAAAUAACGAAUCUGCCGACCUGUCCGACGACUGUUUCCAAUGUCA +>RNA_6 +CGCGGAAGCCUCCGGCGAAGCGAUGGUGAGCCCACUGACCCUCACAGUUAACCCCAUCCGCAUCAUCAUUAAGAUCGCACCCGAAGUACGGUCAUCCGAGGGAAACUGCAGCAGCCUAUUGGGAACAGCGUUCACGACCUCCGCCGUCCGCCGUUACUCCCAUCUUGCGGGUGCCGACAGUAACGCCCUCCCGUGCUCCCCGCCGGUAGUCUGACAAUUUAUCCUAUAGGGACUAGCGAUCAGAUGGGAACCCGCCUCUCCCAUUGCUACCGCUCCGCCGGCCGCUAGCAACGGCCCAGCACAUUCGAAGAUUACUUUUCGCUGGCCCAUACUAGAACCAAUCCAUUCUACGGACAUACGGAUUGGCGCAAGUCCCUCAAAUCCCCUGCCUAAGCACGUUCUGCAGCGGGAGACACUUCAAGAGGGUAGGGGGAUUUAGCAAUGCGAUUGUGGUGUCACGAGAGUACGGUCCAUAAUUUAAAGUGGAGCUAUCCCGCUUAGUGUCUCCUCGUAUGGGAGAGCGAUUUAUCGGAGCCUGAACCACGCAACCAAUGCAAGGAUUGGACUACACGGAUACAAGGUGUGCAUGGCGCGAAUCCCGUGCUUCAAAGAGGCGCCCACUACAUCGACGCAUAGGUAGUAACUUGCUUUCUACAAGUAACCUUUCAGAUACUCGUUAACAUUCCCAUGGUUUCGGCACUUCCGUAACUCGAAACUACAUGAGCAGUAUUUGCGGGUCCGGUGCGCUGAUUUCGAACCUACUAGAGCUCUAGGAGCAACUGUGCAGCGGGGGUGGAGCCUUUGCCACCUAUCCGGUUAAGCUACAAGACACUAUUGUGGCCUCGCUCGCUAACGAUGUCAGUCUUAAUAAGUGGUCAGUGCUCCUCGUAUGCUAUGGGGUGCUUCAACGCCCGGGAAGUGAGACAAUGGGUACGAACAACGCCCAUCAUUAUGGAAAUAACGAAUCUGCCGACCUGUCCGACGACUGUUUCCAAUGUCA +>RNA_7 +UUGUCUCCACGAACACCUAGUCUCACUCAUGUCCCAGCCAGGCAGUCCUUGCUGAGUGGCAACACAUCACCUCAGCUAUGGUGAGGUCCCUACGACGCACCGUUCCGCCGACCCUUUUCGGUUUAGAACUGCUUUGUCAUCGAACGUGAGGGCACCGGAGGUAGGCUUCGAAGCGCGAUCCUGAUAAACUCGACUCAGACGCUCCCAAGCCAUGUUCAACAUCGACCCGUAGUUGAUGGUAUCGGAAACGCGACAAGAGCCGUCCCGCAGACAGAUAUUCCGCCGCCCCAGACGAUACGUGGGGCCGCAGGCCUGCCGAAUGACACAGGGUCUUUAUCAGUCCUCCCGUGCGCUUGUUCCACAUCUGCACUUAAAUGUGGCCUUCUGGGAUAUGCCCGUCCAGCCUGGUUCUAGGGUAGCCCUGCAUCUUCAAGGUUGUACCGGCAAACUGGCAAGGGACAAUGGGAAUAGAGGUCAGCUCACGUCCGGGCCGCUAGCAU +>RNA_8 +AAUCGAGACAGGCGAGACUCUUCGGACCGGGCCGGCCGCAUUGAGGAAAGGGGUCAGAGCGACCGGUGCCCCAUUCCGGAAUCGCCCAGGCAUGAUGUAAACACACCAACCGUCCUUGCCUGAUGAACUCGCACGUUUCGUGCCUCAUGAAGAGUAGACUCCCGGUUGUAUUUGUCCCCCGUGUGUACUGAUAGAACCUAGGAACGGCAAUAAUACCACUGAGGAACGGGUGGCCUAUCCAAGGGGGGCUACGGAUAGCGGGUAGCCAUACGCCGUAGGGAGUCUUAUUCGGUAAGUUUGAGCUAUAGAACGUAAAAUUAGCUGCUCCGUGUUCCUCUAUAUGCGGCUGCAUUUCCACGUACCUACUCCCGUGAGUUAGCUCGACACUUUAUAACCAUCUCAGUUGCUACGGUGAUAAAGAAUUACGUGCCGCCGACGGGUAGUCCCGCGCAAAGAUGUGGUCGUUUAGUGGGUUCUCCUACUCUAGAGACUUGCUCGUAGCAUGCGCUUAUUACAAUAUCAGUCUGGUGGUGCCGAUUUCUAGCUCAUGACCGUAAGCAACCCAAAGCAUCGUAGAAUCGGAUACAUACAGCUGAAUGAUUACAAUAACUUUGUAGAGCCCACAGUAAAGCCUAAUGAGAGCCGUUCGUUGCCUGUAAUAUCUCACUUGGGUGAUGCAAGCCCCGCUUGAUUGGAAAUCCGGGGCUCGGGUGUCCCGUGUCGUGGGAUGGUACUUGCUCACUGCCAGGAUAAAUAAAUCCCACCUCACGCGGGCAAUUACAGUGAUGACAACAAACGCC +>RNA_9 +CCCUGCCUGAGUCAUCCGGGAGUUCCCUGAGUUAUGCAGUGUAAAAAGAAUAUAUGAGAGCUCUUUUAGGGACUCAGUGAAGCUAUAACGAAGCCUUCCUCCAUCCCUCAUUUUAUAUAAAGUUGUCGGUAGUCGGGGCUACAGUGCAGUGUCCUGAUGAGCCCGCUGCUGCGAACCUUAGGGAUCUCUUAAGCUCACCGAAGCCAUUAUGUGAGAUCCCCCGCCUUGCUGGCGAUUAAGUGGGUCUAAAGCGUGUAAUUCGCCUCCUCGUAUAACUUUCUUCGGUAAUUAUGGCCGUUCAAUACUUGGAUAGACUUCGUUAUAUACGCGGUGAUUCUAUUACCACGGAGCGUUUAAAAAAAUUACCAGGAUCUCAAGGGCACCCCGCCUGAUGUGUUGAGUCUCUCGAUUGUAGGUCAGUUAGCUAGCUGACUCACAAUUGCUCUCAGCCCAGGCGUCCUAUGGCUUACCCUACAAGUAUGUUGCUCUCUGUACGGACA +>RNA_10 +CCCUGCCUGAGUCAUCCGGGAGUUCCCUGAGUUAUGCAGUGUAAAAAGAAUAUAUGAGAGCUCUUUUAGGGACUCAGUGAAGCUAUAACGAAGCCUUCCUCCAUCCCUCAUUUUAUAUAAAGUUGUCGGUAGUCGGGGCUACAGUGCAGUGUCCUGAUGAGCCCGCUGCUGCGAACCUUAGGGAUCUCUUAAGCUCACCGAAGCCAUUAUGUGAGAUCCCCCGCCUUGCUGGCGAUUAAGUGGGUCUAAAGCGUGUAAUUCGCCUCCUCGUAUAACUUUCUUCGGUAAUUAUGGCCGUUCAAUACUUGGAUAGACUUCGUUAUAUACGCGGUGAUUCUAUUACCACGGAGCGUUUAAAAAAAUUACCAGGAUCUCAAGGGCACCCCGCCUGAUGUGUUGAGUCUCUCGAUUGUAGGUCAGUUAGCUAGCUGACUCACAAUUGCUCUCAGCCCAGGCGUCCUAUGGCUUACCCUACAAGUAUGUUGCUCUCUGUACGGACA \ No newline at end of file diff --git a/tests/gtf_example.gtf b/tests/gtf_example.gtf new file mode 100644 index 0000000000000000000000000000000000000000..cc1d19daae0e06cbe1588eb0edfc1f9748d61262 --- /dev/null +++ b/tests/gtf_example.gtf @@ -0,0 +1,21 @@ +RNA_1 . priming_site 50 68 89 . . . +RNA_2 . priming_site 500 518 50 . . . +RNA_2 . priming_site 980 1000 90 . . . +RNA_3 . priming_site 500 522 34 . . . +RNA_3 . priming_site 255 270 89 . . . +RNA_3 . priming_site 678 698 34 . . . +RNA_3 . priming_site 990 1000 34 . . . +RNA_4 . priming_site 400 422 23 . . . +RNA_4 . priming_site 20 40 60 . . . +RNA_5 . priming_site 90 112 12 . . . +RNA_5 . priming_site 40 58 34.9 . . . +RNA_5 . priming_site 800 812 23.9 . . . +RNA_5 . priming_site 456 474 56.9 . . . +RNA_6 . priming_site 978 1000 100 . . . +RNA_7 . priming_site 100 122 0.1 . . . +RNA_7 . priming_site 50 70 40 . . . +RNA_8 . priming_site 20 42 20 . . . +RNA_8 . priming_site 80 96 98.9 . . . +RNA_9 . priming_site 400 418 40 . . . +RNA_9 . priming_site 10 32 60 . . . +RNA_10 . priming_site 200 218 10 . . . \ No newline at end of file diff --git a/tests/nrcopies_example.csv b/tests/nrcopies_example.csv new file mode 100644 index 0000000000000000000000000000000000000000..debd32888f00d3f82e8bfa7418ef8a3d7234ab59 --- /dev/null +++ b/tests/nrcopies_example.csv @@ -0,0 +1,10 @@ +RNA_1,Gene_1,2 +RNA_2,Gene_2,90 +RNA_3,Gene_3,6 +RNA_4,Gene_4,8 +RNA_5,Gene_5,25 +RNA_6,Gene_6,23 +RNA_7,Gene_7,5 +RNA_8,Gene_8,5 +RNA_9,Gene_9,1 +RNA_10,Gene_10,7 \ No newline at end of file