diff --git a/.gitignore b/.gitignore index 5cbcf089038bd06f821a4b73ce759865a540d5bb..767b8027a746e5a425b110f06756efdd492f8338 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,7 @@ .ipynb* +Untitled.ipynb + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 09a567743e7adf10b6145e15feb7de0d343e5031..397b934540596614faec566083079aa6e9f17cd9 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,8 @@ default: image: python:3.10-slim-buster my_tests: - # Good to put a small description here of what this job does + # run tests script: - - echo "Command 1" - - echo "Command 2" + - pip install . + - pip install -r requirements-dev.txt + - pytest \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..1e8f2cb74c9fa1b47400f268a13cb53ebf244fda --- /dev/null +++ b/Dockerfile @@ -0,0 +1,46 @@ +FROM python:3.11.1-slim-bullseye + +# set maintainer +MAINTAINER zavolab-biozentrum@unibas.ch + +# set names for user, group and user home +ARG USER="bioc" +ARG GROUP="bioc" +ARG WORKDIR="/home/${USER}" + +# create user, group, user home & makes the user and group +# the owner of the user home +RUN mkdir -p $WORKDIR \ + && groupadd -r $GROUP \ + && useradd --no-log-init -r -g $GROUP $USER \ + && chown -R ${USER}:${GROUP} $WORKDIR \ + && chmod 700 $WORKDIR + +# set the user, make sure the location where pip +# installs binaries/executables is part of the $PATH +# and set the working directory to the user's home +USER $USER +ENV PATH="${WORKDIR}/.local/bin:${PATH}" +WORKDIR $WORKDIR + +# copy entire content of the current directory to the +# working directory; ensure that this does not contain any +# files that you don't want to have end up in the Docker +# image, especially not any secrets! +# use `.dockerignore` to set files that you do NOT want +# Docker to copy +COPY --chown=${USER}:${GROUP} . $WORKDIR + +# install app and development dependencies +# assumes that dependencies in `requirements.txt` are +# automatically installed when installing the app; if +# that is not the case, they need to be installed +# _before_ installing the app +RUN pip install -e . \ + && pip install --no-cache-dir -r requirements-dev.txt + +# set default command; optional +CMD ["bash exampleInput.sh"] + + + diff --git a/README.md b/README.md index 1a489873685e74c3d4a69594b442075c2f4e8086..19582f0bd06cde7466ca4802ddf6740d798816ac 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,33 @@ # cDNA Generator module +Generate cDNA based on mRNA transcript sequences and the coresponding priming probabilities. -Description of the module: -The function of this module is to generate cdDNA based on mRNA transcript seuqences and the coresponding priming probabilities. +## Example usage +A simple example can be run from the test_files directory: + python ../cdna/cli.py -ifa yeast_example.fa -icpn copy_number_input.csv -igt Example_GTF_Input.GTF -ofa cDNA.fasta -ocsv cDNA.csv +## Installation + + pip install . + +## Docker +A docker image is available, to fetch this image: + + docker pull ericdb/my-image + +To run a simple example using this image: + + docker run my-image python cdna/cli.py -ifa test_files/yeast_example.fa -icpn test_files/copy_number_input.csv -igt test_files/Example_GTF_Input.GTF -ofa test_files/cDNA.fasta -ocsv test_files/cDNA.csv + +## License + +[MIT](https://choosealicense.com/licenses/mit/) license, Copyright (c) 2022 Zavolan Lab, Biozentrum, University of Basel + + +## Contributers +Eric Boittier, Bastian Wagner, Quentin Badolle + +## More info: **Input files** @@ -17,7 +41,7 @@ transcript_copies (csv-formatted) containing: transcript_sequences (fasta-formatted) containing: - ID of transcript - +- transcript-sequence priming_sites (gtf-formatted) containing: diff --git a/cdna/cdna.py b/cdna/cdna.py index c4b1ae3fd7eaa756e3b3b2cff40e5407f17081f3..d8acd5f05c684854327b61db125edc9b168c3d9a 100644 --- a/cdna/cdna.py +++ b/cdna/cdna.py @@ -1,94 +1,235 @@ -import sys +import warnings +import pandas as pd +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from gtfparse import read_gtf -def translate(res): - translate_dict = {"A": "T", "U": "A", "G": "C", "C": "G"} +# ignore warnings from read_gtf +warnings.filterwarnings(action="ignore", category=FutureWarning) + + +def complement(res: str) -> str: + """ + Returns the cDNA complement of a given base pair + Args: + res: residue code. + + Returns: corresponding cDNA residue. + Raises: Value error + + """ + translate_dict = {"A": "T", "T": "A", "U": "A", "G": "C", "C": "G"} if res not in translate_dict.keys(): - print("cDNA residue not A,T,U or G ") - sys.exit(1) + print(f"Unknown character, {res}") + raise ValueError return translate_dict[res] -class cDNA_Gen: - def __init__( - self, fasta, gtf, cpn, output_fasta="cDNA.fasta", output_csv="cDNA.csv" - ): +def seq_complement(sequence: str) -> str or None: + """ + Returns the corresponding cDNA sequence by finding the complementary + base pairs and returning the reversed sequence. + + Args: + sequence: sequence to be converted into cDNA. + + Returns: corresponding cDNA sequence. + + """ + if sequence is None: + return None + _ = "".join([complement(char) for char in str(sequence)])[::-1] # reverse string + return _ + + +class CDNAGen: + """ + Module that performs the cDNA synthesis. + """ + + def __init__(self, ifasta: str, igtf: str, icpn: str, ofasta: str, ocsv: str): # inputs - self.fasta = fasta - self.gtf = gtf - self.cpn = cpn - self.output_fasta = output_fasta - self.output_csv = output_csv - # variables - self.prime_sites = [] - self.fasta_seq = "" - self.fasta_id = "" - self.copy_numbers = {} + self.fasta = ifasta + self.gtf = igtf + self.cpn = icpn + self.output_fasta = ofasta + self.output_csv = ocsv + # variables + self.csv_df = None + self.fasta_dict = None + self.fasta_records = None + self.gtf_df = None self.run() - def run(self): + def run(self) -> None: + """ + Executes the cDNA workflow. + Returns: None + + """ + self.read_csv() self.read_fasta() self.read_gtf() - - def order_priming_sites(self): - pass - - def generate_cdna(self): - pass - - def read_fasta(self): - fasta = open(self.fasta).readlines() - self.fasta_id = fasta[0] - print(fasta[0]) - self.fasta_seq = "".join([_.rstrip() for _ in fasta[1:]]) - - def read_gtf(self): - with open(self.gtf) as gtf_file: - gtf_lines = gtf_file.readlines() - for line in gtf_lines[:1000]: - if not line.startswith("#"): - temp_gtf = GTF_entry(line) - temp_gtf.set_sequence(self.fasta_seq) - self.prime_sites.append(temp_gtf) - - def write_fasta(self): - pass - - def read_copy_numbers(self): - with open(self.cpn) as cpn_file: - cpn_lines = cpn_file.readlines() - for line in cpn_lines: - csv = line.split(",") - trans_id = csv[0] - if trans_id: - gene_id = csv[1] - count = csv[2] - self.copy_numbers[gene_id] = count - - def return_output(self): - return self.output_fasta, self.output_csv - - -class GTF_entry: - def __init__(self, string): - self.string = string - self.values = self.string.split("\t") - self.id = self.values[0] - self.start = int(self.values[3]) - self.end = int(self.values[4]) - self.score = float(0.5) # self.values[5] - self.sequence = "no sequence set" - self.length = self.end - self.start - - def __repr__(self): - return self.sequence[:10] + "..." + f" len={self.length} score={self.score}" - - def set_sequence(self, full_sequence): - self.sequence = full_sequence[self.start : self.end] - - -if __name__ == "__main__": - import argparse - - pass + self.add_sequences() + self.add_complement() + self.add_records() + self.write_fasta() + self.write_csv() + + def add_records(self) -> None: + """Adds data records to fasta file. + + Adds the copy number information to the fasta records. + + Returns: None + + """ + self.fasta_records = [] + for index, row in self.gtf_df.iterrows(): + if row["complement"] is not None: + copy_number = row["Transcript_Copy_Number"] + for _ in range(int(copy_number)): + record = SeqRecord( + Seq(row["complement"]), + row["cdna_ID"], + f"Transcript copy number: {copy_number}", + "", + ) + self.fasta_records.append(record) + + def add_sequences(self) -> None: + """ + Adds the sequence for a given priming site. + Returns: None + + """ + self.gtf_df["priming_site"] = self.gtf_df.apply( + lambda row: self.read_primingsite(row["seqname"], row["start"]), + axis=1, + ) + + def add_complement(self) -> None: + """ + Adds the complementary cDNA sequence. + Returns: None + + """ + self.gtf_df["complement"] = self.gtf_df["priming_site"].apply( + lambda x: seq_complement(x) + ) + + def read_primingsite(self, sequence: str, end: int) -> None: + """Read a fasta file from a given start character + + Reads a fasta sequence with ID (sequence) and returns the + sequence starting from the index start. + + Args: + sequence: sequence ID to be read. + end: end index of the priming site. + + Returns: None + + """ + if sequence not in self.fasta_dict.keys(): + return None + return self.fasta_dict[sequence].seq[:end] + + def read_fasta(self) -> None: + """Read a given fasta file. + + Wrapper for SeqIO.parse. + + Returns: None + + """ + record = SeqIO.parse(self.fasta, "fasta") + records = list(record) + self.fasta_dict = {x.name: x for x in records} + + def read_csv(self) -> None: + """Reads a given copy number csv file + + Wrapper for Pandas read_csv. + + Returns: None + + """ + df_csv = pd.read_csv(self.cpn, index_col=False) + df_csv = df_csv.reset_index() # make sure indexes pair with number of rows + self.csv_df = df_csv + + def read_gtf(self) -> None: + """Read and process the GTF file. + + Reads a GTF file and determines copy numbers from normalized probabilities. + + Returns: None + + """ + # returns GTF with essential columns such as "feature", "seqname", "start", "end" + # alongside the names of any optional keys which appeared in the attribute column + gtf_df = read_gtf(self.gtf) + gtf_df["Binding_Probability"] = pd.to_numeric( + gtf_df["Binding_Probability"] + ) # convert to numeric + df_normalization_bind_probablility = gtf_df.groupby("seqname")[ + "Binding_Probability" + ].sum() # extract binding probability + count = 0 + prev_id = None + # Adds Normalized_Binding_Probability and Transcript_Copy_Number + # to each transcript in the dataframe + for index, row in gtf_df.iterrows(): + # GTF transcript ID + id_ = str(row["seqname"]) + if id_ == prev_id: + count += 1 + else: + count = 0 # reset count + # CVS transcript ID + id_csv = str(row["seqname"]).split("_")[1] + # Calculate Normalized_Binding_Probability and add to GTF dataframe + gtf_df.loc[index, "Normalized_Binding_Probability"] = ( + row["Binding_Probability"] / df_normalization_bind_probablility[id_] + ) + # Calculate Normalized_Binding_Probability and add to GTF dataframe + csv_transcript_copy_number = self.csv_df.loc[ + self.csv_df["ID of transcript"] == int(id_csv), + "Transcript copy number", + ].iloc[0] # pop the first value in the frame + gtf_df.loc[index, "Transcript_Copy_Number"] = round( + csv_transcript_copy_number + * gtf_df.loc[index, "Normalized_Binding_Probability"] + ) + gtf_df.loc[index, "cdna_ID"] = f"{id_}_{count}" + prev_id = id_ + + self.gtf_df = gtf_df + + def write_fasta(self) -> None: + """Writes cDNA fasta records to file. + + Wrapper for SeqIO.write. + + Returns: None + + """ + SeqIO.write(self.fasta_records, self.output_fasta, "fasta") + print(f"Fasta file successfully written to: {self.output_fasta}") + + def write_csv(self) -> None: + """Writes the copy number information to a csv file. + + Wrapper for Pandas to_csv. + + Returns: None + + """ + self.gtf_df[["cdna_ID", "Transcript_Copy_Number"]].to_csv( + self.output_csv, index=False + ) + print(f"Copy number csv file successfully written to: {self.output_csv}") diff --git a/cdna/cli.py b/cdna/cli.py index 8a623693ef8b7df6bfea53c8825d65639ba81a6a..86f1babf669c3aeadb136922d036ca442eeef8a1 100644 --- a/cdna/cli.py +++ b/cdna/cli.py @@ -1 +1,58 @@ -import cdna +import argparse +import logging + +from cdna import CDNAGen + + +def cdna_parser() -> None: + """Parser for cDNA generator + + Parses command line arguments for cDNA generation. + + Returns: None + + """ + parser = argparse.ArgumentParser( + prog="cDNA generator", + description="Generate cDNA sequences based on primer probabilities.", + ) + parser.add_argument( + "-ifa", "--input_fasta", help="genome fasta file", required=True + ) + parser.add_argument("-igtf", "--input_gtf", help="gtf file", required=True) + parser.add_argument( + "-ofa", "--output_fasta", help="output fasta file", required=True + ) + parser.add_argument( + "-icpn", + "--input_copy_number", + help="input copy number (csv) file", + required=True, + ) + parser.add_argument( + "-ocsv", "--output_csv", help="output fasta file", required=True + ) + args = parser.parse_args() + # Print parser arguments + print(" \n".join(f"{k}={v}" for k, v in vars(args).items())) + print() + cdna_inst = CDNAGen( + ifasta=args.input_fasta, + igtf=args.input_gtf, + icpn=args.input_copy_number, + ocsv=args.output_csv, + ofasta=args.output_fasta, + ) + return cdna_inst + + +if __name__ == "__main__": + logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', + level=logging.INFO, + ) + LOG = logging.getLogger(__name__) + print("**********************") + print("Running cDNA generator") + print("**********************") + cdna_parser() diff --git a/exampleInput.sh b/exampleInput.sh new file mode 100644 index 0000000000000000000000000000000000000000..547236c18a023c23a7d9fb68a80ce9c3f847f921 --- /dev/null +++ b/exampleInput.sh @@ -0,0 +1,4 @@ +python cdna/cli.py -ifa test_files/yeast_example.fa \ + -icpn test_files/copy_number_input.csv \ + -igt test_files/Example_GTF_Input.GTF \ + -ofa test_files/cDNA.fasta -ocsv test_files/cDNA.csv \ No newline at end of file diff --git a/requirements_dev.txt b/requirements-dev.txt similarity index 62% rename from requirements_dev.txt rename to requirements-dev.txt index 074eda1a7a8500264266efe852248e12a7161c8a..a511aad9d7b7d9ab2e6d885ba48915c49d907b28 100644 --- a/requirements_dev.txt +++ b/requirements-dev.txt @@ -1,3 +1,6 @@ +gtfparse +biopython +pytest black flake8 flake8-docstrings diff --git a/requirements.txt b/requirements.txt index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d9abb6b59de6846eaa50eaefc737f65f3b674c31 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,3 @@ +gtfparse +biopython +pandas diff --git a/setup.py b/setup.py index 96ecc09a8223aebbcef30db4096998562b1f0976..73e2bfd1cb029c83a18f4d43ff8309ff8d1ff6eb 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,8 @@ from setuptools import setup, find_packages +with open('requirements.txt') as f: + required = f.read().splitlines() + setup( name='cdna', url='https://gitlab.com/my_user_name/my_package.git', @@ -9,5 +12,5 @@ setup( license='MIT', version='1.0.0', packages=find_packages(), # this will autodetect Python packages from the directory tree, e.g., in `code/` - install_requires=[], # add here packages that are required for your package to run, including version or range of versions + install_requires=required, # add here packages that are required for your package to run, including version or range of versions ) diff --git a/test_files/priming_sites.gtf b/test_files/Example_GTF_Input.GTF similarity index 50% rename from test_files/priming_sites.gtf rename to test_files/Example_GTF_Input.GTF index df3aafd0f3266cd3f912c116a10821d36a371e8e..3b7a10689330b4b92b02f423c329a79c83b06c37 100644 --- a/test_files/priming_sites.gtf +++ b/test_files/Example_GTF_Input.GTF @@ -1,3 +1,6 @@ Transcript_1 RIBlast Priming_site 10 25 . + . Accessibility_Energy "1.49"; Hybridization_Energy "-9.76"; Interaction_Energy "-8.74"; Number_of_binding_sites "2"; Binding_Probability "0.12" Transcript_1 RIBlast Priming_site 640 655 . + . Accessibility_Energy "1.71"; Hybridization_Energy "-9.12"; Interaction_Energy "-8.34"; Number_of_binding_sites "2"; Binding_Probability "0.05" Transcript_2 RIBlast Priming_site 3 18 . + . Accessibility_Energy "1.21"; Hybridization_Energy "-5.12"; Interaction_Energy "-2.34"; Number_of_binding_sites "1"; Binding_Probability "0.15" +Transcript_3 RIBlast Priming_site 5 35 . + . Accessibility_Energy "1.21"; Hybridization_Energy "-5.12"; Interaction_Energy "-2.34"; Number_of_binding_sites "1"; Binding_Probability "0.25" +Transcript_4 RIBlast Priming_site 5 35 . + . Accessibility_Energy "1.21"; Hybridization_Energy "-5.12"; Interaction_Energy "-2.34"; Number_of_binding_sites "1"; Binding_Probability "0.15" +Transcript_5 RIBlast Priming_site 5 35 . + . Accessibility_Energy "1.21"; Hybridization_Energy "-5.12"; Interaction_Energy "-2.34"; Number_of_binding_sites "1"; Binding_Probability "0.15" \ No newline at end of file diff --git a/test_files/cDNA.csv b/test_files/cDNA.csv new file mode 100644 index 0000000000000000000000000000000000000000..d88da423c611025b828e53eb4de0409f8f7d4cbe --- /dev/null +++ b/test_files/cDNA.csv @@ -0,0 +1,7 @@ +cdna_ID,Transcript_Copy_Number +Transcript_1_0,8.0 +Transcript_1_1,4.0 +Transcript_2_0,11.0 +Transcript_3_0,33.0 +Transcript_4_0,11.0 +Transcript_5_0,55.0 diff --git a/test_files/cDNA.fasta b/test_files/cDNA.fasta new file mode 100644 index 0000000000000000000000000000000000000000..2a8d9b5f817d8e06453d71fa464237be6c77d1bf --- /dev/null +++ b/test_files/cDNA.fasta @@ -0,0 +1,86 @@ +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_0 +AAAAAAAAAA +>Transcript_1_1 +ACATATGTGTGGATGATATAAGATTTGTCCCAAGGACGCTTCTCGAATAGTTCTTTTGTC +CCTAGATATTTCCTTCTCTAATTTCTTCTGTCTCTCCAAAACATCTTGTTCATTTTTGGG +TGGTGGCAGCATTTGTTCTTTATGTAGGAAAGCCTTTGTAGCACGGTTTACAATTTTTAC +TGCACAAACCTCATTAGTGTAACGATGTTTTGCCAGCTTCACTTTACCCATAGAACCTGC +ACCAACTGTTTCAACAAACTCCCAATCTCCTAACGATTTTCGATGAAACTGCTTAGGCAT +GCCCTGAGAAGACGAAACTCGACTCTGGCTGGTAGTATTTGGCTTTGGTGCATTCTCTCT +CGATTTACCTTCAAGTTCTACTTGTCTCTCCTTTTGCTCTGCTTGATGTGAGTTACCATT +ATTGGCGTATTTGATATCCGCCGGGGGCATTAGCGGTGTGTTCTGCTGCTGCTGCTGCTG +TGGACTTTTTCCCATCATTCTCAGCGTAGCGGGCGCCATAGTGCTTGGTTGTGTATGCAT +GCTGTTGCTTTCACTATTGCCATCATCCTGCTGGTTACCTCTGCCCATTGAGAAGGCAGT +ATTTACGTGATAATCATCCATAAAAAAAAAAAAAAAAAAA +>Transcript_1_1 +ACATATGTGTGGATGATATAAGATTTGTCCCAAGGACGCTTCTCGAATAGTTCTTTTGTC +CCTAGATATTTCCTTCTCTAATTTCTTCTGTCTCTCCAAAACATCTTGTTCATTTTTGGG +TGGTGGCAGCATTTGTTCTTTATGTAGGAAAGCCTTTGTAGCACGGTTTACAATTTTTAC +TGCACAAACCTCATTAGTGTAACGATGTTTTGCCAGCTTCACTTTACCCATAGAACCTGC +ACCAACTGTTTCAACAAACTCCCAATCTCCTAACGATTTTCGATGAAACTGCTTAGGCAT +GCCCTGAGAAGACGAAACTCGACTCTGGCTGGTAGTATTTGGCTTTGGTGCATTCTCTCT +CGATTTACCTTCAAGTTCTACTTGTCTCTCCTTTTGCTCTGCTTGATGTGAGTTACCATT +ATTGGCGTATTTGATATCCGCCGGGGGCATTAGCGGTGTGTTCTGCTGCTGCTGCTGCTG +TGGACTTTTTCCCATCATTCTCAGCGTAGCGGGCGCCATAGTGCTTGGTTGTGTATGCAT +GCTGTTGCTTTCACTATTGCCATCATCCTGCTGGTTACCTCTGCCCATTGAGAAGGCAGT +ATTTACGTGATAATCATCCATAAAAAAAAAAAAAAAAAAA +>Transcript_1_1 +ACATATGTGTGGATGATATAAGATTTGTCCCAAGGACGCTTCTCGAATAGTTCTTTTGTC +CCTAGATATTTCCTTCTCTAATTTCTTCTGTCTCTCCAAAACATCTTGTTCATTTTTGGG +TGGTGGCAGCATTTGTTCTTTATGTAGGAAAGCCTTTGTAGCACGGTTTACAATTTTTAC +TGCACAAACCTCATTAGTGTAACGATGTTTTGCCAGCTTCACTTTACCCATAGAACCTGC +ACCAACTGTTTCAACAAACTCCCAATCTCCTAACGATTTTCGATGAAACTGCTTAGGCAT +GCCCTGAGAAGACGAAACTCGACTCTGGCTGGTAGTATTTGGCTTTGGTGCATTCTCTCT +CGATTTACCTTCAAGTTCTACTTGTCTCTCCTTTTGCTCTGCTTGATGTGAGTTACCATT +ATTGGCGTATTTGATATCCGCCGGGGGCATTAGCGGTGTGTTCTGCTGCTGCTGCTGCTG +TGGACTTTTTCCCATCATTCTCAGCGTAGCGGGCGCCATAGTGCTTGGTTGTGTATGCAT +GCTGTTGCTTTCACTATTGCCATCATCCTGCTGGTTACCTCTGCCCATTGAGAAGGCAGT +ATTTACGTGATAATCATCCATAAAAAAAAAAAAAAAAAAA +>Transcript_1_1 +ACATATGTGTGGATGATATAAGATTTGTCCCAAGGACGCTTCTCGAATAGTTCTTTTGTC +CCTAGATATTTCCTTCTCTAATTTCTTCTGTCTCTCCAAAACATCTTGTTCATTTTTGGG +TGGTGGCAGCATTTGTTCTTTATGTAGGAAAGCCTTTGTAGCACGGTTTACAATTTTTAC +TGCACAAACCTCATTAGTGTAACGATGTTTTGCCAGCTTCACTTTACCCATAGAACCTGC +ACCAACTGTTTCAACAAACTCCCAATCTCCTAACGATTTTCGATGAAACTGCTTAGGCAT +GCCCTGAGAAGACGAAACTCGACTCTGGCTGGTAGTATTTGGCTTTGGTGCATTCTCTCT +CGATTTACCTTCAAGTTCTACTTGTCTCTCCTTTTGCTCTGCTTGATGTGAGTTACCATT +ATTGGCGTATTTGATATCCGCCGGGGGCATTAGCGGTGTGTTCTGCTGCTGCTGCTGCTG +TGGACTTTTTCCCATCATTCTCAGCGTAGCGGGCGCCATAGTGCTTGGTTGTGTATGCAT +GCTGTTGCTTTCACTATTGCCATCATCCTGCTGGTTACCTCTGCCCATTGAGAAGGCAGT +ATTTACGTGATAATCATCCATAAAAAAAAAAAAAAAAAAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA +>Transcript_2_0 +AAA diff --git a/test_files/copy_number_input.csv b/test_files/copy_number_input.csv new file mode 100644 index 0000000000000000000000000000000000000000..d3bfaaa6e44d0ae7b52ee8f60f6ec61060c4b636 --- /dev/null +++ b/test_files/copy_number_input.csv @@ -0,0 +1,6 @@ +ID of transcript,ID of parent transcript,Transcript copy number +1,1,12 +2,1,11 +3,2,33 +4,3,11 +5,4,55 \ No newline at end of file diff --git a/test_files/yeast_example.fa b/test_files/yeast_example.fa new file mode 100644 index 0000000000000000000000000000000000000000..8af0a80debe4d1c24cd8a5ca73900891450e6a93 --- /dev/null +++ b/test_files/yeast_example.fa @@ -0,0 +1,76 @@ +>Transcript_1 +TTTTTTTTTTTTTTTTTTTATGGATGATTATCACGTAAATACTGCCTTCTCAATGGGCAGAGGTAACCAGCAGGATGAT +GGCAATAGTGAAAGCAACAGCATGCATACACAACCAAGCACTATGGCGCCCGCTACGCTG +AGAATGATGGGAAAAAGTCCACAGCAGCAGCAGCAGCAGAACACACCGCTAATGCCCCCG +GCGGATATCAAATACGCCAATAATGGTAACTCACATCAAGCAGAGCAAAAGGAGAGACAA +GTAGAACTTGAAGGTAAATCGAGAGAGAATGCACCAAAGCCAAATACTACCAGCCAGAGT +CGAGTTTCGTCTTCTCAGGGCATGCCTAAGCAGTTTCATCGAAAATCGTTAGGAGATTGG +GAGTTTGTTGAAACAGTTGGTGCAGGTTCTATGGGTAAAGTGAAGCTGGCAAAACATCGT +TACACTAATGAGGTTTGTGCAGTAAAAATTGTAAACCGTGCTACAAAGGCTTTCCTACAT +AAAGAACAAATGCTGCCACCACCCAAAAATGAACAAGATGTTTTGGAGAGACAGAAGAAA +TTAGAGAAGGAAATATCTAGGGACAAAAGAACTATTCGAGAAGCGTCCTTGGGACAAATC +TTATATCATCCACACATATGTAGACTTTTTGAGATGTGCACGTTGTCAAATCATTTCTAT +ATGTTGTTTGAATATGTTTCAGGTGGTCAGCTGTTAGACTATATCATCCAACATGGGTCA +ATACGAGAACACCAAGCGAGAAAGTTTGCTAGGGGTATCGCGAGCGCCTTAATATATTTG +CATGCTAACAACATCGTCCATAGAGATTTGAAGATAGAAAATATAATGATTTCAGATTCC +AGTGAAATCAAGATAATTGATTTTGGACTTTCAAATATTTATGATTCTAGGAAGCAGCTT +CATACATTCTGTGGCTCTCTGTATTTTGCCGCTCCCGAGCTGTTAAAAGCGAATCCTTAT +ACAGGACCTGAAGTAGATGTCTGGTCATTTGGTGTAGTTTTATTTGTTTTGGTATGCGGT +AAAGTGCCATTTGACGACGAAAATTCGAGCGTTTTACATGAAAAGATCAAGCAAGGTAAG +GTTGAATATCCCCAACATTTATCTATCGAAGTAATATCACTGTTATCCAAAATGTTGGTA +GTAGATCCGAAAAGAAGAGCCACACTTAAACAGGTTGTGGAGCACCACTGGATGGTAAGA +GGGTTCAATGGTCCCCCTCCTTCTTACTTACCGAAAAGAGTTCCCCTAACTATCGAAATG +CTTGATATAAATGTCTTAAAAGAAATGTACCGTTTAGAATTTATTGACGATGTAGAGGAA +ACAAGAAGTGTTTTGGTCAGTATAATCACAGATCCTACTTACGTTCTTCTCTCTAGACAA +TACTGGACTTTAGCGGCCAAAATGAACGCAGAATCCAGTGATAACGGAAACGCGCCAAAC +ATAACAGAGAGTTTTGAAGACCCAACTCGGGCATATCATCCAATGATTTCCATATATTAC +TTGACTTCTGAGATGCTTGATAGGAAACATGCGAAAATTCGGAATCAACAACAGCGACAG +AGCCACGAAAATATAGAAAAGCTTTCTGAAATACCGGAAAGTGTGAAGCAAAGGGACGTA +GAGGTCAATACAACCGCTATGAAATCAGAGCCAGAAGCCACGTTGGCGACAAAAGATACC +TCTGTGCCATTCACTCCAAAGAATAGCGATGGCACAGAGCCCCCCTTACATGTCTTAATT +CCACCGAGGTTAGCAATGCCAGAACAAGCGCATACGTCACCAACGAGTAGAAAAAGTTCT +GATAATCAACGCCGTGAAATGGAATATGCTCTCTCTCCAACTCCTCAGGGAAATGACTAT +CAACAATTTAGGGTACCTTCAACTACTGGCGATCCCTCAGAAAAGGCGAAGTTTGGGAAT +ATATTTAGAAAATTATCACAGCGCCGTAAAAAGACCATTGAACAGACATCTGTTAATAGT +AATAATAGTATCAATAAACCTGTGCAAAAGACGCATTCTCGCGCTGTTTCAGACTTTGTC +CCTGGTTTTGCTAAACCGAGTTATGATTCAAATTATACCATGAATGAGCCTGTCAAGACA +AACGATAGCAGAGGTGGCAATAAAGGTGACTTTCCAGCATTGCCTGCGGATGCAGAAAAT +ATGGTAGAGAAGCAAAGGGAGAAGCAAATTGAAGAAGATATAATGAAATTGCATGATATT +AATAAACAGAATAATGAAGTTGCAAAAGGAAGCGGGCGGGAAGCTTACGCTGCACAGAAG +TTTGAAGGAAGCGACGACGACGAGAATCATCCCTTACCACCTCTCAATGTTGCAAAAGGT +CGAAAACTACATCCAAGCGCAAGAGCTAAATCAGTTGGTCATGCTCGTCGTGAATCACTC +AAATATATGAGACCCCCAATGCCTTCATCTGCCTATCCTCAGCAAGAGCTTATAGATACT +GGGTTCTTAGAATCAAGTGACGATAACAAATCCGATAGTTTGGGAAATGTTACTTCACAG +ACGAATGATAGCGTCAGCGTGCATTCTGTGAACGCACACATAAACTCGCCATCTGTGGAG +AAGGAATTAACAGATGAAGAGATATTGCAGGAAGCTTCTAGAGCTCCAGCCGGTTCTATG +CCATCCATCGATTTCCCCCGTTCTTTATTTTTGAAGGGTTTCTTTTCTGTTCAGACAACT +TCATCGAAGCCTTTGCCAATTGTTAGATACAAGATTATGTTTGTTCTGAGGAAAATGAAT +ATAGAGTTCAAGGAGGTTAAGGGTGGTTTTGTTTGTATGCAAAGGTTCTCTTCCAATAAT +GTGGCAGCGAAGAGAGAAGGGACTCCAAGATCGATCATGCCACTTTCGCACCACGAATCC +ATTAGACGTCAAGGCTCTAATAAATACTCACCTTCTTCTCCTTTGACAACTAATTCCATT +CACCAGAGAAAAACATTTTTTTTTTTTTTTTTTTTCTATTACCGAAACCTATGGAGATGATAAGCATTCGGGAACATCT +TTGGAGAACATCCACCAACAAGGTGACGGTAGCGAAGGCATGACTACAACAGAAAAAGAG +CCCATCAAATTCGAAATTCATATCGTCAAGGTTCGTATTGTTGGTTTAGCCGGTGTGCAT +TTTAAGAAAATTTCTGGGAACACTTGGTTGTATAAAGAGCTGGCTTCTAGTATATTAAAA +GAACTAAAGTTGTAA +>Transcript_2 +TTTTTTTTTTTTTTTTTTTTTATGTATGTTGATCCGATGAACAACAATGAAATCAGGAAATTAAGCATTACTGCCAAGACA +GAAACAACTCCAGATAACGTTGGACAAGACATTCCTGTAAACGCACATTCGGTGCATGAG +GAATGTTCTTCCAACACACCCGTGGAGATAAATGGAAGAAACAGCGGAAAGTTGAAAGAA +GAAGCGTCTGCAGGTATTTGTTTGGTTAAAAAACCAATGCTACAATATAGAGATACCTCA +GGAAAGTATTCCCTAAGTGACTTTCAGATTTTAAGAACTTTGGGAACTGGCTCATTTGGG +AGAGTTCACCTAATTCGTTCCAATCACAATGGGAGGTTTTACGCTTTGAAGACATTGAAA +AAGCACACTATAGTGAAGCTGAAGCAGGTTGAACACACCAATGACGAACGCCGAATGCTT +TCAATTGTTTCACATCCATTCATCATTCGAATGTGGGGAACGTTCCAAGATTCTCAGCAA +GTTTTCATGGTAATGGACTACATTGAAGGTGGTGAATTATTTTCTTTACTACGTAAATCT +CAAAGATTTCCCAACCCAGTAGCCAAATTTTATGCCGCAGAGGTATGCTTAGCGTTGGAA +TATTTGCACAGTAAGGATATAATATATATTTTTTTTTTTTTTTTTTTTTTTTTTGAGACTTGAAACCAGAAAATATCCTTCTAGAT +AAAAACGGCCATATCAAGATAACCGACTTTGGCTTCGCAAAATACGTTCCCGATGTCACA +TACACATTATGTGGCACACCAGATTACATAGCGCCGGAAGTGGTCAGTACAAAACCGTAT +AATAAATCAGTGGATTGGTGGAGTTTTGGTGTGCTAATCTATGAAATGCTTGCCGGATAC +ACTCCATTTTACAATTCGAACACCATGAAAACTTACGAAAATATACTGAACGCCGAATTG +AAGTTCCCACCATTTTTCCATCCAGACGCGCAGGACTTATTGAAGAAGCTAATTACCAGA +GACTTAAGTGAAAGGTTGGGTAACTTACAAAATGGAAGTGAAGATGTCAAGAACCATCCG +TGGTTTAACGAAGTGATATGGGAGAAATTGTTAGCAAGATACATAGAAACGCCGTACGAA +CCACCAATCCAACAGGGCCAAGGTGACACTTCTCAATTTGACAGATACCCTGAAGAGGAA +TTCAACTATGGAATTCAAGGGGAGGATCCATATATGGATTTAATGAAAGAATTTTAATTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT diff --git a/tests/test_cdna.py b/tests/test_cdna.py index ae9fcb17877f4666f9d76fcf3e866520358d24c4..5ad8caa2b55d5159d1e2a90591a8d07f2ab5cdb1 100644 --- a/tests/test_cdna.py +++ b/tests/test_cdna.py @@ -1,2 +1,35 @@ # imports import pytest +from cdna.cdna import complement, seq_complement + +@pytest.mark.parametrize( + "test_input,expected", + [("A", "T")] +) +def test_complement_param(test_input, expected): # we need to pass the lists to the test function... + assert complement(test_input) == expected + +@pytest.mark.parametrize( + "test_input,expected", + [("AA", "TT")] +) +def test_seq_complement_param(test_input, expected): # we need to pass the lists to the test function... + assert seq_complement(test_input) == expected + + +# we can do the same for the tests that raise an error: +@pytest.mark.parametrize( + "test_input,expected", + [(1, ValueError)] +) +def test_complement_param_failing(test_input, expected): + with pytest.raises(expected): + complement(test_input) + +@pytest.mark.parametrize( + "test_input,expected", + [("11", ValueError)] +) +def test_complement_param_failing(test_input, expected): + with pytest.raises(expected): + seq_complement(test_input)