diff --git a/.gitignore b/.gitignore index 510b9ee493d9c985939497b3b784347cbfb27bc0..89a387026bfd0d5cc8d3eb33f46742badc9d489d 100644 --- a/.gitignore +++ b/.gitignore @@ -56,4 +56,7 @@ Temporary Items # End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode -__pycache__/ \ No newline at end of file +__pycache__/ +*_cache +*egg-info/ +.coverage \ No newline at end of file diff --git a/build/lib/term_frag_sel/__init__.py b/build/lib/term_frag_sel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb7d5f3186cb2e516714c06c9a5e8d2b57696586 --- /dev/null +++ b/build/lib/term_frag_sel/__init__.py @@ -0,0 +1 @@ +"""Initialise package.""" diff --git a/build/lib/term_frag_sel/cli.py b/build/lib/term_frag_sel/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..a9b8e957a60f3a64c33e2e15cd10de951b887850 --- /dev/null +++ b/build/lib/term_frag_sel/cli.py @@ -0,0 +1,137 @@ +"""Receive command line arguments, fragment sequences, and output fragments.""" +import argparse +import logging +from pathlib import Path +from Bio import SeqIO # type: ignore +import numpy as np +import pandas as pd # type: ignore + + +from term_frag_sel.fragmentation import fragmentation +from term_frag_sel.utils import check_positive, check_prob + + +def main(args: argparse.Namespace): + """Use CLI arguments to fragment sequences and output text file \ + with selected terminal fragments. + + Args: + args (parser): list of arguments from CLI. + """ + # Create or wipe output file + with open(args.output, "w", encoding="utf-8") as _: + pass + + logger.info("Checking validity of files...") + fasta, seq_counts = file_validation(args.fasta, args.counts, args.sep) + + logger.info("Fragmentation of %s...", args.fasta) + fasta_parse = {} + for record in fasta: + fasta_parse[record.id] = record.seq + splits = np.arange(0, len(list(fasta_parse))+args.size, args.size) + + for i, split in enumerate(splits): + fasta_dict = fasta_parse[split:splits[i+1]] + term_frags = fragmentation(fasta_dict, seq_counts, + args.mean, args.std, + args.A_prob, args.T_prob, + args.G_prob, args.C_prob) + + logger.info("Writing batch %s sequences to %s...", i, args.output) + with open(args.output, 'a', encoding="utf-8") as out_file: + for line in term_frags: + out_file.write(f"{line}\n") + + +def file_validation(fasta_file: str, + counts_file: str, + sep: str) -> tuple[dict, pd.DataFrame]: + """Validate input files exist and are the correct format. + + Args: + fasta_file (str): Input FASTA file path + counts_file (str): CSV or TSV counts file path + sep (str): Separator for counts file. + + Returns: + tuple: fasta and sequence counts variables + """ + with open(fasta_file, "r", encoding="utf-8") as handle: + fasta = SeqIO.parse(handle, "fasta") + if not any(fasta): + raise ValueError("Input FASTA file is either empty or \ + incorrect file type.") + + count_path = Path(counts_file) + if not count_path.is_file(): + logger.exception("Input counts file does not exist or isn't a file.") + else: + if sep == ",": + seq_counts = pd.read_csv(counts_file, names=["seqID", "count"]) + else: + seq_counts = pd.read_table(counts_file, names=["seqID", "count"]) + + return fasta, seq_counts + + +def parse_arguments() -> argparse.Namespace: + """Request parameters from user on CLI. + + Returns: + argparse.Namespace: object of arguments from CLI. + """ + parser = argparse.ArgumentParser(description="""Takes as input FASTA file + of cDNA sequences, a CSV/TSV with sequence + counts, and mean and std. dev. of fragment + lengths and 4 nucleotide probabilities + for the cuts. Outputs most terminal + fragment (within desired length range) + for each sequence.""") + + parser.add_argument('--fasta', required=True, + help="Path to FASTA file with cDNA sequences") + parser.add_argument('--counts', required=True, + help="Path to CSV/TSV file with sequence counts") + parser.add_argument('-o', '--output', required=True, + help="output file path") + parser.add_argument('--mean', required=False, default=300, + type=check_positive, + help="Mean fragment length (default: 10)") + parser.add_argument('--std', required=False, default=60, + type=check_positive, + help="Standard deviation fragment length \ + (defafult: 1)") + parser.add_argument('-a', '--A_prob', required=False, default=0.22, + type=check_prob, + help="Probability cut happens after nucleotide A") + parser.add_argument('-t', '--T_prob', required=False, default=0.25, + type=check_prob, + help="Probability cut happens after nucleotide T") + parser.add_argument('-g', '--G_prob', required=False, default=0.25, + type=check_prob, + help="Probability cut happens after nucleotide G") + parser.add_argument('-c', '--C_prob', required=False, default=0.28, + type=check_prob, + help="Probability cut happens after nucleotide C") + parser.add_argument('-s', '--size', required=False, default=10000, + type=check_positive, + help="Chunk size for batch processing") + parser.add_argument('--sep', required=False, default=",", + type=check_positive, + help="Sequence counts file separator.") + args = parser.parse_args() + + return args + + +if __name__ == '__main__': + logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s \ + (module "%(module)s")', + level=logging.INFO, + ) + logger = logging.getLogger(__name__) + + arguments = parse_arguments() + main(arguments) diff --git a/build/lib/term_frag_sel/fragmentation.py b/build/lib/term_frag_sel/fragmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..302fe6f41a8602688af4abcf8681ef830e91ba48 --- /dev/null +++ b/build/lib/term_frag_sel/fragmentation.py @@ -0,0 +1,63 @@ +"""Fragment sequences.""" +import re + +import numpy as np +import pandas as pd # type: ignore + + +def fragmentation(fasta: dict, seq_counts: pd.DataFrame, + mean_length: int = 100, std: int = 10, + a_prob: float = 0.22, t_prob: float = 0.25, + g_prob: float = 0.25, c_prob: float = 0.28 + ) -> list: + """Fragment cDNA sequences and select terminal fragment. + + Args: + fasta_file (dict): dictionary of {transcript IDs: sequences} + counts_file (pd.DataFrame): dataframe with sequence counts and IDs + mean_length (int): mean length of desired fragments + std (int): standard deviation of desired fragment lengths + a_prob (float): probability of nucleotide A + t_prob (float): probability of nucleotide T + g_prob (float): probability of nucleotide G + c_prob (float): probability of nucleotide C + + Returns: + list: list of selected terminal fragments + """ + # calculated using https://www.nature.com/articles/srep04532#MOESM1 + nuc_probs = {'A': a_prob, 'T': t_prob, 'G': g_prob, 'C': c_prob} + + term_frags = [] + for seq_id, seq in fasta.items(): + counts = seq_counts[seq_counts["seqID"] == seq_id]["count"] + for _ in range(counts): + n_cuts = int(len(seq)/mean_length) + + # non-uniformly random DNA fragmentation implementation based on + # https://www.nature.com/articles/srep04532#Sec1 + # assume fragmentation by sonication for NGS workflow + cuts = [] + cut_nucs = np.random.choice(list(nuc_probs.keys()), + n_cuts, p=list(nuc_probs.values())) + for nuc in cut_nucs: + nuc_pos = [x.start() for x in re.finditer(nuc, seq)] + pos = np.random.choice(nuc_pos) + while pos in cuts: + pos = np.random.choice(nuc_pos) + cuts.append(pos) + + cuts.sort() + cuts.insert(0, 0) + term_frag = "" + for i, val in enumerate(cuts): + if i == len(cuts)-1: + fragment = seq[val+1:cuts[-1]] + else: + fragment = seq[val:cuts[i+1]] + if mean_length-std <= len(fragment) <= mean_length+std: + term_frag = fragment + if term_frag != "": + term_frags.append(term_frag) + + return term_frags diff --git a/build/lib/term_frag_sel/utils.py b/build/lib/term_frag_sel/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d77cc1497952b11b89f9754aad33e93dd5d78965 --- /dev/null +++ b/build/lib/term_frag_sel/utils.py @@ -0,0 +1,46 @@ +"""Utility functions for CLI arguments.""" +import argparse + + +def check_positive(value: str) -> int: + """Check input value is a positive integer. + + Args: + value (str): command line parameter + + Raises: + argparse.ArgumentTypeError: received a negative integer + argparse.ArgumentTypeError: received a non-integer value + + Returns: + integer: integer version of input value + """ + try: + ivalue = round(float(value)) + if ivalue <= 0: + raise argparse.ArgumentTypeError(f"""Expected positive integer, + got negative integer: {value}""") + except ValueError as exc: + raise argparse.ArgumentTypeError(f"""Expected positive integer, + got: {value}""") from exc + else: + return ivalue + + +def check_prob(value: str) -> float: + """Check probability value is within ]0,1] range. + + Args: + value (str): command line parameter + + Raises: + argparse.ArgumentTypeError: received a value outside valid range + + Returns: + float: float version of input value + """ + pvalue = float(value) + if pvalue <= 0 or pvalue > 1: + raise argparse.ArgumentTypeError("""Expected a positive float between + 0 and 1, but got {value}""") + return pvalue diff --git a/images/.gitkeep b/images/.gitkeep deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/images/PushPull_Hugo.png b/images/PushPull_Hugo.png deleted file mode 100644 index 063021cba0bf6adf5bbd1afcbf6de594d014c536..0000000000000000000000000000000000000000 Binary files a/images/PushPull_Hugo.png and /dev/null differ diff --git a/images/gitIntro_Hugo.png b/images/gitIntro_Hugo.png deleted file mode 100644 index e1f812aaa5725d960ad2f7b69047b761801fe4dd..0000000000000000000000000000000000000000 Binary files a/images/gitIntro_Hugo.png and /dev/null differ diff --git a/images/git_sunho.png b/images/git_sunho.png deleted file mode 100644 index fd10c600ac93b038db96236a6dc1b9bb62ba9ec6..0000000000000000000000000000000000000000 Binary files a/images/git_sunho.png and /dev/null differ diff --git a/images/git_sunho2.png b/images/git_sunho2.png deleted file mode 100644 index 8b70717c0ef12171dbcabf31a034775e65ca4a37..0000000000000000000000000000000000000000 Binary files a/images/git_sunho2.png and /dev/null differ diff --git a/images/gittutorial_Tanya.png b/images/gittutorial_Tanya.png deleted file mode 100644 index 23dff08d437f4dbefa04cc135a4d82222ce62035..0000000000000000000000000000000000000000 Binary files a/images/gittutorial_Tanya.png and /dev/null differ diff --git a/images/gittutorial_Tanya2.png b/images/gittutorial_Tanya2.png deleted file mode 100644 index aae38f8a62d15c890816e05e7d706d2813941b0a..0000000000000000000000000000000000000000 Binary files a/images/gittutorial_Tanya2.png and /dev/null differ diff --git a/images/markdownTutorial_Hugo.png b/images/markdownTutorial_Hugo.png deleted file mode 100644 index 3a2573fd278582d9cd9bd6402e15dd65eaa4e948..0000000000000000000000000000000000000000 Binary files a/images/markdownTutorial_Hugo.png and /dev/null differ diff --git a/images/markdown_Tanya.png b/images/markdown_Tanya.png deleted file mode 100644 index fcecf48dcb06f93fa6ec116901fd53f7b2f10aca..0000000000000000000000000000000000000000 Binary files a/images/markdown_Tanya.png and /dev/null differ diff --git a/images/markdown_sunho.png b/images/markdown_sunho.png deleted file mode 100644 index a9a594b9e2c69cd7d1fc6ab229a2b141d0c466d7..0000000000000000000000000000000000000000 Binary files a/images/markdown_sunho.png and /dev/null differ diff --git a/images/terminal-fragment-selector b/images/terminal-fragment-selector deleted file mode 160000 index 2bb15db26adb68c6560620fc36bb03417d17c803..0000000000000000000000000000000000000000 --- a/images/terminal-fragment-selector +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2bb15db26adb68c6560620fc36bb03417d17c803 diff --git a/term_frag_sel/cli.py b/term_frag_sel/cli.py index c9e3efd2ff7f79d0082cc04d0cb951be18a55fcf..a9b8e957a60f3a64c33e2e15cd10de951b887850 100644 --- a/term_frag_sel/cli.py +++ b/term_frag_sel/cli.py @@ -1,55 +1,53 @@ """Receive command line arguments, fragment sequences, and output fragments.""" import argparse import logging -from Bio import SeqIO -import numpy as np -import pandas as pd from pathlib import Path +from Bio import SeqIO # type: ignore +import numpy as np +import pandas as pd # type: ignore + from term_frag_sel.fragmentation import fragmentation from term_frag_sel.utils import check_positive, check_prob def main(args: argparse.Namespace): - """ - Use CLI arguments to fragment sequences and output text file \ + """Use CLI arguments to fragment sequences and output text file \ with selected terminal fragments. Args: args (parser): list of arguments from CLI. """ - fasta_file, counts_file, output, mean_length, std = args[0:5] - a_prob, t_prob, g_prob, c_prob, chunk_size, sep = args[5:] - # Create or wipe output file - open(output, "w").close() + with open(args.output, "w", encoding="utf-8") as _: + pass logger.info("Checking validity of files...") - fasta, seq_counts = file_validation(fasta_file, counts_file, sep) + fasta, seq_counts = file_validation(args.fasta, args.counts, args.sep) - logger.info(f"Fragmentation of {fasta_file}...") + logger.info("Fragmentation of %s...", args.fasta) fasta_parse = {} for record in fasta: fasta_parse[record.id] = record.seq - splits = np.arange(0, len(list(fasta_parse))+chunk_size, chunk_size) + splits = np.arange(0, len(list(fasta_parse))+args.size, args.size) for i, split in enumerate(splits): fasta_dict = fasta_parse[split:splits[i+1]] term_frags = fragmentation(fasta_dict, seq_counts, - mean_length, std, - a_prob, t_prob, g_prob, c_prob) + args.mean, args.std, + args.A_prob, args.T_prob, + args.G_prob, args.C_prob) - logger.info(f"Writing batch {i} sequences to {output}...") - with open(output, 'a') as f: + logger.info("Writing batch %s sequences to %s...", i, args.output) + with open(args.output, 'a', encoding="utf-8") as out_file: for line in term_frags: - f.write(f"{line}\n") + out_file.write(f"{line}\n") def file_validation(fasta_file: str, counts_file: str, sep: str) -> tuple[dict, pd.DataFrame]: - """ - Validate input files exist and are the correct format. + """Validate input files exist and are the correct format. Args: fasta_file (str): Input FASTA file path @@ -59,7 +57,7 @@ def file_validation(fasta_file: str, Returns: tuple: fasta and sequence counts variables """ - with open(fasta_file, "r") as handle: + with open(fasta_file, "r", encoding="utf-8") as handle: fasta = SeqIO.parse(handle, "fasta") if not any(fasta): raise ValueError("Input FASTA file is either empty or \ @@ -78,8 +76,7 @@ def file_validation(fasta_file: str, def parse_arguments() -> argparse.Namespace: - """ - Request parameters from user on CLI. + """Request parameters from user on CLI. Returns: argparse.Namespace: object of arguments from CLI. diff --git a/term_frag_sel/fragmentation.py b/term_frag_sel/fragmentation.py index f85b1e82c68b80e7417278c87477f0ee24248ae3..302fe6f41a8602688af4abcf8681ef830e91ba48 100644 --- a/term_frag_sel/fragmentation.py +++ b/term_frag_sel/fragmentation.py @@ -2,15 +2,15 @@ import re import numpy as np -import pandas as pd +import pandas as pd # type: ignore def fragmentation(fasta: dict, seq_counts: pd.DataFrame, - mean_length: int, std: int, - a_prob: float, t_prob: float, g_prob: float, c_prob: float + mean_length: int = 100, std: int = 10, + a_prob: float = 0.22, t_prob: float = 0.25, + g_prob: float = 0.25, c_prob: float = 0.28 ) -> list: - """ - Fragment cDNA sequences and select terminal fragment. + """Fragment cDNA sequences and select terminal fragment. Args: fasta_file (dict): dictionary of {transcript IDs: sequences} @@ -57,8 +57,7 @@ def fragmentation(fasta: dict, seq_counts: pd.DataFrame, fragment = seq[val:cuts[i+1]] if mean_length-std <= len(fragment) <= mean_length+std: term_frag = fragment - if term_frag == "": - continue - else: + if term_frag != "": term_frags.append(term_frag) + return term_frags diff --git a/term_frag_sel/utils.py b/term_frag_sel/utils.py index 7d2fb88c4e7a95c3f200d07d170760f78318cc31..d77cc1497952b11b89f9754aad33e93dd5d78965 100644 --- a/term_frag_sel/utils.py +++ b/term_frag_sel/utils.py @@ -28,8 +28,7 @@ def check_positive(value: str) -> int: def check_prob(value: str) -> float: - """ - Check probability value is within ]0,1] range. + """Check probability value is within ]0,1] range. Args: value (str): command line parameter diff --git a/tests/test_cli.py b/tests/test_cli.py index d584354617a4e70322212429f6ee496a1a9fd7cb..53a94c3f5011ee77266ceb8ee8c96951d3d1ad82 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,15 +1,13 @@ """Test cli.py functions.""" import pytest -from Bio import SeqIO +# from Bio import SeqIO -from term_frag_sel.cli import file_validation +from term_frag_sel.cli import file_validation # type: ignore -with open("./test_files/test.fasta", "r") as handle: - fasta = SeqIO.parse(handle, "fasta") +FASTA_FILE = "tests/test_files/test.fasta" def test_file(): """Test check_positive function.""" - assert file_validation(fasta, "", ",") == 1 with pytest.raises(FileNotFoundError): file_validation("", "", ",") diff --git a/tests/test_frag.py b/tests/test_frag.py index 1cf44e72fb04a60827ff48df839a129c55729bb2..d97b176054f801a93796832ffc54c324b4076046 100644 --- a/tests/test_frag.py +++ b/tests/test_frag.py @@ -2,19 +2,19 @@ import pytest from Bio import SeqIO -from term_frag_sel.fragmentation import fragmentation +from term_frag_sel.fragmentation import fragmentation # type: ignore -with open("./test_files/test.fasta", "r") as handle: +with open("tests/test_files/test.fasta", "r", encoding="utf-8") as handle: fasta = SeqIO.parse(handle, "fasta") # have to create the counts file -mu = 100 -std = 10 -a_prob = 0.22 -c_prob = 0.28 -t_prob = 0.25 -g_prob = 0.25 +MU = 100 +STD = 10 +A_PROB = 0.22 +C_PROB = 0.28 +T_PROB = 0.25 +G_PROB = 0.25 def test_frag(): diff --git a/tests/test_utils.py b/tests/test_utils.py index a93414276bd2cda74d147aed9ff98423d1bdd5b0..c7d7be2d3fb43f5511322d1bdfe2d4107ecc38ae 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,17 +1,18 @@ """Test utils.py functions.""" -import pytest import argparse +import pytest -from term_frag_sel.utils import check_positive, check_prob +from term_frag_sel.utils import check_positive, check_prob # type: ignore def test_positive(): """Test check_positive function.""" assert check_positive("1") == 1 assert check_positive("100") == 100 - assert check_positive("0") == 100 assert check_positive("2.2") == 2 assert check_positive("2.6") == 3 + with pytest.raises(argparse.ArgumentTypeError): + check_positive("0") with pytest.raises(argparse.ArgumentTypeError): check_positive("-1") with pytest.raises(argparse.ArgumentTypeError): @@ -30,7 +31,7 @@ def test_prob(): check_prob("10") with pytest.raises(argparse.ArgumentTypeError): check_prob("-1") - with pytest.raises(argparse.ArgumentTypeError): + with pytest.raises(ValueError): check_prob("string") - with pytest.raises(argparse.ArgumentTypeError): + with pytest.raises(ValueError): check_prob("")