Skip to content
Snippets Groups Projects
Commit bd397e57 authored by sunhollyjolly's avatar sunhollyjolly
Browse files

add README

parent dfdafafe
No related branches found
No related tags found
3 merge requests!52Last,!51Sunho final fix,!50Sunho fixed
Showing
with 211 additions and 92 deletions
...@@ -52,5 +52,12 @@ Temporary Items ...@@ -52,5 +52,12 @@ Temporary Items
# Ignore all local history of files # Ignore all local history of files
.history .history
.ionide .ionide
.vscode
# End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode # End of https://www.toptal.com/developers/gitignore/api/macos,visualstudiocode
\ No newline at end of file
__pycache__/
*_cache
*egg-info/
.coverage
build/
\ No newline at end of file
default: # Set default
tags:
- docker
image: python:3.10-slim-buster
stages: # List of stages for jobs, and their order of execution
- build
- test
build-job: # This job runs in the build stage, which runs first.
stage: build
script:
- pip install -r requirements.txt
- pip install -r requirements-dev.txt
- pip install -e .
unit-test-job: # This job runs in the test stage.
stage: test # It only starts when the job in the build stage completes successfully.
script:
- pip install -r requirements.txt
- pip install -r requirements-dev.txt
- pip install -e .
- coverage run --source term_frag_sel -m pytest
- coverage report -m
lint-test-job: # This job also runs in the test stage.
stage: test # It can run at the same time as unit-test-job (in parallel).
script:
- pip install -r requirements.txt
- pip install -r requirements-dev.txt
- pip install -e .
#- flake8 --docstring-convention google readsequencer/ tests/
#- pylint readsequencer/ tests/
Images/git_sunho.png

126 KiB

Images/git_sunho2.png

149 KiB

Images/markdown_sunho.png

416 KiB

import random
dna_seq = {
"ATAACATGTGGATGGCCAGTGGTCGGTTGTTACACGCCTACCGCGATGCTGAATGACCCGGACTAGAGTGGCGAAATTTATGGCGTGTGACCCGTTATGC": 100,
"TCCATTTCGGTCAGTGGGTCATTGCTAGTAGTCGATTGCATTGCCATTCTCCGAGTGATTTAGCGTGACAGCCGCAGGGAACCCATAAAATGCAATCGTA": 100
}
mean_length = 12
std = 1
term_frags = []
for seq, counts in dna_seq.items():
for _ in range(counts):
n_cuts = int(len(seq)/mean_length)
cuts = random.sample(range(1,len(seq)-1), n_cuts)
cuts.sort()
cuts.insert(0,0)
term_frag = ""
for i, val in enumerate(cuts):
if i == len(cuts)-1:
fragment = seq[val:cuts[-1]]
else:
fragment = seq[val:cuts[i+1]]
if mean_length-std <= len(fragment) <= mean_length+std:
term_frag = fragment
if term_frag == "":
continue
else:
term_frags.append(term_frag)
with open('terminal_frags.txt', 'w') as f:
for line in term_frags:
f.write(line)
f.write('\n')
import argparse
from fragmentation_v2 import fragmentation
from utils import check_positive, extant_file
def main(args):
fasta, seq_counts, mean_length, std = args
term_frags = fragmentation(fasta, seq_counts, mean_length, std)
with open('terminal_frags.txt', 'w') as f:
for line in term_frags:
f.write(line)
f.write('\n')
# Parse command-line arguments
def parse_arguments():
parser = argparse.ArgumentParser(description="Takes as input FASTA file of cDNA sequences, a CSV with sequence counts, and mean and std. dev. of fragment lengths. Outputs most terminal fragment (within desired length range) for each sequence.")
parser.add_argument('--fasta', required=True, type=extant_file, help="FASTA file with cDNA sequences")
parser.add_argument('--counts', required=True, type=extant_file, help="CSV file with sequence counts")
parser.add_argument('--mean', required = False, default = 10, type = check_positive, help="Mean fragment length (default: 10)")
parser.add_argument('--std', required = False, default = 1, type = check_positive, help="Standard deviation fragment length (defafult: 1)")
args = parser.parse_args()
return args.fasta, args.counts, args.mean, args.std
if __name__ == '__main__':
arguments = parse_arguments()
main(arguments)
\ No newline at end of file
import argparse
import os.path
# found on https://stackoverflow.com/questions/11540854/file-as-command-line-argument-for-argparse-error-message-if-argument-is-not-va
def extant_file(x):
"""
'Type' for argparse - checks that file exists but does not open.
"""
if not os.path.exists(x):
# Argparse uses the ArgumentTypeError to give a rejection message like:
# error: argument input: x does not exist
raise argparse.ArgumentTypeError("{0} does not exist".format(x))
elif not x.endswith((".fasta", ".fa", ".csv")):
raise argparse.ArgumentTypeError("{0} is not the correct file format".format(x))
return x
# found on https://stackoverflow.com/questions/14117415/in-python-using-argparse-allow-only-positive-integers
def check_positive(value):
ivalue = int(value)
if ivalue <= 0:
raise argparse.ArgumentTypeError("%s is an invalid positive int value" % value)
return ivalue
images/PushPull_Hugo.png

25.1 KiB

images/gitIntro_Hugo.png

77.6 KiB

images/gittutorial_Tanya.png

148 KiB

images/gittutorial_Tanya2.png

96.7 KiB

images/markdownTutorial_Hugo.png

115 KiB

images/markdown_Tanya.png

255 KiB

argparse
biopython >= 1.78
numpy >= 1.23.3
pandas >= 1.4.4
\ No newline at end of file
pytest
coverage
flake8
flake8-docstrings
mypy
pylint
setup.py 0 → 100644
"""Set up project."""
from setuptools import setup, find_packages
from pathlib import Path
project_root_dir = Path(__file__).parent.resolve()
with open(project_root_dir / "requirements.txt",
"r", encoding="utf-8") as f:
INSTALL_REQUIRES = f.read().splitlines()
url = 'https://git.scicore.unibas.ch/zavolan_group/tools/terminal-fragment-selector'
setup(
name='terminal-fragment-selector',
version='0.1.1',
url=url,
license='MIT',
author='Hugo Madge Leon, Sunho Kim, Tanya Nandan',
author_email='hmadge@ethz.ch',
description='Terminal fragment selector',
packages=find_packages(),
install_requires=INSTALL_REQUIRES
)
"""Initialise package."""
"""Receive command line arguments, fragment sequences, and output fragments."""
import argparse
import logging
from pathlib import Path
from Bio import SeqIO # type: ignore
import numpy as np
import pandas as pd # type: ignore
from term_frag_sel.fragmentation import fragmentation
from term_frag_sel.utils import check_positive, check_prob
def main(args: argparse.Namespace):
"""Use CLI arguments to fragment sequences and output text file \
with selected terminal fragments.
Args:
args (parser): list of arguments from CLI.
"""
# Create or wipe output file
with open(args.output, "w", encoding="utf-8") as _:
pass
logger.info("Checking validity of files...")
fasta, seq_counts = file_validation(args.fasta, args.counts, args.sep)
logger.info("Fragmentation of %s...", args.fasta)
fasta_parse = {}
for record in fasta:
fasta_parse[record.id] = record.seq
splits = np.arange(0, len(list(fasta_parse))+args.size, args.size)
for i, split in enumerate(splits):
fasta_dict = fasta_parse[split:splits[i+1]]
term_frags = fragmentation(fasta_dict, seq_counts,
args.mean, args.std,
args.A_prob, args.T_prob,
args.G_prob, args.C_prob)
logger.info("Writing batch %s sequences to %s...", i, args.output)
with open(args.output, 'a', encoding="utf-8") as out_file:
for line in term_frags:
out_file.write(f"{line}\n")
def file_validation(fasta_file: str,
counts_file: str,
sep: str) -> tuple[dict, pd.DataFrame]:
"""Validate input files exist and are the correct format.
Args:
fasta_file (str): Input FASTA file path
counts_file (str): CSV or TSV counts file path
sep (str): Separator for counts file.
Returns:
tuple: fasta and sequence counts variables
"""
with open(fasta_file, "r", encoding="utf-8") as handle:
fasta = SeqIO.parse(handle, "fasta")
if not any(fasta):
raise ValueError("Input FASTA file is either empty or \
incorrect file type.")
count_path = Path(counts_file)
if not count_path.is_file():
logger.exception("Input counts file does not exist or isn't a file.")
else:
if sep == ",":
seq_counts = pd.read_csv(counts_file, names=["seqID", "count"])
else:
seq_counts = pd.read_table(counts_file, names=["seqID", "count"])
return fasta, seq_counts
def parse_arguments() -> argparse.Namespace:
"""Request parameters from user on CLI.
Returns:
argparse.Namespace: object of arguments from CLI.
"""
parser = argparse.ArgumentParser(description="""Takes as input FASTA file
of cDNA sequences, a CSV/TSV with sequence
counts, and mean and std. dev. of fragment
lengths and 4 nucleotide probabilities
for the cuts. Outputs most terminal
fragment (within desired length range)
for each sequence.""")
parser.add_argument('--fasta', required=True,
help="Path to FASTA file with cDNA sequences")
parser.add_argument('--counts', required=True,
help="Path to CSV/TSV file with sequence counts")
parser.add_argument('-o', '--output', required=True,
help="output file path")
parser.add_argument('--mean', required=False, default=300,
type=check_positive,
help="Mean fragment length (default: 10)")
parser.add_argument('--std', required=False, default=60,
type=check_positive,
help="Standard deviation fragment length \
(defafult: 1)")
parser.add_argument('-a', '--A_prob', required=False, default=0.22,
type=check_prob,
help="Probability cut happens after nucleotide A")
parser.add_argument('-t', '--T_prob', required=False, default=0.25,
type=check_prob,
help="Probability cut happens after nucleotide T")
parser.add_argument('-g', '--G_prob', required=False, default=0.25,
type=check_prob,
help="Probability cut happens after nucleotide G")
parser.add_argument('-c', '--C_prob', required=False, default=0.28,
type=check_prob,
help="Probability cut happens after nucleotide C")
parser.add_argument('-s', '--size', required=False, default=10000,
type=check_positive,
help="Chunk size for batch processing")
parser.add_argument('--sep', required=False, default=",",
type=check_positive,
help="Sequence counts file separator.")
args = parser.parse_args()
return args
if __name__ == '__main__':
logging.basicConfig(
format='[%(asctime)s: %(levelname)s] %(message)s \
(module "%(module)s")',
level=logging.INFO,
)
logger = logging.getLogger(__name__)
arguments = parse_arguments()
main(arguments)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment