diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2880e2058461273ae97557f8783cc428c35cb4f8..ae8d3a2bee8c2423c4d708131fb22ada77170194 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -6,7 +6,6 @@ default: # Set default stages: # List of stages for jobs, and their order of execution - build - test - - deploy build-job: # This job runs in the build stage, which runs first. stage: build @@ -30,12 +29,6 @@ lint-test-job: # This job also runs in the test stage. - pip install -r requirements.txt - pip install -r requirements-dev.txt - pip install -e . - - flake8 --docstring-convention google --max-line-length 120 readsequencer/ --ignore=D212,D103,D104,D107,D100,D017,D415 - #- pylint readsequencer/ tests/ - -deploy-job: # This job runs in the deploy stage. - stage: deploy # It only runs when *both* jobs in the test stage complete successfully. - environment: production - script: - - echo "Deploying application..." - - echo "Application successfully deployed." + - flake8 --docstring-convention google readsequencer/ tests/ + - pylint readsequencer/ tests/ + - mypy readsequencer/ tests/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index f2b74d79ed80385868c001801b96b62c67601d50..0000000000000000000000000000000000000000 --- a/Dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -FROM python:3.11.1-slim-bullseye - -# MAINTAINER add_maintainer -MAINTAINER "Christoph Harmel" -# set names for user, group and user home -ARG USER="bioc" -ARG GROUP="bioc" -ARG WORKDIR="/home/${USER}" - -# create user, group, user home & makes the user and group -# the owner of the user home -RUN mkdir -p $WORKDIR \ - && groupadd -r $GROUP \ - && useradd --no-log-init -r -g $GROUP $USER \ - && chown -R ${USER}:${GROUP} $WORKDIR \ - && chmod 700 $WORKDIR - -# set the user, make sure the location where pip -# installs binaries/executables is part of the $PATH -# and set the working directory to the user's home -USER $USER -ENV PATH="${WORKDIR}/.local/bin:${PATH}" -WORKDIR $WORKDIR - -# copy entire content of the current directory to the -# working directory; ensure that this does not contain any -# files that you don't want to have end up in the Docker -# image, especially not any secrets! -# use `.dockerignore` to set files that you do NOT want -# Docker to copy -COPY --chown=${USER}:${GROUP} . $WORKDIR - -# install app and development dependencies -# assumes that dependencies in `requirements.txt` are -# automatically installed when installing the app; if -# that is not the case, they need to be installed -# _before_ installing the app -RUN pip install . \ - && pip install --no-cache-dir -r requirements-dev.txt - -# set default command; optional -CMD ["readsequencer"] diff --git a/readsequencer/__init__.py b/readsequencer/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bcc1665e55c64d2ec9dc148a5666b910b7e44c36 100644 --- a/readsequencer/__init__.py +++ b/readsequencer/__init__.py @@ -0,0 +1 @@ +"""Initialise read-sequencer.""" diff --git a/readsequencer/cli.py b/readsequencer/cli.py index cc74c44cef025035b96a8963e79b3c89b280a4c2..89653d1aa144b1ccd25495546cc8c3aca15455ac 100644 --- a/readsequencer/cli.py +++ b/readsequencer/cli.py @@ -1,27 +1,49 @@ +"""Receive command line arguments.""" import argparse import logging from readsequencer.read_sequencer import ReadSequencer +logging.basicConfig( + format='[%(asctime)s: %(levelname)s] %(message)s \ + (module "%(module)s")', + level=logging.INFO, +) +logger = logging.getLogger(__name__) + LOG = logging.getLogger(__name__) def main(): + """Use CLI arguments to simulate sequencing.""" parser = argparse.ArgumentParser( prog="readsequencer", - description="Simulates sequencing of DNA sequences specified by an FASTA file.", + description="Simulates sequencing of DNA sequences specified \ + by an FASTA file.", + ) + parser.add_argument( + "output", + help="path to FASTA file" + ) + parser.add_argument( + "-i", + "--input", + default=None, + help="path to FASTA file" ) - - parser.add_argument("output", help="path to FASTA file") - parser.add_argument("-i", "--input", default=None, help="path to FASTA file") parser.add_argument( - "-r", "--read-length", default=100, help="read length for sequencing", type=int + "-r", + "--read-length", + type=int, + default=100, + help="read length for sequencing", ) parser.add_argument( "-n", "--n_random", default=100, type=int, - help="n random sequences. Just used if input fasta file is not specified.", + help="n random sequences. Just used if input" + "fasta file is not specified.", ) parser.add_argument( "-s", @@ -30,7 +52,6 @@ def main(): type=int, help="chunk_size for batch processing", ) - args = parser.parse_args() LOG.info("Read sequencer started.") if args.input is not None: @@ -55,10 +76,5 @@ def main(): LOG.info("Read sequencer finished.") -if __name__ == "__main__": - logging.basicConfig( - format='[%(asctime)s: %(levelname)s] %(message)s (module "%(module)s")', - level=logging.INFO, - ) - LOG = logging.getLogger(__name__) +if __name__ == '__main__': main() diff --git a/readsequencer/read_sequencer.py b/readsequencer/read_sequencer.py index 6cd7565a114b012a468b75bf1dcefb0ee8ad316c..a1437266416e11f89155dcce3553d7801727022a 100644 --- a/readsequencer/read_sequencer.py +++ b/readsequencer/read_sequencer.py @@ -1,12 +1,13 @@ +"""Main module for read sequencer.""" from random import choices from collections.abc import Generator, Iterator -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.SeqRecord import SeqRecord +from Bio import SeqIO # type: ignore +from Bio.Seq import Seq # type: ignore +from Bio.SeqRecord import SeqRecord # type: ignore class ReadSequencer: - """ReadSequencer class + """ReadSequencer class. Args: fasta: path fasta file @@ -22,23 +23,22 @@ class ReadSequencer: def __init__( self, - fasta: str = None, - output: str = None, + fasta=None, + output=None, read_length: int = 150, chunk_size: int = 10000, ) -> None: - + """Initialise class.""" self.fasta = fasta self.output = output self.read_length = read_length self.chunk_size = chunk_size self.random = False self.bases = ("A", "T", "C", "G") - self.n_sequences = None + self.n_sequences: int def get_n_sequences(self) -> None: - """ - Helper function to detect number of sequences present in set fasta file. + """Detect number of sequences present in set fasta file. Returns: None @@ -46,8 +46,7 @@ class ReadSequencer: self.n_sequences = len(list(SeqIO.parse(self.fasta, "fasta"))) def define_random_sequences(self, n_seq: int) -> None: - """ - Defines random sequences. + """Define random sequences. Args: n_seq: number of random sequences to be generated @@ -59,8 +58,7 @@ class ReadSequencer: self.n_sequences = n_seq def generate_random_sequence(self, length: int) -> Seq: - """ - Generates random sequence. + """Generate random sequence. Args: length: length of sequence @@ -73,7 +71,7 @@ class ReadSequencer: return seq def resize_sequence(self, record: SeqRecord) -> SeqRecord: - """Resizes sequence + """Resize sequence. Resizes sequence according to set read length. If sequence is shorter than read length, fills up with random nucleotides. @@ -93,7 +91,7 @@ class ReadSequencer: return record.seq def batch_iterator(self, iterator: Iterator, batch_size: int) -> Generator: - """Generates batch iterator. + """Generate batch iterator. This is a generator function, and it returns lists of the entries from the supplied iterator. Each list will have @@ -114,7 +112,7 @@ class ReadSequencer: batch = [] def run_sequencing(self) -> None: - """Runs sequencing. + """Run sequencing. Runs read sequencing of specified sequences from input fasta file or generates random sequences for a given read length. If number of @@ -125,7 +123,7 @@ class ReadSequencer: """ if self.random: if self.n_sequences <= self.chunk_size: - with open(self.output, "w") as output_handle: + with open(self.output, "w", encoding="utf-8") as output_handle: for i in range(self.n_sequences): record = SeqRecord( self.generate_random_sequence(self.read_length), @@ -134,35 +132,44 @@ class ReadSequencer: SeqIO.write(record, output_handle, "fasta") else: batch_generator = self.batch_iterator( - range(self.n_sequences), self.chunk_size + iter(range(self.n_sequences)), self.chunk_size ) for i, batch in enumerate(batch_generator): - filename = self.output.replace(".fasta", "") + "_chunk_%i.fasta" % ( - i + 1 + filename = ( + self.output.replace(".fasta", "") + + f"_chunk_{i + 1}.fasta" ) - with open(filename, "w") as output_handle: - for j, k in enumerate(batch): + with open( + filename, "w", encoding="utf-8" + ) as output_handle: + for j, _ in enumerate(batch): record = SeqRecord( - self.generate_random_sequence(self.read_length), + self.generate_random_sequence( + self.read_length + ), id="random_seq: " + str(j + 1), ) SeqIO.write(record, output_handle, "fasta") else: if self.n_sequences <= self.chunk_size: - with open(self.fasta) as input_handle, open( - self.output, "w" + with open(self.fasta, encoding="utf-8") as input_handle, open( + self.output, "w", encoding="utf-8" ) as output_handle: for record in SeqIO.parse(input_handle, "fasta"): record.seq = self.resize_sequence(record) SeqIO.write(record, output_handle, "fasta") else: - record_iter = SeqIO.parse(open(self.fasta), "fasta") - for i, batch in enumerate( - self.batch_iterator(record_iter, self.chunk_size) - ): - filename = self.output.replace(".fasta", "") + "_chunk_%i.fasta" % (i + 1) - for j, record in enumerate(batch): - batch[j].seq = self.resize_sequence(record) - with open(filename, "w") as handle: - SeqIO.write(batch, handle, "fasta") + with open(self.fasta, encoding="utf-8") as file: + record_iter = SeqIO.parse(file, "fasta") + for i, batch in enumerate( + self.batch_iterator(record_iter, self.chunk_size) + ): + filename = ( + self.output.replace(".fasta", "") + + f"_chunk_{i + 1}.fasta" + ) + for j, record in enumerate(batch): + record.seq = self.resize_sequence(record) + with open(filename, "w", encoding="utf-8") as handle: + SeqIO.write(batch, handle, "fasta") diff --git a/setup.py b/setup.py index 8a94d0d97f65a2f1a6dd9f9ce80635f5454f7fca..ceb391d11e7af487e384abd5192dc1c2f33b0e59 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,30 @@ -from setuptools import setup, find_packages +"""Setup tool.""" from pathlib import Path +from setuptools import setup, find_packages # type: ignore + project_root_dir = Path(__file__).parent.resolve() -with open(project_root_dir / "requirements.txt", "r", encoding="utf-8") as _file: +with open( + project_root_dir / "requirements.txt", "r", encoding="utf-8" +) as _file: INSTALL_REQUIRES = _file.read().splitlines() +URL = ('https://git.scicore.unibas.ch/zavolan_group/' + 'tools/read-sequencer') + setup( name='readsequencer', version='0.1.1', - url='https://git.scicore.unibas.ch/zavolan_group/tools/read-sequencer', + url=URL, license='MIT', author='Clara Serger, Michael Sandholzer and Christoph Harmel', author_email='christoph.harmel@unibas.ch', - description='Simulates sequencing with a specified read length from sequences specified by a FASTA file.', + description='Simulates sequencing with a specified read length from' + 'sequences specified by a FASTA file.', packages=find_packages(), install_requires=INSTALL_REQUIRES, - entry_points={'console_scripts': ['readsequencer=readsequencer.cli:main']} + entry_points={ + 'console_scripts': [ + 'readsequencer=readsequencer.cli:main' + ] + } ) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0cf13d23e76271f02a81a28e53018db6ed382f49 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Initialise testing.""" diff --git a/tests/test_cli.py b/tests/test_cli.py index 6cb9ae1d6146a9045255f46c32e764b2ce18c516..b2b302cccec65dc23011552e0b3697099eb38a10 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,19 +1,17 @@ -import readsequencer.cli +"""Test cli.py.""" import pytest -from cli_test_helpers import ArgvContext, shell -import os -import glob +from cli_test_helpers import ArgvContext, shell # type: ignore +import readsequencer.cli + + def test_entrypoint(): - """ - Is entrypoint script installed? (setup.py) - """ + """Test if entrypoint script is installed (setup.py).""" result = shell('readsequencer --help') assert result.exit_code == 0 + def test_usage_no_args(): - """ - Does CLI abort w/o arguments, displaying usage instructions? - """ + """Test if CLI aborts w/o arguments, displaying usage instructions.""" with ArgvContext('readsequencer'), pytest.raises(SystemExit): readsequencer.cli.main() diff --git a/tests/test_read_sequencer.py b/tests/test_read_sequencer.py index 7157ccd670226bfe55479379fc8fea87b90ff861..86a589dfb41546f44a5413d3306cb20db63c4cde 100644 --- a/tests/test_read_sequencer.py +++ b/tests/test_read_sequencer.py @@ -1,9 +1,11 @@ -import pytest +"""Test read_sequencer.py.""" import os import glob from readsequencer.read_sequencer import ReadSequencer + def test_init_default(): + """Test default initation.""" sequencer = ReadSequencer() assert sequencer.fasta is None assert sequencer.read_length == 150 @@ -13,6 +15,7 @@ def test_init_default(): def test_run_random(): + """Test random run.""" sequencer = ReadSequencer( output="./tests/fasta_testfile/results.fasta") sequencer.define_random_sequences(n_seq=100) @@ -23,7 +26,9 @@ def test_run_random(): sequencer.run_sequencing() os.remove("./tests/fasta_testfile/results.fasta") + def test_run_random_chunks(): + """Test random run chunks.""" # setup class sequencer = ReadSequencer( output="./tests/fasta_testfile/results.fasta", @@ -44,6 +49,7 @@ def test_run_random_chunks(): def test_run_sequencing(): + """Test sequencing run.""" sequencer = ReadSequencer( fasta="./tests/fasta_testfile/50_seqs_50_1000_bp.fasta", output="./tests/fasta_testfile/results.fasta", @@ -59,7 +65,9 @@ def test_run_sequencing(): for file in result_file: os.remove(file) + def test_run_sequencing_chunks(): + """Test run sequencing chunks.""" # setup class sequencer = ReadSequencer( fasta="./tests/fasta_testfile/50_seqs_50_1000_bp.fasta", @@ -78,6 +86,3 @@ def test_run_sequencing_chunks(): assert len(result_files) == 5 for file in result_files: os.remove(file) - - -