diff --git a/.gitignore b/.gitignore index 479190f97f1a8c16a3319dceee47caf8be1a0178..155279f82b41df9f3e30436c29c74f70191ce43f 100755 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ assembly/resources/bakta_db facienda.md variantcalling/container/pggb_latest.sif variantcalling/.snakemake +assembly/logs +assembly/.fontconfig diff --git a/assembly/README.md b/assembly/README.md index 8c339e5721c86fa4741a2b0c236bea8d961d7190..177b319bc29cad12ba7db7fdd5089842a3f0986d 100755 --- a/assembly/README.md +++ b/assembly/README.md @@ -13,14 +13,18 @@ The user needs to provide two things to run the workflow on her samples: - a config file with some global options for the analysis - a tab separate table, without header, that contains the sample names and the corresponding paths to the HiFi consensus reads. + +## Clone the directory + + ## Create conda environment containing snakemake and singularity ``` -conda env create -f environment.yml +conda env create -f config/environment.yml ``` -## config.yml +## Run the pipeline In the file config/config.yaml some global parameters can be set: ```yaml diff --git a/assembly/config/cluster_config.yaml b/assembly/cluster/config.yaml similarity index 86% rename from assembly/config/cluster_config.yaml rename to assembly/cluster/config.yaml index a8eb21698259a77bfd2a075f95285bb9c955f5ad..9deb95e863bcd5a6811ba91847f13cd99097dabe 100644 --- a/assembly/config/cluster_config.yaml +++ b/assembly/cluster/config.yaml @@ -9,16 +9,16 @@ cluster: --output=logs/{rule}/{rule}-{wildcards}-%j.stdout --error=logs/{rule}/{rule}-{wildcards}-%j.stderr default-resources: - - partition=scicore - - qos=1day - - time=12:00:00 - - mem_mb=20000 + - "partition=scicore" + - "qos='1day'" + - "time='12:00:00'" + - "mem_mb=20000" restart-times: 3 max-jobs-per-second: 10 max-status-checks-per-second: 1 local-cores: 1 latency-wait: 60 -jobs: 500 +jobs: 10 keep-going: True rerun-incomplete: True printshellcmds: True diff --git a/assembly/config/config.yaml b/assembly/config/config.yaml index eb11d3bea3047d9be05641541f294eb53ad152b0..ee2e9b6623ce042b85cf734465653384738b886a 100755 --- a/assembly/config/config.yaml +++ b/assembly/config/config.yaml @@ -2,15 +2,17 @@ # ############################## -samples: config/samples.tsv -outdir: ./results +samples: "config/samples.tsv" +outdir: "./results" ref: - genome_size: 4.4m - gbf: resources/H37Rv.gbf + genome_size: "4.4m" + gbf: "resources/H37Rv.gbf" -bakta_db: /scicore/home/gagneux/GROUP/PacbioSnake_resources/databases/bakta_db -container: /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/assemblySC.sif +bakta_db: "/scicore/home/gagneux/GROUP/PacbioSnake_resources/databases/bakta_db" +container: "/scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/assemblySC.sif" + +annotate: "No" threads_per_job: 4 diff --git a/assembly/config/environment.yml b/assembly/config/environment.yml index f5e4dfb68a298668efd08fbd932411e9cf7f2d77..1946e195954f0eb9e6493dfdc05d9f64059563eb 100644 --- a/assembly/config/environment.yml +++ b/assembly/config/environment.yml @@ -7,3 +7,4 @@ channels: dependencies: - snakemake=7.32.4 - singularity=3.8.6 + - biopython diff --git a/assembly/run_assembly_pipeline.py b/assembly/run_assembly_pipeline.py index 388c61689988a8ac2eeeb7f139005c6a9f74435e..490b900ad352bcf6504fdb32d25e6665f0cc24c9 100755 --- a/assembly/run_assembly_pipeline.py +++ b/assembly/run_assembly_pipeline.py @@ -2,7 +2,6 @@ import argparse import os -import yaml import sys def get_args(): @@ -12,15 +11,17 @@ def get_args(): # Parameter groups parser_io = parser.add_argument_group('INPUT/OUTPUT') - parser_cluster = parser.add_argument_group('CLUSTER CONFIGURATION') + parser_cluster = parser.add_argument_group('CLUSTER CONFIGURATION (not implemented yet)') # INPUT/OUTPUT - parser_io.add_argument('-s', '--samples', required=True, help='Path to tab-separeted table, no header, with sample name and path to fastq with HiFi reads.') + parser_io.add_argument('-s', '--samples', required=True, help='Absolute path to tab-separated table, no header, with sample name and path to fastq with HiFi reads.') - parser_io.add_argument('-o', '--outdir', required=True, help='Output directory for the results.') + parser_io.add_argument('-o', '--outdir', required=True, help='Absolute path to output directory.') + parser_io.add_argument('-n', '--dry_run', action='store_true', help='Do snakemake dry run.') - # CLUSTER CONFIG + + # CLUSTER CONFIG (not implemented, would have to temper with the cluster config file) parser_cluster.add_argument('-j', '--njobs', default='4', help='Number of jobs to run in parallel. [4]') parser_cluster.add_argument('-t', '--threads', default='10', help='Threads per job. [10]' ) @@ -36,8 +37,7 @@ def main(): # Infer pipeline location from path of run_assembly_pipeline.py pl_path = os.path.dirname(os.path.abspath(sys.argv[0])) - print(pl_path) - + # Directories for which singularity needs to be given access bind_dirs = [ "/scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux", @@ -47,24 +47,43 @@ def main(): pl_path ] + # Infer folders with samples, to add them to bind_dirs + sample_dirs = set() + with open(args.samples) as f: + for line in f: + fields = line.strip().split() + fastq_path = fields[1] + fastq_dir = os.path.dirname(os.path.realpath(fastq_path)) + sample_dirs.add(fastq_dir) + + bind_dirs = bind_dirs + list(sample_dirs) + singularity_args = "--bind " + " --bind ".join(bind_dirs) - cmd = [ - "snakemake", - "--snakefile", pl_path + "/workflow/Snakefile", - "--directory", pl_path, - "--configfile", pl_path + "/config/config.yaml", - "--profile", pl_path + "/config/cluster_config.yaml", - # Overwrite samples and outdir parameters - "--config", "samples=" + args.samples, - "--config", "outdir=" + args.outdir, - "--jobs", args.njobs, - "--cleanup-shadow", - "--use-singularity", - "--singularity-args" + " \"" + singularity_args + "\"" - ] - - #print(" ".join(cmd)) + if args.dry_run: + + cmd = [ + "snakemake -n", + "--snakefile", pl_path + "/workflow/Snakefile", + "--directory", pl_path, + "--configfile", pl_path + "/config/config.yaml", + "--config", "samples=\"" + args.samples + "\"" + " outdir=\"" + args.outdir + "\"" + ] + + else: + cmd = [ + "snakemake", + "--snakefile", pl_path + "/workflow/Snakefile", + "--directory", pl_path, + "--configfile", pl_path + "/config/config.yaml", + "--profile", pl_path + "/cluster", + "--use-singularity", + "--singularity-args" + " \"" + singularity_args + "\"", + # Overwrite samples and outdir parameters in configfile + "--config", "samples=\"" + args.samples + "\"" + " outdir=\"" + args.outdir + "\"" + ] + + print("\n" + " ".join(cmd) + "\n") os.system(" ".join(cmd)) if __name__ == '__main__':