First working version

752f06bb · Christoph Stritt · 2d44d68a · 752f06bb · 752f06bb · 752f06bb
Commit 752f06bb authored 1 year ago by Christoph Stritt
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@ assembly/resources/bakta_db
 facienda.md
 variantcalling/container/pggb_latest.sif
 variantcalling/.snakemake
+assembly/logs
+assembly/.fontconfig
--- a/assembly/README.md
+++ b/assembly/README.md
@@ -13,14 +13,18 @@ The user needs to provide two things to run the workflow on her samples:
 - a config file with some global options for the analysis
 - a tab separate table, without header, that contains the sample names and the corresponding paths to the HiFi consensus reads. 

+
+## Clone the directory
+
+
 ## Create conda environment containing snakemake and singularity
 ```
-conda env create -f environment.yml
+conda env create -f config/environment.yml

 ```


-## config.yml
+## Run the pipeline
 In the file config/config.yaml some global parameters can be set:

 ```yaml

--- a/assembly/config/cluster_config.yaml
+++ b/assembly/config/cluster_config.yaml
@@ -9,16 +9,16 @@ cluster:
    --output=logs/{rule}/{rule}-{wildcards}-%j.stdout
    --error=logs/{rule}/{rule}-{wildcards}-%j.stderr
 default-resources:
-  - partition=scicore
-  - qos=1day
-  - time=12:00:00
-  - mem_mb=20000
+  - "partition=scicore"
+  - "qos='1day'"
+  - "time='12:00:00'"
+  - "mem_mb=20000"
 restart-times: 3
 max-jobs-per-second: 10
 max-status-checks-per-second: 1
 local-cores: 1
 latency-wait: 60
-jobs: 500
+jobs: 10
 keep-going: True
 rerun-incomplete: True
 printshellcmds: True

--- a/assembly/config/config.yaml
+++ b/assembly/config/config.yaml
@@ -2,15 +2,17 @@
 #
 ##############################

-samples: config/samples.tsv
-outdir: ./results
+samples: "config/samples.tsv"
+outdir: "./results"

 ref:
-  genome_size: 4.4m
-  gbf: resources/H37Rv.gbf
+  genome_size: "4.4m"
+  gbf: "resources/H37Rv.gbf"

-bakta_db: /scicore/home/gagneux/GROUP/PacbioSnake_resources/databases/bakta_db
-container: /scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/assemblySC.sif
+bakta_db: "/scicore/home/gagneux/GROUP/PacbioSnake_resources/databases/bakta_db"
+container: "/scicore/home/gagneux/GROUP/PacbioSnake_resources/containers/assemblySC.sif"
+
+annotate: "No"

 threads_per_job: 4


--- a/assembly/config/environment.yml
+++ b/assembly/config/environment.yml
@@ -7,3 +7,4 @@ channels:
 dependencies:
  - snakemake=7.32.4
  - singularity=3.8.6
+  - biopython
--- a/assembly/run_assembly_pipeline.py
+++ b/assembly/run_assembly_pipeline.py
@@ -2,7 +2,6 @@

 import argparse
 import os
-import yaml
 import sys

 def get_args():
@@ -12,15 +11,17 @@ def get_args():
    # Parameter groups
    parser_io = parser.add_argument_group('INPUT/OUTPUT')

-    parser_cluster = parser.add_argument_group('CLUSTER CONFIGURATION')
+    parser_cluster = parser.add_argument_group('CLUSTER CONFIGURATION (not implemented yet)')

    # INPUT/OUTPUT
-    parser_io.add_argument('-s', '--samples', required=True, help='Path to tab-separeted table, no header, with sample name and path to fastq with HiFi reads.')
+    parser_io.add_argument('-s', '--samples', required=True, help='Absolute path to tab-separated table, no header, with sample name and path to fastq with HiFi reads.')
    
-    parser_io.add_argument('-o', '--outdir', required=True, help='Output directory for the results.')
+    parser_io.add_argument('-o', '--outdir', required=True, help='Absolute path to output directory.')

+    parser_io.add_argument('-n', '--dry_run', action='store_true', help='Do snakemake dry run.')

-    # CLUSTER CONFIG
+
+    # CLUSTER CONFIG (not implemented, would have to temper with the cluster config file)
    parser_cluster.add_argument('-j', '--njobs', default='4', help='Number of jobs to run in parallel. [4]')

    parser_cluster.add_argument('-t', '--threads', default='10', help='Threads per job. [10]' )
@@ -36,8 +37,7 @@ def main():
   
    # Infer pipeline location from path of run_assembly_pipeline.py
    pl_path = os.path.dirname(os.path.abspath(sys.argv[0]))
-    print(pl_path)
-
+    
    # Directories for which singularity needs to be given access
    bind_dirs = [
        "/scicore/home/gagneux/GROUP/tbresearch/genomes/IN_PROGRESS/PacBio_genomes/Gagneux",
@@ -47,24 +47,43 @@ def main():
        pl_path
        ]
    
+    # Infer folders with samples, to add them to bind_dirs
+    sample_dirs = set()
+    with open(args.samples) as f:
+        for line in f:
+            fields = line.strip().split()
+            fastq_path = fields[1]
+            fastq_dir = os.path.dirname(os.path.realpath(fastq_path))
+            sample_dirs.add(fastq_dir)
+
+    bind_dirs = bind_dirs + list(sample_dirs)
+
    singularity_args = "--bind " + " --bind ".join(bind_dirs)

-    cmd = [
-        "snakemake",
-        "--snakefile", pl_path + "/workflow/Snakefile",
-        "--directory", pl_path,
-        "--configfile", pl_path + "/config/config.yaml",
-        "--profile", pl_path + "/config/cluster_config.yaml", 
-        # Overwrite samples and outdir parameters
-        "--config", "samples=" + args.samples,
-        "--config", "outdir=" + args.outdir,
-        "--jobs", args.njobs,
-        "--cleanup-shadow",
-        "--use-singularity", 
-        "--singularity-args" + " \"" + singularity_args + "\""        
-    ]
-
-    #print(" ".join(cmd))
+    if args.dry_run:
+
+        cmd = [
+            "snakemake -n",
+            "--snakefile", pl_path + "/workflow/Snakefile",
+            "--directory", pl_path,
+            "--configfile", pl_path + "/config/config.yaml",
+            "--config", "samples=\"" + args.samples + "\"" + " outdir=\"" + args.outdir + "\""
+        ]
+
+    else:
+        cmd = [
+            "snakemake",
+            "--snakefile", pl_path + "/workflow/Snakefile",
+            "--directory", pl_path,
+            "--configfile", pl_path + "/config/config.yaml",
+            "--profile", pl_path + "/cluster", 
+            "--use-singularity", 
+            "--singularity-args" + " \"" + singularity_args + "\"",
+            # Overwrite samples and outdir parameters in configfile
+            "--config", "samples=\"" + args.samples + "\"" + " outdir=\"" + args.outdir + "\""
+        ]
+
+    print("\n" + " ".join(cmd) + "\n")
    os.system(" ".join(cmd))
    
 if __name__ == '__main__':