From cd541afe2fb05f815d28fdcad47326c2763bca7d Mon Sep 17 00:00:00 2001
From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch>
Date: Mon, 3 Feb 2020 18:18:03 +0100
Subject: [PATCH] generate Snakemake inputs from LabKey data table

Adds script `scripts/labkey_to_snakemake.py` which
- maps LabKey table fields to Snakemake parameters
- assembles required parameters from the table data
- infers required parameters from the input data
- produces files `config.yaml` and `samples.tsv` required by the Snakemake pipeline

A self-contained integration test for the script is located at `tests/test_scripts_labkey_to_snakemake` (execute script `test.sh`) and was added to the CI/CD pipeline.

Note that intermittent changes to the `master` branch were merged into this branch to forego conflicts during merging.

Closes #39
---
 .gitlab-ci.yml                                |  11 +-
 scripts/labkey_to_snakemake.py                | 248 ++++++++++++++++++
 tests/RNA_Seq_data_template_test.tsv          |   3 -
 .../expected_output.md5                       |   2 +
 .../input_dict.tsv                            |  51 ++++
 .../input_lib_1.mate_1.fastq.gz               | Bin 0 -> 1862 bytes
 .../input_lib_1.mate_2.fastq.gz               | Bin 0 -> 1835 bytes
 .../input_lib_2.mate_1.fastq.gz               | Bin 0 -> 1669 bytes
 .../input_lib_2.mate_2.fastq.gz               | Bin 0 -> 1758 bytes
 .../input_table.tsv                           |   3 +
 .../test_scripts_labkey_to_snakemake/test.sh  |  22 ++
 11 files changed, 334 insertions(+), 6 deletions(-)
 create mode 100755 scripts/labkey_to_snakemake.py
 delete mode 100644 tests/RNA_Seq_data_template_test.tsv
 create mode 100644 tests/test_scripts_labkey_to_snakemake/expected_output.md5
 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_dict.tsv
 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz
 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz
 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz
 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz
 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_table.tsv
 create mode 100755 tests/test_scripts_labkey_to_snakemake/test.sh

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6863c72..4d4f4c3 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,9 +1,14 @@
 image: snakemake/snakemake:v5.9.1
 
+before_script:
+  - pip install biopython==1.76
+
 test:
   script:
-    - cd snakemake
-    - snakemake -n
-    - bash run_test.sh
+    - bash tests/test_scripts_labkey_to_snakemake/test.sh
+    - cd snakemake      # fix this in future version; all tests
+    - snakemake -n      # should be called from home directory
+    - bash run_test.sh  #
+    - cd ..             #
     # add additional tests here
 
diff --git a/scripts/labkey_to_snakemake.py b/scripts/labkey_to_snakemake.py
new file mode 100755
index 0000000..f8f7527
--- /dev/null
+++ b/scripts/labkey_to_snakemake.py
@@ -0,0 +1,248 @@
+## -----------------------------------------------------------------------------
+# Author : Katsantoni Maria, Christina Herrmann
+# Company: Mihaela Zavolan, Biozentrum, Basel
+# -----------------------------------------------------------------------------
+
+# -----------------------------------------------------------------------------
+# This script is part of the Zavolan lab quantification pipeline, which is used
+# for analysing RNA-seq data. The table is provided by labkey and is a csv file.
+# If the user provides their own table the table should contain the following 
+# columns:
+# -----------------------------------------------------------------------------
+
+import sys
+import gzip
+from argparse import ArgumentParser, RawTextHelpFormatter
+import os
+import numpy as np
+import pandas as pd
+from Bio import SeqIO
+from io import StringIO
+from csv import writer
+from pathlib import Path
+
+# ----------------------------------------------------------------------------------------------------------------------
+def main():
+    """ Preprocess sample folder and create config file for snakemake"""
+
+    __doc__ = "Preprocess of the table and create config file."
+
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=RawTextHelpFormatter)
+
+    parser.add_argument(
+        "--input_table",
+        dest="input_table",
+        help="input table containing the sample information",
+        required=True,
+        metavar="FILE")
+
+    parser.add_argument(
+        "--input_dict",
+        dest="input_dict",
+        help="input dictionary containing the feature name \
+              conversion from labkey to snakemake allowed names",
+        required=True,
+        metavar="FILE")
+
+    parser.add_argument(
+        "--genomes_path",
+        dest="genomes_path",
+        help="path containing the fasta and gtf files for all organisms",
+        required=True)
+
+    parser.add_argument(
+        "--multimappers",
+        dest="multimappers",
+        help="number of mulitmappers allowed",
+        required=False,
+        type=int,
+        metavar='value',
+        default=1)
+
+    parser.add_argument(
+        "--soft_clip",
+        dest="soft_clip",
+        help="soft-clipping option of STAR",
+        required=False,
+        choices=['EndToEnd','Local'],
+        default='EndToEnd')
+
+    parser.add_argument(
+        "--pass_mode",
+        dest="pass_mode",
+        help="STAR option pass_mode",
+        required=False,
+        choices=['None','Basic'],
+        default='None')
+
+    parser.add_argument(
+        "--libtype",
+        dest="libtype",
+        help="Library type for salmon",
+        required=False,
+        default='A')
+
+    parser.add_argument(
+        "--config_file",
+        dest="config_file",
+        help="Configuration file to be used by Snakemake",
+        required=False)
+
+    parser.add_argument(
+        "--samples_table",
+        dest="samples_table",
+        help="Table with samples",
+        required=True)
+
+
+    # __________________________________________________________________________________________________________________
+    # ------------------------------------------------------------------------------------------------------------------
+    # get the arguments
+    # ------------------------------------------------------------------------------------------------------------------
+    try:
+        options = parser.parse_args()
+    except(Exception):
+        parser.print_help()
+
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+
+    sys.stdout.write('Reading input file...\n')
+    input_table = pd.read_csv(
+        options.input_table,
+        header=0,
+        sep='\t',
+        index_col=None,
+        comment='#',
+        engine='python')
+
+    input_dict = pd.read_csv(
+        options.input_dict,
+        header=0,
+        sep='\t',
+        index_col=None,
+        comment='#',
+        engine='python')
+    input_dict.set_index('snakemake', inplace=True, drop=True)
+    sys.stdout.write('Create snakemake table...\n')
+    snakemake_table = pd.DataFrame()
+    for index, row in input_table.iterrows():
+        if row[input_dict.loc['seqmode', 'labkey']] == 'PAIRED':
+            snakemake_table.loc[index, 'seqmode'] = 'paired_end'
+        elif row[input_dict.loc['seqmode', 'labkey']] == 'SINGLE':
+            snakemake_table.loc[index, 'seqmode'] = 'single_end'
+
+        fq1 = os.path.join(
+            row[input_dict.loc['fastq_path', 'labkey']],
+            row[input_dict.loc['fq1', 'labkey']])
+        snakemake_table.loc[index, 'fq1'] = fq1
+
+        with gzip.open(fq1, "rt") as handle:
+            for record in SeqIO.parse(handle, "fastq"):
+                read_length = len(record.seq)
+                break
+        snakemake_table.loc[index, 'index_size'] = read_length
+        if read_length <= 50:
+            snakemake_table.loc[index, 'kmer'] = 21
+        elif read_length > 50:
+            snakemake_table.loc[index, 'kmer'] = 31
+
+
+        snakemake_table.loc[index, 'fq2'] = os.path.join(
+            row[input_dict.loc['fastq_path', 'labkey']],
+            row[input_dict.loc['fq2', 'labkey']])
+
+        snakemake_table.loc[index, 'fq1_3p'] = row[input_dict.loc['fq1_3p', 'labkey']]
+        snakemake_table.loc[index, 'fq1_5p'] = row[input_dict.loc['fq1_5p', 'labkey']]
+        snakemake_table.loc[index, 'fq2_3p'] = row[input_dict.loc['fq2_3p', 'labkey']]
+        snakemake_table.loc[index, 'fq2_5p'] = row[input_dict.loc['fq2_5p', 'labkey']]
+
+        organism = row[input_dict.loc['organism', 'labkey']].replace(' ', '_').lower()
+        snakemake_table.loc[index, 'organism'] = organism
+        snakemake_table.loc[index, 'gtf'] = os.path.join(
+            options.genomes_path,
+            organism,
+            'annotation.gtf')
+        snakemake_table.loc[index, 'gtf_filtered'] = os.path.join(
+            options.genomes_path,
+            organism,
+            'annotation.gtf')
+        snakemake_table.loc[index, 'genome'] = os.path.join(
+            options.genomes_path,
+            organism,
+            'genome.fa')
+        snakemake_table.loc[index, 'tr_fasta_filtered'] = os.path.join(
+            options.genomes_path,
+            organism,
+            'transcriptome.fa')
+
+        snakemake_table.loc[index, 'sd'] = row[input_dict.loc['sd', 'labkey']]
+        snakemake_table.loc[index, 'mean'] = row[input_dict.loc['mean', 'labkey']]
+        snakemake_table.loc[index, 'multimappers'] = options.multimappers
+        snakemake_table.loc[index, 'soft_clip'] = options.soft_clip
+        snakemake_table.loc[index, 'pass_mode'] = options.pass_mode
+        snakemake_table.loc[index, 'libtype'] = options.libtype
+
+        if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE':
+            snakemake_table.loc[index, 'kallisto_directionality'] = '--fr-stranded'
+        elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE':
+            snakemake_table.loc[index, 'kallisto_directionality'] = '--rf-stranded'
+        else:
+            snakemake_table.loc[index, 'kallisto_directionality'] = ''
+
+        if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE':
+            snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA'
+        elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE':
+            snakemake_table.loc[index, 'fq1_polya'] = 'TTTTTTTTTTTTTTTTT'
+        elif row[input_dict.loc['mate1_direction', 'labkey']] == 'RANDOM':
+            snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA'
+        else:
+            pass
+
+        if row[input_dict.loc['mate2_direction', 'labkey']] == 'SENSE':
+            snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA'
+        elif row[input_dict.loc['mate2_direction', 'labkey']] == 'ANTISENSE':
+            snakemake_table.loc[index, 'fq2_polya'] = 'TTTTTTTTTTTTTTTTT'
+        elif row[input_dict.loc['mate2_direction', 'labkey']] == 'RANDOM':
+            snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA'
+        else:
+            pass
+
+    snakemake_table.to_csv(
+        options.samples_table,
+        sep='\t',
+        header=True,
+        index=False)
+
+    # Read file and infer read size for sjdbovwerhang
+    with open(options.config_file, 'w') as config_file:
+        config_file.write('''---
+  output_dir: "results"
+  local_log: "local_log"
+  star_indexes: "star_indexes"
+  kallisto_indexes: "kallisto_indexes"
+...''')
+
+
+    sys.stdout.write('Create snakemake table finished successfully...\n')
+    sys.stdout.write('Create config file...\n')
+    sys.stdout.write('Create config file finished successfully...\n')
+    return
+
+
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Call the Main function and catch Keyboard interrups
+# -----------------------------------------------------------------------------
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!" + os.linesep)
+        sys.exit(0)
+
+
diff --git a/tests/RNA_Seq_data_template_test.tsv b/tests/RNA_Seq_data_template_test.tsv
deleted file mode 100644
index 34c7bb1..0000000
--- a/tests/RNA_Seq_data_template_test.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-Entry_Date	Path_Fastq_Files	Condition_Name	Replicate_Name	Single_Paired	Mate1_File	Mate2_File	Mate1_Direction	Mate2_Direction	Mate1_5p_Adapter	Mate1_3p_Adapter	Mate2_5p_Adapter	Mate2_3p_Adapter	Fragment_Length_Mean	Fragment_Length_SD	Quality_Control_Flag	Checksum_Raw_FASTQ_Mate1	Checksum_Raw_FASTQ_Mate2	File_Name_Metadata_File	Name_Quality_Control_File_Mate1	Name_Quality_Control_File_Mate2	Organism	TaxonID	Strain_Isolate_Breed_Ecotype	Strain_Isolate_Breed_Ecotype_ID	Biomaterial_Provider	Source_Tissue_Name	Tissue_Code	Additional_Tissue_Description	Genotype_Short_Name	Genotype_Description	Disease_Short_Name	Disease_Description	Treatment_Short_Name	Treatment_Description	Gender	Age	Developmental_Stage	Passage_Number	Sample_Preparation_Date	Prepared_By	Documentation	Protocol_File	Sequencing_Date	Sequencing_Instrument	Library_preparation_kit	Cycles	Molecule	Contaminant_Sequences	BioAnalyzer_File
-Fri Dec 20 00:00:00 CET 2019	/scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031355465-60677668	LN18C	LN18C_rep1	PAIRED	BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz	BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz	ANTISENSE	ANTISENSE	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	300.0	100.0	xxx	xxx	xxx	xxx	xxx	xxx	Homo sapiens	9606	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx
-Fri Dec 20 00:00:00 CET 2019	/scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031410069-60677669	LN18C	LN18C_rep2	PAIRED	BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz	BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz	ANTISENSE	ANTISENSE	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	300.0	100.0	xxx	xxx	xxx	xxx	xxx	xxx	Homo sapiens	9606	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx
diff --git a/tests/test_scripts_labkey_to_snakemake/expected_output.md5 b/tests/test_scripts_labkey_to_snakemake/expected_output.md5
new file mode 100644
index 0000000..622cddb
--- /dev/null
+++ b/tests/test_scripts_labkey_to_snakemake/expected_output.md5
@@ -0,0 +1,2 @@
+de940b0dd38a67a7433536a5b3aee0ac  config.yaml
+d9c9ea4cd6108d39a2521dd87cd0c7e1  samples.tsv
diff --git a/tests/test_scripts_labkey_to_snakemake/input_dict.tsv b/tests/test_scripts_labkey_to_snakemake/input_dict.tsv
new file mode 100644
index 0000000..a309c29
--- /dev/null
+++ b/tests/test_scripts_labkey_to_snakemake/input_dict.tsv
@@ -0,0 +1,51 @@
+labkey	snakemake
+Entry date	entry_date
+Path to FASTQ file(s)	fastq_path
+Condition name	condition
+Replicate name	replicate_name
+End type (PAIRED or SINGLE)	seqmode
+Name of Mate1 FASTQ file	fq1
+Name of Mate2 FASTQ file	fq2
+Direction of Mate1 (SENSE, ANTISENSE or RANDOM)	mate1_direction
+Direction of Mate2 (SENSE, ANTISENSE or RANDOM)	mate2_direction
+5' adapter of Mate1	fq1_5p
+3' adapter of Mate1	fq1_3p
+5' adapter of Mate2	fq2_5p
+3' adapter of Mate2	fq2_3p
+Fragment length mean	mean
+Fragment length SD	sd
+Quality control flag (PASSED or FAILED)	quality_control_flag
+Checksum of raw Mate1 FASTQ file	mate1_checksum
+Checksum of raw Mate2 FASTQ file	mate2_checksum
+Name of metadata file	metadata
+Name of quality control file for Mate1	mate1_quality
+Name of quality control file for Mate2	mate2_quality
+Organism	organism
+Taxon ID	taxon_id
+Name of Strain / Isolate / Breed / Ecotype	strain_name
+Strain / Isolate / Breed / Ecotype ID	strain_id
+Biomaterial provider	biomaterial_provider
+Source / tissue name	source_name
+Tissue code	tissue_code
+Additional tissue description	tissue_description
+Genotype short name	genotype_name
+Genotype description	genotype_description
+Disease short name	disease_name
+Disease description	disease_description
+Abbreviation for treatment	treatment
+Treatment description	treatment_description
+Gender	gender
+Age	age
+Developmental stage	development_stage
+Passage number	passage_number
+Sample preparation date (YYYY-MM-DD)	sample_prep_date
+Prepared by	prepared_by
+Documentation	documentation
+Name of protocol file	protocol_file
+Sequencing date (YYYY-MM-DD)	seq_date
+Sequencing instrument	seq_instrument
+Library preparation kit	library_kit
+Cycles	cycles
+Molecule	molecule
+Contaminant sequences	contaminant_seqs
+Name of BioAnalyzer file	bioanalyser_file
\ No newline at end of file
diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..ddd5858c0318c865dd3c1e4a20ca03c07144daac
GIT binary patch
literal 1862
zcmb2|=3tQau!v(|o*Hqw=&_l=p5NL<pB1OJOmT54`nZ8%TgQi8j-lWF?^k*8j?p#X
z%dCZ}6CbSZar5^-U-SF<@&3Ak>$e|&OaFiS{(rstn)kbIKmBCLBVTV{^W)+E|9^g7
ztdG&F{eSWM-y5|&$>uTtUfmD+WcgEMufOH9DIdOC9*fv#G1r)V=HnR#2U2Z<^K{NX
z>oETi$g@$K|Isen7ng6p&sS{uxl=cFvn>DguN&sg+-}a&doB0(s<Yp|<uV_Bnk}*G
zH><%NdDb=~>G)@jkt+NK1_yFZU9sSN&*{YEWm(;Gn)%MdX){XM<+R!>*e5*hJs=z*
zDzQtm<SBnK$3fOJDmSKB%wQB&KYjVK$g)q;r;h90x_jr;$(1=ftUflceJv%iPW{G%
zV@%GCiw_*Q+CL{xjd#bD8$VUN5;Aw*+QBa&a)71rc<7u{70%yOc<MV{)M8loF)=UO
zETkl%{A|}F<DVBcE$y3rbgjmViwPMLhn<ctYkXM7)OL=s?e*RR2mXs+K5|I)@9#kF
z`38o&+2(9{HlxObhk;%1Nc{d*0};D3#p*0fcEti>rMDI~GAb_$$<x(-yX<6XE2Ex(
zu-~1D;(w-XFAruuT%e>;wJA|VTI5hfnE$uW&ezm0$S5{2yLFyl#BjKEzESmq1GX88
zd-k|?J-_|!)`mCRd$}Ymo^&O$ERxXbb$Qdes4?-6RNLv=2D6gA2?-JFcYl$&Wmg%@
zJBg_==$v{RFT>&htrCr`JJeP0a_#MjytSb@aohV%H?vo7Z*$|$PFd5iPJHz_6N4S=
z6L;9{;^h&$^?>PelL^ym&Z_Kr3#uPhix{|Y`8&ET=)S^d;JGPdRhE;p#J8LUa{D$t
z5}uUP<{rYqp0i_qL*fqMHo=*PiYLvRw4#3AOQ)4h+Osyk@=>&lmioG5L5{<lO(Kl~
zp<1@v)p=f@S6VbZe)~j@MC0qpu7{iq8~v=jg)5GWNTn36YJB+7lxMrN%BI)rKkzO~
z>ohv=>Gf{MmP}hW4TJCt850D1nEhgs8f*_Kct)=7sawW7O=a5O_opA;T-&(t{kab2
z-e&2B=GHQwNzUu?7O`A%(D2RJYWO|p;^e)5jZ}EnO!4vWoZ#=)A1?Fi{Hfn-y|!K4
z(BF7crRm9xDVlu+77vy4ZG)FNpJKWG*I2wNtvbokM}l#lk*BjlawEsV%2lchCwLjP
zS4nyLe|EHzU3pmV#do6(iDxUrwph$Dm~>sR=EeJ-?juu^FL<+_;JfAE!u0a&0!F<#
zGfbxbF%+Nr<>BHl^PMIq3T3Lq7tF|JikUd$yW35fLwh_0UOzMozQV1`r#|P2#T|{w
z{81W9xqZU=3}?wt$o}AT;uB|rXHbY<*`yAy*qJ(-Dr>ZK?(n_pnyyp0fUEkPOzy(g
zbw?kzOqFq4)-Cs|A>_oBV=4-&PgW_Jxb$4*ag=vqa&3NNDn9q5@oAk2{GOTO8)^l`
ze{=3<`uv6cv!1KX+_Ie)9u*w&-Zigg#|iIM2})vLj&TWCoQY7LKU3*cbcdPd>PJ&N
z+_X>XtW?vVuef%BosYiStEt>)w@TTxZ?==$ZZXGT@>9P@4OP}rQ#Na)+<TI<=M-D7
zXMwZp&gP2~d`uQwcDu*~ue0!qKlLxz?u6_Og)QQIO%H=54l(kn-+5Az9<$+ylf;Hu
z2^-YC=Z2iu`R?HDAU@Z#M)dY7gGo}nI_islINS{3oE6ek(|0&dv@hu7^G_S@N6Glz
zI9a%dsr_S~>9nl0z(_XB%+8%>;wqPLwa#%qD_g{IO1EyUod=h4aOHLH|E)70ywqL$
ze8H@mwQJur9=)SpcKMiV`}DO(AGXYwk^O046t8l+a0~A#?#R~0^?awI-yW!FEd6A<
zf7PBZLV{cxsnZ@g#ZAn5W#gmWKcDA@lX2^+_P0k<zs_}9!poI<EnG%V{m+vR6XtJ8
zy8K!`eWS;v)Bj!8GdZ=2-&IsU_wRPr`juiQU-i!NSt@a@y>mC;nPpKSmlJ&JQ<u%j
z;&`9oT=@6?#WD?Z%je5>nn~SgFPgmbWWrIm?!%n+mNAj>J64x2n!!GK*;28|qUTR$
zdFpSw@ufJZXo|rq{*^(8R;$IY<_=xjzVDYo`Wunt_nP&pB8m5>+dNdi+gtZ3OV;x1
z<ts^ND!1|<Xoxh}tUfhs`lU5VXBCz-SZvCDQ>pECaC;ZC_R$NaTW7ZXd;I2F+~F4a
z>w7m})?lsFcWrrnp?TA`%OWw7j}KUG)4e&p<W1;Si#Z=B%YIo{&%kcASc@afPQy@x
zbJdD-7k^#7c<u^Y^BNUh=d@J|-maT#v+e1l)qdxSa}VBF_-UEeN3B`euVQBTef{Tp
z9un^CU!LggO<o*TI!(nu%hF3@Qj}+4kc0Jstzqh3`(K@___i@Lcg0%it9vIg|Fp?J
zcRI00#?4o`Ki5a*NBe0``%3v=y>>@#IV9Hy%{`Ul*Kl!B#fOute2v&TFGX+GnrW62
px*~1UC9Ze>KVH?9nx=gySUt+~fc5nqv+n<67SBF(Zoec00|14JhDQJZ

literal 0
HcmV?d00001

diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..0e97fa50b8d9f6457a2d73f6f0aa324a5949a1e0
GIT binary patch
literal 1835
zcmb2|=3tQau!v(|o*Q|3(ql88J?F!hXtB!&d8*tL%AL;VUF01TtG?#czyH%&Ua)=r
z79qXPeUpc0s!4-e^Y6Fw&!68_dj0m}Z{dG$_tzh{-yeQ;ZRWh(-+w-SczC(~|Ig2n
z_Oq9*n_n7Res%NPRoi~<{^?=)ti{>S@c5I({YBsIoi<F5?7n%rq{~RX-st7#GcJ3-
z#Z60flDj0WBsWp~#OghEvnJnrVgK(3ha%5qiQ|P6>K`Y#%k5aw{dmif^``~fnh&1r
z&*zqy_1z%kyIbN8dlt5hSJXGEYm3}GY&PFu_7#c0ToX#X!bG}`SPKPwU{T>_`^9%a
zm0!ckbDzr2ZiAMSEK<i;U-+K2DUtbbYAWYxgEQql(zlt9H%wojuwz~a+jUMJruG94
z{oSSpJ;z+<ec_&_B9*e@YoP4f#>L%Toh*zALC={zXZjR%aGs8_?r1C2*(WC+u4=Yy
zrSkQ|-+o&7u1nl8w_#xs*Ws`0E-pyxx_#u%lnnVA-qpe}`i6%bE=X>*cZ)vElgDP2
zS>>H{=tcCC&CQIBiRa#mm%R^4;QJ{M!x4J*``$TKKkv2QH&14Du2#~_a&caMal(Wj
zS#4_Dp0&<mmil1eZ|Qvb?%|CRsvob`EooqDJ=Xd(SglWT<}-B(HHB$v{)z@?jwB@X
zH0TLzKl%KVdVq$MvwXpY^+hkj4NBf8X1rTl<Q%H3DRgI9B%{)mV7Ay*A1yD{xOz^X
zsIcd%wDgmV8$VvkJkPe>>(SN8GkvGw^3xWZpA;759Oq*DF4y*Y@vH=oFBx0neJ43Y
zG)o>`swyt=c%tGW<C!%l10Ja)t7U7c)_nNjEz$PVLdS7NLPWd!$q5Ph+^tf4T1Wq~
z@Vu2ZC}}uY9rYwp@KvMLjFy+-%4+*%3(Y5=Hn`*=#XJ3!=bJdaNe`!=Ub;VH8hiP~
z2k%l_55Kg3US?TewkDxsPy55N;I1{VRv$PJug8{Faplh05_ivr^NK7=hZ|X$6q{LB
zFXT}Sa9wFID?-YAz4lQxXTDF5y+8d?(tP!b$I`)KN7zZnv&Z$@aumd~4y8|Kaq9{2
zeO2qFz18;76vf#y!aDOG{}q$2i#J?r(f9f4$&cC=b1qEWE~R?;ZlLx9+Zif;VO%U?
zk+199nws1`bchAmpKj653%^s*yW)fE)U+#sdtYZys>%JfY1Y2pHeEjTmM2>;SFCtH
zOFT;9s@(^r*9$s1uX%8!{jmAWoVwt<>kIA;8Q;FKIjm9+?D>2#ywWh#yY5-(;{y|y
zC(fA0Iy+#?kw<r&{niy)EL7S3jcr!Vh77f56-(k3=S`J<(j}!4<hIJHWl7JqT9;Wo
zi&t(@oWHrS^%=*n-u2ySh1=%2h=rUBow|F+UZ-ECv)1@~oZRNTbH!A7lln-dmno;%
zZ#jnYHVZ`@p2W0f;hkAG!c!;Zd=b;sIJ9}mt<bZVoNG@qCB9m!F}FQ5U-aADYie?S
z8J^+Fa<jO;GVwY%EsXghyoO0bM$ToS<biLEWo|(Y>E#Fe4nI0_g-zA9N_SCPVz5$M
z;!=%MOKhs!)UR305tv-0UT~pK`sw}-g%rncCC0bfV;(fPN-+zDIw~`;P2cmWM=9S!
z!hgZEeF1^R_InQ><+&%dc9rLjhUXjNZayq<@+~WW!WyN+vin~`(VX_@7sD@9_)p}`
zz8EbzHOo{XVCHWzSNjF$G8!7?3O-AlS~eHV(LR2UfA?xrKJ_Ol*JDN7`ARQ0DQvu$
z63@HHy;)Y*Lt*mgMeHB%gmvT`@G;q_`g~va{Hh5%^S)g(U6en|Oz7C8w|ft=tl%oB
z;cNVm6cQ8pJ>n~;?M=%g&fgjA7g)9=cApiC-<AJvd&ME<niaDRPySO5Dy&+qA-JY7
z+;zo-y?s(*f(Bxtl_%ycd=q$f;l>pI-^ccUwY$P;d(&=(`o)~0{hcT0Zk(TTR&a|#
zw#pYl4TdA0SsfYMeDj^NHD*_sWq2)XIA*NBMAfbC`-eEqUy~)xWcGXM3#nB(XkXo}
z;M;N8FGXHr$GvN3pS8@(S^q-+&QyyVY}{KmyID<1V6snGUX-P_l1*W;=kzNx5Af?Q
z3v^5RJT)>a?$cp$ucFG+)>#ii_FoJ<%W3;E@51EpE555z-yGP~edwD~kjyR%<zK~`
z(iy6gOvPtTvT?GVx@<|w%M;2RxxMU}s;WXu=3nW%{4#FRY9~>>+San-qMZ7|U8@R}
zr@#9DfBE8B##Nhw>Q>0#nCTLhZq~Q$ded6Anx3CVvHejq9p+k0`TfE4w%+!)#~D@5
zOWZu#T+6p8fn)XCmoFD;wwJxi+?UvUnA7&=)?Mn?udMrW_v9(L@AHJWr{*%q=1VoD
z=RGYxe`;1xvdgDN^W2xd^YaTiR4w@)Rj0qGDs@$twz+08=fdRg+vfdHocd4m*1IJi
z<;xXr+8vMCmQgG?_lG~%vj$JT6&!Cuw0$}yH{D!x_eziR(E<?-b}uEK?!Q@nHGkOs
MejPixUy^|V0L8O$fB*mh

literal 0
HcmV?d00001

diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..af6dc8e311511b121c45266063b97f7882519334
GIT binary patch
literal 1669
zcmb2|=3tQau!v(|UK@FO(rq`MJ@3Px%=2;zRMO0SnINFo_x+Vm=%4@bk6z|Y5OTFm
zdmFI+VZn*0FATkG-%jtpzyJNN#CnUb_3!`v_;&wzeEjTZ7q-2R{r>N5%H#6izaAg|
z{cdmW?90E`*-v>{W^eM>ez*Den7ET4XK&g$)zNF(U7=&|G?&i#smVQQs_9dfd}E!{
zEAO=B>uGhLR$*_8p7S}seqZlni`YJXnajs*x*ppUdAVC2OxO7S&@}e`y7Z~<joqqB
ze(tH=*|e~h_i){HrnYY(2_8qIBD`LomhUy*J|}b9!EZ^rj0andqK~ny7QQ|;^SBZZ
zm(k8Wa|#~j>FIOlCI0yF#qh4}v9B+=+Wuxs?5Yh;cp-IQL9fm<iS74!1S*c%wR@+$
z)8+Lt=(^u#yoS}}0N1{Smdev?&j0#bD-ox(sqx_lskViU<|T(09&qqJ#eOXD@Wa#d
zHhh+>Tf}`QBYld*^VkIkk`pS-eu>n$tqXKum18;n?r=c&vE<G34_yeYIQ))5!q2x+
zQJd`n_e?*ZbOy68+%4YM)=U(NSukTxhj~Q{izU<T_nsFn=5nR@?|qn-6t(Z2kDvbp
z$>X9twXNn`Y!fA#luj}pX>{Q@#KJFS<IpE`g!6Ed4+B$Yi-mz06L*m2o<kK2ZEBn+
zN*ixCzJ6>H)A5f!>sWLr$(}#hYjN~yBJ1IRhrI3wE=*SPZEUcgALzJVJ$+V_O1ITa
z#W{kG-<V4daCbJHcydB?SG-DBhGuwOyY#E%Bw3N;ToSh!4P>N(RCwG&QdApw{+&P5
z*u}les*Ue{LP0_AiPE~`x)rkzsT^8*=D!lp`RjWd9)6K(tK+|WLB9Nyh-T~Y+h@9u
z)hM_(7AUAsV0&n>OCip5vYSDmYVE-%d@1MWynE4hd|7N*?0R`s{%uT7pOUz~ZfHup
z9<t*w<H|+$r9yl&e=;g`Fndhus`6`5SkRpw<;Krhb!L(z56dd;yLp~Bh5L#RCe9I?
zY<Fl;N6o*wIn!rKZYX+N!|3w9P=ZT5;bbM3rfFWmiVG8Cy&12{G&VnBT(c;x<m3_;
zK^L1CuI^b%+PXnX7KL6@WI1M@*cBA8;P{#~uU=2loD^*~?T&f0&~cM_rvumMK1+(=
zX@6P5aUs#BDgOCFNw#%64=!}___kZQa6%K4c}g48wy+X+qY|yOVom0kE@@n<-bGxx
zt<oQ!d^+6X6>;i}zE^uL%Uw^S{uhbi3su@}|2CR<3NthF8;WpUvRuz&(9N2_mMna?
z?as^1n;sl=xU@1P!_>F0eEo%d4$qi**RI9ORX^qz4cc`3eP{ELW^cw#N4h*VM0&5B
z5U@cf@bIS7Gu>A5&N<}S6mxW1>$McoNAtt3L_3RD?M(h}6!$n_p{oD#zNxG~8kaJi
zF-v9k%z30Fe!Opz^W+kvR#tV_C6+5yyk~VBI-`7TM^5))W1VHQ@Bf~@{O!hL3!QxC
zMTJG|p7!%l(^pQXPuYS`dg|Yl3GuJGG9ho$xh9cP#Rr?0g%uvMYU!_i!GAvJ#p~p?
z{`rp=Ir+}xTcdj`%x2}1`7`*oOns%pBIzdAGpjjf*7HR@PnS%}QW6*Fcq0`YCgLG~
zob}MZ&?$E+4kZU%vj{nHGWqb1jQ)_T#qS+T7C20wEzBJ3^29POC(rvr=(DWZy4-Am
zCWST?LJPmzo5-2nD`KBu$|qV?^x`G|@8&FD)*X={qLEKqAGywz^yE3lV`}loA@7I|
zQ++~f;wq1;6@j0*9<#PTQ|h0#U`ANmx>M(7-H6zi_d7te>eH3YH{L8+JcadUZ}Msd
zt(?5-MMpw}+cI42Jc~JnET?(h4hdcHeUh;<n_b{Teo?(oQj;MTFI;SOP28*ek&9@=
zn$6Aui`IyIUz%aI%gS(1*2Sy~wUM=2+%q4ZQvB!PZrk4|6<^Wk+&a%@qv6f%2NbOW
zZly9U+U4+7n00Z$h7}Ue<i!$qCA_*IC9L{7^rzdE#pg?}h)v5B^C?_reSWso>!P0W
zf)y7gFF(v@QBu9_$rMcy?gr(O#V$!FRz~K{(tZ4M?)qc<?0gMZd}{aKkUrHu<i*M4
z!+JjV_5S{n5#4?{B9!gfX1z&OVNJbabF^Do;wN~1S@PoLuS;`3terRKVT4(Qqf;h>
zmlapQ#H=lk^Gu&^ZQ5~mc5drDo3fs6$6N_7xv0hV_g1o8TDtj>cdhS{D@V?MuGgpt
n{M#;aP#|>`*Ttfr!GAWS^u2xL&L}AO`J3?d+{B6dB^ek1gL4+;

literal 0
HcmV?d00001

diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz
new file mode 100644
index 0000000000000000000000000000000000000000..dbc4e5096650863aeaa4bcea12b21a9c4b4d2509
GIT binary patch
literal 1758
zcmb2|=3tQau!v(|o*Qv+(`*ZYJ^R<+xGlG>Ogx$SnP_{@lueJa!$ME~uRpnJx4T46
z&SU}h<_Rm!*Up<KSNrq*@9)m$GXBZ`&p+(Xw_jIZpF1n~c3s)s=i4tHD*yfK^YiBq
zH|Ng&`1!*8Qw7yBv;N)xru<Z5vg+m;MP1V-9gWa?rCzuv=4i)98Os@8IQWBqR_^qg
zk@mECe&VvH8DVaxE$jY#6?EEh=J?mDqxJR%4I7VM`17EMv+je`_B}IRop`%fv8E>S
zz=eJ74=XYlovoSJj>P?YS9ETVS&YIRF4vvUW=JYO-&wcjoKAUa(z&P0S&x5W<fzG?
zxq*?nX`c;?o{vmJ_FtX;4+VwRVFohy4_}a-&E$NYqis`DggHAy`r)XwmZL{q_S}?6
zJ;O1_U%-6d<VDZ77~MX8LgAx^?STUl>#y@N?XeJBaQFE8_i+acs`w9IXyCnDpk<Ju
zxv`?n{-%;7-=ktj<weZ5O$AJoBJ$3#{O8?}5ar6C|LDky&AA&N=3bEInbFKAwWoSR
z)56;a+YWOY^z<floJr_#nI;#Y(zCPS$#Rh$5)B$Ne@*PIToA45q*gRDq~n<J<X4<L
z3)2qB><URJu$(a~b5_a4Kb_r~S#60820RrKvo82JAO0LLZ%x9B@(l~$avo-oI8mWD
zyS;fT`~9Vj2^=$>)^?mKUuSz=CgJIs%mdv&S<R+&%PduxCZAAoj8i^_>yR%KW6;xs
zKHXXif*I3U4{vDZO+RoUSm|g<(cGm9fr_5ILbDtj)y~Xmxyqh%ruv9ipoT@D?kQ^p
z`AFlRpP8--o4q;H_xxnhd<$)fbJhtN><0>hf(k-I?-i9O%?@fkz%X&fBo_YuNsFhM
zT%N<H(7SBcy)N#jUqYm}6;_D7N`7-tw5cUxb7Bp@bA3zxi}USF%&(&_ok~~V{bK&&
z>n&$gO~pNUW)z(AQ%sI?o$6)SDXMlkH)r!h!N&4C72T(}+MG8x^Qlkis(aJIENr>@
zl=^nZHGDdqJ)DvUViwd)*}Cs%$BW3uMO{57_82&NRjw@In4VSf#WCe_5BGe(S*BC#
zqh1C~|9wcSdS~RaMN@)9+ZWAomAz*1=EKBTZ|19WxcO2nF5B#CpJdpxZfc1~ij9)b
z8sQTX(*u5LGdQk3BFw!ZZlRTr%&ZLp7PZq0n3=L9ynbpq_%sUo31>MOZ{RUmAt&rK
zcTd-YpDrq2=07*(v+hZ$w-9Ex(r{_=I_~kb+sLS{FhE6fM#UrzmN}~4iEEZf&D^gb
zUSXbQ`|N1yVOPCX2Ux!OEKCv;yV2TeQ1O$)F1`0KpSq>YN%m7quDiG;SFV$sVxCp!
zD^kik$Hz7$PJsUfYr}%6J}sIr#r?L#ioHsGdn2IPGbZTllh@ZR-fWn3_oIhQhTI+|
z1?~`SC06@Y$wu$xeL5E(JH)GY!r}8<Gnq`rz^$hQ@8-;CFRhTaJ!I6#cFa=up1?D<
z>}gyvYYT5YR6V{iIi0aKK<WbX#<>A33!IjDFAH&tFQ{UXe4$hDLB@QdX!B3i$$9q*
z73bdj^l;thJH<H(+m9u-&y<NSi)f#gHp#GR?g<{1&(~bnm+?JY-S0a8`Dxb_`_0c&
z!X0bpuD#MWbI-Z3EbU|2rc?8k>RS)<nOn}g5xr$?&J@>t14+>rvC`)?_#I*uluxiN
zvvun|HgoQ>Bh#h^8zp}}VQl`hd-dOc5sx<<_M0dD%(Xi0_}R8j7rziwQJXD2uS^$<
zG@TWFEK+~Rbb{0xnWXlr;(hK?lj_;q<5}5Tzw))q))l=t+5WiGB1KE;ZBzWrW<gn&
zb^6Lnx2t5VV_e2`QnrgLVoJ}6SqH-e+cW<yKFr(0bxuh#@8h|fg*y)~ytRDy+ZkW&
zuJGAge#<ele%>51aoMcdnx1|Eds{UXuJK9ftzzzVTxI6$`CwJkn~TBQ1AnIOt=RbI
zyr;l5SGAlcuDcKW^V!yfEdQsJ-F#rThFDFh2v^I!13xFG?#<?2w^!=j(GzYPHl=<3
zwXo*kU1OO#UFUgfu3w|GWy*e>oG3lHmRlx7=xV5!*3OfsloqbtdNU#Y{rT5yOS?GU
zZj(seS2_7=!bZ{4>wj$M_*muL^7`<1yN5B4HyrlwTh3hYqVI+NmXv#rKi+N8$ldoj
zMn5fj>z=da^6!GqiG|GiU2XL<!aDr=`+^$})z9;mCstkaHO`;$qx065{(a|_+*9`j
z2cLGiRHwSy>6>obi_&eoa&Pk5?RkFK>FzV#m)<)<{9>Mce)_7f@?QIwbzO6h2AF?3
zeWM|Emnon1pC>z$<9Keh70O&m%G#}Xl6m9mCG`QX9S&V6%bx8j9U4}uXk5GI-A6CS
jxSYc}Esr{7?!3>xFaKuKp1JzJKG>gA=)baGl7RsL$-_?L

literal 0
HcmV?d00001

diff --git a/tests/test_scripts_labkey_to_snakemake/input_table.tsv b/tests/test_scripts_labkey_to_snakemake/input_table.tsv
new file mode 100644
index 0000000..4b24cbf
--- /dev/null
+++ b/tests/test_scripts_labkey_to_snakemake/input_table.tsv
@@ -0,0 +1,3 @@
+Entry date	Path to FASTQ file(s)	Condition name	Replicate name	End type (PAIRED or SINGLE)	Name of Mate1 FASTQ file	Name of Mate2 FASTQ file	Direction of Mate1 (SENSE, ANTISENSE or RANDOM)	Direction of Mate2 (SENSE, ANTISENSE or RANDOM)	5' adapter of Mate1	3' adapter of Mate1	5' adapter of Mate2	3' adapter of Mate2	Fragment length mean	Fragment length SD	Quality control flag (PASSED or FAILED)	Checksum of raw Mate1 FASTQ file	Checksum of raw Mate2 FASTQ file	Name of metadata file	Name of quality control file for Mate1	Name of quality control file for Mate2	Organism	Taxon ID	Name of Strain / Isolate / Breed / Ecotype	Strain / Isolate / Breed / Ecotype ID	Biomaterial provider	Source / tissue name	Tissue code	Additional tissue description	Genotype short name	Genotype description	Disease short name	Disease description	Abbreviation for treatment	Treatment description	Gender	Age	Developmental stage	Passage number	Sample preparation date (YYYY-MM-DD)	Prepared by	Documentation	Name of protocol file	Sequencing date (YYYY-MM-DD)	Sequencing instrument	Library preparation kit	Cycles	Molecule	Contaminant sequences	Name of BioAnalyzer file
+Fri Dec 20 00:00:00 CET 2019	.	LN18C	LN18C_rep1	PAIRED	input_lib_1.mate_1.fastq.gz	input_lib_1.mate_2.fastq.gz	ANTISENSE	SENSE	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	300.0	100.0	xxx	xxx	xxx	xxx	xxx	xxx	Homo sapiens	9606	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx
+Fri Dec 20 00:00:00 CET 2019	.	LN18C	LN18C_rep2	PAIRED	input_lib_2.mate_2.fastq.gz	input_lib_2.mate_2.fastq.gz	ANTISENSE	SENSE	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	AAAAAAAAAA	300.0	100.0	xxx	xxx	xxx	xxx	xxx	xxx	Homo sapiens	9606	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx	xxx
diff --git a/tests/test_scripts_labkey_to_snakemake/test.sh b/tests/test_scripts_labkey_to_snakemake/test.sh
new file mode 100755
index 0000000..06c50cc
--- /dev/null
+++ b/tests/test_scripts_labkey_to_snakemake/test.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# Tear down test environment
+trap 'rm config.yaml samples.tsv && cd $user_dir' EXIT  # quotes command is exected after script exits, regardless of exit status
+
+# Set up test environment
+set -eo pipefail  # ensures that script exits at first command that exits with non-zero status
+set -u  # ensures that script exits when unset variables are used
+set -x  # facilitates debugging by printing out executed commands
+user_dir=$PWD
+script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
+cd $script_dir
+
+# Run tests
+python "../../scripts/labkey_to_snakemake.py" \
+    --input_table="input_table.tsv" \
+    --input_dict="input_dict.tsv" \
+    --config_file="config.yaml" \
+    --samples_table="samples.tsv" \
+    --genomes_path="."
+md5sum --check "expected_output.md5"
+
-- 
GitLab