From cd541afe2fb05f815d28fdcad47326c2763bca7d Mon Sep 17 00:00:00 2001 From: BIOPZ-Katsantoni Maria <maria.katsantoni@unibas.ch> Date: Mon, 3 Feb 2020 18:18:03 +0100 Subject: [PATCH] generate Snakemake inputs from LabKey data table Adds script `scripts/labkey_to_snakemake.py` which - maps LabKey table fields to Snakemake parameters - assembles required parameters from the table data - infers required parameters from the input data - produces files `config.yaml` and `samples.tsv` required by the Snakemake pipeline A self-contained integration test for the script is located at `tests/test_scripts_labkey_to_snakemake` (execute script `test.sh`) and was added to the CI/CD pipeline. Note that intermittent changes to the `master` branch were merged into this branch to forego conflicts during merging. Closes #39 --- .gitlab-ci.yml | 11 +- scripts/labkey_to_snakemake.py | 248 ++++++++++++++++++ tests/RNA_Seq_data_template_test.tsv | 3 - .../expected_output.md5 | 2 + .../input_dict.tsv | 51 ++++ .../input_lib_1.mate_1.fastq.gz | Bin 0 -> 1862 bytes .../input_lib_1.mate_2.fastq.gz | Bin 0 -> 1835 bytes .../input_lib_2.mate_1.fastq.gz | Bin 0 -> 1669 bytes .../input_lib_2.mate_2.fastq.gz | Bin 0 -> 1758 bytes .../input_table.tsv | 3 + .../test_scripts_labkey_to_snakemake/test.sh | 22 ++ 11 files changed, 334 insertions(+), 6 deletions(-) create mode 100755 scripts/labkey_to_snakemake.py delete mode 100644 tests/RNA_Seq_data_template_test.tsv create mode 100644 tests/test_scripts_labkey_to_snakemake/expected_output.md5 create mode 100644 tests/test_scripts_labkey_to_snakemake/input_dict.tsv create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz create mode 100644 tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz create mode 100644 tests/test_scripts_labkey_to_snakemake/input_table.tsv create mode 100755 tests/test_scripts_labkey_to_snakemake/test.sh diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6863c72..4d4f4c3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,14 @@ image: snakemake/snakemake:v5.9.1 +before_script: + - pip install biopython==1.76 + test: script: - - cd snakemake - - snakemake -n - - bash run_test.sh + - bash tests/test_scripts_labkey_to_snakemake/test.sh + - cd snakemake # fix this in future version; all tests + - snakemake -n # should be called from home directory + - bash run_test.sh # + - cd .. # # add additional tests here diff --git a/scripts/labkey_to_snakemake.py b/scripts/labkey_to_snakemake.py new file mode 100755 index 0000000..f8f7527 --- /dev/null +++ b/scripts/labkey_to_snakemake.py @@ -0,0 +1,248 @@ +## ----------------------------------------------------------------------------- +# Author : Katsantoni Maria, Christina Herrmann +# Company: Mihaela Zavolan, Biozentrum, Basel +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# This script is part of the Zavolan lab quantification pipeline, which is used +# for analysing RNA-seq data. The table is provided by labkey and is a csv file. +# If the user provides their own table the table should contain the following +# columns: +# ----------------------------------------------------------------------------- + +import sys +import gzip +from argparse import ArgumentParser, RawTextHelpFormatter +import os +import numpy as np +import pandas as pd +from Bio import SeqIO +from io import StringIO +from csv import writer +from pathlib import Path + +# ---------------------------------------------------------------------------------------------------------------------- +def main(): + """ Preprocess sample folder and create config file for snakemake""" + + __doc__ = "Preprocess of the table and create config file." + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter) + + parser.add_argument( + "--input_table", + dest="input_table", + help="input table containing the sample information", + required=True, + metavar="FILE") + + parser.add_argument( + "--input_dict", + dest="input_dict", + help="input dictionary containing the feature name \ + conversion from labkey to snakemake allowed names", + required=True, + metavar="FILE") + + parser.add_argument( + "--genomes_path", + dest="genomes_path", + help="path containing the fasta and gtf files for all organisms", + required=True) + + parser.add_argument( + "--multimappers", + dest="multimappers", + help="number of mulitmappers allowed", + required=False, + type=int, + metavar='value', + default=1) + + parser.add_argument( + "--soft_clip", + dest="soft_clip", + help="soft-clipping option of STAR", + required=False, + choices=['EndToEnd','Local'], + default='EndToEnd') + + parser.add_argument( + "--pass_mode", + dest="pass_mode", + help="STAR option pass_mode", + required=False, + choices=['None','Basic'], + default='None') + + parser.add_argument( + "--libtype", + dest="libtype", + help="Library type for salmon", + required=False, + default='A') + + parser.add_argument( + "--config_file", + dest="config_file", + help="Configuration file to be used by Snakemake", + required=False) + + parser.add_argument( + "--samples_table", + dest="samples_table", + help="Table with samples", + required=True) + + + # __________________________________________________________________________________________________________________ + # ------------------------------------------------------------------------------------------------------------------ + # get the arguments + # ------------------------------------------------------------------------------------------------------------------ + try: + options = parser.parse_args() + except(Exception): + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + sys.stdout.write('Reading input file...\n') + input_table = pd.read_csv( + options.input_table, + header=0, + sep='\t', + index_col=None, + comment='#', + engine='python') + + input_dict = pd.read_csv( + options.input_dict, + header=0, + sep='\t', + index_col=None, + comment='#', + engine='python') + input_dict.set_index('snakemake', inplace=True, drop=True) + sys.stdout.write('Create snakemake table...\n') + snakemake_table = pd.DataFrame() + for index, row in input_table.iterrows(): + if row[input_dict.loc['seqmode', 'labkey']] == 'PAIRED': + snakemake_table.loc[index, 'seqmode'] = 'paired_end' + elif row[input_dict.loc['seqmode', 'labkey']] == 'SINGLE': + snakemake_table.loc[index, 'seqmode'] = 'single_end' + + fq1 = os.path.join( + row[input_dict.loc['fastq_path', 'labkey']], + row[input_dict.loc['fq1', 'labkey']]) + snakemake_table.loc[index, 'fq1'] = fq1 + + with gzip.open(fq1, "rt") as handle: + for record in SeqIO.parse(handle, "fastq"): + read_length = len(record.seq) + break + snakemake_table.loc[index, 'index_size'] = read_length + if read_length <= 50: + snakemake_table.loc[index, 'kmer'] = 21 + elif read_length > 50: + snakemake_table.loc[index, 'kmer'] = 31 + + + snakemake_table.loc[index, 'fq2'] = os.path.join( + row[input_dict.loc['fastq_path', 'labkey']], + row[input_dict.loc['fq2', 'labkey']]) + + snakemake_table.loc[index, 'fq1_3p'] = row[input_dict.loc['fq1_3p', 'labkey']] + snakemake_table.loc[index, 'fq1_5p'] = row[input_dict.loc['fq1_5p', 'labkey']] + snakemake_table.loc[index, 'fq2_3p'] = row[input_dict.loc['fq2_3p', 'labkey']] + snakemake_table.loc[index, 'fq2_5p'] = row[input_dict.loc['fq2_5p', 'labkey']] + + organism = row[input_dict.loc['organism', 'labkey']].replace(' ', '_').lower() + snakemake_table.loc[index, 'organism'] = organism + snakemake_table.loc[index, 'gtf'] = os.path.join( + options.genomes_path, + organism, + 'annotation.gtf') + snakemake_table.loc[index, 'gtf_filtered'] = os.path.join( + options.genomes_path, + organism, + 'annotation.gtf') + snakemake_table.loc[index, 'genome'] = os.path.join( + options.genomes_path, + organism, + 'genome.fa') + snakemake_table.loc[index, 'tr_fasta_filtered'] = os.path.join( + options.genomes_path, + organism, + 'transcriptome.fa') + + snakemake_table.loc[index, 'sd'] = row[input_dict.loc['sd', 'labkey']] + snakemake_table.loc[index, 'mean'] = row[input_dict.loc['mean', 'labkey']] + snakemake_table.loc[index, 'multimappers'] = options.multimappers + snakemake_table.loc[index, 'soft_clip'] = options.soft_clip + snakemake_table.loc[index, 'pass_mode'] = options.pass_mode + snakemake_table.loc[index, 'libtype'] = options.libtype + + if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE': + snakemake_table.loc[index, 'kallisto_directionality'] = '--fr-stranded' + elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE': + snakemake_table.loc[index, 'kallisto_directionality'] = '--rf-stranded' + else: + snakemake_table.loc[index, 'kallisto_directionality'] = '' + + if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE': + snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA' + elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE': + snakemake_table.loc[index, 'fq1_polya'] = 'TTTTTTTTTTTTTTTTT' + elif row[input_dict.loc['mate1_direction', 'labkey']] == 'RANDOM': + snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA' + else: + pass + + if row[input_dict.loc['mate2_direction', 'labkey']] == 'SENSE': + snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA' + elif row[input_dict.loc['mate2_direction', 'labkey']] == 'ANTISENSE': + snakemake_table.loc[index, 'fq2_polya'] = 'TTTTTTTTTTTTTTTTT' + elif row[input_dict.loc['mate2_direction', 'labkey']] == 'RANDOM': + snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA' + else: + pass + + snakemake_table.to_csv( + options.samples_table, + sep='\t', + header=True, + index=False) + + # Read file and infer read size for sjdbovwerhang + with open(options.config_file, 'w') as config_file: + config_file.write('''--- + output_dir: "results" + local_log: "local_log" + star_indexes: "star_indexes" + kallisto_indexes: "kallisto_indexes" +...''') + + + sys.stdout.write('Create snakemake table finished successfully...\n') + sys.stdout.write('Create config file...\n') + sys.stdout.write('Create config file finished successfully...\n') + return + + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) + + diff --git a/tests/RNA_Seq_data_template_test.tsv b/tests/RNA_Seq_data_template_test.tsv deleted file mode 100644 index 34c7bb1..0000000 --- a/tests/RNA_Seq_data_template_test.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Entry_Date Path_Fastq_Files Condition_Name Replicate_Name Single_Paired Mate1_File Mate2_File Mate1_Direction Mate2_Direction Mate1_5p_Adapter Mate1_3p_Adapter Mate2_5p_Adapter Mate2_3p_Adapter Fragment_Length_Mean Fragment_Length_SD Quality_Control_Flag Checksum_Raw_FASTQ_Mate1 Checksum_Raw_FASTQ_Mate2 File_Name_Metadata_File Name_Quality_Control_File_Mate1 Name_Quality_Control_File_Mate2 Organism TaxonID Strain_Isolate_Breed_Ecotype Strain_Isolate_Breed_Ecotype_ID Biomaterial_Provider Source_Tissue_Name Tissue_Code Additional_Tissue_Description Genotype_Short_Name Genotype_Description Disease_Short_Name Disease_Description Treatment_Short_Name Treatment_Description Gender Age Developmental_Stage Passage_Number Sample_Preparation_Date Prepared_By Documentation Protocol_File Sequencing_Date Sequencing_Instrument Library_preparation_kit Cycles Molecule Contaminant_Sequences BioAnalyzer_File -Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031355465-60677668 LN18C LN18C_rep1 PAIRED BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx -Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031410069-60677669 LN18C LN18C_rep2 PAIRED BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx diff --git a/tests/test_scripts_labkey_to_snakemake/expected_output.md5 b/tests/test_scripts_labkey_to_snakemake/expected_output.md5 new file mode 100644 index 0000000..622cddb --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/expected_output.md5 @@ -0,0 +1,2 @@ +de940b0dd38a67a7433536a5b3aee0ac config.yaml +d9c9ea4cd6108d39a2521dd87cd0c7e1 samples.tsv diff --git a/tests/test_scripts_labkey_to_snakemake/input_dict.tsv b/tests/test_scripts_labkey_to_snakemake/input_dict.tsv new file mode 100644 index 0000000..a309c29 --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/input_dict.tsv @@ -0,0 +1,51 @@ +labkey snakemake +Entry date entry_date +Path to FASTQ file(s) fastq_path +Condition name condition +Replicate name replicate_name +End type (PAIRED or SINGLE) seqmode +Name of Mate1 FASTQ file fq1 +Name of Mate2 FASTQ file fq2 +Direction of Mate1 (SENSE, ANTISENSE or RANDOM) mate1_direction +Direction of Mate2 (SENSE, ANTISENSE or RANDOM) mate2_direction +5' adapter of Mate1 fq1_5p +3' adapter of Mate1 fq1_3p +5' adapter of Mate2 fq2_5p +3' adapter of Mate2 fq2_3p +Fragment length mean mean +Fragment length SD sd +Quality control flag (PASSED or FAILED) quality_control_flag +Checksum of raw Mate1 FASTQ file mate1_checksum +Checksum of raw Mate2 FASTQ file mate2_checksum +Name of metadata file metadata +Name of quality control file for Mate1 mate1_quality +Name of quality control file for Mate2 mate2_quality +Organism organism +Taxon ID taxon_id +Name of Strain / Isolate / Breed / Ecotype strain_name +Strain / Isolate / Breed / Ecotype ID strain_id +Biomaterial provider biomaterial_provider +Source / tissue name source_name +Tissue code tissue_code +Additional tissue description tissue_description +Genotype short name genotype_name +Genotype description genotype_description +Disease short name disease_name +Disease description disease_description +Abbreviation for treatment treatment +Treatment description treatment_description +Gender gender +Age age +Developmental stage development_stage +Passage number passage_number +Sample preparation date (YYYY-MM-DD) sample_prep_date +Prepared by prepared_by +Documentation documentation +Name of protocol file protocol_file +Sequencing date (YYYY-MM-DD) seq_date +Sequencing instrument seq_instrument +Library preparation kit library_kit +Cycles cycles +Molecule molecule +Contaminant sequences contaminant_seqs +Name of BioAnalyzer file bioanalyser_file \ No newline at end of file diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..ddd5858c0318c865dd3c1e4a20ca03c07144daac GIT binary patch literal 1862 zcmb2|=3tQau!v(|o*Hqw=&_l=p5NL<pB1OJOmT54`nZ8%TgQi8j-lWF?^k*8j?p#X z%dCZ}6CbSZar5^-U-SF<@&3Ak>$e|&OaFiS{(rstn)kbIKmBCLBVTV{^W)+E|9^g7 ztdG&F{eSWM-y5|&$>uTtUfmD+WcgEMufOH9DIdOC9*fv#G1r)V=HnR#2U2Z<^K{NX z>oETi$g@$K|Isen7ng6p&sS{uxl=cFvn>DguN&sg+-}a&doB0(s<Yp|<uV_Bnk}*G zH><%NdDb=~>G)@jkt+NK1_yFZU9sSN&*{YEWm(;Gn)%MdX){XM<+R!>*e5*hJs=z* zDzQtm<SBnK$3fOJDmSKB%wQB&KYjVK$g)q;r;h90x_jr;$(1=ftUflceJv%iPW{G% zV@%GCiw_*Q+CL{xjd#bD8$VUN5;Aw*+QBa&a)71rc<7u{70%yOc<MV{)M8loF)=UO zETkl%{A|}F<DVBcE$y3rbgjmViwPMLhn<ctYkXM7)OL=s?e*RR2mXs+K5|I)@9#kF z`38o&+2(9{HlxObhk;%1Nc{d*0};D3#p*0fcEti>rMDI~GAb_$$<x(-yX<6XE2Ex( zu-~1D;(w-XFAruuT%e>;wJA|VTI5hfnE$uW&ezm0$S5{2yLFyl#BjKEzESmq1GX88 zd-k|?J-_|!)`mCRd$}Ymo^&O$ERxXbb$Qdes4?-6RNLv=2D6gA2?-JFcYl$&Wmg%@ zJBg_==$v{RFT>&htrCr`JJeP0a_#MjytSb@aohV%H?vo7Z*$|$PFd5iPJHz_6N4S= z6L;9{;^h&$^?>PelL^ym&Z_Kr3#uPhix{|Y`8&ET=)S^d;JGPdRhE;p#J8LUa{D$t z5}uUP<{rYqp0i_qL*fqMHo=*PiYLvRw4#3AOQ)4h+Osyk@=>&lmioG5L5{<lO(Kl~ zp<1@v)p=f@S6VbZe)~j@MC0qpu7{iq8~v=jg)5GWNTn36YJB+7lxMrN%BI)rKkzO~ z>ohv=>Gf{MmP}hW4TJCt850D1nEhgs8f*_Kct)=7sawW7O=a5O_opA;T-&(t{kab2 z-e&2B=GHQwNzUu?7O`A%(D2RJYWO|p;^e)5jZ}EnO!4vWoZ#=)A1?Fi{Hfn-y|!K4 z(BF7crRm9xDVlu+77vy4ZG)FNpJKWG*I2wNtvbokM}l#lk*BjlawEsV%2lchCwLjP zS4nyLe|EHzU3pmV#do6(iDxUrwph$Dm~>sR=EeJ-?juu^FL<+_;JfAE!u0a&0!F<# zGfbxbF%+Nr<>BHl^PMIq3T3Lq7tF|JikUd$yW35fLwh_0UOzMozQV1`r#|P2#T|{w z{81W9xqZU=3}?wt$o}AT;uB|rXHbY<*`yAy*qJ(-Dr>ZK?(n_pnyyp0fUEkPOzy(g zbw?kzOqFq4)-Cs|A>_oBV=4-&PgW_Jxb$4*ag=vqa&3NNDn9q5@oAk2{GOTO8)^l` ze{=3<`uv6cv!1KX+_Ie)9u*w&-Zigg#|iIM2})vLj&TWCoQY7LKU3*cbcdPd>PJ&N z+_X>XtW?vVuef%BosYiStEt>)w@TTxZ?==$ZZXGT@>9P@4OP}rQ#Na)+<TI<=M-D7 zXMwZp&gP2~d`uQwcDu*~ue0!qKlLxz?u6_Og)QQIO%H=54l(kn-+5Az9<$+ylf;Hu z2^-YC=Z2iu`R?HDAU@Z#M)dY7gGo}nI_islINS{3oE6ek(|0&dv@hu7^G_S@N6Glz zI9a%dsr_S~>9nl0z(_XB%+8%>;wqPLwa#%qD_g{IO1EyUod=h4aOHLH|E)70ywqL$ ze8H@mwQJur9=)SpcKMiV`}DO(AGXYwk^O046t8l+a0~A#?#R~0^?awI-yW!FEd6A< zf7PBZLV{cxsnZ@g#ZAn5W#gmWKcDA@lX2^+_P0k<zs_}9!poI<EnG%V{m+vR6XtJ8 zy8K!`eWS;v)Bj!8GdZ=2-&IsU_wRPr`juiQU-i!NSt@a@y>mC;nPpKSmlJ&JQ<u%j z;&`9oT=@6?#WD?Z%je5>nn~SgFPgmbWWrIm?!%n+mNAj>J64x2n!!GK*;28|qUTR$ zdFpSw@ufJZXo|rq{*^(8R;$IY<_=xjzVDYo`Wunt_nP&pB8m5>+dNdi+gtZ3OV;x1 z<ts^ND!1|<Xoxh}tUfhs`lU5VXBCz-SZvCDQ>pECaC;ZC_R$NaTW7ZXd;I2F+~F4a z>w7m})?lsFcWrrnp?TA`%OWw7j}KUG)4e&p<W1;Si#Z=B%YIo{&%kcASc@afPQy@x zbJdD-7k^#7c<u^Y^BNUh=d@J|-maT#v+e1l)qdxSa}VBF_-UEeN3B`euVQBTef{Tp z9un^CU!LggO<o*TI!(nu%hF3@Qj}+4kc0Jstzqh3`(K@___i@Lcg0%it9vIg|Fp?J zcRI00#?4o`Ki5a*NBe0``%3v=y>>@#IV9Hy%{`Ul*Kl!B#fOute2v&TFGX+GnrW62 px*~1UC9Ze>KVH?9nx=gySUt+~fc5nqv+n<67SBF(Zoec00|14JhDQJZ literal 0 HcmV?d00001 diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..0e97fa50b8d9f6457a2d73f6f0aa324a5949a1e0 GIT binary patch literal 1835 zcmb2|=3tQau!v(|o*Q|3(ql88J?F!hXtB!&d8*tL%AL;VUF01TtG?#czyH%&Ua)=r z79qXPeUpc0s!4-e^Y6Fw&!68_dj0m}Z{dG$_tzh{-yeQ;ZRWh(-+w-SczC(~|Ig2n z_Oq9*n_n7Res%NPRoi~<{^?=)ti{>S@c5I({YBsIoi<F5?7n%rq{~RX-st7#GcJ3- z#Z60flDj0WBsWp~#OghEvnJnrVgK(3ha%5qiQ|P6>K`Y#%k5aw{dmif^``~fnh&1r z&*zqy_1z%kyIbN8dlt5hSJXGEYm3}GY&PFu_7#c0ToX#X!bG}`SPKPwU{T>_`^9%a zm0!ckbDzr2ZiAMSEK<i;U-+K2DUtbbYAWYxgEQql(zlt9H%wojuwz~a+jUMJruG94 z{oSSpJ;z+<ec_&_B9*e@YoP4f#>L%Toh*zALC={zXZjR%aGs8_?r1C2*(WC+u4=Yy zrSkQ|-+o&7u1nl8w_#xs*Ws`0E-pyxx_#u%lnnVA-qpe}`i6%bE=X>*cZ)vElgDP2 zS>>H{=tcCC&CQIBiRa#mm%R^4;QJ{M!x4J*``$TKKkv2QH&14Du2#~_a&caMal(Wj zS#4_Dp0&<mmil1eZ|Qvb?%|CRsvob`EooqDJ=Xd(SglWT<}-B(HHB$v{)z@?jwB@X zH0TLzKl%KVdVq$MvwXpY^+hkj4NBf8X1rTl<Q%H3DRgI9B%{)mV7Ay*A1yD{xOz^X zsIcd%wDgmV8$VvkJkPe>>(SN8GkvGw^3xWZpA;759Oq*DF4y*Y@vH=oFBx0neJ43Y zG)o>`swyt=c%tGW<C!%l10Ja)t7U7c)_nNjEz$PVLdS7NLPWd!$q5Ph+^tf4T1Wq~ z@Vu2ZC}}uY9rYwp@KvMLjFy+-%4+*%3(Y5=Hn`*=#XJ3!=bJdaNe`!=Ub;VH8hiP~ z2k%l_55Kg3US?TewkDxsPy55N;I1{VRv$PJug8{Faplh05_ivr^NK7=hZ|X$6q{LB zFXT}Sa9wFID?-YAz4lQxXTDF5y+8d?(tP!b$I`)KN7zZnv&Z$@aumd~4y8|Kaq9{2 zeO2qFz18;76vf#y!aDOG{}q$2i#J?r(f9f4$&cC=b1qEWE~R?;ZlLx9+Zif;VO%U? zk+199nws1`bchAmpKj653%^s*yW)fE)U+#sdtYZys>%JfY1Y2pHeEjTmM2>;SFCtH zOFT;9s@(^r*9$s1uX%8!{jmAWoVwt<>kIA;8Q;FKIjm9+?D>2#ywWh#yY5-(;{y|y zC(fA0Iy+#?kw<r&{niy)EL7S3jcr!Vh77f56-(k3=S`J<(j}!4<hIJHWl7JqT9;Wo zi&t(@oWHrS^%=*n-u2ySh1=%2h=rUBow|F+UZ-ECv)1@~oZRNTbH!A7lln-dmno;% zZ#jnYHVZ`@p2W0f;hkAG!c!;Zd=b;sIJ9}mt<bZVoNG@qCB9m!F}FQ5U-aADYie?S z8J^+Fa<jO;GVwY%EsXghyoO0bM$ToS<biLEWo|(Y>E#Fe4nI0_g-zA9N_SCPVz5$M z;!=%MOKhs!)UR305tv-0UT~pK`sw}-g%rncCC0bfV;(fPN-+zDIw~`;P2cmWM=9S! z!hgZEeF1^R_InQ><+&%dc9rLjhUXjNZayq<@+~WW!WyN+vin~`(VX_@7sD@9_)p}` zz8EbzHOo{XVCHWzSNjF$G8!7?3O-AlS~eHV(LR2UfA?xrKJ_Ol*JDN7`ARQ0DQvu$ z63@HHy;)Y*Lt*mgMeHB%gmvT`@G;q_`g~va{Hh5%^S)g(U6en|Oz7C8w|ft=tl%oB z;cNVm6cQ8pJ>n~;?M=%g&fgjA7g)9=cApiC-<AJvd&ME<niaDRPySO5Dy&+qA-JY7 z+;zo-y?s(*f(Bxtl_%ycd=q$f;l>pI-^ccUwY$P;d(&=(`o)~0{hcT0Zk(TTR&a|# zw#pYl4TdA0SsfYMeDj^NHD*_sWq2)XIA*NBMAfbC`-eEqUy~)xWcGXM3#nB(XkXo} z;M;N8FGXHr$GvN3pS8@(S^q-+&QyyVY}{KmyID<1V6snGUX-P_l1*W;=kzNx5Af?Q z3v^5RJT)>a?$cp$ucFG+)>#ii_FoJ<%W3;E@51EpE555z-yGP~edwD~kjyR%<zK~` z(iy6gOvPtTvT?GVx@<|w%M;2RxxMU}s;WXu=3nW%{4#FRY9~>>+San-qMZ7|U8@R} zr@#9DfBE8B##Nhw>Q>0#nCTLhZq~Q$ded6Anx3CVvHejq9p+k0`TfE4w%+!)#~D@5 zOWZu#T+6p8fn)XCmoFD;wwJxi+?UvUnA7&=)?Mn?udMrW_v9(L@AHJWr{*%q=1VoD z=RGYxe`;1xvdgDN^W2xd^YaTiR4w@)Rj0qGDs@$twz+08=fdRg+vfdHocd4m*1IJi z<;xXr+8vMCmQgG?_lG~%vj$JT6&!Cuw0$}yH{D!x_eziR(E<?-b}uEK?!Q@nHGkOs MejPixUy^|V0L8O$fB*mh literal 0 HcmV?d00001 diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..af6dc8e311511b121c45266063b97f7882519334 GIT binary patch literal 1669 zcmb2|=3tQau!v(|UK@FO(rq`MJ@3Px%=2;zRMO0SnINFo_x+Vm=%4@bk6z|Y5OTFm zdmFI+VZn*0FATkG-%jtpzyJNN#CnUb_3!`v_;&wzeEjTZ7q-2R{r>N5%H#6izaAg| z{cdmW?90E`*-v>{W^eM>ez*Den7ET4XK&g$)zNF(U7=&|G?&i#smVQQs_9dfd}E!{ zEAO=B>uGhLR$*_8p7S}seqZlni`YJXnajs*x*ppUdAVC2OxO7S&@}e`y7Z~<joqqB ze(tH=*|e~h_i){HrnYY(2_8qIBD`LomhUy*J|}b9!EZ^rj0andqK~ny7QQ|;^SBZZ zm(k8Wa|#~j>FIOlCI0yF#qh4}v9B+=+Wuxs?5Yh;cp-IQL9fm<iS74!1S*c%wR@+$ z)8+Lt=(^u#yoS}}0N1{Smdev?&j0#bD-ox(sqx_lskViU<|T(09&qqJ#eOXD@Wa#d zHhh+>Tf}`QBYld*^VkIkk`pS-eu>n$tqXKum18;n?r=c&vE<G34_yeYIQ))5!q2x+ zQJd`n_e?*ZbOy68+%4YM)=U(NSukTxhj~Q{izU<T_nsFn=5nR@?|qn-6t(Z2kDvbp z$>X9twXNn`Y!fA#luj}pX>{Q@#KJFS<IpE`g!6Ed4+B$Yi-mz06L*m2o<kK2ZEBn+ zN*ixCzJ6>H)A5f!>sWLr$(}#hYjN~yBJ1IRhrI3wE=*SPZEUcgALzJVJ$+V_O1ITa z#W{kG-<V4daCbJHcydB?SG-DBhGuwOyY#E%Bw3N;ToSh!4P>N(RCwG&QdApw{+&P5 z*u}les*Ue{LP0_AiPE~`x)rkzsT^8*=D!lp`RjWd9)6K(tK+|WLB9Nyh-T~Y+h@9u z)hM_(7AUAsV0&n>OCip5vYSDmYVE-%d@1MWynE4hd|7N*?0R`s{%uT7pOUz~ZfHup z9<t*w<H|+$r9yl&e=;g`Fndhus`6`5SkRpw<;Krhb!L(z56dd;yLp~Bh5L#RCe9I? zY<Fl;N6o*wIn!rKZYX+N!|3w9P=ZT5;bbM3rfFWmiVG8Cy&12{G&VnBT(c;x<m3_; zK^L1CuI^b%+PXnX7KL6@WI1M@*cBA8;P{#~uU=2loD^*~?T&f0&~cM_rvumMK1+(= zX@6P5aUs#BDgOCFNw#%64=!}___kZQa6%K4c}g48wy+X+qY|yOVom0kE@@n<-bGxx zt<oQ!d^+6X6>;i}zE^uL%Uw^S{uhbi3su@}|2CR<3NthF8;WpUvRuz&(9N2_mMna? z?as^1n;sl=xU@1P!_>F0eEo%d4$qi**RI9ORX^qz4cc`3eP{ELW^cw#N4h*VM0&5B z5U@cf@bIS7Gu>A5&N<}S6mxW1>$McoNAtt3L_3RD?M(h}6!$n_p{oD#zNxG~8kaJi zF-v9k%z30Fe!Opz^W+kvR#tV_C6+5yyk~VBI-`7TM^5))W1VHQ@Bf~@{O!hL3!QxC zMTJG|p7!%l(^pQXPuYS`dg|Yl3GuJGG9ho$xh9cP#Rr?0g%uvMYU!_i!GAvJ#p~p? z{`rp=Ir+}xTcdj`%x2}1`7`*oOns%pBIzdAGpjjf*7HR@PnS%}QW6*Fcq0`YCgLG~ zob}MZ&?$E+4kZU%vj{nHGWqb1jQ)_T#qS+T7C20wEzBJ3^29POC(rvr=(DWZy4-Am zCWST?LJPmzo5-2nD`KBu$|qV?^x`G|@8&FD)*X={qLEKqAGywz^yE3lV`}loA@7I| zQ++~f;wq1;6@j0*9<#PTQ|h0#U`ANmx>M(7-H6zi_d7te>eH3YH{L8+JcadUZ}Msd zt(?5-MMpw}+cI42Jc~JnET?(h4hdcHeUh;<n_b{Teo?(oQj;MTFI;SOP28*ek&9@= zn$6Aui`IyIUz%aI%gS(1*2Sy~wUM=2+%q4ZQvB!PZrk4|6<^Wk+&a%@qv6f%2NbOW zZly9U+U4+7n00Z$h7}Ue<i!$qCA_*IC9L{7^rzdE#pg?}h)v5B^C?_reSWso>!P0W zf)y7gFF(v@QBu9_$rMcy?gr(O#V$!FRz~K{(tZ4M?)qc<?0gMZd}{aKkUrHu<i*M4 z!+JjV_5S{n5#4?{B9!gfX1z&OVNJbabF^Do;wN~1S@PoLuS;`3terRKVT4(Qqf;h> zmlapQ#H=lk^Gu&^ZQ5~mc5drDo3fs6$6N_7xv0hV_g1o8TDtj>cdhS{D@V?MuGgpt n{M#;aP#|>`*Ttfr!GAWS^u2xL&L}AO`J3?d+{B6dB^ek1gL4+; literal 0 HcmV?d00001 diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..dbc4e5096650863aeaa4bcea12b21a9c4b4d2509 GIT binary patch literal 1758 zcmb2|=3tQau!v(|o*Qv+(`*ZYJ^R<+xGlG>Ogx$SnP_{@lueJa!$ME~uRpnJx4T46 z&SU}h<_Rm!*Up<KSNrq*@9)m$GXBZ`&p+(Xw_jIZpF1n~c3s)s=i4tHD*yfK^YiBq zH|Ng&`1!*8Qw7yBv;N)xru<Z5vg+m;MP1V-9gWa?rCzuv=4i)98Os@8IQWBqR_^qg zk@mECe&VvH8DVaxE$jY#6?EEh=J?mDqxJR%4I7VM`17EMv+je`_B}IRop`%fv8E>S zz=eJ74=XYlovoSJj>P?YS9ETVS&YIRF4vvUW=JYO-&wcjoKAUa(z&P0S&x5W<fzG? zxq*?nX`c;?o{vmJ_FtX;4+VwRVFohy4_}a-&E$NYqis`DggHAy`r)XwmZL{q_S}?6 zJ;O1_U%-6d<VDZ77~MX8LgAx^?STUl>#y@N?XeJBaQFE8_i+acs`w9IXyCnDpk<Ju zxv`?n{-%;7-=ktj<weZ5O$AJoBJ$3#{O8?}5ar6C|LDky&AA&N=3bEInbFKAwWoSR z)56;a+YWOY^z<floJr_#nI;#Y(zCPS$#Rh$5)B$Ne@*PIToA45q*gRDq~n<J<X4<L z3)2qB><URJu$(a~b5_a4Kb_r~S#60820RrKvo82JAO0LLZ%x9B@(l~$avo-oI8mWD zyS;fT`~9Vj2^=$>)^?mKUuSz=CgJIs%mdv&S<R+&%PduxCZAAoj8i^_>yR%KW6;xs zKHXXif*I3U4{vDZO+RoUSm|g<(cGm9fr_5ILbDtj)y~Xmxyqh%ruv9ipoT@D?kQ^p z`AFlRpP8--o4q;H_xxnhd<$)fbJhtN><0>hf(k-I?-i9O%?@fkz%X&fBo_YuNsFhM zT%N<H(7SBcy)N#jUqYm}6;_D7N`7-tw5cUxb7Bp@bA3zxi}USF%&(&_ok~~V{bK&& z>n&$gO~pNUW)z(AQ%sI?o$6)SDXMlkH)r!h!N&4C72T(}+MG8x^Qlkis(aJIENr>@ zl=^nZHGDdqJ)DvUViwd)*}Cs%$BW3uMO{57_82&NRjw@In4VSf#WCe_5BGe(S*BC# zqh1C~|9wcSdS~RaMN@)9+ZWAomAz*1=EKBTZ|19WxcO2nF5B#CpJdpxZfc1~ij9)b z8sQTX(*u5LGdQk3BFw!ZZlRTr%&ZLp7PZq0n3=L9ynbpq_%sUo31>MOZ{RUmAt&rK zcTd-YpDrq2=07*(v+hZ$w-9Ex(r{_=I_~kb+sLS{FhE6fM#UrzmN}~4iEEZf&D^gb zUSXbQ`|N1yVOPCX2Ux!OEKCv;yV2TeQ1O$)F1`0KpSq>YN%m7quDiG;SFV$sVxCp! zD^kik$Hz7$PJsUfYr}%6J}sIr#r?L#ioHsGdn2IPGbZTllh@ZR-fWn3_oIhQhTI+| z1?~`SC06@Y$wu$xeL5E(JH)GY!r}8<Gnq`rz^$hQ@8-;CFRhTaJ!I6#cFa=up1?D< z>}gyvYYT5YR6V{iIi0aKK<WbX#<>A33!IjDFAH&tFQ{UXe4$hDLB@QdX!B3i$$9q* z73bdj^l;thJH<H(+m9u-&y<NSi)f#gHp#GR?g<{1&(~bnm+?JY-S0a8`Dxb_`_0c& z!X0bpuD#MWbI-Z3EbU|2rc?8k>RS)<nOn}g5xr$?&J@>t14+>rvC`)?_#I*uluxiN zvvun|HgoQ>Bh#h^8zp}}VQl`hd-dOc5sx<<_M0dD%(Xi0_}R8j7rziwQJXD2uS^$< zG@TWFEK+~Rbb{0xnWXlr;(hK?lj_;q<5}5Tzw))q))l=t+5WiGB1KE;ZBzWrW<gn& zb^6Lnx2t5VV_e2`QnrgLVoJ}6SqH-e+cW<yKFr(0bxuh#@8h|fg*y)~ytRDy+ZkW& zuJGAge#<ele%>51aoMcdnx1|Eds{UXuJK9ftzzzVTxI6$`CwJkn~TBQ1AnIOt=RbI zyr;l5SGAlcuDcKW^V!yfEdQsJ-F#rThFDFh2v^I!13xFG?#<?2w^!=j(GzYPHl=<3 zwXo*kU1OO#UFUgfu3w|GWy*e>oG3lHmRlx7=xV5!*3OfsloqbtdNU#Y{rT5yOS?GU zZj(seS2_7=!bZ{4>wj$M_*muL^7`<1yN5B4HyrlwTh3hYqVI+NmXv#rKi+N8$ldoj zMn5fj>z=da^6!GqiG|GiU2XL<!aDr=`+^$})z9;mCstkaHO`;$qx065{(a|_+*9`j z2cLGiRHwSy>6>obi_&eoa&Pk5?RkFK>FzV#m)<)<{9>Mce)_7f@?QIwbzO6h2AF?3 zeWM|Emnon1pC>z$<9Keh70O&m%G#}Xl6m9mCG`QX9S&V6%bx8j9U4}uXk5GI-A6CS jxSYc}Esr{7?!3>xFaKuKp1JzJKG>gA=)baGl7RsL$-_?L literal 0 HcmV?d00001 diff --git a/tests/test_scripts_labkey_to_snakemake/input_table.tsv b/tests/test_scripts_labkey_to_snakemake/input_table.tsv new file mode 100644 index 0000000..4b24cbf --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/input_table.tsv @@ -0,0 +1,3 @@ +Entry date Path to FASTQ file(s) Condition name Replicate name End type (PAIRED or SINGLE) Name of Mate1 FASTQ file Name of Mate2 FASTQ file Direction of Mate1 (SENSE, ANTISENSE or RANDOM) Direction of Mate2 (SENSE, ANTISENSE or RANDOM) 5' adapter of Mate1 3' adapter of Mate1 5' adapter of Mate2 3' adapter of Mate2 Fragment length mean Fragment length SD Quality control flag (PASSED or FAILED) Checksum of raw Mate1 FASTQ file Checksum of raw Mate2 FASTQ file Name of metadata file Name of quality control file for Mate1 Name of quality control file for Mate2 Organism Taxon ID Name of Strain / Isolate / Breed / Ecotype Strain / Isolate / Breed / Ecotype ID Biomaterial provider Source / tissue name Tissue code Additional tissue description Genotype short name Genotype description Disease short name Disease description Abbreviation for treatment Treatment description Gender Age Developmental stage Passage number Sample preparation date (YYYY-MM-DD) Prepared by Documentation Name of protocol file Sequencing date (YYYY-MM-DD) Sequencing instrument Library preparation kit Cycles Molecule Contaminant sequences Name of BioAnalyzer file +Fri Dec 20 00:00:00 CET 2019 . LN18C LN18C_rep1 PAIRED input_lib_1.mate_1.fastq.gz input_lib_1.mate_2.fastq.gz ANTISENSE SENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx +Fri Dec 20 00:00:00 CET 2019 . LN18C LN18C_rep2 PAIRED input_lib_2.mate_2.fastq.gz input_lib_2.mate_2.fastq.gz ANTISENSE SENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx diff --git a/tests/test_scripts_labkey_to_snakemake/test.sh b/tests/test_scripts_labkey_to_snakemake/test.sh new file mode 100755 index 0000000..06c50cc --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/test.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Tear down test environment +trap 'rm config.yaml samples.tsv && cd $user_dir' EXIT # quotes command is exected after script exits, regardless of exit status + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run tests +python "../../scripts/labkey_to_snakemake.py" \ + --input_table="input_table.tsv" \ + --input_dict="input_dict.tsv" \ + --config_file="config.yaml" \ + --samples_table="samples.tsv" \ + --genomes_path="." +md5sum --check "expected_output.md5" + -- GitLab