diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 6863c72edd258f8847e385f51c5a39f4999bf1cf..4d4f4c3abd6b184555bf3609481240cfa86c1c22 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,9 +1,14 @@ image: snakemake/snakemake:v5.9.1 +before_script: + - pip install biopython==1.76 + test: script: - - cd snakemake - - snakemake -n - - bash run_test.sh + - bash tests/test_scripts_labkey_to_snakemake/test.sh + - cd snakemake # fix this in future version; all tests + - snakemake -n # should be called from home directory + - bash run_test.sh # + - cd .. # # add additional tests here diff --git a/scripts/labkey_to_snakemake.py b/scripts/labkey_to_snakemake.py new file mode 100755 index 0000000000000000000000000000000000000000..f8f752752f91e84ddfa66a110a3016069fe01ede --- /dev/null +++ b/scripts/labkey_to_snakemake.py @@ -0,0 +1,248 @@ +## ----------------------------------------------------------------------------- +# Author : Katsantoni Maria, Christina Herrmann +# Company: Mihaela Zavolan, Biozentrum, Basel +# ----------------------------------------------------------------------------- + +# ----------------------------------------------------------------------------- +# This script is part of the Zavolan lab quantification pipeline, which is used +# for analysing RNA-seq data. The table is provided by labkey and is a csv file. +# If the user provides their own table the table should contain the following +# columns: +# ----------------------------------------------------------------------------- + +import sys +import gzip +from argparse import ArgumentParser, RawTextHelpFormatter +import os +import numpy as np +import pandas as pd +from Bio import SeqIO +from io import StringIO +from csv import writer +from pathlib import Path + +# ---------------------------------------------------------------------------------------------------------------------- +def main(): + """ Preprocess sample folder and create config file for snakemake""" + + __doc__ = "Preprocess of the table and create config file." + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter) + + parser.add_argument( + "--input_table", + dest="input_table", + help="input table containing the sample information", + required=True, + metavar="FILE") + + parser.add_argument( + "--input_dict", + dest="input_dict", + help="input dictionary containing the feature name \ + conversion from labkey to snakemake allowed names", + required=True, + metavar="FILE") + + parser.add_argument( + "--genomes_path", + dest="genomes_path", + help="path containing the fasta and gtf files for all organisms", + required=True) + + parser.add_argument( + "--multimappers", + dest="multimappers", + help="number of mulitmappers allowed", + required=False, + type=int, + metavar='value', + default=1) + + parser.add_argument( + "--soft_clip", + dest="soft_clip", + help="soft-clipping option of STAR", + required=False, + choices=['EndToEnd','Local'], + default='EndToEnd') + + parser.add_argument( + "--pass_mode", + dest="pass_mode", + help="STAR option pass_mode", + required=False, + choices=['None','Basic'], + default='None') + + parser.add_argument( + "--libtype", + dest="libtype", + help="Library type for salmon", + required=False, + default='A') + + parser.add_argument( + "--config_file", + dest="config_file", + help="Configuration file to be used by Snakemake", + required=False) + + parser.add_argument( + "--samples_table", + dest="samples_table", + help="Table with samples", + required=True) + + + # __________________________________________________________________________________________________________________ + # ------------------------------------------------------------------------------------------------------------------ + # get the arguments + # ------------------------------------------------------------------------------------------------------------------ + try: + options = parser.parse_args() + except(Exception): + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + sys.stdout.write('Reading input file...\n') + input_table = pd.read_csv( + options.input_table, + header=0, + sep='\t', + index_col=None, + comment='#', + engine='python') + + input_dict = pd.read_csv( + options.input_dict, + header=0, + sep='\t', + index_col=None, + comment='#', + engine='python') + input_dict.set_index('snakemake', inplace=True, drop=True) + sys.stdout.write('Create snakemake table...\n') + snakemake_table = pd.DataFrame() + for index, row in input_table.iterrows(): + if row[input_dict.loc['seqmode', 'labkey']] == 'PAIRED': + snakemake_table.loc[index, 'seqmode'] = 'paired_end' + elif row[input_dict.loc['seqmode', 'labkey']] == 'SINGLE': + snakemake_table.loc[index, 'seqmode'] = 'single_end' + + fq1 = os.path.join( + row[input_dict.loc['fastq_path', 'labkey']], + row[input_dict.loc['fq1', 'labkey']]) + snakemake_table.loc[index, 'fq1'] = fq1 + + with gzip.open(fq1, "rt") as handle: + for record in SeqIO.parse(handle, "fastq"): + read_length = len(record.seq) + break + snakemake_table.loc[index, 'index_size'] = read_length + if read_length <= 50: + snakemake_table.loc[index, 'kmer'] = 21 + elif read_length > 50: + snakemake_table.loc[index, 'kmer'] = 31 + + + snakemake_table.loc[index, 'fq2'] = os.path.join( + row[input_dict.loc['fastq_path', 'labkey']], + row[input_dict.loc['fq2', 'labkey']]) + + snakemake_table.loc[index, 'fq1_3p'] = row[input_dict.loc['fq1_3p', 'labkey']] + snakemake_table.loc[index, 'fq1_5p'] = row[input_dict.loc['fq1_5p', 'labkey']] + snakemake_table.loc[index, 'fq2_3p'] = row[input_dict.loc['fq2_3p', 'labkey']] + snakemake_table.loc[index, 'fq2_5p'] = row[input_dict.loc['fq2_5p', 'labkey']] + + organism = row[input_dict.loc['organism', 'labkey']].replace(' ', '_').lower() + snakemake_table.loc[index, 'organism'] = organism + snakemake_table.loc[index, 'gtf'] = os.path.join( + options.genomes_path, + organism, + 'annotation.gtf') + snakemake_table.loc[index, 'gtf_filtered'] = os.path.join( + options.genomes_path, + organism, + 'annotation.gtf') + snakemake_table.loc[index, 'genome'] = os.path.join( + options.genomes_path, + organism, + 'genome.fa') + snakemake_table.loc[index, 'tr_fasta_filtered'] = os.path.join( + options.genomes_path, + organism, + 'transcriptome.fa') + + snakemake_table.loc[index, 'sd'] = row[input_dict.loc['sd', 'labkey']] + snakemake_table.loc[index, 'mean'] = row[input_dict.loc['mean', 'labkey']] + snakemake_table.loc[index, 'multimappers'] = options.multimappers + snakemake_table.loc[index, 'soft_clip'] = options.soft_clip + snakemake_table.loc[index, 'pass_mode'] = options.pass_mode + snakemake_table.loc[index, 'libtype'] = options.libtype + + if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE': + snakemake_table.loc[index, 'kallisto_directionality'] = '--fr-stranded' + elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE': + snakemake_table.loc[index, 'kallisto_directionality'] = '--rf-stranded' + else: + snakemake_table.loc[index, 'kallisto_directionality'] = '' + + if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE': + snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA' + elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE': + snakemake_table.loc[index, 'fq1_polya'] = 'TTTTTTTTTTTTTTTTT' + elif row[input_dict.loc['mate1_direction', 'labkey']] == 'RANDOM': + snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA' + else: + pass + + if row[input_dict.loc['mate2_direction', 'labkey']] == 'SENSE': + snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA' + elif row[input_dict.loc['mate2_direction', 'labkey']] == 'ANTISENSE': + snakemake_table.loc[index, 'fq2_polya'] = 'TTTTTTTTTTTTTTTTT' + elif row[input_dict.loc['mate2_direction', 'labkey']] == 'RANDOM': + snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA' + else: + pass + + snakemake_table.to_csv( + options.samples_table, + sep='\t', + header=True, + index=False) + + # Read file and infer read size for sjdbovwerhang + with open(options.config_file, 'w') as config_file: + config_file.write('''--- + output_dir: "results" + local_log: "local_log" + star_indexes: "star_indexes" + kallisto_indexes: "kallisto_indexes" +...''') + + + sys.stdout.write('Create snakemake table finished successfully...\n') + sys.stdout.write('Create config file...\n') + sys.stdout.write('Create config file finished successfully...\n') + return + + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) + + diff --git a/tests/RNA_Seq_data_template_test.tsv b/tests/RNA_Seq_data_template_test.tsv deleted file mode 100644 index 34c7bb10db59c7ca9d693ded26e5cf1b5ab79b1d..0000000000000000000000000000000000000000 --- a/tests/RNA_Seq_data_template_test.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Entry_Date Path_Fastq_Files Condition_Name Replicate_Name Single_Paired Mate1_File Mate2_File Mate1_Direction Mate2_Direction Mate1_5p_Adapter Mate1_3p_Adapter Mate2_5p_Adapter Mate2_3p_Adapter Fragment_Length_Mean Fragment_Length_SD Quality_Control_Flag Checksum_Raw_FASTQ_Mate1 Checksum_Raw_FASTQ_Mate2 File_Name_Metadata_File Name_Quality_Control_File_Mate1 Name_Quality_Control_File_Mate2 Organism TaxonID Strain_Isolate_Breed_Ecotype Strain_Isolate_Breed_Ecotype_ID Biomaterial_Provider Source_Tissue_Name Tissue_Code Additional_Tissue_Description Genotype_Short_Name Genotype_Description Disease_Short_Name Disease_Description Treatment_Short_Name Treatment_Description Gender Age Developmental_Stage Passage_Number Sample_Preparation_Date Prepared_By Documentation Protocol_File Sequencing_Date Sequencing_Instrument Library_preparation_kit Cycles Molecule Contaminant_Sequences BioAnalyzer_File -Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031355465-60677668 LN18C LN18C_rep1 PAIRED BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx -Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031410069-60677669 LN18C LN18C_rep2 PAIRED BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx diff --git a/tests/test_scripts_labkey_to_snakemake/expected_output.md5 b/tests/test_scripts_labkey_to_snakemake/expected_output.md5 new file mode 100644 index 0000000000000000000000000000000000000000..622cddb24f18000cdf60d7d6a1547d2048a549b0 --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/expected_output.md5 @@ -0,0 +1,2 @@ +de940b0dd38a67a7433536a5b3aee0ac config.yaml +d9c9ea4cd6108d39a2521dd87cd0c7e1 samples.tsv diff --git a/tests/test_scripts_labkey_to_snakemake/input_dict.tsv b/tests/test_scripts_labkey_to_snakemake/input_dict.tsv new file mode 100644 index 0000000000000000000000000000000000000000..a309c29bbdc8dc59f21ebd3314b6d78b009fa7b5 --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/input_dict.tsv @@ -0,0 +1,51 @@ +labkey snakemake +Entry date entry_date +Path to FASTQ file(s) fastq_path +Condition name condition +Replicate name replicate_name +End type (PAIRED or SINGLE) seqmode +Name of Mate1 FASTQ file fq1 +Name of Mate2 FASTQ file fq2 +Direction of Mate1 (SENSE, ANTISENSE or RANDOM) mate1_direction +Direction of Mate2 (SENSE, ANTISENSE or RANDOM) mate2_direction +5' adapter of Mate1 fq1_5p +3' adapter of Mate1 fq1_3p +5' adapter of Mate2 fq2_5p +3' adapter of Mate2 fq2_3p +Fragment length mean mean +Fragment length SD sd +Quality control flag (PASSED or FAILED) quality_control_flag +Checksum of raw Mate1 FASTQ file mate1_checksum +Checksum of raw Mate2 FASTQ file mate2_checksum +Name of metadata file metadata +Name of quality control file for Mate1 mate1_quality +Name of quality control file for Mate2 mate2_quality +Organism organism +Taxon ID taxon_id +Name of Strain / Isolate / Breed / Ecotype strain_name +Strain / Isolate / Breed / Ecotype ID strain_id +Biomaterial provider biomaterial_provider +Source / tissue name source_name +Tissue code tissue_code +Additional tissue description tissue_description +Genotype short name genotype_name +Genotype description genotype_description +Disease short name disease_name +Disease description disease_description +Abbreviation for treatment treatment +Treatment description treatment_description +Gender gender +Age age +Developmental stage development_stage +Passage number passage_number +Sample preparation date (YYYY-MM-DD) sample_prep_date +Prepared by prepared_by +Documentation documentation +Name of protocol file protocol_file +Sequencing date (YYYY-MM-DD) seq_date +Sequencing instrument seq_instrument +Library preparation kit library_kit +Cycles cycles +Molecule molecule +Contaminant sequences contaminant_seqs +Name of BioAnalyzer file bioanalyser_file \ No newline at end of file diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..ddd5858c0318c865dd3c1e4a20ca03c07144daac Binary files /dev/null and b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_1.fastq.gz differ diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..0e97fa50b8d9f6457a2d73f6f0aa324a5949a1e0 Binary files /dev/null and b/tests/test_scripts_labkey_to_snakemake/input_lib_1.mate_2.fastq.gz differ diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..af6dc8e311511b121c45266063b97f7882519334 Binary files /dev/null and b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_1.fastq.gz differ diff --git a/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..dbc4e5096650863aeaa4bcea12b21a9c4b4d2509 Binary files /dev/null and b/tests/test_scripts_labkey_to_snakemake/input_lib_2.mate_2.fastq.gz differ diff --git a/tests/test_scripts_labkey_to_snakemake/input_table.tsv b/tests/test_scripts_labkey_to_snakemake/input_table.tsv new file mode 100644 index 0000000000000000000000000000000000000000..4b24cbfeb3085aab0c537fb239b082f132b3cbb5 --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/input_table.tsv @@ -0,0 +1,3 @@ +Entry date Path to FASTQ file(s) Condition name Replicate name End type (PAIRED or SINGLE) Name of Mate1 FASTQ file Name of Mate2 FASTQ file Direction of Mate1 (SENSE, ANTISENSE or RANDOM) Direction of Mate2 (SENSE, ANTISENSE or RANDOM) 5' adapter of Mate1 3' adapter of Mate1 5' adapter of Mate2 3' adapter of Mate2 Fragment length mean Fragment length SD Quality control flag (PASSED or FAILED) Checksum of raw Mate1 FASTQ file Checksum of raw Mate2 FASTQ file Name of metadata file Name of quality control file for Mate1 Name of quality control file for Mate2 Organism Taxon ID Name of Strain / Isolate / Breed / Ecotype Strain / Isolate / Breed / Ecotype ID Biomaterial provider Source / tissue name Tissue code Additional tissue description Genotype short name Genotype description Disease short name Disease description Abbreviation for treatment Treatment description Gender Age Developmental stage Passage number Sample preparation date (YYYY-MM-DD) Prepared by Documentation Name of protocol file Sequencing date (YYYY-MM-DD) Sequencing instrument Library preparation kit Cycles Molecule Contaminant sequences Name of BioAnalyzer file +Fri Dec 20 00:00:00 CET 2019 . LN18C LN18C_rep1 PAIRED input_lib_1.mate_1.fastq.gz input_lib_1.mate_2.fastq.gz ANTISENSE SENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx +Fri Dec 20 00:00:00 CET 2019 . LN18C LN18C_rep2 PAIRED input_lib_2.mate_2.fastq.gz input_lib_2.mate_2.fastq.gz ANTISENSE SENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx diff --git a/tests/test_scripts_labkey_to_snakemake/test.sh b/tests/test_scripts_labkey_to_snakemake/test.sh new file mode 100755 index 0000000000000000000000000000000000000000..06c50ccc165d2c3ab3402ca1e00645cf7499bd5f --- /dev/null +++ b/tests/test_scripts_labkey_to_snakemake/test.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# Tear down test environment +trap 'rm config.yaml samples.tsv && cd $user_dir' EXIT # quotes command is exected after script exits, regardless of exit status + +# Set up test environment +set -eo pipefail # ensures that script exits at first command that exits with non-zero status +set -u # ensures that script exits when unset variables are used +set -x # facilitates debugging by printing out executed commands +user_dir=$PWD +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)" +cd $script_dir + +# Run tests +python "../../scripts/labkey_to_snakemake.py" \ + --input_table="input_table.tsv" \ + --input_dict="input_dict.tsv" \ + --config_file="config.yaml" \ + --samples_table="samples.tsv" \ + --genomes_path="." +md5sum --check "expected_output.md5" +