Skip to content
Snippets Groups Projects
Commit cd541afe authored by BIOPZ-Katsantoni Maria's avatar BIOPZ-Katsantoni Maria Committed by Alex Kanitz
Browse files

generate Snakemake inputs from LabKey data table

Adds script `scripts/labkey_to_snakemake.py` which
- maps LabKey table fields to Snakemake parameters
- assembles required parameters from the table data
- infers required parameters from the input data
- produces files `config.yaml` and `samples.tsv` required by the Snakemake pipeline

A self-contained integration test for the script is located at `tests/test_scripts_labkey_to_snakemake` (execute script `test.sh`) and was added to the CI/CD pipeline.

Note that intermittent changes to the `master` branch were merged into this branch to forego conflicts during merging.

Closes #39
parent e2a49244
Branches
Tags
No related merge requests found
Showing with 334 additions and 6 deletions
image: snakemake/snakemake:v5.9.1
before_script:
- pip install biopython==1.76
test:
script:
- cd snakemake
- snakemake -n
- bash run_test.sh
- bash tests/test_scripts_labkey_to_snakemake/test.sh
- cd snakemake # fix this in future version; all tests
- snakemake -n # should be called from home directory
- bash run_test.sh #
- cd .. #
# add additional tests here
## -----------------------------------------------------------------------------
# Author : Katsantoni Maria, Christina Herrmann
# Company: Mihaela Zavolan, Biozentrum, Basel
# -----------------------------------------------------------------------------
# -----------------------------------------------------------------------------
# This script is part of the Zavolan lab quantification pipeline, which is used
# for analysing RNA-seq data. The table is provided by labkey and is a csv file.
# If the user provides their own table the table should contain the following
# columns:
# -----------------------------------------------------------------------------
import sys
import gzip
from argparse import ArgumentParser, RawTextHelpFormatter
import os
import numpy as np
import pandas as pd
from Bio import SeqIO
from io import StringIO
from csv import writer
from pathlib import Path
# ----------------------------------------------------------------------------------------------------------------------
def main():
""" Preprocess sample folder and create config file for snakemake"""
__doc__ = "Preprocess of the table and create config file."
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter)
parser.add_argument(
"--input_table",
dest="input_table",
help="input table containing the sample information",
required=True,
metavar="FILE")
parser.add_argument(
"--input_dict",
dest="input_dict",
help="input dictionary containing the feature name \
conversion from labkey to snakemake allowed names",
required=True,
metavar="FILE")
parser.add_argument(
"--genomes_path",
dest="genomes_path",
help="path containing the fasta and gtf files for all organisms",
required=True)
parser.add_argument(
"--multimappers",
dest="multimappers",
help="number of mulitmappers allowed",
required=False,
type=int,
metavar='value',
default=1)
parser.add_argument(
"--soft_clip",
dest="soft_clip",
help="soft-clipping option of STAR",
required=False,
choices=['EndToEnd','Local'],
default='EndToEnd')
parser.add_argument(
"--pass_mode",
dest="pass_mode",
help="STAR option pass_mode",
required=False,
choices=['None','Basic'],
default='None')
parser.add_argument(
"--libtype",
dest="libtype",
help="Library type for salmon",
required=False,
default='A')
parser.add_argument(
"--config_file",
dest="config_file",
help="Configuration file to be used by Snakemake",
required=False)
parser.add_argument(
"--samples_table",
dest="samples_table",
help="Table with samples",
required=True)
# __________________________________________________________________________________________________________________
# ------------------------------------------------------------------------------------------------------------------
# get the arguments
# ------------------------------------------------------------------------------------------------------------------
try:
options = parser.parse_args()
except(Exception):
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
sys.stdout.write('Reading input file...\n')
input_table = pd.read_csv(
options.input_table,
header=0,
sep='\t',
index_col=None,
comment='#',
engine='python')
input_dict = pd.read_csv(
options.input_dict,
header=0,
sep='\t',
index_col=None,
comment='#',
engine='python')
input_dict.set_index('snakemake', inplace=True, drop=True)
sys.stdout.write('Create snakemake table...\n')
snakemake_table = pd.DataFrame()
for index, row in input_table.iterrows():
if row[input_dict.loc['seqmode', 'labkey']] == 'PAIRED':
snakemake_table.loc[index, 'seqmode'] = 'paired_end'
elif row[input_dict.loc['seqmode', 'labkey']] == 'SINGLE':
snakemake_table.loc[index, 'seqmode'] = 'single_end'
fq1 = os.path.join(
row[input_dict.loc['fastq_path', 'labkey']],
row[input_dict.loc['fq1', 'labkey']])
snakemake_table.loc[index, 'fq1'] = fq1
with gzip.open(fq1, "rt") as handle:
for record in SeqIO.parse(handle, "fastq"):
read_length = len(record.seq)
break
snakemake_table.loc[index, 'index_size'] = read_length
if read_length <= 50:
snakemake_table.loc[index, 'kmer'] = 21
elif read_length > 50:
snakemake_table.loc[index, 'kmer'] = 31
snakemake_table.loc[index, 'fq2'] = os.path.join(
row[input_dict.loc['fastq_path', 'labkey']],
row[input_dict.loc['fq2', 'labkey']])
snakemake_table.loc[index, 'fq1_3p'] = row[input_dict.loc['fq1_3p', 'labkey']]
snakemake_table.loc[index, 'fq1_5p'] = row[input_dict.loc['fq1_5p', 'labkey']]
snakemake_table.loc[index, 'fq2_3p'] = row[input_dict.loc['fq2_3p', 'labkey']]
snakemake_table.loc[index, 'fq2_5p'] = row[input_dict.loc['fq2_5p', 'labkey']]
organism = row[input_dict.loc['organism', 'labkey']].replace(' ', '_').lower()
snakemake_table.loc[index, 'organism'] = organism
snakemake_table.loc[index, 'gtf'] = os.path.join(
options.genomes_path,
organism,
'annotation.gtf')
snakemake_table.loc[index, 'gtf_filtered'] = os.path.join(
options.genomes_path,
organism,
'annotation.gtf')
snakemake_table.loc[index, 'genome'] = os.path.join(
options.genomes_path,
organism,
'genome.fa')
snakemake_table.loc[index, 'tr_fasta_filtered'] = os.path.join(
options.genomes_path,
organism,
'transcriptome.fa')
snakemake_table.loc[index, 'sd'] = row[input_dict.loc['sd', 'labkey']]
snakemake_table.loc[index, 'mean'] = row[input_dict.loc['mean', 'labkey']]
snakemake_table.loc[index, 'multimappers'] = options.multimappers
snakemake_table.loc[index, 'soft_clip'] = options.soft_clip
snakemake_table.loc[index, 'pass_mode'] = options.pass_mode
snakemake_table.loc[index, 'libtype'] = options.libtype
if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE':
snakemake_table.loc[index, 'kallisto_directionality'] = '--fr-stranded'
elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE':
snakemake_table.loc[index, 'kallisto_directionality'] = '--rf-stranded'
else:
snakemake_table.loc[index, 'kallisto_directionality'] = ''
if row[input_dict.loc['mate1_direction', 'labkey']] == 'SENSE':
snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA'
elif row[input_dict.loc['mate1_direction', 'labkey']] == 'ANTISENSE':
snakemake_table.loc[index, 'fq1_polya'] = 'TTTTTTTTTTTTTTTTT'
elif row[input_dict.loc['mate1_direction', 'labkey']] == 'RANDOM':
snakemake_table.loc[index, 'fq1_polya'] = 'AAAAAAAAAAAAAAAAA'
else:
pass
if row[input_dict.loc['mate2_direction', 'labkey']] == 'SENSE':
snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA'
elif row[input_dict.loc['mate2_direction', 'labkey']] == 'ANTISENSE':
snakemake_table.loc[index, 'fq2_polya'] = 'TTTTTTTTTTTTTTTTT'
elif row[input_dict.loc['mate2_direction', 'labkey']] == 'RANDOM':
snakemake_table.loc[index, 'fq2_polya'] = 'AAAAAAAAAAAAAAAAA'
else:
pass
snakemake_table.to_csv(
options.samples_table,
sep='\t',
header=True,
index=False)
# Read file and infer read size for sjdbovwerhang
with open(options.config_file, 'w') as config_file:
config_file.write('''---
output_dir: "results"
local_log: "local_log"
star_indexes: "star_indexes"
kallisto_indexes: "kallisto_indexes"
...''')
sys.stdout.write('Create snakemake table finished successfully...\n')
sys.stdout.write('Create config file...\n')
sys.stdout.write('Create config file finished successfully...\n')
return
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
Entry_Date Path_Fastq_Files Condition_Name Replicate_Name Single_Paired Mate1_File Mate2_File Mate1_Direction Mate2_Direction Mate1_5p_Adapter Mate1_3p_Adapter Mate2_5p_Adapter Mate2_3p_Adapter Fragment_Length_Mean Fragment_Length_SD Quality_Control_Flag Checksum_Raw_FASTQ_Mate1 Checksum_Raw_FASTQ_Mate2 File_Name_Metadata_File Name_Quality_Control_File_Mate1 Name_Quality_Control_File_Mate2 Organism TaxonID Strain_Isolate_Breed_Ecotype Strain_Isolate_Breed_Ecotype_ID Biomaterial_Provider Source_Tissue_Name Tissue_Code Additional_Tissue_Description Genotype_Short_Name Genotype_Description Disease_Short_Name Disease_Description Treatment_Short_Name Treatment_Description Gender Age Developmental_Stage Passage_Number Sample_Preparation_Date Prepared_By Documentation Protocol_File Sequencing_Date Sequencing_Instrument Library_preparation_kit Cycles Molecule Contaminant_Sequences BioAnalyzer_File
Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031355465-60677668 LN18C LN18C_rep1 PAIRED BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx
Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031410069-60677669 LN18C LN18C_rep2 PAIRED BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx
de940b0dd38a67a7433536a5b3aee0ac config.yaml
d9c9ea4cd6108d39a2521dd87cd0c7e1 samples.tsv
labkey snakemake
Entry date entry_date
Path to FASTQ file(s) fastq_path
Condition name condition
Replicate name replicate_name
End type (PAIRED or SINGLE) seqmode
Name of Mate1 FASTQ file fq1
Name of Mate2 FASTQ file fq2
Direction of Mate1 (SENSE, ANTISENSE or RANDOM) mate1_direction
Direction of Mate2 (SENSE, ANTISENSE or RANDOM) mate2_direction
5' adapter of Mate1 fq1_5p
3' adapter of Mate1 fq1_3p
5' adapter of Mate2 fq2_5p
3' adapter of Mate2 fq2_3p
Fragment length mean mean
Fragment length SD sd
Quality control flag (PASSED or FAILED) quality_control_flag
Checksum of raw Mate1 FASTQ file mate1_checksum
Checksum of raw Mate2 FASTQ file mate2_checksum
Name of metadata file metadata
Name of quality control file for Mate1 mate1_quality
Name of quality control file for Mate2 mate2_quality
Organism organism
Taxon ID taxon_id
Name of Strain / Isolate / Breed / Ecotype strain_name
Strain / Isolate / Breed / Ecotype ID strain_id
Biomaterial provider biomaterial_provider
Source / tissue name source_name
Tissue code tissue_code
Additional tissue description tissue_description
Genotype short name genotype_name
Genotype description genotype_description
Disease short name disease_name
Disease description disease_description
Abbreviation for treatment treatment
Treatment description treatment_description
Gender gender
Age age
Developmental stage development_stage
Passage number passage_number
Sample preparation date (YYYY-MM-DD) sample_prep_date
Prepared by prepared_by
Documentation documentation
Name of protocol file protocol_file
Sequencing date (YYYY-MM-DD) seq_date
Sequencing instrument seq_instrument
Library preparation kit library_kit
Cycles cycles
Molecule molecule
Contaminant sequences contaminant_seqs
Name of BioAnalyzer file bioanalyser_file
\ No newline at end of file
File added
File added
File added
File added
Entry date Path to FASTQ file(s) Condition name Replicate name End type (PAIRED or SINGLE) Name of Mate1 FASTQ file Name of Mate2 FASTQ file Direction of Mate1 (SENSE, ANTISENSE or RANDOM) Direction of Mate2 (SENSE, ANTISENSE or RANDOM) 5' adapter of Mate1 3' adapter of Mate1 5' adapter of Mate2 3' adapter of Mate2 Fragment length mean Fragment length SD Quality control flag (PASSED or FAILED) Checksum of raw Mate1 FASTQ file Checksum of raw Mate2 FASTQ file Name of metadata file Name of quality control file for Mate1 Name of quality control file for Mate2 Organism Taxon ID Name of Strain / Isolate / Breed / Ecotype Strain / Isolate / Breed / Ecotype ID Biomaterial provider Source / tissue name Tissue code Additional tissue description Genotype short name Genotype description Disease short name Disease description Abbreviation for treatment Treatment description Gender Age Developmental stage Passage number Sample preparation date (YYYY-MM-DD) Prepared by Documentation Name of protocol file Sequencing date (YYYY-MM-DD) Sequencing instrument Library preparation kit Cycles Molecule Contaminant sequences Name of BioAnalyzer file
Fri Dec 20 00:00:00 CET 2019 . LN18C LN18C_rep1 PAIRED input_lib_1.mate_1.fastq.gz input_lib_1.mate_2.fastq.gz ANTISENSE SENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx
Fri Dec 20 00:00:00 CET 2019 . LN18C LN18C_rep2 PAIRED input_lib_2.mate_2.fastq.gz input_lib_2.mate_2.fastq.gz ANTISENSE SENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx
#!/bin/bash
# Tear down test environment
trap 'rm config.yaml samples.tsv && cd $user_dir' EXIT # quotes command is exected after script exits, regardless of exit status
# Set up test environment
set -eo pipefail # ensures that script exits at first command that exits with non-zero status
set -u # ensures that script exits when unset variables are used
set -x # facilitates debugging by printing out executed commands
user_dir=$PWD
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
cd $script_dir
# Run tests
python "../../scripts/labkey_to_snakemake.py" \
--input_table="input_table.tsv" \
--input_dict="input_dict.tsv" \
--config_file="config.yaml" \
--samples_table="samples.tsv" \
--genomes_path="."
md5sum --check "expected_output.md5"
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment