diff --git a/.gitignore b/.gitignore index 8b137891791fe96927ad78e64b0aad7bded08bdc..e43b0f988953ae3a84b00331d0ccf5f7d51cb3cf 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1 @@ - +.DS_Store diff --git a/prepare_annotation/.gitignore b/prepare_annotation/.gitignore deleted file mode 100644 index 06cd4097a1ab378199d07c82a5014afbff8d30d0..0000000000000000000000000000000000000000 --- a/prepare_annotation/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -.* -Log.out -results -logs -dag.png -nohup.out diff --git a/prepare_annotation/Snakefile b/prepare_annotation/Snakefile deleted file mode 100644 index efd220e2be5a71ef5963af80034fda9cce6af1ca..0000000000000000000000000000000000000000 --- a/prepare_annotation/Snakefile +++ /dev/null @@ -1,199 +0,0 @@ -configfile: "config.yaml" - -localrules: download_genome, assemble_genome, download_annotation, assemble_annotation, finish - -################################################################################# -### Final rule -################################################################################# - -rule finish: - input: - STAR_index = os.path.join(config["output_dir"], "STAR_index"), - other_RNA_idx = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"), - salmon_index = os.path.join(config["output_dir"], "filtered_transcripts_salmon.idx") - -################################################################################# -### Download genome -################################################################################# - -rule download_genome: - params: - sequences = expand(config["genome"]), - output: - genome_dir = os.path.join(config["output_dir"], "genome") - singularity: - "docker://zavolab/ubuntu:18.04" - log: - os.path.join(config["local_log"], "download_genome.log") - shell: - "(wget --directory-prefix {output.genome_dir} {params.sequences}) &> {log}" - -################################################################################# -### Assemble genome -################################################################################# - -rule assemble_genome: - input: - genome_dir = os.path.join(config["output_dir"], "genome") - output: - genome = os.path.join(config["output_dir"], "genome.fa") - params: - genome = os.path.join(config["output_dir"], "genome.fa.gz") - singularity: - "docker://zavolab/ubuntu:18.04" - log: - os.path.join(config["local_log"], "assemble_genome.log") - shell: - "(cat {input.genome_dir}/* > {params.genome}; \ - zcat {params.genome} | sed \'s/\s.*//\' > {output.genome}; \ - rm {params.genome}; \ - ) &>{log}" - -################################################################################# -### Download annotation -################################################################################# - -rule download_annotation: - params: - annotation = expand(config["annotation"]), - output: - annotation_dir = os.path.join(config["output_dir"], "annotation") - singularity: - "docker://zavolab/ubuntu:18.04" - log: - os.path.join(config["local_log"], "download_annotation.log") - shell: - "(wget --directory-prefix {output.annotation_dir} {params.annotation}) &> {log}" - -################################################################################# -### Assemble annotation -################################################################################# - -rule assemble_annotation: - input: - annotation_dir = os.path.join(config["output_dir"], "annotation") - output: - annotation = os.path.join(config["output_dir"], "annotation.gtf") - params: - annotation = os.path.join(config["output_dir"], "annotation.gtf.gz") - singularity: - "docker://zavolab/ubuntu:18.04" - log: - os.path.join(config["local_log"], "assemble_annotation.log") - shell: - "(cat {input.annotation_dir}/* > {params.annotation}; \ - zcat {params.annotation} > {output.annotation}; \ - rm {params.annotation}; \ - ) &>{log}" - -################################################################################# -### ToDo: Download other RNA -################################################################################# - -################################################################################# -### Generate segemehl index for other RNAs -################################################################################# - -rule generate_segemehl_index_other_RNAs: - input: - sequence = config["other_RNA"] - output: - other_RNA_idx = os.path.join(config["output_dir"], "other_RNAs_sequence.idx") - log: - os.path.join(config["local_log"], "generate_segemehl_index_other_RNAs.log") - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x -x {output.other_RNA_idx} -d {input.sequence}) &> {log}" - -################################################################################# -### Index genome STAR -################################################################################# - -rule index_genome_STAR: - input: - genome = os.path.join(config["output_dir"], "genome.fa"), - annotation = os.path.join(config["output_dir"], "annotation.gtf") - output: - output = os.path.join(config["output_dir"], "STAR_index") - params: - outputdir = os.path.join(config["output_dir"],"STAR_index"), - sjdbOverhang = config["sjdbOverhang"] - threads: 8 - singularity: - "docker://zavolab/star:2.6.0a" - log: - os.path.join(config["local_log"],"index_genome_STAR.log") - shell: - "mkdir -p {output.output}; \ - chmod -R 777 {output.output}; \ - (STAR --runMode genomeGenerate \ - --sjdbOverhang {params.sjdbOverhang} \ - --genomeDir {params.outputdir} \ - --genomeFastaFiles {input.genome} \ - --runThreadN {threads} \ - --sjdbGTFfile {input.annotation}) &> {log}" - -################################################################################## -### Filter protein coding and lncRNA transcripts -################################################################################## - -rule filter_transcripts: - input: - script = "scripts/fg_extract_transcripts.py", - annotation = os.path.join(config["output_dir"], "annotation.gtf") - output: - annotation = os.path.join(config["output_dir"], "filtered_transcripts.gtf") - params: - transcript_biotype = "\"protein_coding,lincRNA,antisense_RNA,retained_intron,sense_intronic\"" - singularity: - "docker://zavolab/python_htseq:3.6.5_0.10.0" - log: - os.path.join(config["local_log"], "filter_transcripts.log") - shell: - "(python {input.script} \ - --gtf {input.annotation} \ - --out {output.annotation} \ - --transcript_biotype {params.transcript_biotype}) &> {log}" - -################################################################################## -### Extract transcript sequences -################################################################################## - -rule extract_sequences: - input: - annotation = os.path.join(config["output_dir"], "filtered_transcripts.gtf"), - genome = os.path.join(config["output_dir"], "genome.fa") - output: - transcripts = os.path.join(config["output_dir"], "filtered_transcripts.fa") - singularity: - "docker://zavolab/cufflinks:2.2.1" - log: - os.path.join(config["local_log"],"extract_sequences.log") - shell: - "(gffread {input.annotation} \ - -g {input.genome} \ - -w {output.transcripts}) &> {log}" - -################################################################################## -### Index salmon -################################################################################## - -rule index_salmon: - input: - transcripts = os.path.join(config["output_dir"], "filtered_transcripts.fa") - output: - index = os.path.join(config["output_dir"], "filtered_transcripts_salmon.idx") - params: - kmerLen = config["kmerLen"], - singularity: - "docker://zavolab/salmon:0.11.0" - log: - os.path.join(config["local_log"],"index_salmon.log") - threads: 8 - shell: - "(salmon index \ - --transcripts {input.transcripts} \ - --index {output.index} \ - --kmerLen {params.kmerLen} \ - --threads {threads}) &> {log}" diff --git a/prepare_annotation/cluster.json b/prepare_annotation/cluster.json deleted file mode 100644 index a6bfdd5fa4aea8da21bdaff8c6f0d0ad96c1c6d8..0000000000000000000000000000000000000000 --- a/prepare_annotation/cluster.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "__default__" : - { - "queue": "6hours", - "time": "05:00:00", - "threads": "1", - "mem": "4G", - "name": "{rule}.{wildcards}", - "out": "$PWD/logs/cluster_log/{rule}.{wildcards}-%j-%N.out" - }, - "generate_segemehl_index_other_RNAs": - { - "time": "06:00:00", - "threads":"8", - "mem":"50G" - }, - "index_genome_STAR": - { - "time": "06:00:00", - "threads":"8", - "mem":"75G" - } -} diff --git a/prepare_annotation/config.yaml b/prepare_annotation/config.yaml deleted file mode 100644 index 92249c924ed5882411b3eb5c031bc74c733b6261..0000000000000000000000000000000000000000 --- a/prepare_annotation/config.yaml +++ /dev/null @@ -1,43 +0,0 @@ ---- - ############################################################################## - ### Annotation - ############################################################################## - genome: ["ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.1.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.2.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.3.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.4.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.5.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.6.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.8.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.9.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.10.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.11.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.12.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.13.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.14.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.15.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.16.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.17.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.18.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.19.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.20.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.21.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.22.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.X.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.Y.fa.gz", - "ftp://ftp.ensembl.org/pub/release-89/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna_sm.chromosome.MT.fa.gz"] - annotation: ["ftp://ftp.ensembl.org/pub/release-89/gtf/homo_sapiens/Homo_sapiens.GRCh38.89.chr.gtf.gz"] - other_RNA: "other.fa" - ############################################################################## - ### Output and log directories - ############################################################################## - output_dir: "results" - local_log: "logs/local_log" - cluster_log: "logs/cluster_log" - ############################################################################## - ### Options - ############################################################################## - sjdbOverhang: 100 - kmerLen: 31 -... diff --git a/prepare_annotation/other.fa b/prepare_annotation/other.fa deleted file mode 100644 index 819e24651f2dda9c8f600f98cbd4b62971895822..0000000000000000000000000000000000000000 --- a/prepare_annotation/other.fa +++ /dev/null @@ -1,195 +0,0 @@ ->RNA45SN1 -GCTGACACGCTGTCCTCTGGCGACCTGTCGCTGGAGAGGTTGGGCCTCCGGATGCGCGCGGGGCTCTGGC -CTACCGGTGACCCGGCTAGCCGGCCGCGCTCCTGCTTGAGCCGCCTGCCGGGGCCCGCGGGCCTGCTGTT -CTCTCGCGCGTCCGAGCGTCCCGACTCCCGGTGCCGGCCCGGGTCCGGGTCTCTGACCCACCCGGGGGCG -GCGGGGAAGGCGGCGAGGGCCACCGTGCCCCCGTGCGCTCTCCGCTGCGGGCGCCCGGGGCGGCCGCGAC -AACCCCACCCCGCTGGCTCCGTGCCGTGCGTGTCAGGCGTTCTCGTCTCCGCGGGGTTGTCCGCCGCCCC -TTCCCCGGAGTGGGGGGTTGGCCGGAGCCGATCGGCTCGCTGGCCGGCCGGCCGGCCTCCGCTCCCGGGG -GGCTCTTCGTGATCGATGTGGTGACGTCGTGCTCTCCCGGGCCGGGTCCGAGCCGCGACGGGCGAGGGGC -GGACGTTCGTGGCGAACGGGACCGTCCTTCTCGCTCCGCCCCGCGGGGGTCCCCTCGTCTCTCCTCTCCC -CGCCCGCCGGCGGTGCGTGTGGGAAGGCGTGGGGTGCGGACCCCGGCCCGACCTCGCCGTCCCGCCCGCC -GCCTTCTGCGTCGCGGGGCGGGCCGGCGGGGTCCTCTGACGCGGCAGACAGCCCTCGCTGTCGCCTCCAG -TGGTTGTCGACTTGCGGGCGGCCCCCCTCCGCGGCGGTGGGGGTGCCGTCCCGCCGGCCCGTCGTGCTGC -CCTCTCGGGGGGTTTGCGCGAGCGTCGGCTCCGCCTGGGCCCTTGCGGTGCTCCTGGAGCGCTCCGGGTT -GTCCCTCAGGTGCCCGAGGCCGAACGGTGGTGTGTCGTTCCCGCCCCCGGCGCCCCCTCCTCCGGTCGCC -GCCGCGGTGTCCGCGCGTGGGTCCTGAGGGAGCTCGTCGGTGTGGGGTTCGAGGCGGTTTGAGTGAGACG -AGACGAGACGCGCCCCTCCCACGCGGGGAAGGGCGCCCGCCTGCTCTCGGTGAGCGCACGTCCCGTGCTC -CCCTCTGGCGGGTGCGCGCGGGCCGTGTGAGCGATCGCGGTGGGTTCGGGCCGGTGTGACGCGTGCGCCG -GCCGGCCGCCGAGGGGCTGCCGTTCTGCCTCCGACCGGTCGTGTGTGGGTTGACTTCGGAGGCGCTCTGC -CTCGGAAGGAAGGAGGTGGGTGGACGGGGGGGCCTGGTGGGGTTGCGCGCACGCGCGCACCGGCCGGGCC -CCCGCCCTGAACGCGAACGCTCGAGGTGGCCGCGCGCAGGTGTTTCCTCGTACCGCAGGGCCCCCTCCCT -TCCCCAGGCGTCCCTCGGCGCCTCTGCGGGCCCGAGGAGGAGCGGCTGGCGGGTGGGGGGAGTGTGACCC -ACCCTCGGTGAGAAAAGCCTTCTCTAGCGATCTGAGAGGCGTGCCTTGGGGGTACCGGATCCCCCGGGCC -GCCGCCTCTGTCTCTGCCTCCGTTATGGTAGCGCTGCCGTAGCGACCCGCTCGCAGAGGACCCTCCTCCG -CTTCCCCCTCGACGGGGTTGGGGGGGAGAAGCGAGGGTTCCGCCGGCCACCGCGGTGGTGGCCGAGTGCG -GCTCGTCGCCTACTGTGGCCCGCGCCTCCCCCTTCCGAGTCGGGGGAGGATCCCGCCGGGCCGGGCCCGG -CGTTCCCAGCGGGTTGGGACGCGGCGGCCGGCGGGCGGTGGGTGTGCGCGCCCGGCGCTCTGTCCGGCGC -GTGACCCCCTCCGCCGCGAGTCGGCTCTCCGCCCGCTCCCGTGCCGAGTCGTGACCGGTGCCGACGACCG -CGTTTGCGTGGCACGGGGTCGGGCCCGCCTGGCCCTGGGAAAGCGTCCCACGGTGGGGGCGCGCCGGTCT -CCCGGAGCGGGACCGGGTCGGAGGATGGACGAGAATCACGAGCGACGGTGGTGCGGGCGTGTCGGGTTCG -TGGCTGCGGTCGCTCCGGGGCCCCCGGTGGCGGGGCCCCGGGGCTCGCGAGGCGGTTCTCGGTGGGGGCC -GAGGGCCGTCCGGCGTCCCAGGCGGGGCGCCGCGGGACCGCCCTCGTGTCTGTGGCGGTGGGATCCCGCG -GCCGTGTTTTCCTGGTGGCCCGGCCGTGCCTGAGGTTTCTCCCCGAGCCGCCGCCTCTGCGGGCTCCCGG -GTGCCCTTGCCCTCGCGGTCCCCGGCCCTCGCCCGTCTGTGCCCTCTTCCCCGCCCGCCGCCCGCCGATC -CTCTTCTTCCCCCCGAGCGGCTCACCGGCTTCACGTCCGTTGGTGGCCCCGCCTGGGACCGAACCCGGCA -CCGCCTCGTGGGGCGCCGCCGCCGGCCACTGATCGGCCCGGCGTCCGCGTCCCCCGGCGCGCGCCTTGGG -GACCGGGTCGGTGGCGCCCCGCGTGGGGCCCGGTGGGCTTCCCGGAGGGTTCCGGGGGTCGGCCTGCGGC -GCGTGCGGGGGAGGAGACGGTTCCGGGGGACCGGCCGCGACTGCGGCGGCGGTGGTGGGGGCAGCCGCGG -GGATCGCCGAGGGCCGGTCGGCCGCCCCGGGTGCCGCGCGGTGCCGCCGGCGGCGGTGAGGCCCCGCGCG -TGTGTCCCGGCCGCGGTCGGCCGCGCTCGAGGGGTCCCCGTGGCGTCCCCTTCCCCGCCGGCCGCCTTTC -TCGCGCCTTCCCCGTCGCCCCGGCCTCGCCCGTGGTCTCTCGTCTTCTCCCGGCCCGCTCTTCCGAACCG -GGTCGGCGCGTCCCCCGGGTGCGCCTCGCTTCCCGGGCCTGCCGCGGCCCTTCCCCGAGGCGTCCGTCCC -GGGCGTCGGCGTCGGGGAGAGCCCGTCCTCCCCGCGTGGCGTCGCCCCGTTCGGCGCGCGCGTGCGCCCG -AGCGCGGCCCGGTGGTCCCTGCCGGACAGGCGTTCGTGCGACGTGTGGCGTGGGTCGACCTCCGCCTTGC -CGGTCGCTCGCCCTTTCCCCGGGTCGGGGGGTGGGGCCCGGGCCGGGGCCTCGGCCCCGGTCGCGGTCCC -CCGTCCCGGGCGGGGGCGGGCGCGCCGGCCGGCCTCGGTCGGCCCTCCCTTGGCCGTCGTGTGGCGTGTG -CCACCCCTGCGCCCGCGCCCGCCGGCGGGGCTCGGAGCCGGGCTTCGGCCGGGCCCCGGGCCCTCGACCG -GACCGGTGCGCGGGCGCTGCGGCCGCACGGCGCGACTGTCCCCGGGCCGGGCACCGCGGTCCGCCTCTCG -CTCGCCGCCCGGACGTCGGGGCCGCCCCGCGGGGCGGGCGGAGCGCCGTCCCCGCCTCGCCGCCGCCCGC -GGGCGCCGGCCGCGCGCGCGCGCGCGTGGCCGCCGGTCCCTCCCGGCCGCCGGGCGCGGGTCGGGCCGTC -CGCCTCCTCGCGGGCGGGCGCGACGAAGAAGCGTCGCGGGTCTGTGGCGCGGGGCCCCGGTGGTCGTGTC -GCGTGGGGGGCGGGTGGTTGGGGCGTCCGGTTCGCCGCGCCCCGCCCCGGCCCCACCGGTCCCGGCCGCC -GCCCCCGCGCCCGCTCGCTCCCTCCCGTCCGCCCGTCCGCGGCCCGTCCGTCCGTCCGTCGTCCTCCTCG -CTTGCGGGGCGCCGGGCCCGTCCTCGCGAGGCCCCCCGGCCGGCCGTCCGGCCGCGTCGGGGCCTCGCCG -CGCTCTACCTTACCTACCTGGTTGATCCTGCCAGTAGCATATGCTTGTCTCAAAGATTAAGCCATGCATG -TCTGAGTACGCACGGCCGGTACAGTGAAACTGCGAATGGCTCATTAAATCAGTTATGGTTCCTTTGGTCG -CTCGCTCCTCTCCTACTTGGATAACTGTGGTAATTCTAGAGCTAATACATGCCGACGGGCGCTGACCCCC -TTCGCGGGGGGGATGCGTGCATTTATCAGATCAAAACCAACCCGGTCAGCCCCTCTCCGGCCCCGGCCGG -GGGGCGGGCGCCGGCGGCTTTGGTGACTCTAGATAACCTCGGGCCGATCGCACGCCCCCCGTGGCGGCGA -CGACCCATTCGAACGTCTGCCCTATCAACTTTCGATGGTAGTCGCCGTGCCTACCATGGTGACCACGGGT -GACGGGGAATCAGGGTTCGATTCCGGAGAGGGAGCCTGAGAAACGGCTACCACATCCAAGGAAGGCAGCA -GGCGCGCAAATTACCCACTCCCGACCCGGGGAGGTAGTGACGAAAAATAACAATACAGGACTCTTTCGAG -GCCCTGTAATTGGAATGAGTCCACTTTAAATCCTTTAACGAGGATCCATTGGAGGGCAAGTCTGGTGCCA -GCAGCCGCGGTAATTCCAGCTCCAATAGCGTATATTAAAGTTGCTGCAGTTAAAAAGCTCGTAGTTGGAT -CTTGGGAGCGGGCGGGCGGTCCGCCGCGAGGCGAGCCACCGCCCGTCCCCGCCCCTTGCCTCTCGGCGCC -CCCTCGATGCTCTTAGCTGAGTGTCCCGCGGGGCCCGAAGCGTTTACTTTGAAAAAATTAGAGTGTTCAA -AGCAGGCCCGAGCCGCCTGGATACCGCAGCTAGGAATAATGGAATAGGACCGCGGTTCTATTTTGTTGGT -TTTCGGAACTGAGGCCATGATTAAGAGGGACGGCCGGGGGCATTCGTATTGCGCCGCTAGAGGTGAAATT -CTTGGACCGGCGCAAGACGGACCAGAGCGAAAGCATTTGCCAAGAATGTTTTCATTAATCAAGAACGAAA -GTCGGAGGTTCGAAGACGATCAGATACCGTCGTAGTTCCGACCATAAACGATGCCGACCGGCGATGCGGC -GGCGTTATTCCCATGACCCGCCGGGCAGCTTCCGGGAAACCAAAGTCTTTGGGTTCCGGGGGGAGTATGG -TTGCAAAGCTGAAACTTAAAGGAATTGACGGAAGGGCACCACCAGGAGTGGAGCCTGCGGCTTAATTTGA -CTCAACACGGGAAACCTCACCCGGCCCGGACACGGACAGGATTGACAGATTGATAGCTCTTTCTCGATTC -CGTGGGTGGTGGTGCATGGCCGTTCTTAGTTGGTGGAGCGATTTGTCTGGTTAATTCCGATAACGAACGA -GACTCTGGCATGCTAACTAGTTACGCGACCCCCGAGCGGTCGGCGTCCCCCAACTTCTTAGAGGGACAAG -TGGCGTTCAGCCACCCGAGATTGAGCAATAACAGGTCTGTGATGCCCTTAGATGTCCGGGGCTGCACGCG -CGCTACACTGACTGGCTCAGCGTGTGCCTACCCTACGCCGGCAGGCGCGGGTAACCCGTTGAACCCCATT -CGTGATGGGGATCGGGGATTGCAATTATTCCCCATGAACGAGGAATTCCCAGTAAGTGCGGGTCATAAGC -TTGCGTTGATTAAGTCCCTGCCCTTTGTACACACCGCCCGTCGCTACTACCGATTGGATGGTTTAGTGAG -GCCCTCGGATCGGCCCCGCCGGGGTCGGCCCACGGCCCTGGCGGAGCGCTGAGAAGACGGTCGAACTTGA -CTATCTAGAGGAAGTAAAAGTCGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTAACGGAGC -CCGGAGGGCGAGGCCCGCGGCGGCGCCGCCGCCGCCGCGCGCTTCCCTCCGCACACCCACCCCCCCACCG -CGACGCGGCGCGTGCGCGGGCGGGGCCCGCGTGCCCGTTCGTTCGCTCGCTCGTTCGTTCGCCGCCCGGC -CCCGCCGGCCGCGAGAGCCGGAGAACTCGGGAGGGAGACGGGGGAGAGAGAGAGAGAGAGAGAAAGAGAA -AGAAGGGCGTGTCGTTGGTGTGCGCGTGTCGTGGGGCCGGCGGGCGGCGGGGAGCGGTCCCCGGCCGCGG -CCCCGACGACGTGGGTGTCGGCGGGCGCGGGGGCGGTTCTCGGCGGCGTCGCGGCGGGTCTGGGGGGGTC -TCGGTGCCCTCCTCCCCGCCGGGGCCCGTCGTCCGGCCCCGCCGCGCCGGCTCCCCGTCTTCGGGGCCGG -CCGGATTCCCGTCGCCTCCGCCGCGCCGCTCCGCGCCGCCGGGCACGGCCCCGCTCGCTCTCCCCGGCCT -TCCCGCTAGGGCGTCTCGAGGGTCGGGGGCCGGACGCCGGTCCCCTCCCCCGCCTCCTCGTCCGCCCCCC -CGCCGTCCAGGTACCTAGCGCGTTCCGGCGCGGAGGTTTAAAGACCCCTTGGGGGGATCGCCCGTCCGCC -CGTGGGTCGGGGGCGGTGGTGGGCCCGCGGGGGAGTCCCGTCGGGAGGGGCCCGGCCCCTCCCGCGCCTC -CACCGCGGACTCCGCTCCCCGGCCGGGGCCGCGCCGCCGCCGCCGCCGCGGCGGCCGTCGGGTGGGGGCT -TTACCCGGCGGCCGTCGCGCGCCTGCCGCGCGTGTGGCGTGCGCCCCGCGCCGTGGGGGCGGGAACCCCC -GGGCGCCTGTGGGGTGGTGTCCGCGCTCGCCCCCGCGTGGGCGGCGCGCGCCTCCCCGTGGTGTGAAACC -TTCCGACCCCTCTCCGGAGTCCGGTCCCGTTTGCTGTCTCGTCTGGCCGGCCTGAGGCAACCCCCTCTCC -TCTTGGGCGGGGGGGGGGGGGACGTGCCGCGCCAGGAAGGGCCTCCTCCCGGTGCGTCGTCGGGAGCGCC -CTCGCCAAATCGACCTCGTACGACTCTTAGCGGTGGATCACTCGGCTCGTGCGTCGATGAAGAACGCAGC -TAGCTGCGAGAATTAATGTGAATTGCAGGACACATTGATCATCGACACTTCGAACGCACTTGCGGCCCCG -GGTTCCTCCCGGGGCTACGCCTGTCTGAGCGTCGCTTGCCGATCAATCGCCCCCGGGGGTGCCTCCGGGC -TCCTCGGGGTGCGCGGCTGGGGGTTCCCTCGCAGGGCCCGCCGGGGGCCCTCCGTCCCCCTAAGCGCAGA -CCCGGCGGCGTCCGCCCTCCTCTTGCCGCCGCGCCCGCCCCTTCCCCCTCCCCCCGCGGGCCCTGCGTGG -TCACGCGTCGGGTGGCGGGGGGGAGAGGGGGGCGCGCCCGGCTGAGAGAGACGGGGAGGGCGGCGCCGCC -GCCGCCCGCGAAGACGGAGAGGGAAAGAGAGAGCCGGCTCGGGCCGAGTTCCCGTGGCCGCCGCCTGCGG -TCCGGGTTCCTCCCTCGGGGGGCTCCCTCGCGCCGCGCGCGGCTCGGGGTTCGGGGTTCGTCGGCCCCGG -CCGGGTGGAAGGTCCCGTGCCCGTCGTCGTCGTCGTCGTCGCGCGTCGTCGGCGGTGGGGGCGTGTTGCG -TGCGGTGTGGTGGTGGGGGAGGAGGAAGGCGGGTCCGGAAGGGGAAGGGTGCCGGCGGGGAGAGAGGGTC -GGGGGAGCGCGTCCCGGTCGCCGCGGTTCGCCGCCCGCCCCCGGTGGCGGCCCGGCGTCCGGCCGACCGC -CGCTCCCGCGCCCCTCCTCCTCCCCGCCGCCCCTCCTCCGAGGCCCCGCCCGTCCTCCTCGCCCTCCCCG -CGCGTACGCGCGCCCGCCCGCCCGGCTCGCCTCGCGGCGCGTCGGCCGGGGCCGGGAGCCCGCCCCGCGG -CCCGCCCGGCCGCGCCCGTGGCCGCGGCGCCGGGGTTCGCGTGTCCCCGGCGGCGACCCGCGGGACGCCG -CGGTGTCGTCCGCCGTCGCGCGCCCGCCTCCGGCTCGCGGCCGCGCCGCGCCGCGCCGGGGCCCCGTCCC -GAGCTTCCGCGTCGGGGCGGGGCGGCTCCGCCGCCGCGTCCTCGGACCCGTCCCCCCGACCTCCGCGGGG -GAGACGGGTCGGGGCGTGCGGCGCCCGTCCCGCCCCCGGCCCGTGCCCCTCCCTCCGGTCGTCCCGCTCC -GGCGGGGCGGCGCGGGGGTGCCGCCGGCCGCGCGCTCTCTCTCCCGTCGCCTCTCCCCCTCGCCGGGCCC -GTCTCCCGACGGAGCGTCGGGCGGGCGGTCGGGCCGGCGCGATTCCGTCCGTCCGTCCGCCGAGCGGCCC -GTCCCCCTCCGAGACGCGACCTCAGATCAGACGTGGCGACCCGCTGAATTTAAGCATATTAGTCAGCGGA -GGAGAAGAAACTAACCAGGATTCCCTCAGTAACGGCGAGTGAACAGGGAAGAGCCCAGCGCCGAATCCCC -GCCCCGCGGCGGGGCGCGGGACATGTGGCGTACGGAAGACCCGCTCCCCGGCGCCGCTCGTGGGGGGCCC -AAGTCCTTCTGATCGAGGCCCAGCCCGTGGACGGTGTGAGGCCGGTAGCGGCCCCCGGCGCGCCGGGCCC -GGGTCTTCCCGGAGTCGGGTTGCTTGGGAATGCAGCCCAAAGCGGGTGGTAAACTCCATCTAAGGCTAAA -TACCGGCACGAGACCGATAGTCAACAAGTACCGTAAGGGAAAGTTGAAAAGAACTTTGAAGAGAGAGTTC -AAGAGGGCGTGAAACCGTTAAGAGGTAAACGGGTGGGGTCCGCGCAGTCCGCCCGGAGGATTCAACCCGG -CGGCGGGTCCGGCCGTGTCGGCGGCCCGGCGGATCTTTCCCGCCCCCCGTTCCTCCCGACCCCTCCACCC -GCCCTCCCTTCCCCCGCCGCCCCTCCTCCTCCTCCCCGGAGGGGGCGGGCTCCGGCGGGTGCGGGGGTGG -GCGGGCGGGGCCGGGGGTGGGGTCGGCGGGGGACCGTCCCCCGACCGGCGACCGGCCGCCGCCGGGCGCA -TTTCCACCGCGGCGGTGCGCCGCGACCGGCTCCGGGACGGCTGGGAAGGCCCGGCGGGGAAGGTGGCTCG -GGGGGCCCCGTCCGTCCGTCCGTCCGTCCTCCTCCTCCCCCGTCTCCGCCCCCCGGCCCCGCGTCCTCCC -TCGGGAGGGCGCGCGGGTCGGGGCGGCGGCGGCGGCGGCGGTGGCGGCGGCGGCGGCGGCGGCGGGACCG -AAACCCCCCCCGAGTGTTACAGCCCCCCCGGCAGCAGCACTCGCCGAATCCCGGGGCCGAGGGAGCGAGA -CCCGTCGCCGCGCTCTCCCCCCTCCCGGCGCCCACCCCCGCGGGGAATCCCCCGCGAGGGGGGTCTCCCC -CGCGGGGGCGCGCCGGCGTCTCCTCGTGGGGGGGCCGGGCCACCCCTCCCACGGCGCGACCGCTCTCCCA -CCCCTCCTCCCCGCGCCCCCGCCCCGGCGACGGGGGGGGTGCCGCGCGCGGGTCGGGGGGCGGGGCGGAC -TGTCCCCAGTGCGCCCCGGGCGGGTCGCGCCGTCGGGCCCGGGGGAGGTTCTCTCGGGGCCACGCGCGCG -TCCCCCGAAGAGGGGGACGGCGGAGCGAGCGCACGGGGTCGGCGGCGACGTCGGCTACCCACCCGACCCG -TCTTGAAACACGGACCAAGGAGTCTAACACGTGCGCGAGTCGGGGGCTCGCACGAAAGCCGCCGTGGCGC -AATGAAGGTGAAGGCCGGCGCGCTCGCCGGCCGAGGTGGGATCCCGAGGCCTCTCCAGTCCGCCGAGGGC -GCACCACCGGCCCGTCTCGCCCGCCGCGCCGGGGAGGTGGAGCACGAGCGCACGTGTTAGGACCCGAAAG -ATGGTGAACTATGCCTGGGCAGGGCGAAGCCAGAGGAAACTCTGGTGGAGGTCCGTAGCGGTCCTGACGT -GCAAATCGGTCGTCCGACCTGGGTATAGGGGCGAAAGACTAATCGAACCATCTAGTAGCTGGTTCCCTCC -GAAGTTTCCCTCAGGATAGCTGGCGCTCTCGCAGACCCGACGCACCCCCGCCACGCAGTTTTATCCGGTA -AAGCGAATGATTAGAGGTCTTGGGGCCGAAACGATCTCAACCTATTCTCAAACTTTAAATGGGTAAGAAG -CCCGGCTCGCTGGCGTGGAGCCGGGCGTGGAATGCGAGTGCCTAGTGGGCCACTTTTGGTAAGCAGAACT -GGCGCTGCGGGATGAACCGAACGCCGGGTTAAGGCGCCCGATGCCGACGCTCATCAGACCCCAGAAAAGG -TGTTGGTTGATATAGACAGCAGGACGGTGGCCATGGAAGTCGGAATCCGCTAAGGAGTGTGTAACAACTC -ACCTGCCGAATCAACTAGCCCTGAAAATGGATGGCGCTGGAGCGTCGGGCCCATACCCGGCCGTCGCCGG -CAGTCGAGAGTGGACGGGAGCGGCGGGGGCGGCGCGCGCGCGCGCGCGTGTGGTGTGCGTCGGAGGGCGG -CGGCGGCGGCGGCGGCGGGGGTGTGGGGTCCTTCCCCCGCCCCCCCCCCCACGCCTCCTCCCCTCCTCCC -GCCCACGCCCCGCTCCCCGCCCCCGGAGCCCCGCGGACGCTACGCCGCGACGAGTAGGAGGGCCGCTGCG -GTGAGCCTTGAAGCCTAGGGCGCGGGCCCGGGTGGAGCCGCCGCAGGTGCAGATCTTGGTGGTAGTAGCA -AATATTCAAACGAGAACTTTGAAGGCCGAAGTGGAGAAGGGTTCCATGTGAACAGCAGTTGAACATGGGT -CAGTCGGTCCTGAGAGATGGGCGAGCGCCGTTCCGAAGGGACGGGCGATGGCCTCCGTTGCCCTCGGCCG -ATCGAAAGGGAGTCGGGTTCAGATCCCCGAATCCGGAGTGGCGGAGATGGGCGCCGCGAGGCGTCCAGTG -CGGTAACGCGACCGATCCCGGAGAAGCCGGCGGGAGCCCCGGGGAGAGTTCTCTTTTCTTTGTGAAGGGC -AGGGCGCCCTGGAATGGGTTCGCCCCGAGAGAGGGGCCCGTGCCTTGGAAAGCGTCGCGGTTCCGGCGGC -GTCCGGTGAGCTCTCGCTGGCCCTTGAAAATCCGGGGGAGAGGGTGTAAATCTCGCGCCGGGCCGTACCC -ATATCCGCAGCAGGTCTCCAAGGTGAACAGCCTCTGGCATGTTGGAACAATGTAGGTAAGGGAAGTCGGC -AAGCCGGATCCGTAACTTCGGGATAAGGATTGGCTCTAAGGGCTGGGTCGGTCGGGCTGGGGCGCGAAGC -GGGGCTGGGCGCGCGCCGCGGCTGGACGAGGCGCCGCCGCCCCCCCCACGCCCGGGGCACCCCCCTCGCG -GCCCTCCCCCGCCCCACCCCGCGCGCGCCGCTCGCTCCCTCCCCGCCCCGCGCCCTCTCTCTCTCTCTCT -CCCCCGCTCCCCGTCCTCCCCCCTCCCCGGGGGAGCGCCGCGTGGGGGCGGCGGCGGGGGGAGAAGGGTC -GGGGCGGCAGGGGCCGGCGGCGGCCCGCCGCGGGGCCCCGGCGGCGGGGGCACGGTCCCCCGCGAGGGGG -GCCCGGGCACCCGGGGGGCCGGCGGCGGCGGCGACTCTGGACGCGAGCCGGGCCCTTCCCGTGGATCGCC -CCAGCTGCGGCGGGCGTCGCGGCCGCCCCCGGGGAGCCCGGCGGGCGCCGGCGCGCCCCCCCCCCCACCC -CACGTCTCGTCGCGCGCGCGTCCGCTGGGGGCGGGGAGCGGTCGGGCGGCGGCGGTCGGCGGGCGGCGGG -GCGGGGCGGTTCGTCCCCCCGCCCTACCCCCCCGGCCCCGTCCGCCCCCCGTTCCCCCCTCCTCCTCGGC -GCGCGGCGGCGGCGGCGGGCGGCGGAGGGGCCGCGGGCCGGTCCCCCCCGCCGGGTCCGCCCCCGGGGCC -GCGGTTCCGCGCGGCGCCTCGCCTCGGCCGGCGCCTAGCAGCCGACTTAGAACTGGTGCGGACCAGGGGA -ATCCGACTGTTTAATTAAAACAAAGCATCGCGAAGGCCCGCGGCGGGTGTTGACGCGATGTGATTTCTGC -CCAGTGCTCTGAATGTCAAAGTGAAGAAATTCAATGAAGCGCGGGTAAACGGCGGGAGTAACTATGACTC -TCTTAAGGTAGCCAAATGCCTCGTCATCTAATTAGTGACGCGCATGAATGGATGAACGAGATTCCCACTG -TCCCTACCTACTATCCAGCGAAACCACAGCCAAGGGAACGGGCTTGGCGGAATCAGCGGGGAAAGAAGAC -CCTGTTGAGCTTGACTCTAGTCTGGCACGGTGAAGAGACATGAGAGGTGTAGAATAAGTGGGAGGCCCCC -GGCGCCCCCCCGGTGTCCCCGCGAGGGGCCCGGGGCGGGGTCCGCCGGCCCTGCGGGCCGCCGGTGAAAT -ACCACTACTCTGATCGTTTTTTCACTGACCCGGTGAGGCGGGGGGGCGAGCCCCGAGGGGCTCTCGCTTC -TGGCGCCAAGCGCCCGGCCGCGCGCCGGCCGGGCGCGACCCGCTCCGGGGACAGTGCCAGGTGGGGAGTT -TGACTGGGGCGGTACACCTGTCAAACGGTAACGCAGGTGTCCTAAGGCGAGCTCAGGGAGGACAGAAACC -TCCCGTGGAGCAGAAGGGCAAAAGCTCGCTTGATCTTGATTTTCAGTACGAATACAGACCGTGAAAGCGG -GGCCTCACGATCCTTCTGACCTTTTGGGTTTTAAGCAGGAGGTGTCAGAAAAGTTACCACAGGGATAACT -GGCTTGTGGCGGCCAAGCGTTCATAGCGACGTCGCTTTTTGATCCTTCGATGTCGGCTCTTCCTATCATT -GTGAAGCAGAATTCACCAAGCGTTGGATTGTTCACCCACTAATAGGGAACGTGAGCTGGGTTTAGACCGT -CGTGAGACAGGTTAGTTTTACCCTACTGATGATGTGTTGTTGCCATGGTAATCCTGCTCAGTACGAGAGG -AACCGCAGGTTCAGACATTTGGTGTATGTGCTTGGCTGAGGAGCCAATGGGGCGAAGCTACCATCTGTGG -GATTATGACTGAACGCCTCTAAGTCAGAATCCCGCCCAGGCGGAACGATACGGCAGCGCCGCGGAGCCTC -GGTTGGCCTCGGATAGCCGGTCCCCCGCCTGTCCCCGCCGGCGGGCCGCCCCCCCCTCCACGCGCCCCGC -GCGCGCGGGAGGGCGCGTGCCCCGCCGCGCGCCGGGACCGGGGTCCGGTGCGGAGTGCCCTTCGTCCTGG -GAAACGGGGCGCGGCTGGAAAGGCGGCCGCCCCCTCGCCCGTCACGCACCGCACGTTCGTGGGGAACCTG -GCGCTAAACCATTCGTAGACGACCTGCTTCTGGGTCGGGGTTTCGTACGTAGCAGAGCAGCTCCCTCGCT -GCGATCTATTGAAAGTCAGCCCTCGACACAAGGGTTTGTCCGCGCGCGCGCGCGCGCGTGCGTGCGGGGG -GCCCGGCGGGGCGTGCGCGTCCGGCGCCGTCCGTCCTTCCGTTCGTCTTCCTCCCTCCCGGCCTCTCCCG -CCGACCGCGGGCGTGGTGGTGGGGGTGGGGGGGAGGGCGCGCGACCCCGGTCGGCGCGCCCCGCTTCTTC -GGTTCCCGCCTCCTCCCCGTTCACCGCCGGGGCGGCTCGTCCGCTCCGGGCCGGGACGGGGTCCGGGGAG -CGTGGTTTGGGAGCCGCGGAGGCGGCCGCGCCGAGCCGGGCCCGTGGCCCGCCGGTCCCCGTCCCGGGGG -TTGGCCGCGCGGGCCCCGGTGGGGCGGCCACCCGGGGTCCCGGCCCTCGCG ->RNA5S1 -GTCTACGGCCATACCACCCTGAACGCGCCCGATCTCGTCTGATCTCGGAAGCTAAGCAGGGTCGGGCCTG -GTTAGTACTTGGATGGGAGACCGCCTGGGAATACCGGGTGCTGTAGGCTTT diff --git a/process_data/.gitignore b/process_data/.gitignore deleted file mode 100644 index 58c3d7f9d20089761693a491f14ae0626205a9b9..0000000000000000000000000000000000000000 --- a/process_data/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -.* -Log.out -results -logs -dag.png -nohup.out -samples diff --git a/process_data/Snakefile b/process_data/Snakefile deleted file mode 100644 index edd9baa064b837396dbf255ffd2a961c497f2f4f..0000000000000000000000000000000000000000 --- a/process_data/Snakefile +++ /dev/null @@ -1,182 +0,0 @@ -configfile: "config.yaml" - -localrules: finish - -################################################################################# -### Final rule -################################################################################# - -rule finish: - input: - fastqc = expand(os.path.join(config["output_dir"], "{sample}", "fastqc"), sample=config["sample"]), - htseq_qa = expand(os.path.join(config["output_dir"], "{sample}", "htseq_qa", "htseq_quality.pdf"), sample=config["sample"]), - gn_estimates = expand(os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.genes.sf"), sample=config["sample"]), - bam = expand(os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam"), sample=config["sample"]) - -################################################################################## -### Fastqc -################################################################################## - -rule fastqc: - input: - reads = os.path.join(config["input_dir"], "{sample}.fastq.gz") - output: - outdir = os.path.join(config["output_dir"], "{sample}", "fastqc") - singularity: - "docker://zavolab/fastqc:0.11.8" - log: - os.path.join(config["local_log"], "fastqc_{sample}.log") - shell: - "(mkdir -p {output.outdir}; \ - fastqc \ - --outdir {output.outdir} \ - {input.reads}) &> {log}" - -################################################################################## -### HTSeq quality assessment of the fastq file -################################################################################## - -rule htseq_qa: - input: - reads = os.path.join(config["input_dir"], "{sample}.fastq.gz") - output: - qual_pdf = os.path.join(config["output_dir"], "{sample}", "htseq_qa", "htseq_quality.pdf") - singularity: - "docker://zavolab/python_htseq:3.6.5_0.10.0" - log: - os.path.join(config["local_log"], "htseq_qa_{sample}.log") - shell: - "(htseq-qa \ - -t fastq \ - -o {output.qual_pdf} \ - {input.reads} ) &> {log}" - -################################################################################## -### Map to other RNAs with Segemehl -################################################################################## - -rule map_to_other_RNAs: - input: - reads = os.path.join(config["input_dir"], "{sample}.fastq.gz"), - index = config["other_RNAs_index"], - sequence = config["other_RNAs_sequence"] - output: - sam = os.path.join(config["output_dir"], "{sample}", "other_genes.mapped.sam"), - reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz") - params: - reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq"), - silent = "--silent", - accuracy = "90" - log: - os.path.join(config["local_log"], "map_to_other_genes_{sample}.log") - threads: 8 - singularity: - "docker://zavolab/segemehl:0.2.0" - shell: - "(segemehl.x \ - {params.silent} \ - -i {input.index} \ - -d {input.sequence} \ - -q {input.reads} \ - --accuracy {params.accuracy} \ - --threads {threads} \ - -o {output.sam} \ - -u {params.reads}; \ - gzip -c {params.reads} > {output.reads}; \ - rm {params.reads}) &> {log}" - -################################################################################## -### salmon quant -################################################################################## - -rule salmon_quant: - input: - reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz"), - gtf = config["annotation_filtered"], - index = config["salmon_index"] - output: - output_dir = os.path.join(config["output_dir"], "{sample}", "salmon_quant"), - gn_estimates = os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.genes.sf"), - tr_estimates = os.path.join(config["output_dir"], "{sample}", "salmon_quant", "quant.sf") - params: - libType = lambda wildcards: config[wildcards.sample]['libType'], - fldMean = lambda wildcards: config[wildcards.sample]['fldMean'], - fldSD = lambda wildcards: config[wildcards.sample]['fldSD'], - log: - os.path.join(config["local_log"], "salmon_quant_{sample}.log") - threads: 6 - singularity: - "docker://zavolab/salmon:0.11.0" - shell: - "(salmon quant \ - --index {input.index} \ - --libType {params.libType} \ - --unmatedReads <(zcat {input.reads}) \ - --seqBias \ - --geneMap {input.gtf} \ - --fldMean {params.fldMean} \ - --fldSD {params.fldSD} \ - --threads {threads} \ - --output {output.output_dir}) &> {log}" - -################################################################################# -### Align reads STAR -################################################################################# - -rule align_reads_STAR: - input: - index = config["STAR_index"], - reads = os.path.join(config["output_dir"], "{sample}", "other_genes.unmapped.fastq.gz"), - gtf = config["annotation"] - output: - outputfile = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam") - params: - outFileNamePrefix = os.path.join(config["output_dir"], "{sample}", "STAR_") - log: - os.path.join(config["local_log"],"align_reads_STAR_{sample}.log") - threads: 8 - singularity: - "docker://zavolab/star:2.6.0a" - shell: - "(STAR --runMode alignReads \ - --twopassMode Basic \ - --runThreadN {threads} \ - --genomeDir {input.index} \ - --sjdbGTFfile {input.gtf} \ - --readFilesIn {input.reads} \ - --readFilesCommand zcat \ - --outFileNamePrefix {params.outFileNamePrefix} \ - --outSAMtype BAM Unsorted) &> {log}" - -################################################################################ -### Sort alignment file -################################################################################ - -rule sort_bam: - input: - bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.bam") - output: - bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam") - threads: 8 - log: - os.path.join(config["local_log"],"sort_bam_{sample}.log") - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools sort -@ {threads} {input.bam} > {output.bam}) &> {log}" - -################################################################################ -### Index alignment file -################################################################################ - -rule samtools_index: - input: - bam = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam") - output: - bai = os.path.join(config["output_dir"], "{sample}", "STAR_Aligned.out.sorted.bam.bai") - log: - os.path.join(config["local_log"],"samtools_index_{sample}.log") - singularity: - "docker://zavolab/samtools:1.8" - shell: - "(samtools index {input.bam} > {output.bai}) &> {log}" diff --git a/process_data/create_snakemake_flowchart.sh b/process_data/create_snakemake_flowchart.sh deleted file mode 100755 index ce71a9a3dcfcd940c55d9568bff4e10bfa670f45..0000000000000000000000000000000000000000 --- a/process_data/create_snakemake_flowchart.sh +++ /dev/null @@ -1 +0,0 @@ -snakemake --dag -np | dot -Tpng > dag.png diff --git a/process_data/run_snakefile.sh b/process_data/run_snakefile.sh deleted file mode 100755 index ceec8a8493b661eab1c279a7e44a6f82d2d77ba6..0000000000000000000000000000000000000000 --- a/process_data/run_snakefile.sh +++ /dev/null @@ -1,13 +0,0 @@ -# set -e - -mkdir -p logs/cluster_log -mkdir -p logs/local_log - -snakemake \ ---cluster-config cluster.json \ ---cluster "sbatch --cpus-per-task={cluster.threads} --mem={cluster.mem} --qos={cluster.queue} --time={cluster.time} --job-name={cluster.name} -o {cluster.out} -p scicore" \ ---cores 256 \ --p \ ---rerun-incomplete \ ---use-singularity \ ---singularity-args "--bind ${PWD}" diff --git a/prepare_annotation/scripts/fg_extract_transcripts.py b/scripts/fg_extract_transcripts.py similarity index 100% rename from prepare_annotation/scripts/fg_extract_transcripts.py rename to scripts/fg_extract_transcripts.py diff --git a/snakemake/Snakefile b/snakemake/Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..b541c1e992f2c1a127eeff1956832ef93ae03eed --- /dev/null +++ b/snakemake/Snakefile @@ -0,0 +1,132 @@ +configfile: "config.yaml" + +################################################################################ +### python modules +################################################################################ + +import os +import sys +import pandas as pd + +############################ + +samples_table = pd.read_csv(config["samples"], header=0, index_col=0, comment='#', engine='python', sep="\t") + +localrules: finish + +################################################################################## +# Execution dependend on sequencing mode +################################################################################## + +include: 'paired_end.snakefile' +include: 'single_end.snakefile' + +################################################################################# +### Final rule +################################################################################# + +rule finish: + input: + outdir1 = expand(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate1_fastqc"), sample=samples_table.index.values), + outdir2 = expand(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate2_fastqc"), sample=samples_table.index.values) + + +rule create_index_star: + ''' Create index using STAR''' + input: + genome = lambda wildcards: samples_table.loc[wildcards.sample, 'genome'], + gtf = lambda wildcards: samples_table.loc[wildcards.sample, 'gtf'] + output: + chromosome_info = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrNameLength.txt"), + chromosomes_names = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index", + "chrName.txt") + params: + output_dir = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index"), + outFileNamePrefix = os.path.join( + config["star_indexes"], + "{organism}", + "{index_size}", + "STAR_index/STAR_"), + sjdbOverhang = lambda wildcards: + samples_table[wildcards.sample, "index_size"], + singularity: + "docker://zavolab/star:2.6.0a" + threads: 12 + log: + os.path.join( config["local_log"], "{organism}_{index_size}_create_index_star.log") + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + STAR \ + --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.output_dir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --sjdbGTFfile {input.gtf}) &> {log}" + + +rule create_index_salmon: + '''Create index for salmon quantification''' + input: + transcriptome = lambda wildcards: + samples_table.loc[wildcards.sample, 'tr_fasta_filtered'] + output: + index = os.path.join( + config["salmon_indexes"], + "{organism}", + "salmon.idx") + params: + kmerLen = lambda wildcards: + samples_table.loc[wildcards.sample, 'kmer'] + singularity: + "docker://zavolab/salmon:0.11.0" + log: + os.path.join(config["local_log"], "{organism}_create_index_salmon.log") + threads: 8 + shell: + "(salmon index \ + --t {input.transcriptome} \ + --i {output.index} \ + --k {params.kmerLen} \ + --threads {threads}) &> {log}" + + +rule create_index_kallisto: + '''Create index for running Kallisto''' + input: + transcriptome = lambda wildcards: + samples_table.loc[wildcards.sample, 'tr_fasta_filtered'] + output: + index = os.path.join( + config["kallisto_indexes"], + "{organism}", + "kallisto.idx") + params: + output_dir = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + samples_table[wildcards.sample, 'organism']) + singularity: + "docker://zavolab/kallisto:0.9" + log: + os.path.join(config["local_log"], "{organism}_create_index_kallisto.log") + shell: + "(mkdir -p {params.output_dir}; \ + chmod -R 777 {params.output_dir}; \ + kallisto index -i {output.index} {input.transcriptome}) &> {log}" + diff --git a/process_data/cluster.json b/snakemake/cluster.json similarity index 100% rename from process_data/cluster.json rename to snakemake/cluster.json diff --git a/process_data/config.yaml b/snakemake/config.yaml similarity index 81% rename from process_data/config.yaml rename to snakemake/config.yaml index d2c34bae0bd619d420ce33a2a2edb2d095259c86..80641a90cd98c6df1421965406defd27a1d84a81 100644 --- a/process_data/config.yaml +++ b/snakemake/config.yaml @@ -2,6 +2,7 @@ ############################################################################## ### Annotation ############################################################################## + organism: "Homo_sapiens" annotation: "../prepare_annotation/results/annotation.gtf" genome: "../prepare_annotation/results/genome.fa" annotation_filtered: "../prepare_annotation/results/filtered_transcripts.gtf" @@ -12,13 +13,17 @@ ############################################################################## ### Output and log directories ############################################################################## + database_path: "/scicore/home/zavolan/GROUP/Rna_Seq_pipeline/Blabla" + STAR_idx_folder: "STAR_indices" output_dir: "results" + star_indexes: "results" + salmon_indexes: "results" + kallisto_indexes: "results" local_log: "logs/local_log" cluster_log: "logs/cluster_log" + ############################################################################## ### Sample info ############################################################################## - input_dir: "samples" - sample: ["test"] - test: {libType: A, fldMean: 300, fldSD: 100} + samples: "../tests/samples.tsv" ... diff --git a/prepare_annotation/create_snakemake_flowchart.sh b/snakemake/create_snakemake_flowchart.sh similarity index 100% rename from prepare_annotation/create_snakemake_flowchart.sh rename to snakemake/create_snakemake_flowchart.sh diff --git a/snakemake/paired_end.snakefile b/snakemake/paired_end.snakefile new file mode 100644 index 0000000000000000000000000000000000000000..ebc283a950b1b3411f0a973f8e22183b04254a8a --- /dev/null +++ b/snakemake/paired_end.snakefile @@ -0,0 +1,364 @@ + +rule pe_fastqc: + '''A quality control tool for high throughput sequence data''' + + input: + reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] + output: + outdir1 = directory(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate1_fastqc")), + outdir2 = directory(os.path.join(config["output_dir"], "paired_end", "{sample}", "mate2_fastqc")) + threads: + 2 + singularity: + "docker://zavolab/fastqc:0.11.8" + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "fastqc.log") + shell: + "(mkdir -p {output.outdir1}; \ + mkdir -p {output.outdir2}; \ + fastqc --outdir {output.outdir1} {input.reads1}; & \ + fastqc --outdir {output.outdir2} {input.reads2}) &> {log}" + +rule pe_htseq_qa: + '''Assess the technical quality of a run''' + input: + reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] + output: + qual_pdf_mate1 = os.path.join(config["output_dir"], "paired_end", "{sample}", "htseq_quality_mate1.pdf"), + qual_pdf_mate2 = os.path.join(config["output_dir"], "paired_end", "{sample}", "htseq_quality_mate2.pdf") + threads: + 2 + singularity: + "docker://zavolab/python_htseq:3.6.5_0.10.0" + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "htseq_qa_.log") + shell: + "(htseq-qa -t fastq -o {output.qual_pdf_mate1} {input.reads1}; & \ + htseq-qa -t fastq -o {output.qual_pdf_mate2} {input.reads2}; ) &> {log}" + + +rule pe_remove_adapters_cutadapt: + '''Remove adapters''' + input: + reads1 = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + reads2 = lambda wildcards: samples_table.loc[wildcards.sample, "fq2"] + output: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_adapters_mate1.fastq.gz"), + + reads2 = os.path.join( + config["output_dir"], + "{sample}", + "{sample}.remove_adapters_mate2.fastq.gz") + params: + adapter_3_mate1 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq1_3p'], + adapter_5_mate1 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq1_5p'], + adapter_3_mate2 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq2_3p'], + adapter_5_mate2 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq2_5p'] + singularity: + "docker://zavolab/cutadapt:1.16" + threads: 8 + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "remove_adapters_cutadapt.log") + shell: + "(cutadapt \ + -e 0.1 \ + -j {threads} \ + --pair-filter=both \ + -m 10 \ + -n 3 \ + -a {params.adapter_3_mate1} \ + -g {params.adapter_5_mate1} \ + -A {params.adapter_3_mate2} \ + -G {params.adapter_5_mate2} \ + -o {output.reads1} \ + -p {output.reads2} \ + {input.reads1} \ + {input.reads2}) &> {log}" + + +rule pe_remove_polya_cutadapt: + '''Remove polyA tails''' + input: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_adapters_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_adapters_mate2.fastq.gz") + output: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz") + params: + polya_3_mate1 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq1_polya'], + polya_3_mate2 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq2_polya'], + singularity: + "docker://zavolab/cutadapt:1.16" + threads: 8 + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "remove_polya_cutadapt.log") + shell: + '(cutadapt \ + --match-read-wildcards \ + -j {threads} \ + --pair-filter=both \ + -m 10 \ + -n 2 \ + -e 0.1 \ + -q 6 \ + -m 10 \ + -a {params.polya_3_mate1} \ + -A {params.polya_3_mate2} \ + -o {output.reads1} \ + -p {output.reads2} \ + {input.reads1} \ + {input.reads2}) &> {log}' + + +rule pe_map_genome_star: + '''Map to genome using STAR''' + input: + index = lambda wildcards: + os.path.join( + config["star_indexes"], + samples_table.loc[wildcards.sample, "organism"], + samples_table.loc[wildcards.sample, "index_size"], + "STAR_index", + "chrNameLength.txt"), + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz") + output: + bam = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam"), + logfile = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Log.final.out") + params: + sample_id = "{sample}", + index = lambda wildcards: + os.path.join( + config["star_indexes"], + samples_table.loc[wildcards.sample, "index_size"], + "STAR_index"), + outFileNamePrefix = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_"), + multimappers = lambda wildcards: + samples_table.loc[wildcards.sample, "mulitmappers"], + soft_clip = lambda wildcards: + samples_table.loc[wildcards.sample, "soft_clip"], + pass_mode = lambda wildcards: + samples_table.loc[wildcards.sample, "pass_mode"] + + singularity: + "docker://zavolab/star:2.6.0a" + + threads: 12 + + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "map_genome_star.log") + + shell: + "(STAR \ + --runMode alignReads \ + --twopassMode {params.pass_mode} \ + --runThreadN {threads} \ + --genomeDir {params.index} \ + --readFilesIn {input.reads1} {input.reads2} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_SortedByCoordinate \ + --outSAMtype BAM SortedByCoordinate \ + --outFilterMismatchNoverLmax 0.04 \ + --outFilterScoreMinOverLread 0.3 \ + --outFilterMatchNminOverLread 0.3 \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rnaseq_pipeline SM:{params.sample} \ + --alignEndsType {params.soft_clip}} > {output.bam};) &> {log}" + + +rule pe_index_genomic_alignment_samtools: + '''Index the genomic alignment''' + input: + bam = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam"), + output: + bai = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam.bai"), + singularity: + "docker://zavolab/samtools:1.8" + log: + os.path.join( config["local_log"], "paired_end", "{sample}", "index_genomic_alignment_samtools.log") + + shell: + "(samtools index {input.bam} {output.bai};) &> {log}" + + +rule pe_quantification_salmon: + '''Quantification at transcript and gene level using Salmon''' + input: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz"), + gtf = lambda wildcards: + samples_table.loc[wildcards.sample, 'gtf_filtered'], + index = lambda wildcards: + os.path.join( + config["salmon_indexes"], + samples_table.loc[wildcards.sample, 'organism'], + "salmon.idx") + output: + gn_estimates = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "salmon_quant", + "quant.genes.sf"), + tr_estimates = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "salmon_quant", + "quant.sf") + params: + output_dir = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "salmon_quant"), + libType = lambda wildcards: + samples_table.loc[wildcards.sample, 'libtype'] + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_salmon.log") + threads: 6 + singularity: + "docker://zavolab/salmon:0.11.0" + shell: + "(salmon quant \ + --libType {params.libType} \ + --seqBias \ + --validateMappings \ + --threads {threads} \ + --writeUnmappedNames \ + --index {input.index} \ + --geneMap {input.gtf} \ + -1 {input.reads1} \ + -2 {input.reads2} \ + -o {params.output_dir}) &> {log}" + + +rule pe_genome_quantification_kallisto: + '''Quantification at transcript and gene level using Kallisto''' + input: + reads1 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate1.fastq.gz"), + reads2 = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "{sample}.remove_polya_mate2.fastq.gz"), + index = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + samples_table.loc[wildcards.sample, 'organism'], + "kallisto.idx") + output: + pseudoalignment = os.path.join( + config["output_dir"], + "paired_end", + "{sample}", + "quant_kallisto", + "{sample}.kallisto.pseudo.sam") + params: + output_dir = lambda wildcards: + os.path.join( + config["output_dir"], + "paired_end", + wildcards.sample, + "quant_kallisto"), + directionality = lambda wildcards: + samples_table.loc[wildcards.sample, "kallisto_directionality"] + singularity: + "docker://zavolab/kallisto:0.9" + threads: 8 + log: + os.path.join(config["local_log"], "paired_end", "{sample}", "genome_quantification_kallisto.log") + shell: + "(kallisto quant \ + -i {input.index} \ + -o {params.output_dir} \ + --pseudobam \ + --{params.directionality}-stranded \ + {input.reads1} {input.reads2} > {output.pseudoalignment}) &> {log}" + + + + + + + + diff --git a/snakemake/preprocessing.snakefile b/snakemake/preprocessing.snakefile new file mode 100644 index 0000000000000000000000000000000000000000..54d1d32ec3a633985aa2c9468580cdec8dc138ad --- /dev/null +++ b/snakemake/preprocessing.snakefile @@ -0,0 +1,28 @@ + + +rule index_genome_STAR: + ''' + Create Star index + ''' + input: + genome = os.path.join(config["output_dir"], "genome.fa"), + annotation = os.path.join(config["output_dir"], "annotation.gtf") + output: + output = os.path.join(config["database_path"], config['organism'], config['STAR_idx_folder], "STAR_index" + {sjdb}) + params: + outputdir = os.path.join(config["output_dir"],"STAR_index"), + sjdb = lambda wildcards: samples.loc['sjdb'] + threads: 8 + singularity: + "docker://zavolab/star:2.6.0a" + log: + os.path.join(config["local_log"],"index_genome_STAR.log") + shell: + "mkdir -p {output.output}; \ + chmod -R 777 {output.output}; \ + (STAR --runMode genomeGenerate \ + --sjdbOverhang {params.sjdbOverhang} \ + --genomeDir {params.outputdir} \ + --genomeFastaFiles {input.genome} \ + --runThreadN {threads} \ + --sjdbGTFfile {input.annotation}) &> {log}" \ No newline at end of file diff --git a/prepare_annotation/run_snakefile.sh b/snakemake/run_snakefile.sh similarity index 100% rename from prepare_annotation/run_snakefile.sh rename to snakemake/run_snakefile.sh diff --git a/snakemake/single_end.snakefile b/snakemake/single_end.snakefile new file mode 100644 index 0000000000000000000000000000000000000000..970dec471c18a8ce7f1b41e84ad02e221c12a4c1 --- /dev/null +++ b/snakemake/single_end.snakefile @@ -0,0 +1,282 @@ +import os +rule fastqc: + ''' A quality control tool for high throughput sequence data. ''' + input: + reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"], + output: + outdir = directory(os.path.join(config["output_dir"], "single_end", "{sample}", "fastqc")) + singularity: + "docker://zavolab/fastqc:0.11.8" + log: + os.path.join(config["local_log"], "single_end", "{sample}", "fastqc.log") + shell: + "(mkdir -p {output.outdir}; \ + fastqc \ + --outdir {output.outdir} \ + {input.reads}) &> {log}" + + +rule htseq_qa: + ''' Assess the technical quality of a run. ''' + input: + reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"] + output: + qual_pdf = os.path.join(config["output_dir"], "single_end", "{sample}", "htseq_quality.pdf") + singularity: + "docker://zavolab/python_htseq:3.6.5_0.10.0" + log: + os.path.join(config["local_log"], "single_end", "{sample}", "htseq_qa.log") + shell: + "(htseq-qa \ + -t fastq \ + -o {output.qual_pdf} \ + {input.reads} ) &> {log}" + + +rule remove_adapters_cutadapt: + ''' Remove adapters ''' + input: + reads = lambda wildcards: samples_table.loc[wildcards.sample, "fq1"] + output: + reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_adapters.fastq.gz") + params: + adapters_3 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq1_3p'], + adapters_5 = lambda wildcards: + samples_table.loc[wildcards.sample, 'fq1_5p'] + + singularity: + "docker://zavolab/cutadapt:1.16" + threads: 8 + log: + os.path.join(config["local_log"], "single_end", "{sample}", "remove_adapters_cutadapt.log") + shell: + "cutadapt \ + -e 0.1 \ + -O 1 \ + -j {threads} \ + -m 10 \ + -n 3 \ + -a {params.adapters_3} \ + -g {params.adapters_5} \ + -o {output.reads} \ + {input.reads}) &> {log}" + + +rule remove_polya_cutadapt: + ''' Remove ployA tails''' + input: + reads = lambda wildcards: samples_table[wildcards.sample, "fq1"] + output: + reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz") + params: + polya_3 = lambda wildcards: + samples_table.loc[wildcards.sample, "fq1_polya"] + singularity: + "docker://zavolab/cutadapt:1.16" + threads: 8 + log: + os.path.join(config["local_log"], "single_end", "{sample}", "remove_polya_cutadapt.log") + shell: + "(cutadapt \ + --match-read-wildcards \ + -j {threads} \ + -n 2 \ + -e 0.1 \ + -O 1 \ + -q 6 \ + -m 10 \ + -a {params.polya_3} \ + -o {output.reads} \ + {input.reads}) &> {log}" + + +rule map_genome_star: + ''' Map to genome using STAR. ''' + input: + index = lambda wildcards: + os.path.join( + config["star_indexes"], + samples_table.loc[wildcards.sample, "organism"], + samples_table.loc[wildcards.sample, "index_size"], + "STAR_index","chrNameLength.txt"), + reads = os.path.join(config["output_dir"], "single_end", "{sample}", "{sample}.remove_polya.fastq.gz") + output: + bam = os.path.join(config["output_dir"], "single_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam"), + logfile = os.path.join(config["output_dir"], "single_end", + "{sample}", + "map_genome", + "{sample}_Log.final.out") + params: + sample_id = "{sample}", + index = lambda wildcards: + os.path.join( + config["star_indexes"], + samples_table.loc["{sample}", "organism"], + samples_table.loc[wildcards.sample, "index_size"], + "STAR_index"), + outFileNamePrefix = lambda wildcards: + os.path.join( + config["output_dir"], + "single_end", + "{sample}", "map_genome", "{sample}_"), + multimappers = lambda wildcards: + samples_table.loc[wildcards.sample, "multimappers"], + soft_clip = lambda wildcards: + samples_table.loc[wildcards.sample, "soft_clip"], + pass_mode = lambda wildcards: + samples_table.loc[wildcards.sample, "pass_mode"], + singularity: + "docker://zavolab/star:2.6.0a" + threads: 12 + log: + os.path.join(config["local_log"], "single_end", "{sample}", "map_genome_star.log") + shell: + "(STAR \ + --runMode alignReads \ + -- twopassMode {params.pass_mode} \ + --runThreadN {threads} \ + --genomeDir {params.index} \ + --readFilesIn {input.reads} \ + --readFilesCommand zcat \ + --outSAMunmapped None \ + --outFilterMultimapNmax {params.multimappers} \ + --outFilterMultimapScoreRange 1 \ + --outFileNamePrefix {params.outFileNamePrefix} \ + --outSAMattributes All \ + --outStd BAM_SortedByCoordinate \ + --outSAMtype BAM SortedByCoordinate \ + --outFilterMismatchNoverLmax 0.04 \ + --outFilterScoreMinOverLread 0.3 \ + --outFilterMatchNminOverLread 0.3 \ + --outFilterType BySJout \ + --outReadsUnmapped None \ + --outSAMattrRGline ID:rcrunch SM:{params.sample_id} \ + --alignEndsType {params.soft_clip}} > {output.bam};) &> {log}" + + +rule index_genomic_alignment_samtools: + '''Index genome bamfile using samtools.''' + input: + bam = os.path.join(config["output_dir"], + "single_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam") + output: + bai = os.path.join(config["output_dir"], + "single_end", + "{sample}", + "map_genome", + "{sample}_Aligned.sortedByCoord.out.bam.bai") + singularity: + "docker://zavolab/samtools:1.8" + threads: 1 + log: + os.path.join(config["local_log"], "single_end", "{sample}", "index_genomic_alignment_samtools.log") + shell: + "(samtools index {input.bam} {output.bai};) &> {log}" + + +rule quantification_salmon: + ''' Quantification at transcript and gene level using Salmon. ''' + input: + reads = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.remove_polya.fastq.gz"), + index = lambda wildcards: + os.path.join( + config["salmon_indexes"], + samples_table[wildcards.sample, 'organism'], + "salmon.idx"), + gtf = lambda wildcards: samples_table.loc[wildcards.sample, "gtf_filtered"] + output: + gn_estimates = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "salmon_quant", + "quant.genes.sf"), + tr_estimates = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "salmon_quant", + "quant.sf") + params: + output_dir = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "salmon_quant"), + libType = lambda wildcards: + samples_table.loc[wildcards.sample, "libtype"] + log: + os.path.join(config["local_log"], "single_end", "{sample}", "quantification_salmon.log") + threads: 12 + conda: + "envs/salmon.yaml" + shell: + "(salmon quant \ + --libType {params.libType} \ + --seqBias \ + --validateMappings \ + --threads {threads} \ + --writeUnmappedNames \ + --index {input.index} \ + --geneMap {input.gtf} \ + --unmatedReads {input.reads} \ + -o {params.output_dir}) &> {log}" + + +rule genome_quantification_kallisto: + ''' Quantification at transcript and gene level using Kallisto. ''' + input: + reads = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.remove_polya.fastq.gz"), + index = lambda wildcards: + os.path.join( + config["kallisto_indexes"], + samples_table.loc[wildcards.sample, "organism"], + "kallisto.idx") + output: + pseudoalignment = os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "{sample}.kallisto.pseudo.sam") + params: + output_dir = lambda wildcards: + os.path.join( + config["output_dir"], + "single_end", + "{sample}", + "quant_kallisto"), + fraglen = lambda wildcards: samples_table.loc[wildcards.sample, 'mean'], + fragsd = lambda wildcards: samples_table.loc[wildcards.sample, 'sd'], + directionality = lambda wildcards: samples_table.loc[wildcards.sample, 'kallisto_directionality'] + threads: 8 + log: + os.path.join(config["local_log"],"kallisto_align_{sample}.log") + singularity: + "docker://zavolab/kallisto:0.9" + shell: + "(kallisto quant \ + -i {input.index} \ + -o {params.output_dir} \ + --single \ + -l {params.fraglen} \ + -s {params.fragsd} \ + --pseudobam \ + --{params.directionality}-stranded \ + {input.reads} > {output.pseudoalignment}) &> {log}" + + \ No newline at end of file diff --git a/tests/RNA_Seq_data_template_test.tsv b/tests/RNA_Seq_data_template_test.tsv new file mode 100644 index 0000000000000000000000000000000000000000..34c7bb10db59c7ca9d693ded26e5cf1b5ab79b1d --- /dev/null +++ b/tests/RNA_Seq_data_template_test.tsv @@ -0,0 +1,3 @@ +Entry_Date Path_Fastq_Files Condition_Name Replicate_Name Single_Paired Mate1_File Mate2_File Mate1_Direction Mate2_Direction Mate1_5p_Adapter Mate1_3p_Adapter Mate2_5p_Adapter Mate2_3p_Adapter Fragment_Length_Mean Fragment_Length_SD Quality_Control_Flag Checksum_Raw_FASTQ_Mate1 Checksum_Raw_FASTQ_Mate2 File_Name_Metadata_File Name_Quality_Control_File_Mate1 Name_Quality_Control_File_Mate2 Organism TaxonID Strain_Isolate_Breed_Ecotype Strain_Isolate_Breed_Ecotype_ID Biomaterial_Provider Source_Tissue_Name Tissue_Code Additional_Tissue_Description Genotype_Short_Name Genotype_Description Disease_Short_Name Disease_Description Treatment_Short_Name Treatment_Description Gender Age Developmental_Stage Passage_Number Sample_Preparation_Date Prepared_By Documentation Protocol_File Sequencing_Date Sequencing_Instrument Library_preparation_kit Cycles Molecule Contaminant_Sequences BioAnalyzer_File +Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031355465-60677668 LN18C LN18C_rep1 PAIRED BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx +Fri Dec 20 00:00:00 CET 2019 /scicore/projects/openbis/userstore/biozentrum_zavolan/20191119031410069-60677669 LN18C LN18C_rep2 PAIRED BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz ANTISENSE ANTISENSE AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA AAAAAAAAAA 300.0 100.0 xxx xxx xxx xxx xxx xxx Homo sapiens 9606 xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx xxx diff --git a/tests/samples.tsv b/tests/samples.tsv new file mode 100644 index 0000000000000000000000000000000000000000..fd5216ca83b96371bb6237a8a282eaa76291b465 --- /dev/null +++ b/tests/samples.tsv @@ -0,0 +1,3 @@ +sample fq1 fq2 +LN18C_rep1 /Users/foivosgypas/Desktop/samples/BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R1_001_MM_1.fastq.gz /Users/foivosgypas/Desktop/samples/BSSE_QGF_131557_HHK5FDRXX_1_7_1_LN18C_1_GAATGAGA_GAGGCATT_S1_L001_R2_001_MM_1.fastq.gz +LN18C_rep2 /Users/foivosgypas/Desktop/samples/BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R1_001_MM_1.fastq.gz /Users/foivosgypas/Desktop/samples/BSSE_QGF_131558_HHK5FDRXX_1_7_2_LN18C_2_AGGCAGAG_AGAATGCC_S2_L001_R2_001_MM_1.fastq.gz