diff --git a/snakemake/prepare_annotation/config.yaml b/snakemake/prepare_annotation/config.yaml index 854ad3776c8b7d47c0ea1ddb54c1e8ca3ccd9861..02a28109626d949d21b679a0093a66ad1b9d5889 100644 --- a/snakemake/prepare_annotation/config.yaml +++ b/snakemake/prepare_annotation/config.yaml @@ -2,9 +2,9 @@ ############################################################################## ### Annotation ############################################################################## - genome: "../annotation/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa" - gtf: "../annotation/Mus_musculus.GRCm38.90.chr.gtf" - other_RNAs_sequence: "../annotation/mm10_rrnas.fa" + genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa" + gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf" + other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" ############################################################################## ### Output and log directory ############################################################################## diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index b660645f93276bd43f55263a1cf3c8b3c731fd12..bfc0f0cc82f8d7930213d2ad9819e801bcfed9df 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -1,7 +1,7 @@ configfile: "config.yaml" #from snakemake.utils import listfiles -localrules: finish +localrules: finish, count_oligos ################################################################################ ### Finish rule @@ -9,11 +9,13 @@ localrules: finish rule finish: input: - pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), - counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]), - bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]), - oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]), - overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"]) + oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]) + + + #pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), + #counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]), + #bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]), + #overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"]) ################################################################################ ### Count oligos @@ -22,13 +24,18 @@ rule count_oligos: input: reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]), oligos = config["oligos"], - script = "scripts/scanAdapter.sh" + script = "scripts/count_oligos.py" output: oligos_counts = os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt") log: os.path.join(config["local_log"], "count_oligos_{sample}.log") + singularity: + "docker://zavolab/python_htseq_biopython:3.6.5_0.10.0_1.71" shell: - "({input.script} {input.reads} {input.oligos} > {output.oligos_counts}) &> {log}" + "(python {input.script} \ + --fastq <(zcat {input.reads}) \ + --oligos {input.oligos} \ + --out {output.oligos_counts}) &> {log}" ################################################################################# ### Trim first bases (For Smarter-Kit ONLY) diff --git a/snakemake/process_data/config.yaml b/snakemake/process_data/config.yaml index 4635bd2b9c0acbf39001a64ebafc3f29eeea7457..3be9717aa1135df698a0c06331a5ed0a30f8a855 100644 --- a/snakemake/process_data/config.yaml +++ b/snakemake/process_data/config.yaml @@ -2,7 +2,7 @@ ############################################################################## ### Annotation ############################################################################## - other_RNAs_sequence: "../annotation/mm10_rrnas.fa" + other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa" other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx" transcripts_sequence: "../prepare_annotation/results/longest_pc_transcript_per_gene.fa" transcripts_index: "../prepare_annotation/results/longest_pc_transcript_per_gene.idx" @@ -19,9 +19,6 @@ ############################################################################## input_dir: "samples" input_reads_pattern: ".fastq.gz" - sample: ["BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1", "BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1", "BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1", "BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1"] - BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} - BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} - BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} - BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} + sample: ["example"] + example: {adapter: "GATCGGAAGAGCACA", minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5} ... diff --git a/snakemake/process_data/run_snakefile.sh b/snakemake/process_data/run_snakefile.sh index 536a8d02fea353f99177d5aeddeeea00b2a3766b..0d5707ec50f1c71ccb13848186a8e303a57b2fe6 100755 --- a/snakemake/process_data/run_snakefile.sh +++ b/snakemake/process_data/run_snakefile.sh @@ -10,4 +10,5 @@ snakemake \ -p \ --rerun-incomplete \ --use-singularity \ ---singularity-args "--bind ${PWD},${PWD}/../" +--singularity-args "--no-home --bind ${PWD},${PWD}/../,${PWD}/../annotation,${PWD}/scripts" \ + diff --git a/snakemake/process_data/scripts/count_oligos.py b/snakemake/process_data/scripts/count_oligos.py new file mode 100755 index 0000000000000000000000000000000000000000..0a8d330dbf70fd82d3ce978c16b536acf04bef73 --- /dev/null +++ b/snakemake/process_data/scripts/count_oligos.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +from argparse import ArgumentParser, RawTextHelpFormatter +import sys +import os +from Bio import SeqIO + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + +def main(): + """ Main function """ + + __doc__ = "Find overepresented sequences from a SAM file." + __version__ = "0.1" + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--fastq", + dest="fastq", + help="Input fastq file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--oligos", + dest="oligos", + help="Input oligos file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--out", + dest="out", + help="Output file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Be verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except Exception: + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write("arsing {} {}".format(options.oligos, os.linesep)) + + oligos = {} + with open(options.oligos) as fp: + for line in fp: + oligo = line.strip() + if oligo not in oligos: + oligos[oligo] = 0 + + if options.verbose: + sys.stdout.write("Parsing {} {}".format(options.fastq, os.linesep)) + + for line in SeqIO.parse(options.fastq, "fastq"): + sequence = str(line.seq) + for oligo, number_of_oligos in oligos.items(): + if oligo in sequence: + oligos[oligo] += 1 + if options.verbose: + sys.stdout.write("Writing {} {}".format(options.out, os.linesep)) + + fh = open(options.out, "w") + for w in sorted(oligos, key=oligos.get, reverse=True): + fh.write(str(w) + "\t" + str(oligos[w]) + "\n") + fh.close() + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) diff --git a/snakemake/process_data/scripts/scanAdapter.sh b/snakemake/process_data/scripts/scanAdapter.sh deleted file mode 100755 index bd2e7c076547d8427af024d1d9a1f2437ccafdfa..0000000000000000000000000000000000000000 --- a/snakemake/process_data/scripts/scanAdapter.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Alexander Kanitz -# 04-JAN-2015 - -# Update: 24-JAN-2010 (Foivos Gypas) - -# [USAGE] -# scanAdapter.sh <FASTQ> <ADAPTER_FILE> (<NUMBER_OF_SEQUENCES>) - -# [DESCRIPTION] -# The script scans a part of a specified FASTQ file (gzipped or uncompressed) for the presence of -# adapter sequences present in a separate file and the location of the adapter file. Optionally, -# the number of sequences to scan (default: 10,000). - -seqFile=$1 -adapterFile=${2} -seqNumber=$((${3:-10000}*4)) - - -while read line; do - echo -n "${line}: " - grep -c "$line" <(zcat -f -- "$seqFile") -done < "$adapterFile" diff --git a/snakemake/process_data/scripts/xp-count-reads-ribseq.pl b/snakemake/process_data/scripts/xp-count-reads-ribseq.pl old mode 100644 new mode 100755