Skip to content
Snippets Groups Projects
Commit 7cce44cb authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

Change script that counts oligos. Fix wrongly replaced annotation in config files.

parent 5dc673b5
No related branches found
No related tags found
No related merge requests found
......@@ -2,9 +2,9 @@
##############################################################################
### Annotation
##############################################################################
genome: "../annotation/Mus_musculus.GRCm38.dna_sm.primary_assembly.fa"
gtf: "../annotation/Mus_musculus.GRCm38.90.chr.gtf"
other_RNAs_sequence: "../annotation/mm10_rrnas.fa"
genome: "../annotation/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
gtf: "../annotation/Homo_sapiens.GRCh38.90.chr.gtf"
other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
##############################################################################
### Output and log directory
##############################################################################
......
configfile: "config.yaml"
#from snakemake.utils import listfiles
localrules: finish
localrules: finish, count_oligos
################################################################################
### Finish rule
......@@ -9,11 +9,13 @@ localrules: finish
rule finish:
input:
pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]),
counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]),
bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]),
oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]),
overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"])
oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"])
#pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]),
#counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]),
#bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]),
#overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"])
################################################################################
### Count oligos
......@@ -22,13 +24,18 @@ rule count_oligos:
input:
reads = os.path.join(config["input_dir"], "{sample}" + config["input_reads_pattern"]),
oligos = config["oligos"],
script = "scripts/scanAdapter.sh"
script = "scripts/count_oligos.py"
output:
oligos_counts = os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt")
log:
os.path.join(config["local_log"], "count_oligos_{sample}.log")
singularity:
"docker://zavolab/python_htseq_biopython:3.6.5_0.10.0_1.71"
shell:
"({input.script} {input.reads} {input.oligos} > {output.oligos_counts}) &> {log}"
"(python {input.script} \
--fastq <(zcat {input.reads}) \
--oligos {input.oligos} \
--out {output.oligos_counts}) &> {log}"
#################################################################################
### Trim first bases (For Smarter-Kit ONLY)
......
......@@ -2,7 +2,7 @@
##############################################################################
### Annotation
##############################################################################
other_RNAs_sequence: "../annotation/mm10_rrnas.fa"
other_RNAs_sequence: "../annotation/txome_rRNAs_joao.fa"
other_RNAs_index: "../prepare_annotation/results/other_RNAs_sequence.idx"
transcripts_sequence: "../prepare_annotation/results/longest_pc_transcript_per_gene.fa"
transcripts_index: "../prepare_annotation/results/longest_pc_transcript_per_gene.idx"
......@@ -19,9 +19,6 @@
##############################################################################
input_dir: "samples"
input_reads_pattern: ".fastq.gz"
sample: ["BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1", "BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1", "BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1", "BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1"]
BSSE_QGF_93569_HNFTYBGX5_1_Sample6_6F2R6_R555W_GAATTCG_S4_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
BSSE_QGF_93566_HNFTYBGX5_1_Sample3_3F2R3_R555W_CGCTCAT_S3_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
BSSE_QGF_93564_HNFTYBGX5_1_Sample1_1F2R1_wt_ATTACTC_S1_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
BSSE_QGF_93565_HNFTYBGX5_1_Sample2_2F2R2_wt_TCCGGAG_S2_R1_001_MM_1: {adapter: AAAAAAAAAA, minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
sample: ["example"]
example: {adapter: "GATCGGAAGAGCACA", minimum_quality: 20, quality_type: 33, trim_bases_5_prime: 5}
...
......@@ -10,4 +10,5 @@ snakemake \
-p \
--rerun-incomplete \
--use-singularity \
--singularity-args "--bind ${PWD},${PWD}/../"
--singularity-args "--no-home --bind ${PWD},${PWD}/../,${PWD}/../annotation,${PWD}/scripts" \
#!/usr/bin/env python
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
from argparse import ArgumentParser, RawTextHelpFormatter
import sys
import os
from Bio import SeqIO
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
__doc__ = "Find overepresented sequences from a SAM file."
__version__ = "0.1"
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--fastq",
dest="fastq",
help="Input fastq file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--oligos",
dest="oligos",
help="Input oligos file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--out",
dest="out",
help="Output file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Be verbose"
)
parser.add_argument(
'--version',
action='version',
version=__version__
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except Exception:
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write("arsing {} {}".format(options.oligos, os.linesep))
oligos = {}
with open(options.oligos) as fp:
for line in fp:
oligo = line.strip()
if oligo not in oligos:
oligos[oligo] = 0
if options.verbose:
sys.stdout.write("Parsing {} {}".format(options.fastq, os.linesep))
for line in SeqIO.parse(options.fastq, "fastq"):
sequence = str(line.seq)
for oligo, number_of_oligos in oligos.items():
if oligo in sequence:
oligos[oligo] += 1
if options.verbose:
sys.stdout.write("Writing {} {}".format(options.out, os.linesep))
fh = open(options.out, "w")
for w in sorted(oligos, key=oligos.get, reverse=True):
fh.write(str(w) + "\t" + str(oligos[w]) + "\n")
fh.close()
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
#!/bin/bash
# Alexander Kanitz
# 04-JAN-2015
# Update: 24-JAN-2010 (Foivos Gypas)
# [USAGE]
# scanAdapter.sh <FASTQ> <ADAPTER_FILE> (<NUMBER_OF_SEQUENCES>)
# [DESCRIPTION]
# The script scans a part of a specified FASTQ file (gzipped or uncompressed) for the presence of
# adapter sequences present in a separate file and the location of the adapter file. Optionally,
# the number of sequences to scan (default: 10,000).
seqFile=$1
adapterFile=${2}
seqNumber=$((${3:-10000}*4))
while read line; do
echo -n "${line}: "
grep -c "$line" <(zcat -f -- "$seqFile")
done < "$adapterFile"
File mode changed from 100644 to 100755
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment