diff --git a/snakemake/process_data/Snakefile b/snakemake/process_data/Snakefile index 7df1d42256bc6961c963c193c680bf2b66d9eaa8..a7e8ae0d5ed44ebfa9a2ae7b397773202e4acecf 100644 --- a/snakemake/process_data/Snakefile +++ b/snakemake/process_data/Snakefile @@ -1,4 +1,4 @@ -configfile: "config.yaml" +configfile: "config_hepb2.yaml" #from snakemake.utils import listfiles localrules: finish @@ -11,7 +11,7 @@ rule finish: input: oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]), pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]), - counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]), + counts = expand(os.path.join(config["output_dir"], "{sample}/counts/counts.tsv"), sample=config["sample"]), bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]), overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"]) @@ -317,20 +317,28 @@ rule read_length_histogram: rule count_reads: input: - sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sam"), - transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"] + bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"), + transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"], + p_site_offsets = os.path.join( + config["output_dir"], + "{sample}/p_site_offsets/alignment_offset.json") output: - counts = os.path.join(config["output_dir"], "{sample}/counts.tsv") + counts = os.path.join(config["output_dir"], "{sample}/counts/counts.tsv"), + count = os.path.join(config["output_dir"], "{sample}/counts") + params: + outdir = os.path.join(config["output_dir"], "{sample}/counts") + log: os.path.join(config["local_log"], "count_reads_{sample}.log") threads: 1 singularity: - "docker://perl:5.24-slim" + "docker://fgypas/python_pysam:3.6.5_0.15.1" shell: - "(perl scripts/xp-count-reads-ribseq.pl \ - {input.sam} \ - {input.transcript_id_gene_id_CDS} \ - > {output.counts})" + "(python scripts/count_reads.py \ + --bam {input.bam} \ + --tsv {input.transcript_id_gene_id_CDS} \ + --json {input.p_site_offsets} \ + --outdir {params.outdir})" ################################################################################ ### Determine P-site offset diff --git a/snakemake/process_data/scripts/count_reads.py b/snakemake/process_data/scripts/count_reads.py new file mode 100644 index 0000000000000000000000000000000000000000..4fc2fcc96645e93699d21f1c5d54356bcc170065 --- /dev/null +++ b/snakemake/process_data/scripts/count_reads.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +from argparse import ArgumentParser, RawTextHelpFormatter +import sys +import os +import pysam +import json + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + +def main(): + """ Main function """ + + __doc__ = "Count reads mapped to transcripts using BAM file and offsets from json file." + __version__ = "0.1" + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--bam", + dest="bam", + help="Input bam file (sorted and indexed)", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--json", + dest="json", + help="Input json file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--tsv", + dest="tsv", + help="Input tsv file", + required=True, + metavar="FILE" + ) + parser.add_argument( + "--outdir", + dest="outdir", + help="Output directory", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Be verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except Exception: + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write("and parsing {} {}".format(options.bam, os.linesep)) + + with open(options.json, 'r') as json_file: + data = json_file.read() + + cds_start_ind = {} + cds_end_ind = {} + + with open(options.tsv) as cds_coordinates: + for line in cds_coordinates: + sp_line = line.strip().split("\t") + transcript_id = sp_line[0] + cds_start_ind[transcript_id] = int(sp_line[2]) + cds_end_ind[transcript_id] = int(sp_line[3]) + + count_discarded = 0 + all_transcripts = {} + offset_obj = json.loads(data) + bam = pysam.AlignmentFile(options.bam, "rb") + for read in bam.fetch(): + if read.is_reverse: + continue + transcript = read.reference_name + if transcript not in all_transcripts: + all_transcripts[transcript] = [0, 0, 0] + read_length = len(read.seq) + if str(read_length) in offset_obj: + read_start = read.reference_start + psite_pos = (read_start + offset_obj[str(read_length)]) + if psite_pos >= cds_start_ind[transcript] and psite_pos <= cds_end_ind[transcript]: + all_transcripts[transcript][1] = all_transcripts[transcript][1] + 1 + elif psite_pos < cds_start_ind[transcript]: + all_transcripts[transcript][0] = all_transcripts[transcript][0] + 1 + else: + all_transcripts[transcript][2] = all_transcripts[transcript][2] + 1 + else: + count_discarded = count_discarded + 1 + + w = open(os.path.join(options.outdir, "counts.tsv"), 'w') + w.write("Transcript" + "\t" + "5' UTR" + "\t" + "CDS" + "\t" + "3' UTR" + "\t" + "CDS length" + "\n") + for trcpt_key in all_transcripts: + gene_length = cds_end_ind[trcpt_key]-cds_start_ind[trcpt_key]+1 + w.write(trcpt_key + "\t" + str(all_transcripts[trcpt_key][0]) + "\t" + str(all_transcripts[trcpt_key][1]) + "\t" + str(all_transcripts[trcpt_key][2]) + "\t" + str(gene_length) + "\n") + bam.close() + w.close() + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) diff --git a/snakemake/process_data/scripts/xp-count-reads-ribseq.pl b/snakemake/process_data/scripts/xp-count-reads-ribseq.pl deleted file mode 100755 index ca4a7c9b77adf9d8ee554417d8144092b6c324b3..0000000000000000000000000000000000000000 --- a/snakemake/process_data/scripts/xp-count-reads-ribseq.pl +++ /dev/null @@ -1,52 +0,0 @@ -use Data::Dumper; - -my %hash = (); -my %anno = (); -my $readsSum = 0; - -open(SAM,$ARGV[0]); - -#Annotation file included with beginning and end of every CDS -my $annoFile = (defined($ARGV[1])) ? 1 : 0; - -if($annoFile){ - open(AN,$ARGV[1]); - while( $line = <AN> ){ - chomp $line; - my @F = split /\t/ , $line; #annotation line - my @a = ($F[1],$F[2]); - $anno{$F[0]} = \@a; - } -} else { - print STDERR "Annotation file not specified, using len(5UTR)=200\n"; -} - -#print Dumper(%anno); - -#Parse SAM file and count reads -while( $line = <SAM> ){ - - chomp $line; - next if $line =~ /^@/; #discard headers/comments - - my @F = split /\t/ , $line; #sam line - - my $gene_name = $F[2]; - if($F[1] == 0) { #only consider hits in plus strand - if($annoFile){ - if(($F[3] >= $anno{$gene_name}->[0]-15) & ($F[3] <= $anno{$gene_name}->[1]-15)) { #close to start and stop - $hash{$gene_name}++; - } - } { - if($F[3] >= 186) { #consider that len(5UTR)==200 - $hash{$gene_name}++; - } - } - } -} - -#print Dumper(\%hash); -print "gene\tcounts\n"; -for $key (keys %hash){ - print $key,"\t",$hash{$key},"\n"; -}