Skip to content
Snippets Groups Projects
Commit 0db81c30 authored by BIOPZ-Ataman Meric's avatar BIOPZ-Ataman Meric
Browse files

added a new script for counting reads

parent c915fcbc
No related branches found
No related tags found
1 merge request!2added a new script for counting reads
configfile: "config.yaml"
configfile: "config_hepb2.yaml"
#from snakemake.utils import listfiles
localrules: finish
......@@ -11,7 +11,7 @@ rule finish:
input:
oligos_counts = expand(os.path.join(config["output_dir"], "{sample}", "oligos_counts.txt"), sample=config["sample"]),
pdf = expand(os.path.join(config["output_dir"], "{sample}/read_length/read_length_histogram.pdf"), sample=config["sample"]),
counts = expand(os.path.join(config["output_dir"], "{sample}/counts.tsv"), sample=config["sample"]),
counts = expand(os.path.join(config["output_dir"], "{sample}/counts/counts.tsv"), sample=config["sample"]),
bai = expand(os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.a_site_profile.sorted.bam.bai"), sample=config["sample"]),
overrepresented_sequences_counts = expand(os.path.join(config["output_dir"], "{sample}/overepressented_sequences_other.txt"), sample=config["sample"])
......@@ -317,20 +317,28 @@ rule read_length_histogram:
rule count_reads:
input:
sam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sam"),
transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"]
bam = os.path.join(config["output_dir"], "{sample}/transcripts.mapped.unique.sorted.bam"),
transcript_id_gene_id_CDS = config["transcript_id_gene_id_CDS"],
p_site_offsets = os.path.join(
config["output_dir"],
"{sample}/p_site_offsets/alignment_offset.json")
output:
counts = os.path.join(config["output_dir"], "{sample}/counts.tsv")
counts = os.path.join(config["output_dir"], "{sample}/counts/counts.tsv"),
count = os.path.join(config["output_dir"], "{sample}/counts")
params:
outdir = os.path.join(config["output_dir"], "{sample}/counts")
log:
os.path.join(config["local_log"], "count_reads_{sample}.log")
threads: 1
singularity:
"docker://perl:5.24-slim"
"docker://fgypas/python_pysam:3.6.5_0.15.1"
shell:
"(perl scripts/xp-count-reads-ribseq.pl \
{input.sam} \
{input.transcript_id_gene_id_CDS} \
> {output.counts})"
"(python scripts/count_reads.py \
--bam {input.bam} \
--tsv {input.transcript_id_gene_id_CDS} \
--json {input.p_site_offsets} \
--outdir {params.outdir})"
################################################################################
### Determine P-site offset
......
#!/usr/bin/env python
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
from argparse import ArgumentParser, RawTextHelpFormatter
import sys
import os
import pysam
import json
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
__doc__ = "Count reads mapped to transcripts using BAM file and offsets from json file."
__version__ = "0.1"
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--bam",
dest="bam",
help="Input bam file (sorted and indexed)",
required=True,
metavar="FILE"
)
parser.add_argument(
"--json",
dest="json",
help="Input json file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--tsv",
dest="tsv",
help="Input tsv file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--outdir",
dest="outdir",
help="Output directory",
required=True,
metavar="FILE"
)
parser.add_argument(
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Be verbose"
)
parser.add_argument(
'--version',
action='version',
version=__version__
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except Exception:
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write("and parsing {} {}".format(options.bam, os.linesep))
with open(options.json, 'r') as json_file:
data = json_file.read()
cds_start_ind = {}
cds_end_ind = {}
with open(options.tsv) as cds_coordinates:
for line in cds_coordinates:
sp_line = line.strip().split("\t")
transcript_id = sp_line[0]
cds_start_ind[transcript_id] = int(sp_line[2])
cds_end_ind[transcript_id] = int(sp_line[3])
count_discarded = 0
all_transcripts = {}
offset_obj = json.loads(data)
bam = pysam.AlignmentFile(options.bam, "rb")
for read in bam.fetch():
if read.is_reverse:
continue
transcript = read.reference_name
if transcript not in all_transcripts:
all_transcripts[transcript] = [0, 0, 0]
read_length = len(read.seq)
if str(read_length) in offset_obj:
read_start = read.reference_start
psite_pos = (read_start + offset_obj[str(read_length)])
if psite_pos >= cds_start_ind[transcript] and psite_pos <= cds_end_ind[transcript]:
all_transcripts[transcript][1] = all_transcripts[transcript][1] + 1
elif psite_pos < cds_start_ind[transcript]:
all_transcripts[transcript][0] = all_transcripts[transcript][0] + 1
else:
all_transcripts[transcript][2] = all_transcripts[transcript][2] + 1
else:
count_discarded = count_discarded + 1
w = open(os.path.join(options.outdir, "counts.tsv"), 'w')
w.write("Transcript" + "\t" + "5' UTR" + "\t" + "CDS" + "\t" + "3' UTR" + "\t" + "CDS length" + "\n")
for trcpt_key in all_transcripts:
gene_length = cds_end_ind[trcpt_key]-cds_start_ind[trcpt_key]+1
w.write(trcpt_key + "\t" + str(all_transcripts[trcpt_key][0]) + "\t" + str(all_transcripts[trcpt_key][1]) + "\t" + str(all_transcripts[trcpt_key][2]) + "\t" + str(gene_length) + "\n")
bam.close()
w.close()
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
use Data::Dumper;
my %hash = ();
my %anno = ();
my $readsSum = 0;
open(SAM,$ARGV[0]);
#Annotation file included with beginning and end of every CDS
my $annoFile = (defined($ARGV[1])) ? 1 : 0;
if($annoFile){
open(AN,$ARGV[1]);
while( $line = <AN> ){
chomp $line;
my @F = split /\t/ , $line; #annotation line
my @a = ($F[1],$F[2]);
$anno{$F[0]} = \@a;
}
} else {
print STDERR "Annotation file not specified, using len(5UTR)=200\n";
}
#print Dumper(%anno);
#Parse SAM file and count reads
while( $line = <SAM> ){
chomp $line;
next if $line =~ /^@/; #discard headers/comments
my @F = split /\t/ , $line; #sam line
my $gene_name = $F[2];
if($F[1] == 0) { #only consider hits in plus strand
if($annoFile){
if(($F[3] >= $anno{$gene_name}->[0]-15) & ($F[3] <= $anno{$gene_name}->[1]-15)) { #close to start and stop
$hash{$gene_name}++;
}
} {
if($F[3] >= 186) { #consider that len(5UTR)==200
$hash{$gene_name}++;
}
}
}
}
#print Dumper(\%hash);
print "gene\tcounts\n";
for $key (keys %hash){
print $key,"\t",$hash{$key},"\n";
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment