Skip to content
Snippets Groups Projects
Commit 097957da authored by BIOPZ-Gypas Foivos's avatar BIOPZ-Gypas Foivos
Browse files

Addition of create_tab_delimited_CDS_file.py in the prepare annotation...

Addition of create_tab_delimited_CDS_file.py in the prepare annotation pipeline. Added perl script for counting riboseq reads. Needs to be further added in the process data pipeline.
parent 23eace92
Branches
No related tags found
No related merge requests found
configfile: "config.yaml" configfile: "config.yaml"
localrules: create_output_and_log_directories, finish localrules: create_output_and_log_directories, create_tab_delimited_CDS_file, finish
################################################################################# #################################################################################
### Finish rule ### Finish rule
...@@ -9,7 +9,8 @@ localrules: create_output_and_log_directories, finish ...@@ -9,7 +9,8 @@ localrules: create_output_and_log_directories, finish
rule finish: rule finish:
input: input:
idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"), idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"),
idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx") idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx"),
tsv = os.path.join(config["output_dir"], "transcript_id_gene_id_CDS.tsv")
################################################################################# #################################################################################
### Create output and log directories ### Create output and log directories
...@@ -91,6 +92,29 @@ rule extract_transcript_sequences: ...@@ -91,6 +92,29 @@ rule extract_transcript_sequences:
-g {input.genome} \ -g {input.genome} \
-w {output.transcripts}) &> {log}" -w {output.transcripts}) &> {log}"
#################################################################################
### Generate transcript id, gene id, CDS table
#################################################################################
rule create_tab_delimited_CDS_file:
input:
gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf"),
transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa"),
script = os.path.join(config["scripts"], "create_tab_delimited_CDS_file.py")
output:
tsv = os.path.join(config["output_dir"], "transcript_id_gene_id_CDS.tsv")
params:
cluster_log = os.path.join(config["cluster_log"], "create_tab_delimited_CDS_file.log")
log:
os.path.join(config["local_log"], "create_tab_delimited_CDS_file.log")
singularity:
"docker://zavolab/python_htseq_biopython:3.6.5_0.10.0_1.71"
shell:
"({input.script} \
--gtf {input.gtf} \
--fasta {input.transcripts} \
--out {output.tsv}) &> {log}"
################################################################################# #################################################################################
### Generate segemehl index for transcripts ### Generate segemehl index for transcripts
################################################################################# #################################################################################
......
#!/usr/bin/env python
__version__ = "0.1"
__author__ = "Foivos Gypas"
__contact__ = "foivos.gypas@unibas.ch"
__doc__ = "Parse a gtf file and a transcripts sequence file (as generated " + \
"from gffread and create a tsv file containing the following: " + \
"transcript id, gene id, start codon, stop codon)"
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# import needed (external) modules
# -----------------------------------------------------------------------------
import sys
import os
import HTSeq
from Bio import SeqIO
from argparse import ArgumentParser, RawTextHelpFormatter
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Custom functions
# -----------------------------------------------------------------------------
def split_gff_fasta_header(header):
header = header.strip().split(" ")
transcript_id = header[0].strip("\'")
CDS = header[2].strip("CDS=").split("-")
CDS_start = CDS[0]
CDS_stop = CDS[1]
return transcript_id, CDS_start, CDS_stop
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Main function
# -----------------------------------------------------------------------------
def main():
""" Main function """
parser = ArgumentParser(
description=__doc__,
formatter_class=RawTextHelpFormatter
)
parser.add_argument(
"--gtf",
dest="gtf",
help="Input GTF file in ENSEMBL format",
required=True,
metavar="FILE"
)
parser.add_argument(
"--fasta",
dest="fasta",
help="Selected transcripts fasta file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--out",
dest="out",
help="Output file",
required=True,
metavar="FILE"
)
parser.add_argument(
"--verbose",
action="store_true",
dest="verbose",
default=False,
required=False,
help="Verbose"
)
parser.add_argument(
'--version',
action='version',
version=__version__
)
# _________________________________________________________________________
# -------------------------------------------------------------------------
# get the arguments
# -------------------------------------------------------------------------
try:
options = parser.parse_args()
except Exception:
parser.print_help()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
if options.verbose:
sys.stdout.write(
"Parsing gtf file: {} {}".format(
options.gtf,
os.linesep
)
)
# dictionary of keys: transcript_id values: gene_id
transcripts = {}
# parse gtf file
gtf_file = HTSeq.GFF_Reader(options.gtf)
for gtf_line in gtf_file:
if gtf_line.type == 'transcript':
transcript_id = gtf_line.attr['transcript_id']
gene_id = gtf_line.attr['gene_id']
if transcript_id not in transcripts.keys():
transcripts[transcript_id] = gene_id
# parse fasta file
w = open(options.out, 'w')
for entry in SeqIO.parse(options.fasta, 'fasta'):
transcript_id, CDS_start, CDS_stop = split_gff_fasta_header(entry.description)
if transcript_id in transcripts.keys():
w.write("\t".join([transcript_id,
transcripts[transcript_id],
CDS_start,
CDS_stop + os.linesep]))
w.close()
# _____________________________________________________________________________
# -----------------------------------------------------------------------------
# Call the Main function and catch Keyboard interrups
# -----------------------------------------------------------------------------
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
sys.stderr.write("User interrupt!" + os.linesep)
sys.exit(0)
use Data::Dumper;
my %hash = ();
my %anno = ();
my $readsSum = 0;
open(SAM,$ARGV[0]);
#Annotation file included with beginning and end of every CDS
my $annoFile = (defined($ARGV[1])) ? 1 : 0;
if($annoFile){
open(AN,$ARGV[1]);
while( $line = <AN> ){
chomp $line;
my @F = split /\t/ , $line; #annotation line
my @a = ($F[1],$F[2]);
$anno{$F[0]} = \@a;
}
} else {
print STDERR "Annotation file not specified, using len(5UTR)=200\n";
}
#print Dumper(%anno);
#Parse SAM file and count reads
while( $line = <SAM> ){
chomp $line;
next if $line =~ /^@/; #discard headers/comments
my @F = split /\t/ , $line; #sam line
my $gene_name = $F[2];
if($F[1] == 0) { #only consider hits in plus strand
if($annoFile){
if(($F[3] >= $anno{$gene_name}->[0]-15) & ($F[3] <= $anno{$gene_name}->[1]-15)) { #close to start and stop
$hash{$gene_name}++;
}
} {
if($F[3] >= 186) { #consider that len(5UTR)==200
$hash{$gene_name}++;
}
}
}
}
#print Dumper(\%hash);
print "gene\tcounts\n";
for $key (keys %hash){
print $key,"\t",$hash{$key},"\n";
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment