Addition of create_tab_delimited_CDS_file.py in the prepare annotation...

Addition of create_tab_delimited_CDS_file.py in the prepare annotation pipeline. Added perl script for counting riboseq reads. Needs to be further added in the process data pipeline.

Addition of create_tab_delimited_CDS_file.py in the prepare annotation...
097957da · BIOPZ-Gypas Foivos · 23eace92 · 097957da · 097957da · 097957da
Commit 097957da authored 6 years ago by BIOPZ-Gypas Foivos
--- a/snakemake/prepare_annotation/Snakefile
+++ b/snakemake/prepare_annotation/Snakefile
 configfile: "config.yaml"
-localrules: create_output_and_log_directories, finish
+localrules: create_output_and_log_directories, create_tab_delimited_CDS_file, finish
 #################################################################################
 ### Finish rule
@@ -9,7 +9,8 @@ localrules: create_output_and_log_directories, finish
 rule finish:
 	input:
 		idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"),
-		idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx")
+		idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx"),
+		tsv = os.path.join(config["output_dir"], "transcript_id_gene_id_CDS.tsv")
 #################################################################################
 ### Create output and log directories
@@ -91,6 +92,29 @@ rule extract_transcript_sequences:
 		-g {input.genome} \
 		-w {output.transcripts}) &> {log}"
+#################################################################################
+### Generate transcript id, gene id, CDS table
+#################################################################################
+rule create_tab_delimited_CDS_file:
+	input:
+		gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf"),
+		transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa"),
+		script = os.path.join(config["scripts"], "create_tab_delimited_CDS_file.py")
+	output:
+		tsv = os.path.join(config["output_dir"], "transcript_id_gene_id_CDS.tsv")
+	params:
+		cluster_log = os.path.join(config["cluster_log"], "create_tab_delimited_CDS_file.log")
+	log:
+		os.path.join(config["local_log"], "create_tab_delimited_CDS_file.log")
+	singularity:
+		"docker://zavolab/python_htseq_biopython:3.6.5_0.10.0_1.71"
+	shell:
+		"({input.script} \
+		--gtf {input.gtf} \
+		--fasta {input.transcripts} \
+		--out {output.tsv}) &> {log}"
 #################################################################################
 ### Generate segemehl index for transcripts
 #################################################################################

--- a/snakemake/prepare_annotation/scripts/create_tab_delimited_CDS_file.py
+++ b/snakemake/prepare_annotation/scripts/create_tab_delimited_CDS_file.py
+#!/usr/bin/env python
+__version__ = "0.1"
+__author__ = "Foivos Gypas"
+__contact__ = "foivos.gypas@unibas.ch"
+__doc__ = "Parse a gtf file and a transcripts sequence file (as generated " + \
+          "from gffread and create a tsv file containing the following: " + \
+          "transcript id, gene id, start codon, stop codon)"
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# import needed (external) modules
+# -----------------------------------------------------------------------------
+import sys
+import os
+import HTSeq
+from Bio import SeqIO
+from argparse import ArgumentParser, RawTextHelpFormatter
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Custom functions
+# -----------------------------------------------------------------------------
+def split_gff_fasta_header(header):
+    header = header.strip().split(" ")
+    transcript_id = header[0].strip("\'")
+    CDS = header[2].strip("CDS=").split("-")
+    CDS_start = CDS[0]
+    CDS_stop = CDS[1]
+    return transcript_id, CDS_start, CDS_stop
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Main function
+# -----------------------------------------------------------------------------
+def main():
+    """ Main function """
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=RawTextHelpFormatter
+    )
+    parser.add_argument(
+        "--gtf",
+        dest="gtf",
+        help="Input GTF file in ENSEMBL format",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--fasta",
+        dest="fasta",
+        help="Selected transcripts fasta file",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--out",
+        dest="out",
+        help="Output file",
+        required=True,
+        metavar="FILE"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        dest="verbose",
+        default=False,
+        required=False,
+        help="Verbose"
+    )
+    parser.add_argument(
+        '--version',
+        action='version',
+        version=__version__
+    )
+    # _________________________________________________________________________
+    # -------------------------------------------------------------------------
+    # get the arguments
+    # -------------------------------------------------------------------------
+    try:
+        options = parser.parse_args()
+    except Exception:
+        parser.print_help()
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    if options.verbose:
+        sys.stdout.write(
+            "Parsing gtf file: {} {}".format(
+                options.gtf,
+                os.linesep
+            )
+        )
+    # dictionary of keys: transcript_id values: gene_id
+    transcripts = {}
+    # parse gtf file
+    gtf_file = HTSeq.GFF_Reader(options.gtf)
+    for gtf_line in gtf_file:
+        if gtf_line.type == 'transcript':
+            transcript_id = gtf_line.attr['transcript_id']
+            gene_id = gtf_line.attr['gene_id']
+            if transcript_id not in transcripts.keys():
+                transcripts[transcript_id] = gene_id
+    # parse fasta file
+    w = open(options.out, 'w')
+    for entry in SeqIO.parse(options.fasta, 'fasta'):
+        transcript_id, CDS_start, CDS_stop = split_gff_fasta_header(entry.description)
+        if transcript_id in transcripts.keys():
+            w.write("\t".join([transcript_id,
+                               transcripts[transcript_id],
+                               CDS_start,
+                               CDS_stop + os.linesep]))
+    w.close()
+# _____________________________________________________________________________
+# -----------------------------------------------------------------------------
+# Call the Main function and catch Keyboard interrups
+# -----------------------------------------------------------------------------
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        sys.stderr.write("User interrupt!" + os.linesep)
+        sys.exit(0)
--- a/snakemake/process_data/scripts/xp-count-reads-ribseq.pl
+++ b/snakemake/process_data/scripts/xp-count-reads-ribseq.pl
+use Data::Dumper;
+my %hash  = ();
+my %anno  = ();
+my $readsSum = 0;
+open(SAM,$ARGV[0]);
+#Annotation file included with beginning and end of every CDS
+my $annoFile = (defined($ARGV[1])) ? 1 : 0;
+if($annoFile){
+	open(AN,$ARGV[1]);
+	while( $line = <AN> ){
+		chomp $line;
+		my @F = split /\t/ , $line; #annotation line
+		my @a = ($F[1],$F[2]);
+		$anno{$F[0]} = \@a;
+	}
+} else {
+	print STDERR "Annotation file not specified, using len(5UTR)=200\n";
+}
+#print Dumper(%anno);
+#Parse SAM file and count reads
+while( $line = <SAM> ){
+	chomp $line;
+	next if $line =~ /^@/; #discard headers/comments
+	my @F = split /\t/ , $line; #sam line
+	my $gene_name = $F[2];
+        if($F[1] == 0) { #only consider hits in plus strand
+		if($annoFile){
+			if(($F[3] >= $anno{$gene_name}->[0]-15) & ($F[3] <= $anno{$gene_name}->[1]-15)) { #close to start and stop
+				$hash{$gene_name}++;
+			}
+		} {	
+			if($F[3] >= 186) { #consider that len(5UTR)==200
+				$hash{$gene_name}++;
+			}
+		}
+	}
+}
+#print Dumper(\%hash);
+print "gene\tcounts\n";
+for $key (keys %hash){
+	print $key,"\t",$hash{$key},"\n";
+}