diff --git a/snakemake/prepare_annotation/Snakefile b/snakemake/prepare_annotation/Snakefile index 23deb5da77820088139d5c2647e7fab8bac23551..7240fe26b0a4d849d2cf87b9d1858b2cc10ab9eb 100644 --- a/snakemake/prepare_annotation/Snakefile +++ b/snakemake/prepare_annotation/Snakefile @@ -1,6 +1,6 @@ configfile: "config.yaml" -localrules: create_output_and_log_directories, finish +localrules: create_output_and_log_directories, create_tab_delimited_CDS_file, finish ################################################################################# ### Finish rule @@ -9,7 +9,8 @@ localrules: create_output_and_log_directories, finish rule finish: input: idx_other = os.path.join(config["output_dir"], "other_RNAs_sequence.idx"), - idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx") + idx_transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.idx"), + tsv = os.path.join(config["output_dir"], "transcript_id_gene_id_CDS.tsv") ################################################################################# ### Create output and log directories @@ -91,6 +92,29 @@ rule extract_transcript_sequences: -g {input.genome} \ -w {output.transcripts}) &> {log}" +################################################################################# +### Generate transcript id, gene id, CDS table +################################################################################# + +rule create_tab_delimited_CDS_file: + input: + gtf = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.gtf"), + transcripts = os.path.join(config["output_dir"], "longest_pc_transcript_per_gene.fa"), + script = os.path.join(config["scripts"], "create_tab_delimited_CDS_file.py") + output: + tsv = os.path.join(config["output_dir"], "transcript_id_gene_id_CDS.tsv") + params: + cluster_log = os.path.join(config["cluster_log"], "create_tab_delimited_CDS_file.log") + log: + os.path.join(config["local_log"], "create_tab_delimited_CDS_file.log") + singularity: + "docker://zavolab/python_htseq_biopython:3.6.5_0.10.0_1.71" + shell: + "({input.script} \ + --gtf {input.gtf} \ + --fasta {input.transcripts} \ + --out {output.tsv}) &> {log}" + ################################################################################# ### Generate segemehl index for transcripts ################################################################################# diff --git a/snakemake/prepare_annotation/scripts/create_tab_delimited_CDS_file.py b/snakemake/prepare_annotation/scripts/create_tab_delimited_CDS_file.py new file mode 100755 index 0000000000000000000000000000000000000000..1801da9c91d99c65dac1a2d1cacd324d10dfc099 --- /dev/null +++ b/snakemake/prepare_annotation/scripts/create_tab_delimited_CDS_file.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python + +__version__ = "0.1" +__author__ = "Foivos Gypas" +__contact__ = "foivos.gypas@unibas.ch" +__doc__ = "Parse a gtf file and a transcripts sequence file (as generated " + \ + "from gffread and create a tsv file containing the following: " + \ + "transcript id, gene id, start codon, stop codon)" + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# import needed (external) modules +# ----------------------------------------------------------------------------- + +import sys +import os +import HTSeq +from Bio import SeqIO +from argparse import ArgumentParser, RawTextHelpFormatter + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Custom functions +# ----------------------------------------------------------------------------- +def split_gff_fasta_header(header): + + header = header.strip().split(" ") + + transcript_id = header[0].strip("\'") + CDS = header[2].strip("CDS=").split("-") + + CDS_start = CDS[0] + CDS_stop = CDS[1] + + return transcript_id, CDS_start, CDS_stop + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Main function +# ----------------------------------------------------------------------------- + + +def main(): + """ Main function """ + + parser = ArgumentParser( + description=__doc__, + formatter_class=RawTextHelpFormatter + ) + + parser.add_argument( + "--gtf", + dest="gtf", + help="Input GTF file in ENSEMBL format", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--fasta", + dest="fasta", + help="Selected transcripts fasta file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--out", + dest="out", + help="Output file", + required=True, + metavar="FILE" + ) + + parser.add_argument( + "--verbose", + action="store_true", + dest="verbose", + default=False, + required=False, + help="Verbose" + ) + + parser.add_argument( + '--version', + action='version', + version=__version__ + ) + + # _________________________________________________________________________ + # ------------------------------------------------------------------------- + # get the arguments + # ------------------------------------------------------------------------- + try: + options = parser.parse_args() + except Exception: + parser.print_help() + + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + + if options.verbose: + sys.stdout.write( + "Parsing gtf file: {} {}".format( + options.gtf, + os.linesep + ) + ) + + + # dictionary of keys: transcript_id values: gene_id + transcripts = {} + + # parse gtf file + gtf_file = HTSeq.GFF_Reader(options.gtf) + + for gtf_line in gtf_file: + + if gtf_line.type == 'transcript': + transcript_id = gtf_line.attr['transcript_id'] + gene_id = gtf_line.attr['gene_id'] + + if transcript_id not in transcripts.keys(): + transcripts[transcript_id] = gene_id + + # parse fasta file + w = open(options.out, 'w') + for entry in SeqIO.parse(options.fasta, 'fasta'): + transcript_id, CDS_start, CDS_stop = split_gff_fasta_header(entry.description) + if transcript_id in transcripts.keys(): + w.write("\t".join([transcript_id, + transcripts[transcript_id], + CDS_start, + CDS_stop + os.linesep])) + w.close() + + + +# _____________________________________________________________________________ +# ----------------------------------------------------------------------------- +# Call the Main function and catch Keyboard interrups +# ----------------------------------------------------------------------------- + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + sys.stderr.write("User interrupt!" + os.linesep) + sys.exit(0) diff --git a/snakemake/process_data/scripts/xp-count-reads-ribseq.pl b/snakemake/process_data/scripts/xp-count-reads-ribseq.pl new file mode 100644 index 0000000000000000000000000000000000000000..ca4a7c9b77adf9d8ee554417d8144092b6c324b3 --- /dev/null +++ b/snakemake/process_data/scripts/xp-count-reads-ribseq.pl @@ -0,0 +1,52 @@ +use Data::Dumper; + +my %hash = (); +my %anno = (); +my $readsSum = 0; + +open(SAM,$ARGV[0]); + +#Annotation file included with beginning and end of every CDS +my $annoFile = (defined($ARGV[1])) ? 1 : 0; + +if($annoFile){ + open(AN,$ARGV[1]); + while( $line = <AN> ){ + chomp $line; + my @F = split /\t/ , $line; #annotation line + my @a = ($F[1],$F[2]); + $anno{$F[0]} = \@a; + } +} else { + print STDERR "Annotation file not specified, using len(5UTR)=200\n"; +} + +#print Dumper(%anno); + +#Parse SAM file and count reads +while( $line = <SAM> ){ + + chomp $line; + next if $line =~ /^@/; #discard headers/comments + + my @F = split /\t/ , $line; #sam line + + my $gene_name = $F[2]; + if($F[1] == 0) { #only consider hits in plus strand + if($annoFile){ + if(($F[3] >= $anno{$gene_name}->[0]-15) & ($F[3] <= $anno{$gene_name}->[1]-15)) { #close to start and stop + $hash{$gene_name}++; + } + } { + if($F[3] >= 186) { #consider that len(5UTR)==200 + $hash{$gene_name}++; + } + } + } +} + +#print Dumper(\%hash); +print "gene\tcounts\n"; +for $key (keys %hash){ + print $key,"\t",$hash{$key},"\n"; +}