From 061f425fd02125e479399dd7878e428bf0ec509a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Boris=20Juri=C4=8D?= <499542@mail.muni.cz> Date: Fri, 1 Apr 2022 11:56:30 +0000 Subject: [PATCH] Make src/poly_a.py runnable, added fasta processing. Initial transcript generation workflow. --- src/generate_transcripts.nf | 51 +++++++++++++++++++++++++++++++++++++ src/poly_a.py | 26 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 src/generate_transcripts.nf diff --git a/src/generate_transcripts.nf b/src/generate_transcripts.nf new file mode 100644 index 0000000..36b1f67 --- /dev/null +++ b/src/generate_transcripts.nf @@ -0,0 +1,51 @@ +#!/usr/bin/env nextflow + +params.fasta = "/inputs/ref_genome.fa" +params.gtf = "/inputs/ref_annotation.gtf" +params.outdir = "/tests" +params.tail_length = "100" +params.weights = "1.0 0.0 0.0 0.0" + +project_dir = projectDir + +process getTranscriptGTF { + + input: + path gtf from params.gtf + + output: + file "transcripts.gtf" into transcripts_gtf + + """ + awk '\$3 == "transcript"' ${gtf} > transcripts.gtf + """ +} + +process getTranscripts { + + input: + path fasta from params.fasta + file gtf from transcripts_gtf + + output: + file "transcripts.fa" into transcripts_fa + + """ + bedtools getfasta -fi ${fasta} -bed ${gtf} -fo transcripts.fa + """ +} + +process addPolyA { +publishDir params.outdir, mode: 'copy', pattern: 'transcripts_with_polyA.fa' + + input: + file transcripts from transcripts_fa + + output: + file "transcripts_with_polyA.fa" into transcripts_polyA + + script: + """ + python3 $project_dir/poly_a.py -i ${transcripts} -o transcripts_with_polyA.fa --length ${params.tail_length} --weights ${params.weights} + """ +} diff --git a/src/poly_a.py b/src/poly_a.py index 2c3ee30..72dacb1 100644 --- a/src/poly_a.py +++ b/src/poly_a.py @@ -1,7 +1,11 @@ """Generate a poly(A) tail.""" +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq from random import choices from typing import (List, Tuple) +import argparse def generate_poly_a( @@ -65,3 +69,25 @@ def generate_poly_a( norm_weights: List[float] = [freq/s for freq in weights] tail_bases: List[str] = choices(bases, weights=norm_weights, k=length) return "".join(tail_bases) + +def main(): + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('-i', '--input', required=True, + help='Fasta file to add tails to') + parser.add_argument('-o', '--output', default="out.fa", + help='Name of the output Fasta file (default = out.fa)') + parser.add_argument('--length', default=100, type=int, + help='Length of the desired tail (max value = 200)') + parser.add_argument('--weights', default=(0.914, 0.028, 0.025, 0.033), type=float, nargs='+', + help="Tuple of relative A, C, G and U frequencies in the tail") + args = parser.parse_args() + + new_records = [] + for fasta in SeqIO.parse(open(args.input),'fasta'): + new_seq = fasta.seq + Seq(generate_poly_a(args.length, args.weights)) + new_records.append(SeqRecord(new_seq, id=fasta.id, description=fasta.description)) + print(new_records[0].seq) + SeqIO.write(new_records, args.output, "fasta") + +if __name__ == '__main__': + main() -- GitLab