diff --git a/src/generate_transcripts.nf b/src/generate_transcripts.nf new file mode 100644 index 0000000000000000000000000000000000000000..36b1f6792a3d1131cf668649d7f8347f03e66a5a --- /dev/null +++ b/src/generate_transcripts.nf @@ -0,0 +1,51 @@ +#!/usr/bin/env nextflow + +params.fasta = "/inputs/ref_genome.fa" +params.gtf = "/inputs/ref_annotation.gtf" +params.outdir = "/tests" +params.tail_length = "100" +params.weights = "1.0 0.0 0.0 0.0" + +project_dir = projectDir + +process getTranscriptGTF { + + input: + path gtf from params.gtf + + output: + file "transcripts.gtf" into transcripts_gtf + + """ + awk '\$3 == "transcript"' ${gtf} > transcripts.gtf + """ +} + +process getTranscripts { + + input: + path fasta from params.fasta + file gtf from transcripts_gtf + + output: + file "transcripts.fa" into transcripts_fa + + """ + bedtools getfasta -fi ${fasta} -bed ${gtf} -fo transcripts.fa + """ +} + +process addPolyA { +publishDir params.outdir, mode: 'copy', pattern: 'transcripts_with_polyA.fa' + + input: + file transcripts from transcripts_fa + + output: + file "transcripts_with_polyA.fa" into transcripts_polyA + + script: + """ + python3 $project_dir/poly_a.py -i ${transcripts} -o transcripts_with_polyA.fa --length ${params.tail_length} --weights ${params.weights} + """ +} diff --git a/src/poly_a.py b/src/poly_a.py index 2c3ee302c3b1c89fc1d83cee400a460cd407fc09..72dacb1b20234e8cc2741037155d35f1cad7f1db 100644 --- a/src/poly_a.py +++ b/src/poly_a.py @@ -1,7 +1,11 @@ """Generate a poly(A) tail.""" +from Bio import SeqIO +from Bio.SeqRecord import SeqRecord +from Bio.Seq import Seq from random import choices from typing import (List, Tuple) +import argparse def generate_poly_a( @@ -65,3 +69,25 @@ def generate_poly_a( norm_weights: List[float] = [freq/s for freq in weights] tail_bases: List[str] = choices(bases, weights=norm_weights, k=length) return "".join(tail_bases) + +def main(): + parser = argparse.ArgumentParser(description='Process some integers.') + parser.add_argument('-i', '--input', required=True, + help='Fasta file to add tails to') + parser.add_argument('-o', '--output', default="out.fa", + help='Name of the output Fasta file (default = out.fa)') + parser.add_argument('--length', default=100, type=int, + help='Length of the desired tail (max value = 200)') + parser.add_argument('--weights', default=(0.914, 0.028, 0.025, 0.033), type=float, nargs='+', + help="Tuple of relative A, C, G and U frequencies in the tail") + args = parser.parse_args() + + new_records = [] + for fasta in SeqIO.parse(open(args.input),'fasta'): + new_seq = fasta.seq + Seq(generate_poly_a(args.length, args.weights)) + new_records.append(SeqRecord(new_seq, id=fasta.id, description=fasta.description)) + print(new_records[0].seq) + SeqIO.write(new_records, args.output, "fasta") + +if __name__ == '__main__': + main()