Skip to content
Snippets Groups Projects
Commit 061f425f authored by Boris Jurič's avatar Boris Jurič
Browse files

Make src/poly_a.py runnable, added fasta processing. Initial transcript generation workflow.

parent 557be05b
No related branches found
No related tags found
No related merge requests found
Pipeline #14073 failed
#!/usr/bin/env nextflow
params.fasta = "/inputs/ref_genome.fa"
params.gtf = "/inputs/ref_annotation.gtf"
params.outdir = "/tests"
params.tail_length = "100"
params.weights = "1.0 0.0 0.0 0.0"
project_dir = projectDir
process getTranscriptGTF {
input:
path gtf from params.gtf
output:
file "transcripts.gtf" into transcripts_gtf
"""
awk '\$3 == "transcript"' ${gtf} > transcripts.gtf
"""
}
process getTranscripts {
input:
path fasta from params.fasta
file gtf from transcripts_gtf
output:
file "transcripts.fa" into transcripts_fa
"""
bedtools getfasta -fi ${fasta} -bed ${gtf} -fo transcripts.fa
"""
}
process addPolyA {
publishDir params.outdir, mode: 'copy', pattern: 'transcripts_with_polyA.fa'
input:
file transcripts from transcripts_fa
output:
file "transcripts_with_polyA.fa" into transcripts_polyA
script:
"""
python3 $project_dir/poly_a.py -i ${transcripts} -o transcripts_with_polyA.fa --length ${params.tail_length} --weights ${params.weights}
"""
}
"""Generate a poly(A) tail.""" """Generate a poly(A) tail."""
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from random import choices from random import choices
from typing import (List, Tuple) from typing import (List, Tuple)
import argparse
def generate_poly_a( def generate_poly_a(
...@@ -65,3 +69,25 @@ def generate_poly_a( ...@@ -65,3 +69,25 @@ def generate_poly_a(
norm_weights: List[float] = [freq/s for freq in weights] norm_weights: List[float] = [freq/s for freq in weights]
tail_bases: List[str] = choices(bases, weights=norm_weights, k=length) tail_bases: List[str] = choices(bases, weights=norm_weights, k=length)
return "".join(tail_bases) return "".join(tail_bases)
def main():
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('-i', '--input', required=True,
help='Fasta file to add tails to')
parser.add_argument('-o', '--output', default="out.fa",
help='Name of the output Fasta file (default = out.fa)')
parser.add_argument('--length', default=100, type=int,
help='Length of the desired tail (max value = 200)')
parser.add_argument('--weights', default=(0.914, 0.028, 0.025, 0.033), type=float, nargs='+',
help="Tuple of relative A, C, G and U frequencies in the tail")
args = parser.parse_args()
new_records = []
for fasta in SeqIO.parse(open(args.input),'fasta'):
new_seq = fasta.seq + Seq(generate_poly_a(args.length, args.weights))
new_records.append(SeqRecord(new_seq, id=fasta.id, description=fasta.description))
print(new_records[0].seq)
SeqIO.write(new_records, args.output, "fasta")
if __name__ == '__main__':
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment