Skip to content
Snippets Groups Projects
Commit 529eec46 authored by Laura Urbanska's avatar Laura Urbanska
Browse files

updated and fixed output gtf script and poisson sampler

parent 32743a09
No related branches found
No related tags found
No related merge requests found
import pandas as pd
import numpy as np
import argparse
'''
Sample transcript
......@@ -13,25 +15,32 @@ output: csv file with gene id and count
gtf file with transcript samples
'''
def transcript_sampling(total_transcript, transcripts):
#read file containing representative transcript levels and id
transcript = pd.read_csv(transcripts)
def transcript_sampling(total_transcript_number, csv_file, output_csv):
df = pd.read_csv(csv_file)
levels = []
#poisson sampling for each gene, proportional to expression levels
for expression_level in transcript['level']:
for expression_level in df['level']:
poisson_sampled = np.random.poisson(total_transcript/expression_level)
poisson_sampled = np.random.poisson(total_transcript_number/expression_level)
levels.append(poisson_sampled)
# note: if levels from input are decimals, total trancript should be multiplied by expr level, not divided
# poisson_sampled = np.random.poisson(total_transcript*expression_level)
#write output csv file containing transcript id and count (representative transcript numbers)
transcript_numbers = pd.DataFrame({'id': df['id'],'count': levels})
pd.DataFrame.to_csv(transcript_numbers, output_csv)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Transcript Poisson sampler, csv output",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--expression_level", required=True, help="csv file with expression level")
parser.add_argument("--output_csv", required=True, help="output csv file")
parser.add_argument("--input_csv", required=True, help="input csv file")
parser.add_argument("--transcript_number", required=True, help="total number of transcripts to sample")
args = parser.parse_args()
transcript_numbers = pd.DataFrame({'id': transcript['id'],'count': levels})
pd.DataFrame.to_csv(transcript_numbers, "representative_transcript_numbers.csv")
transcript_sampling(args.transcript_number, args.input_csv, args.output_csv, args.transcript_number)
'''
This function writes keeps the representative transcripts from the original input file (gtf)
and writes them to an output.
(The representative transcripts being listed in a csv file)
'''
import pandas as pd
import numpy as np
import argparse
def gtf_representative_transcripts(original_gtf, transcripts):
transcript = pd.read_csv(transcripts)
representative_transcripts = []
def transcript_ID_finder (entry):
index_transcript_id = entry.find("transcript_id")
find_transcript_id_name = re.compile("\"\S{1,25}\"")
sub_entry = entry[index_transcript_id:]
try_find_transcript_id_name = find_transcript_id_name.search(sub_entry)
try:
transcript_ID = try_find_transcript_id_name[0].replace("\"","")
except:
transcript_ID = ""
return (transcript_ID)
with open (original_gtf, 'r') as file:
for id in transcript['id']:
for line in file:
if id in line:
representative_transcripts.append(line)
'''gtf_file_writer takes as input the original gtf file and the csv file containing relevant transcripts.
with open ('output.gtf', 'w') as outputfile:
outputfile.write(representative_transcripts)
It produces a gtf file containing only the transcript entries of those contained in the csv file
based on id'''
def gtf_file_writer (original_file, csv_file, output_file):
output = []
df = pd.read_csv(csv_file)
listoftranscripts = df['id'].tolist()
if df['id'] == False:
print('Error. \'id\' column needed in input csv file.')
with open(original_file, 'r') as f:
for entry in f:
if "\ttranscript\t" in entry:
transcript_id = transcript_ID_finder(entry)
if transcript_id in listoftranscripts:
output.append(entry)
with open(output_file, 'w') as last_file:
last_file.write(output)
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="gtf output file writer",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument("--annotation", required=True, help="gtf file with genome annotation")
parser.add_argument("--output_gtf", required=True, help="output gtf file")
parser.add_argument("--input_csv", required=True, help="input csv file")
args = parser.parse_args()
gtf_file_writer(args.annotation, args.input_csv, args.output_gtf)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment