diff --git a/scripts/poisson_sampling.py b/scripts/poisson_sampling.py index 5ca481ee62dfb2217ed407e0ded60f417634e942..737cc828c4c3b4751e086897600c0cbe997101c1 100644 --- a/scripts/poisson_sampling.py +++ b/scripts/poisson_sampling.py @@ -1,5 +1,7 @@ import pandas as pd import numpy as np +import argparse + ''' Sample transcript @@ -13,25 +15,32 @@ output: csv file with gene id and count gtf file with transcript samples ''' -def transcript_sampling(total_transcript, transcripts): - - #read file containing representative transcript levels and id - transcript = pd.read_csv(transcripts) +def transcript_sampling(total_transcript_number, csv_file, output_csv): + df = pd.read_csv(csv_file) levels = [] - #poisson sampling for each gene, proportional to expression levels - - for expression_level in transcript['level']: + for expression_level in df['level']: - poisson_sampled = np.random.poisson(total_transcript/expression_level) + poisson_sampled = np.random.poisson(total_transcript_number/expression_level) levels.append(poisson_sampled) - # note: if levels from input are decimals, total trancript should be multiplied by expr level, not divided - # poisson_sampled = np.random.poisson(total_transcript*expression_level) - #write output csv file containing transcript id and count (representative transcript numbers) + transcript_numbers = pd.DataFrame({'id': df['id'],'count': levels}) + pd.DataFrame.to_csv(transcript_numbers, output_csv) + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="Transcript Poisson sampler, csv output", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument("--expression_level", required=True, help="csv file with expression level") + parser.add_argument("--output_csv", required=True, help="output csv file") + parser.add_argument("--input_csv", required=True, help="input csv file") + parser.add_argument("--transcript_number", required=True, help="total number of transcripts to sample") + args = parser.parse_args() + - transcript_numbers = pd.DataFrame({'id': transcript['id'],'count': levels}) - pd.DataFrame.to_csv(transcript_numbers, "representative_transcript_numbers.csv") + transcript_sampling(args.transcript_number, args.input_csv, args.output_csv, args.transcript_number) diff --git a/scripts/writegtf.py b/scripts/writegtf.py index 426f3f492676ff36bdcd461e0851b351070c7360..d5136a79940666ac4f7e9368b687c83bb011a3e9 100644 --- a/scripts/writegtf.py +++ b/scripts/writegtf.py @@ -1,21 +1,53 @@ -''' -This function writes keeps the representative transcripts from the original input file (gtf) -and writes them to an output. -(The representative transcripts being listed in a csv file) -''' +import pandas as pd +import numpy as np +import argparse -def gtf_representative_transcripts(original_gtf, transcripts): - transcript = pd.read_csv(transcripts) - representative_transcripts = [] +def transcript_ID_finder (entry): + index_transcript_id = entry.find("transcript_id") + find_transcript_id_name = re.compile("\"\S{1,25}\"") + sub_entry = entry[index_transcript_id:] + try_find_transcript_id_name = find_transcript_id_name.search(sub_entry) + + try: + transcript_ID = try_find_transcript_id_name[0].replace("\"","") + except: + transcript_ID = "" + return (transcript_ID) - with open (original_gtf, 'r') as file: - for id in transcript['id']: - for line in file: - if id in line: - representative_transcripts.append(line) +'''gtf_file_writer takes as input the original gtf file and the csv file containing relevant transcripts. - with open ('output.gtf', 'w') as outputfile: - outputfile.write(representative_transcripts) + It produces a gtf file containing only the transcript entries of those contained in the csv file + + based on id''' +def gtf_file_writer (original_file, csv_file, output_file): + output = [] + + df = pd.read_csv(csv_file) + listoftranscripts = df['id'].tolist() + if df['id'] == False: + print('Error. \'id\' column needed in input csv file.') + + with open(original_file, 'r') as f: + for entry in f: + if "\ttranscript\t" in entry: + transcript_id = transcript_ID_finder(entry) + if transcript_id in listoftranscripts: + output.append(entry) + with open(output_file, 'w') as last_file: + last_file.write(output) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description="gtf output file writer", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("--annotation", required=True, help="gtf file with genome annotation") + parser.add_argument("--output_gtf", required=True, help="output gtf file") + parser.add_argument("--input_csv", required=True, help="input csv file") + args = parser.parse_args() + + gtf_file_writer(args.annotation, args.input_csv, args.output_gtf)