diff --git a/scripts_phylogeny/make_variable_fasta.py b/scripts_phylogeny/make_variable_fasta.py deleted file mode 100755 index bedaa55fd61dfb79195773f8b0a237ef50cf65db..0000000000000000000000000000000000000000 --- a/scripts_phylogeny/make_variable_fasta.py +++ /dev/null @@ -1,125 +0,0 @@ - -from Bio import SeqIO -from joblib import Parallel, delayed -import argparse -import re - - -############################################################ define arg type float 0 < X > 1 ############################################################### - -def restricted_float(x): - x = float(x) - if x < 0.0 or x > 1.0: - raise argparse.ArgumentTypeError("%r not in range [0.0, 1.0]"%(x,)) - return x - -########################################## find if a position is variable ####################### -def find_variable(n): - out={} - #print "nnnnnnnnnnnnnn" - print n - pos="" - for name, seq in fasta.items(): - pos += seq[n] - #print pos - seq_length=len(pos) - good_seq_length=len(re.findall('[ATCGatcg]', pos)) - ratio= float(good_seq_length)/float(seq_length) - #print ratio - if (ratio >= arguments.missing): - A=len(re.findall('[Aa]', pos)) - T=len(re.findall('[Tt]', pos)) - G=len(re.findall('[Gg]', pos)) - C=len(re.findall('[Cc]', pos)) - if (A > 0): - A=1 - if (T > 0): - T=1 - if (C > 0): - C=1 - if (G > 0): - G=1 - if ((A+C+G+T) > 1): - #print pos - for name_out, seq_out in fasta.items(): - OUT[name_out] += seq_out[n] - - #print OUT - - - #print out - return OUT - - -########################################## parallel loop ####################### -def parallel_loop(i): - n=i - print "iiiiii" - print i - while n < length: - print "n=" + str(n) - print "i=" + str(i) - - OUT=find_variable(n) - #print temp_out - n=n+arguments.cpu #n of threads - #if temp_out: - #OUT.update(temp_out) - #for name_out, seq_out in temp_out.items(): - #OUT[name_out] += temp_out[name_out] - - #tot.update(OUT) - #print OUT - return OUT - - - - - -parser = argparse.ArgumentParser() - -parser.add_argument('INFILE',type=str,help='path to the multi fasta alignment') -parser.add_argument('-m','--missing', metavar='0-1', default='0.5', help='minimum percentage of NOT missing data', type =restricted_float,nargs='?') -parser.add_argument('-c','--cpu', metavar='INT', default=1,help='number of cpu to use (default: 1)',type =int, nargs='?') - - - - - -arguments = parser.parse_args() -tot={} -fasta={} -OUT={} - -for record in SeqIO.parse(arguments.INFILE, "fasta"): - length = len(record.seq) - fasta.update({str(record.id):str(record.seq)}) - OUT.update({str(record.id):""}) - -print "fasta reas" -OUT = Parallel(n_jobs=arguments.cpu)(delayed(find_variable)(i) for i in range(0,length)) - - -#print OUT - - -result={} -for d in OUT: #when run - result.update(d) -#print OUT - - -#print result - -for name_out, seq_out in result.items(): - print ">" + name_out - print seq_out - #scp print len(seq_out) - - - - - - - -