From e749773a3661d801f766209e819904210a876531 Mon Sep 17 00:00:00 2001 From: JoanaMPereira <pereira.joanam@gmail.com> Date: Tue, 26 Mar 2024 09:34:07 +0100 Subject: [PATCH] it now saves the data on the go so that it does not analyse what was previously analysed --- barrOs_library.py | 295 ++++++++++++++++++++++++---------------------- 1 file changed, 153 insertions(+), 142 deletions(-) diff --git a/barrOs_library.py b/barrOs_library.py index b92733e..0f27ce2 100644 --- a/barrOs_library.py +++ b/barrOs_library.py @@ -10,6 +10,7 @@ import threading import itertools import os import sys +import json import warnings warnings.filterwarnings("ignore") @@ -2310,26 +2311,32 @@ def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, max_ outmultimeric = "multimeric_barrel_sequences_matched_pdbs_job{}.fasta".format(job_number) outmt = open(outmultimeric, 'w') - data = {'PDBs': [], - 'TMsegm': [], - 'BB_diameter': [], - 'BB_diameter_mad': [], - 'Shear': [], - 'Membrane_thickness': [], - # 'a': [], - # 'a_mad': [], - # 'b': [], - # 'b_mad': [], - # 'h': [], - # 'CAi-CAi+1': [], - # 'CAi-CAi+1_mad': [], - 'CAi-CAi+1-CAi+2': [], - 'CAi-CAi+1-CAi+2_mad': [], - 'CAi+2-CAi-CAi+1': [], - 'CAi+2-CAi-CAi+1_mad': [], - 'N_residues': [], - 'N_chains': [], - 'Multimeric': []} + outdata = "barrels_matched_pdbs_data_job{}.fasta".format(job_number) + + if os.path.ifile(outdata): + data = json.load(open(outdata, 'r')) + + else: + data = {'PDBs': [], + 'TMsegm': [], + 'BB_diameter': [], + 'BB_diameter_mad': [], + 'Shear': [], + 'Membrane_thickness': [], + # 'a': [], + # 'a_mad': [], + # 'b': [], + # 'b_mad': [], + # 'h': [], + # 'CAi-CAi+1': [], + # 'CAi-CAi+1_mad': [], + 'CAi-CAi+1-CAi+2': [], + 'CAi-CAi+1-CAi+2_mad': [], + 'CAi+2-CAi-CAi+1': [], + 'CAi+2-CAi-CAi+1_mad': [], + 'N_residues': [], + 'N_chains': [], + 'Multimeric': []} for i, pdbID in enumerate(in_queue): deleted_it = False @@ -2400,140 +2407,144 @@ def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, max_ for pdbID, chainID, pdb_file, protein_type, membrane_thickness in target_chains: - try: - pdb_sequence, barrel_topology, chains, num_bb_regions, pdb_file = process_barrel(pdb_file, ID = pdbID, chainID = chainID, offset = offset, step = step, local_angle_threshold = local_angle_threshold, distance_threshold = distance_threshold, multimer_only=multimer_only) - except: - num_bb_regions = 0 + if '{}_{}'.format(pdbID, chainID) not in data['PDBs']: + + try: + pdb_sequence, barrel_topology, chains, num_bb_regions, pdb_file = process_barrel(pdb_file, ID = pdbID, chainID = chainID, offset = offset, step = step, local_angle_threshold = local_angle_threshold, distance_threshold = distance_threshold, multimer_only=multimer_only) + except: + num_bb_regions = 0 - # check if it has a barrel. If so, continue - if num_bb_regions > 0 and len(pdb_sequence) < 10000: - # get the sequence of the barrel region only - barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 10) + # check if it has a barrel. If so, continue + if num_bb_regions > 0 and len(pdb_sequence) < 10000: + # get the sequence of the barrel region only + barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 10) - print('\n\t>{}TM_BARREL_{}_{}_{}'.format(num_bb_regions, protein_type, pdbID, chainID)) - print('\t',pdb_sequence) - print('\t',barrel_topology,'\n') + print('\n\t>{}TM_BARREL_{}_{}_{}'.format(num_bb_regions, protein_type, pdbID, chainID)) + print('\t',pdb_sequence) + print('\t',barrel_topology,'\n') - outpt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) - outpt.write('{}\n'.format(pdb_sequence)) - outpt.write('{}\n'.format(barrel_topology)) + outpt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) + outpt.write('{}\n'.format(pdb_sequence)) + outpt.write('{}\n'.format(barrel_topology)) - # if it is a membrane protein, check whether the barrel is inserted into the membrane. Otherwise consider it does not - if protein_type == 'membrane': - barrel_crosses_membrane, barrel_struct = find_if_barrel_crosses_membrane(pdb_sequence, pdb_file, barrel_topology, membrane_thickness, max_loop_size = max_loop_size) - else: - barrel_crosses_membrane = False - print(" ... ... Barrel in {} crosses membrane: {}".format(pdbID, barrel_crosses_membrane)) - - if protein_type != 'membrane' or (protein_type == 'membrane' and barrel_crosses_membrane): - # if the barrel does not cross the membrane, move and rotate it so that it is in a reference position (centered in (0,0,0) with the axis of the pore in z) - #if not barrel_crosses_membrane: - print(" ... ... Trying to move the barrel to the reference position (centered in (0,0,0) with the axis of the pore in z)") - pdb_file, curr_angle = move_barrel_to_reference_position(pdb_file, barrel_topology, center = [0,0,0]) - - # calculate the diameter of the barrel ONLY (no loops at all) - print(" ... ... Computing barrel diameter (excluding all loops)") - barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 0) - diameter, mad_diameter = get_barrel_diameter(barrel_struct, chainID, min_residue_numb = int(num_bb_regions/5.0), step_z = 1, membrane_thickness = membrane_thickness) - - if diameter is not None: - print(" ... ... ... Barrel diameter: {} (+/- {}) Ang".format(round(diameter,2), round(mad_diameter,2))) - - # calculate the shear number - print(" ... ... Computing the shear number") - try: - shear = shear_number(barrel_struct, show_path = False) - if shear is not None: - print(' ... ... ... Shear: {}'.format(shear)) - else: - print(' ... ... ... Error: Not possible to compute the shear number') - except: - shear = None - print(' ... ... ... Error: Not possible to compute the shear number') - - # calculate a (the distance between two adjacent CA assuming they are in the same line defined by the strand) - # print(' ... ... Computing a and b') - # a, mad_a = calculate_a(barrel_struct) - # print(' ... ... ... a: {} (+/- {}) Ang'.format(round(a, 2), round(mad_a, 2))) - - # b, mad_b = calculate_b(barrel_struct) - # print(' ... ... ... b: {} (+/- {}) Ang'.format(round(b, 2), round(mad_b, 2))) - - # calculate the median distance between two adjacent c-alphas - # print(' ... ... Computing local CA geometric features (distance to next CA and and the bond angles to CAi-1 and CAi+2)') - # c, mad_c = calculate_c(barrel_struct) - # print(' ... ... ... CAi-CAi+1 distance: {} (+/- {}) Ang'.format(round(c, 2), round(mad_c, 2))) - theta1, theta1_mad, kappa, kappa_mad = calculate_CA_pseudoangles(barrel_struct) - print(' ... ... ... CAi-CAi+1-CAi+2 angle: {} (+/- {}) degrees'.format(round(theta1, 2), round(theta1_mad, 2))) - print(' ... ... ... CAi+2-CAi-CAi+1 angle: {} (+/- {}) degrees'.format(round(kappa, 2), round(kappa_mad, 2))) - # h = sqrt(c**2-a**2) - # print(' ... ... ... h: {} Ang'.format(round(h, 2))) - - # save all results into the dictionary - data['PDBs'].append('{}_{}'.format(pdbID, chainID)) - data['TMsegm'].append(num_bb_regions) - data['BB_diameter'].append(diameter) - data['BB_diameter_mad'].append(mad_diameter) - data['Shear'].append(shear) - data['Membrane_thickness'].append(membrane_thickness) - # data['a'].append(a) - # data['a_mad'].append(mad_a) - # data['b'].append(b) - # data['b_mad'].append(mad_b) - # data['CAi-CAi+1'].append(c) - # data['CAi-CAi+1_mad'].append(mad_c) - data['CAi-CAi+1-CAi+2'].append(theta1) - data['CAi-CAi+1-CAi+2_mad'].append(theta1_mad) - data['CAi+2-CAi-CAi+1'].append(kappa) - data['CAi+2-CAi-CAi+1_mad'].append(kappa_mad) - # data['h'].append(h) - data['N_residues'].append(len(pdb_sequence)) - - # extract the sequence of the barrel domain only - print(" ... Getting final barrel structure without N- and C-termini") - print(' ... ... Masking out internal loops from the topology string') - masked_topology = mask_loops(barrel_topology, chains) - barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, masked_topology, max_loop_size = 10, chains=chains) - print(" ... ... Saved as: {}".format(barrel_struct)) - - # check if the barrel is multimeric and save the sequences accordingly - chains_in_barrel = get_chains_in_pdb(barrel_struct, source_pdb=False)[0] - n_chains = len(chains_in_barrel) - data['N_chains'].append(n_chains) - data['Multimeric'].append(n_chains > 1) - - if n_chains > 1: - chains_seqs = get_chains_sequence(pdb_sequence,chains) - for chain in chains_in_barrel: - outmt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chain)) - outmt.write('{}\n'.format(chains_seqs[chain])) - + # if it is a membrane protein, check whether the barrel is inserted into the membrane. Otherwise consider it does not + if protein_type == 'membrane': + barrel_crosses_membrane, barrel_struct = find_if_barrel_crosses_membrane(pdb_sequence, pdb_file, barrel_topology, membrane_thickness, max_loop_size = max_loop_size) else: - outbb.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) - outbb.write('{}\n'.format(barrel_seq)) - else: - print('... ... There is a barrel in {}_{} but it does not cross the membrane\n'.format(pdbID, chainID)) - if delete: - os.system("rm {}*".format(pdb_file[:-4])) + barrel_crosses_membrane = False + print(" ... ... Barrel in {} crosses membrane: {}".format(pdbID, barrel_crosses_membrane)) + + if protein_type != 'membrane' or (protein_type == 'membrane' and barrel_crosses_membrane): + # if the barrel does not cross the membrane, move and rotate it so that it is in a reference position (centered in (0,0,0) with the axis of the pore in z) + #if not barrel_crosses_membrane: + print(" ... ... Trying to move the barrel to the reference position (centered in (0,0,0) with the axis of the pore in z)") + pdb_file, curr_angle = move_barrel_to_reference_position(pdb_file, barrel_topology, center = [0,0,0]) + + # calculate the diameter of the barrel ONLY (no loops at all) + print(" ... ... Computing barrel diameter (excluding all loops)") + barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 0) + diameter, mad_diameter = get_barrel_diameter(barrel_struct, chainID, min_residue_numb = int(num_bb_regions/5.0), step_z = 1, membrane_thickness = membrane_thickness) + + if diameter is not None: + print(" ... ... ... Barrel diameter: {} (+/- {}) Ang".format(round(diameter,2), round(mad_diameter,2))) + + # calculate the shear number + print(" ... ... Computing the shear number") + try: + shear = shear_number(barrel_struct, show_path = False) + if shear is not None: + print(' ... ... ... Shear: {}'.format(shear)) + else: + print(' ... ... ... Error: Not possible to compute the shear number') + except: + shear = None + print(' ... ... ... Error: Not possible to compute the shear number') - if extract_hairpins and os.path.isfile(barrel_struct): - save_hairpins(pdb_sequence, pdb_file, barrel_topology, chains=chains, outfolder='hairpins') + # calculate a (the distance between two adjacent CA assuming they are in the same line defined by the strand) + # print(' ... ... Computing a and b') + # a, mad_a = calculate_a(barrel_struct) + # print(' ... ... ... a: {} (+/- {}) Ang'.format(round(a, 2), round(mad_a, 2))) + + # b, mad_b = calculate_b(barrel_struct) + # print(' ... ... ... b: {} (+/- {}) Ang'.format(round(b, 2), round(mad_b, 2))) + + # calculate the median distance between two adjacent c-alphas + # print(' ... ... Computing local CA geometric features (distance to next CA and and the bond angles to CAi-1 and CAi+2)') + # c, mad_c = calculate_c(barrel_struct) + # print(' ... ... ... CAi-CAi+1 distance: {} (+/- {}) Ang'.format(round(c, 2), round(mad_c, 2))) + theta1, theta1_mad, kappa, kappa_mad = calculate_CA_pseudoangles(barrel_struct) + print(' ... ... ... CAi-CAi+1-CAi+2 angle: {} (+/- {}) degrees'.format(round(theta1, 2), round(theta1_mad, 2))) + print(' ... ... ... CAi+2-CAi-CAi+1 angle: {} (+/- {}) degrees'.format(round(kappa, 2), round(kappa_mad, 2))) + # h = sqrt(c**2-a**2) + # print(' ... ... ... h: {} Ang'.format(round(h, 2))) + + # save all results into the dictionary + data['PDBs'].append('{}_{}'.format(pdbID, chainID)) + data['TMsegm'].append(num_bb_regions) + data['BB_diameter'].append(diameter) + data['BB_diameter_mad'].append(mad_diameter) + data['Shear'].append(shear) + data['Membrane_thickness'].append(membrane_thickness) + # data['a'].append(a) + # data['a_mad'].append(mad_a) + # data['b'].append(b) + # data['b_mad'].append(mad_b) + # data['CAi-CAi+1'].append(c) + # data['CAi-CAi+1_mad'].append(mad_c) + data['CAi-CAi+1-CAi+2'].append(theta1) + data['CAi-CAi+1-CAi+2_mad'].append(theta1_mad) + data['CAi+2-CAi-CAi+1'].append(kappa) + data['CAi+2-CAi-CAi+1_mad'].append(kappa_mad) + # data['h'].append(h) + data['N_residues'].append(len(pdb_sequence)) + + # extract the sequence of the barrel domain only + print(" ... Getting final barrel structure without N- and C-termini") + print(' ... ... Masking out internal loops from the topology string') + masked_topology = mask_loops(barrel_topology, chains) + barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, masked_topology, max_loop_size = 10, chains=chains) + print(" ... ... Saved as: {}".format(barrel_struct)) + + # check if the barrel is multimeric and save the sequences accordingly + chains_in_barrel = get_chains_in_pdb(barrel_struct, source_pdb=False)[0] + n_chains = len(chains_in_barrel) + data['N_chains'].append(n_chains) + data['Multimeric'].append(n_chains > 1) + + if n_chains > 1: + chains_seqs = get_chains_sequence(pdb_sequence,chains) + for chain in chains_in_barrel: + outmt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chain)) + outmt.write('{}\n'.format(chains_seqs[chain])) + + else: + outbb.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) + outbb.write('{}\n'.format(barrel_seq)) + else: + print('... ... There is a barrel in {}_{} but it does not cross the membrane\n'.format(pdbID, chainID)) + if delete: + os.system("rm {}*".format(pdb_file[:-4])) + if extract_hairpins and os.path.isfile(barrel_struct): + save_hairpins(pdb_sequence, pdb_file, barrel_topology, chains=chains, outfolder='hairpins') + # save intermediate collected data + json.dump(data, outdata, indent=4) - else: - print('... ... Not able to detect barrel topology for {}_{}\n'.format(pdbID, chainID)) - if delete: - os.system("rm {}*".format(pdb_file[:-4])) else: - seqstruct, pdb_sequence, chains = get_secondary_structure(pdb_file) - - outbb.write('>NaN_TM_BARREL_{}_{}_{}\n'.format(protein_type, pdbID, chainID)) - outbb.write('{}\n'.format(pdb_sequence)) + print('... ... Not able to detect barrel topology for {}_{}\n'.format(pdbID, chainID)) + if delete: + os.system("rm {}*".format(pdb_file[:-4])) + + else: + seqstruct, pdb_sequence, chains = get_secondary_structure(pdb_file) + + outbb.write('>NaN_TM_BARREL_{}_{}_{}\n'.format(protein_type, pdbID, chainID)) + outbb.write('{}\n'.format(pdb_sequence)) - if extract_hairpins: - save_hairpins(pdb_sequence, pdb_file, seqstruct, marker='E', chains=chains, outfolder='hairpins') + if extract_hairpins: + save_hairpins(pdb_sequence, pdb_file, seqstruct, marker='E', chains=chains, outfolder='hairpins') else: -- GitLab