diff --git a/barrOs.py b/barrOs.py index 95977659172f983c063a2ad4e1d04f32a36c1fca..e27ab7df375a0e4827214695e959ce6ab581bbf4 100644 --- a/barrOs.py +++ b/barrOs.py @@ -9,7 +9,7 @@ number_of_jobs = 1 barros.print_hello() # Get inputs -input_files, input_types, input_mode, outf = barros.get_inputs(sys.argv) +input_files, input_types, input_mode, outf, complexes_only, multimer_only = barros.get_inputs(sys.argv) # Check if inputs are correct in case we are not dealing with a 'help' check @@ -36,7 +36,7 @@ if __name__ == '__main__': # Prepare all parallel jobs and run the main barrOs method for each pdbid separated_jobs = barros.chunk_list(pdbIDs, number_of_jobs) - list_arguments = [i for i in zip(range(number_of_jobs), [input_mode for job in separated_jobs], separated_jobs)] + list_arguments = [i for i in zip(range(number_of_jobs), [input_mode for job in separated_jobs],[complexes_only for job in separated_jobs],[multimer_only for job in separated_jobs], separated_jobs)] pool = mp.Pool(number_of_jobs) data = pool.map(barros.run_barros, list_arguments) diff --git a/barrOs_library.py b/barrOs_library.py index 7f41808546a2f37817a1fafb187261610e7f01fa..75323bd94bf60c326e3e2108c006fdcb574b9bab 100644 --- a/barrOs_library.py +++ b/barrOs_library.py @@ -82,6 +82,8 @@ def get_inputs(argv): found_input = False found_mode = False + complexes_only = False + multimer_only = False input_files = [] input_types = [] input_type = 'nan' @@ -105,6 +107,10 @@ def get_inputs(argv): tmp_mode = arg.split(':')[1] elif '-outputf' in arg: output_file = arg.split(':')[1] + elif '-complexes' in arg: + complexes_only = True + elif '-multimer' in arg: + multimer_only = True if found_input and not found_mode: if input_type in accepted_input_types: @@ -116,7 +122,7 @@ def get_inputs(argv): elif not found_input and found_mode: input_mode = tmp_mode - return input_files, input_types, input_mode, output_file + return input_files, input_types, input_mode, output_file, complexes_only, multimer_only ## 1.3. Functions to check if the inputs are correct @@ -1032,7 +1038,7 @@ def find_additional_cycle(cycl1, cycl2): ## 4.2. Main functions -def process_barrel(pdb_file, ID, chainID, simplify = False, offset = 1, step = 2, local_angle_threshold = 25, distance_threshold = 5, round=0): +def process_barrel(pdb_file, ID, chainID, simplify = False, offset = 1, step = 2, local_angle_threshold = 25, distance_threshold = 5, round=0, multimer_only=False): # get barrel topology from combined Regular Regions and DSSP seqstruct, pdb_sequence, barrel_topology, chains = get_barrel_topology(pdb_file, ID = ID, chainID = chainID, mode = 'combined', offset = offset, step = step, local_angle_threshold = local_angle_threshold, distance_threshold = distance_threshold) @@ -1048,8 +1054,8 @@ def process_barrel(pdb_file, ID, chainID, simplify = False, offset = 1, step = 2 seqstruct, pdb_sequence, barrel_topology, chains = get_barrel_topology(pdb_file, ID = ID, chainID = chainID, mode = 'from DSSP', offset = offset, step = step, distance_threshold = distance_threshold) num_bb_regions = find_number_of_regions_in_barrel(barrel_topology) - # if no barrel was found, check if it could be formed by multiple chains - if num_bb_regions == 0 and round == 0: + # if no barrel was found, check if it could be formed by multiple chains if multimer only is turned on + if num_bb_regions == 0 and round == 0 and multimer_only: chains_in_pdb, pdb_file = get_chains_in_pdb(pdb_file) pdb_sequence, barrel_topology, chains, num_bb_regions, _ = process_barrel(pdb_file, ID, chains_in_pdb, simplify = simplify, offset = offset, step = step, local_angle_threshold = local_angle_threshold, distance_threshold = distance_threshold, round=1) @@ -1160,7 +1166,10 @@ def get_chains_topology(topology, chains): for i, ch in enumerate(chains): if ch not in chains_topo: chains_topo[ch] = '' - chains_topo[ch] += topology[i] + try: + chains_topo[ch] += topology[i] + except: + pass return chains_topo def get_chains_sequence(pdb_sequence, chains): @@ -1169,7 +1178,10 @@ def get_chains_sequence(pdb_sequence, chains): for i, ch in enumerate(chains): if ch not in chains_seq: chains_seq[ch] = '' - chains_seq[ch] += pdb_sequence[i] + try: + chains_seq[ch] += pdb_sequence[i] + except: + pass return chains_seq def mask_loops(topology, chains): @@ -2221,7 +2233,7 @@ def plot_parameter(x_col, y_col, df, saveto, fit_line = False): def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, distance_threshold = 5, max_loop_size = 0): - job_number, input_mode, in_queue = arguments + job_number, input_mode, complexes_only, multimer_only, in_queue = arguments # create output files to save the sequences outfasta = "full_sequences_matched_pdbs_job{}.fasta".format(job_number) @@ -2282,7 +2294,19 @@ def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, dist pdb_file, protein_type, membrane_thickness = download_pdb(pdbID) pdbID, chainID = pdbID.split('_') - if pdb_file != 'not available' and protein_type != 'not available': + if complexes_only and pdb_file != 'not available': + chains_inpdb, pdb_file = get_chains_in_pdb(pdb_file) + target_chains = [] + if len(chains_inpdb) > 1: + for chainID in chains_inpdb: + chain_pdb, protein_type, membrane_thickness = download_pdb('{}_{}'.format(pdbID.split('_')[0], chainID)) + target_chains.append([pdbID.split('_')[0], chainID, chain_pdb, protein_type, membrane_thickness]) + else: + target_chains = None + else: + target_chains = [[pdbID.split('_')[0], chainID]] + + if pdb_file != 'not available' and protein_type != 'not available' and target_chains is not None: if input_mode != 'all': if protein_type != input_mode: os.system("rm {}".format(pdb_file)) @@ -2295,119 +2319,121 @@ def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, dist #else: #print(" ... ... Protein {} is NOT inserted into a membrane. Membrane thickness set to '{}'".format(pdbID, membrane_thickness)) - try: - pdb_sequence, barrel_topology, chains, num_bb_regions, pdb_file = process_barrel(pdb_file, ID = pdbID, chainID = chainID, offset = offset, step = step, local_angle_threshold = local_angle_threshold, distance_threshold = distance_threshold) - except: - num_bb_regions = 0 + for pdbID, chainID, pdb_file, protein_type, membrane_thickness in target_chains: - # check if it has a barrel. If so, continue - if num_bb_regions > 0 and len(pdb_sequence) < 10000: - # get the sequence of the barrel region only - barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 10) + try: + pdb_sequence, barrel_topology, chains, num_bb_regions, pdb_file = process_barrel(pdb_file, ID = pdbID, chainID = chainID, offset = offset, step = step, local_angle_threshold = local_angle_threshold, distance_threshold = distance_threshold, multimer_only=multimer_only) + except: + num_bb_regions = 0 - print('\n\t>{}TM_BARREL_{}_{}_{}'.format(num_bb_regions, protein_type, pdbID, chainID)) - print('\t',pdb_sequence) - print('\t',barrel_topology,'\n') + # check if it has a barrel. If so, continue + if num_bb_regions > 0 and len(pdb_sequence) < 10000: + # get the sequence of the barrel region only + barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 10) - outpt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) - outpt.write('{}\n'.format(pdb_sequence)) - outpt.write('{}\n'.format(barrel_topology)) + print('\n\t>{}TM_BARREL_{}_{}_{}'.format(num_bb_regions, protein_type, pdbID, chainID)) + print('\t',pdb_sequence) + print('\t',barrel_topology,'\n') - # if it is a membrane protein, check whether the barrel is inserted into the membrane. Otherwise consider it does not - if protein_type == 'membrane': - barrel_crosses_membrane, barrel_struct = find_if_barrel_crosses_membrane(pdb_sequence, pdb_file, barrel_topology, membrane_thickness, max_loop_size = max_loop_size) - else: - barrel_crosses_membrane = False - print(" ... ... Barrel in {} crosses membrane: {}".format(pdbID, barrel_crosses_membrane)) - - if protein_type != 'membrane' or (protein_type == 'membrane' and barrel_crosses_membrane): - # if the barrel does not cross the membrane, move and rotate it so that it is in a reference position (centered in (0,0,0) with the axis of the pore in z) - #if not barrel_crosses_membrane: - print(" ... ... Trying to move the barrel to the reference position (centered in (0,0,0) with the axis of the pore in z)") - pdb_file, curr_angle = move_barrel_to_reference_position(pdb_file, barrel_topology, center = [0,0,0]) - - # calculate the diameter of the barrel ONLY (no loops at all) - print(" ... ... Computing barrel diameter (excluding all loops)") - barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 0) - diameter, mad_diameter = get_barrel_diameter(barrel_struct, chainID, min_residue_numb = int(num_bb_regions/5.0), step_z = 1, membrane_thickness = membrane_thickness) - - if diameter is not None: - print(" ... ... ... Barrel diameter: {} (+/- {}) Ang".format(round(diameter,2), round(mad_diameter,2))) - - # calculate the shear number - print(" ... ... Computing the shear number") - shear = shear_number(barrel_struct, show_path = False) - if shear is not None: - print(' ... ... ... Shear: {}'.format(shear)) + outpt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) + outpt.write('{}\n'.format(pdb_sequence)) + outpt.write('{}\n'.format(barrel_topology)) + + # if it is a membrane protein, check whether the barrel is inserted into the membrane. Otherwise consider it does not + if protein_type == 'membrane': + barrel_crosses_membrane, barrel_struct = find_if_barrel_crosses_membrane(pdb_sequence, pdb_file, barrel_topology, membrane_thickness, max_loop_size = max_loop_size) else: - print(' ... ... ... Error: Not possible to compute the shear number') - - # calculate a (the distance between two adjacent CA assuming they are in the same line defined by the strand) - # print(' ... ... Computing a and b') - # a, mad_a = calculate_a(barrel_struct) - # print(' ... ... ... a: {} (+/- {}) Ang'.format(round(a, 2), round(mad_a, 2))) - - # b, mad_b = calculate_b(barrel_struct) - # print(' ... ... ... b: {} (+/- {}) Ang'.format(round(b, 2), round(mad_b, 2))) - - # calculate the median distance between two adjacent c-alphas - # print(' ... ... Computing local CA geometric features (distance to next CA and and the bond angles to CAi-1 and CAi+2)') - # c, mad_c = calculate_c(barrel_struct) - # print(' ... ... ... CAi-CAi+1 distance: {} (+/- {}) Ang'.format(round(c, 2), round(mad_c, 2))) - theta1, theta1_mad, kappa, kappa_mad = calculate_CA_pseudoangles(barrel_struct) - print(' ... ... ... CAi-CAi+1-CAi+2 angle: {} (+/- {}) degrees'.format(round(theta1, 2), round(theta1_mad, 2))) - print(' ... ... ... CAi+2-CAi-CAi+1 angle: {} (+/- {}) degrees'.format(round(kappa, 2), round(kappa_mad, 2))) - # h = sqrt(c**2-a**2) - # print(' ... ... ... h: {} Ang'.format(round(h, 2))) - - # save all results into the dictionary - data['PDBs'].append('{}_{}'.format(pdbID, chainID)) - data['TMsegm'].append(num_bb_regions) - data['BB_diameter'].append(diameter) - data['BB_diameter_mad'].append(mad_diameter) - data['Shear'].append(shear) - data['Membrane_thickness'].append(membrane_thickness) - # data['a'].append(a) - # data['a_mad'].append(mad_a) - # data['b'].append(b) - # data['b_mad'].append(mad_b) - # data['CAi-CAi+1'].append(c) - # data['CAi-CAi+1_mad'].append(mad_c) - data['CAi-CAi+1-CAi+2'].append(theta1) - data['CAi-CAi+1-CAi+2_mad'].append(theta1_mad) - data['CAi+2-CAi-CAi+1'].append(kappa) - data['CAi+2-CAi-CAi+1_mad'].append(kappa_mad) - # data['h'].append(h) - data['N_residues'].append(len(pdb_sequence)) - - # extract the sequence of the barrel domain only - print(" ... Getting final barrel structure without N- and C-termini") - print(' ... ... Masking out internal loops from the topology string') - masked_topology = mask_loops(barrel_topology, chains) - barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, masked_topology, max_loop_size = 10, chains=chains) - print(" ... ... Saved as: {}".format(barrel_struct)) - - # check if the barrel is multimeric and save the sequences accordingly - chains_in_barrel = get_chains_in_pdb(barrel_struct, source_pdb=False)[0] - n_chains = len(chains_in_barrel) - data['N_chains'].append(n_chains) - data['Multimeric'].append(n_chains > 1) - - if n_chains > 1: - chains_seqs = get_chains_sequence(pdb_sequence,chains) - for chain in chains_in_barrel: - outmt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chain)) - outmt.write('{}\n'.format(chains_seqs[chain])) - + barrel_crosses_membrane = False + print(" ... ... Barrel in {} crosses membrane: {}".format(pdbID, barrel_crosses_membrane)) + + if protein_type != 'membrane' or (protein_type == 'membrane' and barrel_crosses_membrane): + # if the barrel does not cross the membrane, move and rotate it so that it is in a reference position (centered in (0,0,0) with the axis of the pore in z) + #if not barrel_crosses_membrane: + print(" ... ... Trying to move the barrel to the reference position (centered in (0,0,0) with the axis of the pore in z)") + pdb_file, curr_angle = move_barrel_to_reference_position(pdb_file, barrel_topology, center = [0,0,0]) + + # calculate the diameter of the barrel ONLY (no loops at all) + print(" ... ... Computing barrel diameter (excluding all loops)") + barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, barrel_topology, max_loop_size = 0) + diameter, mad_diameter = get_barrel_diameter(barrel_struct, chainID, min_residue_numb = int(num_bb_regions/5.0), step_z = 1, membrane_thickness = membrane_thickness) + + if diameter is not None: + print(" ... ... ... Barrel diameter: {} (+/- {}) Ang".format(round(diameter,2), round(mad_diameter,2))) + + # calculate the shear number + print(" ... ... Computing the shear number") + shear = shear_number(barrel_struct, show_path = False) + if shear is not None: + print(' ... ... ... Shear: {}'.format(shear)) + else: + print(' ... ... ... Error: Not possible to compute the shear number') + + # calculate a (the distance between two adjacent CA assuming they are in the same line defined by the strand) + # print(' ... ... Computing a and b') + # a, mad_a = calculate_a(barrel_struct) + # print(' ... ... ... a: {} (+/- {}) Ang'.format(round(a, 2), round(mad_a, 2))) + + # b, mad_b = calculate_b(barrel_struct) + # print(' ... ... ... b: {} (+/- {}) Ang'.format(round(b, 2), round(mad_b, 2))) + + # calculate the median distance between two adjacent c-alphas + # print(' ... ... Computing local CA geometric features (distance to next CA and and the bond angles to CAi-1 and CAi+2)') + # c, mad_c = calculate_c(barrel_struct) + # print(' ... ... ... CAi-CAi+1 distance: {} (+/- {}) Ang'.format(round(c, 2), round(mad_c, 2))) + theta1, theta1_mad, kappa, kappa_mad = calculate_CA_pseudoangles(barrel_struct) + print(' ... ... ... CAi-CAi+1-CAi+2 angle: {} (+/- {}) degrees'.format(round(theta1, 2), round(theta1_mad, 2))) + print(' ... ... ... CAi+2-CAi-CAi+1 angle: {} (+/- {}) degrees'.format(round(kappa, 2), round(kappa_mad, 2))) + # h = sqrt(c**2-a**2) + # print(' ... ... ... h: {} Ang'.format(round(h, 2))) + + # save all results into the dictionary + data['PDBs'].append('{}_{}'.format(pdbID, chainID)) + data['TMsegm'].append(num_bb_regions) + data['BB_diameter'].append(diameter) + data['BB_diameter_mad'].append(mad_diameter) + data['Shear'].append(shear) + data['Membrane_thickness'].append(membrane_thickness) + # data['a'].append(a) + # data['a_mad'].append(mad_a) + # data['b'].append(b) + # data['b_mad'].append(mad_b) + # data['CAi-CAi+1'].append(c) + # data['CAi-CAi+1_mad'].append(mad_c) + data['CAi-CAi+1-CAi+2'].append(theta1) + data['CAi-CAi+1-CAi+2_mad'].append(theta1_mad) + data['CAi+2-CAi-CAi+1'].append(kappa) + data['CAi+2-CAi-CAi+1_mad'].append(kappa_mad) + # data['h'].append(h) + data['N_residues'].append(len(pdb_sequence)) + + # extract the sequence of the barrel domain only + print(" ... Getting final barrel structure without N- and C-termini") + print(' ... ... Masking out internal loops from the topology string') + masked_topology = mask_loops(barrel_topology, chains) + barrel_seq, barrel_struct = remove_long_loops_from_barrel(pdb_sequence, pdb_file, masked_topology, max_loop_size = 10, chains=chains) + print(" ... ... Saved as: {}".format(barrel_struct)) + + # check if the barrel is multimeric and save the sequences accordingly + chains_in_barrel = get_chains_in_pdb(barrel_struct, source_pdb=False)[0] + n_chains = len(chains_in_barrel) + data['N_chains'].append(n_chains) + data['Multimeric'].append(n_chains > 1) + + if n_chains > 1: + chains_seqs = get_chains_sequence(pdb_sequence,chains) + for chain in chains_in_barrel: + outmt.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chain)) + outmt.write('{}\n'.format(chains_seqs[chain])) + + else: + outbb.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) + outbb.write('{}\n'.format(barrel_seq)) else: - outbb.write('>{}TM_BARREL_{}_{}_{}\n'.format(num_bb_regions, protein_type, pdbID, chainID)) - outbb.write('{}\n'.format(barrel_seq)) + print('... ... There is a barrel in {}_{} but it does not cross the membrane\n'.format(pdbID, chainID)) + os.system("rm {}*".format(pdb_file[:-4])) else: - print('... ... There is a barrel in {}_{} but it does not cross the membrane\n'.format(pdbID, chainID)) + print('... ... Not able to detect barrel topology for {}_{}\n'.format(pdbID, chainID)) os.system("rm {}*".format(pdb_file[:-4])) - else: - print('... ... Not able to detect barrel topology for {}_{}\n'.format(pdbID, chainID)) - os.system("rm {}*".format(pdb_file[:-4])) else: print(" ... ... pdbID '{}_{}' impossible to get".format(pdbID, chainID))