diff --git a/barrOs.py b/barrOs.py index f9e28751be4441e7139f1e93637f44b3bbb8dc81..bc26114d0b68e61dc104e86d4cb8fbd211135101 100644 --- a/barrOs.py +++ b/barrOs.py @@ -9,7 +9,7 @@ number_of_jobs = 1 barros.print_hello() # Get inputs -input_files, input_types, input_mode, outf, complexes_only, multimer_only, delete, distance_threshold = barros.get_inputs(sys.argv) +input_files, input_types, input_mode, outf, complexes_only, multimer_only, delete, distance_threshold, extract_hairpins = barros.get_inputs(sys.argv) # Check if inputs are correct in case we are not dealing with a 'help' check @@ -36,7 +36,7 @@ if __name__ == '__main__': # Prepare all parallel jobs and run the main barrOs method for each pdbid separated_jobs = barros.chunk_list(pdbIDs, number_of_jobs) - list_arguments = [i for i in zip(range(number_of_jobs), [input_mode for job in separated_jobs],[complexes_only for job in separated_jobs],[multimer_only for job in separated_jobs],[delete for job in separated_jobs],[distance_threshold for job in separated_jobs], separated_jobs)] + list_arguments = [i for i in zip(range(number_of_jobs), [input_mode for job in separated_jobs],[complexes_only for job in separated_jobs],[multimer_only for job in separated_jobs],[delete for job in separated_jobs],[distance_threshold for job in separated_jobs],[extract_hairpins for job in separated_jobs], separated_jobs)] pool = mp.Pool(number_of_jobs) data = pool.map(barros.run_barros, list_arguments) diff --git a/barrOs_library.py b/barrOs_library.py index cb1066c2af3810781943bcf45d320d0a3687384d..fa967fa9ae407bb5df1454ab58ece1e76c9aeac4 100644 --- a/barrOs_library.py +++ b/barrOs_library.py @@ -53,7 +53,7 @@ def print_hello(): print("\n WELCOME TO BARRoS v0 ") print(" Let me find all barrels in your PDBs ") print(" ") - print(" Last change: 06.08.2023 Joana Pereira \n") + print(" Last change: 28.02.2023 Joana Pereira \n") print(" Created at: MPI for Developmental Biology (Protein Evolution) ") print(" Maintained at: Biozentrum (Protein Structural Bioinformatics) \n") print("-------------------------------------------------------------------\n") @@ -62,17 +62,18 @@ def print_help(): print("\nUsage: python3 barrOs.py -in:<type>:<input_file> -out:<mode>\n") print("Parameters:") - print(" -in:<type> \tan input string (or list of strings) with one of the following types: {}".format(accepted_input_types)) - print(" \tfor example: -in:pdbID:1LML_A,3XML_A or -in:pdb_file:1LML_A.pdb,3XML_A.pdb") - print(" \tATENTION: barrOs only works with monomeric proteins, so always provide the target chain") - print(" -mode: \tdefines the mode of running, i.e. either we want to deal only with membrane proteins, all, or non-membrane proteins") - print(" \tvalues accepted: {}".format(accepted_modes)) - print(" -outputf: \tdefines the name of the output file") - print(" \tit is not mandatory. Default: BARRoS_results.csv") - print(" -nodelete: \tflag to define if files downloaded but without ") - print(" \tdetected barrels should not be deleted. Default: False") - print(" -strandist: \tmaximum distance between strands for a contact") - print(" \tto be considered. Default: 5 Ang") + print(" -in:<type> \tan input string (or list of strings) with one of the following types: {}".format(accepted_input_types)) + print(" \tfor example: -in:pdbID:1LML_A,3XML_A or -in:pdb_file:1LML_A.pdb,3XML_A.pdb") + print(" \tATENTION: barrOs only works with monomeric proteins, so always provide the target chain") + print(" -mode: \tdefines the mode of running, i.e. either we want to deal only with membrane proteins, all, or non-membrane proteins") + print(" \tvalues accepted: {}".format(accepted_modes)) + print(" -outputf: \tdefines the name of the output file") + print(" \tit is not mandatory. Default: BARRoS_results.csv") + print(" -nodelete: \tflag to define if files downloaded but without ") + print(" \tdetected barrels should not be deleted. Default: False") + print(" -strandist: \tmaximum distance between strands for a contact") + print(" \tto be considered. Default: 5 Ang") + print(" -extract_hairpins: \tflag to extract beta-hairpins, always breaking on the shortest loop. Default: False") def print_summary(input_files, input_types, input_mode): @@ -90,6 +91,7 @@ def get_inputs(argv): complexes_only = False multimer_only = False delete = True + extract_hairpins = False input_files = [] input_types = [] input_type = 'nan' @@ -120,6 +122,8 @@ def get_inputs(argv): multimer_only = True elif '-nodelete' in arg: delete = False + elif '-extract_hairpins' in arg: + extract_hairpins = True elif '-strandist' in arg: distance_threshold = int(arg.split(':')[1]) @@ -133,7 +137,7 @@ def get_inputs(argv): elif not found_input and found_mode: input_mode = tmp_mode - return input_files, input_types, input_mode, output_file, complexes_only, multimer_only, delete, distance_threshold + return input_files, input_types, input_mode, output_file, complexes_only, multimer_only, delete, distance_threshold, extract_hairpins ## 1.3. Functions to check if the inputs are correct @@ -1483,6 +1487,50 @@ def get_barrel_diameter(pdb_file, chainID, min_residue_numb = 3, mode = 'CA', st else: return None, None +def save_hairpins(pdb_sequence, pdb_file, barrel_topology, chains, marker='M', n=2, outfolder = 'fragments'): + + CA_atoms, res_nums = get_CA_coordinates(pdb_file) + regular_regions = extract_regular_regions(barrel_topology, pdb_file, marker = marker, add_to_pdb = False, size_threshold = 0) + + register = find_motif_register(regular_regions, n) + + if register is not None: + hairpins = [regular_regions[i:i+n] for i in range(register, len(regular_regions), n)] + + outfolder = '{}/{}'.format('/'.join(pdb_file.split('/')[:-1]), outfolder) + if not os.path.isdir(outfolder): + os.mkdir(outfolder) + + with open(pdb_file, 'r') as inpdb: + for line in inpdb: + if line.startswith('ATOM') or (line.startswith('HETATM') and 'MSE' in line): + resnum = int(line[22:26].strip()) + chain = line[21:22].strip() + + for i, hairpin in enumerate(hairpins): + if len(hairpin) == n: + outfile = '{}/{}_fragment{}.pdb'.format(outfolder, pdb_file.split('/')[-1].strip('.pdb'), i) + + start = res_nums[hairpin[0][0]] + end = res_nums[hairpin[1][1]] + + if resnum >= start-1 and resnum <= end+1: + with open(outfile, 'a+') as outpdb: + outpdb.write(line) + + +def find_motif_register(regular_regions, n): + +# It finds the register that allows for the shorter linker between units + + starting_motifs = regular_regions[:n+1] + linkers = [starting_motifs[i+1][0]-starting_motifs[i][1] for i in range(len(starting_motifs)-1)] + + try: + return linkers.index(max(linkers)) + except: + return None + ## 7. BASIC MATH-RELATED FUNCTIONS ## 7.1. Graphs @@ -2250,7 +2298,7 @@ def plot_parameter(x_col, y_col, df, saveto, fit_line = False): def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, max_loop_size = 0): - job_number, input_mode, complexes_only, multimer_only, delete, distance_threshold, in_queue = arguments + job_number, input_mode, complexes_only, multimer_only, delete, distance_threshold, extract_hairpins, in_queue = arguments # create output files to save the sequences outfasta = "full_sequences_matched_pdbs_job{}.fasta".format(job_number) @@ -2466,6 +2514,12 @@ def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, max_ print('... ... There is a barrel in {}_{} but it does not cross the membrane\n'.format(pdbID, chainID)) if delete: os.system("rm {}*".format(pdb_file[:-4])) + + if extract_hairpins and os.path.isfile(barrel_struct): + save_hairpins(pdb_sequence, pdb_file, barrel_topology, chains=chains, outfolder='hairpins') + + + else: print('... ... Not able to detect barrel topology for {}_{}\n'.format(pdbID, chainID)) if delete: @@ -2477,6 +2531,10 @@ def run_barros(arguments, offset = 1, step = 2, local_angle_threshold = 25, max_ outbb.write('>NaN_TM_BARREL_{}_{}_{}\n'.format(protein_type, pdbID, chainID)) outbb.write('{}\n'.format(pdb_sequence)) + if extract_hairpins: + save_hairpins(pdb_sequence, pdb_file, seqstruct, marker='E', chains=chains, outfolder='hairpins') + + else: print(" ... ... pdbID '{}_{}' impossible to get".format(pdbID, chainID))