diff --git a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py index df86631e37023797bcfb18f79d4e0cc1a49c979e..4f5dbb70c4e25a5d7c4264c2e27a8a97b25833ab 100755 --- a/projects/human-heterodimers-w-crosslinks/translate2modelcif.py +++ b/projects/human-heterodimers-w-crosslinks/translate2modelcif.py @@ -228,6 +228,11 @@ def _abort_msg(msg, exit_code=1): sys.exit(exit_code) +def _warn_msg(msg): + """Write a warning message to stdout.""" + print(f"WARNING: {msg}") + + def _check_file(file_path): """Make sure a file exists and is actually a file.""" if not os.path.exists(file_path): @@ -292,9 +297,8 @@ def _parse_colabfold_config(cnfg_file): use_mmseqs = False use_msa = False elif cf_config["msa_mode"] == "custom": - print( - "WARNING: Custom MSA mode used. Not clear from config what to do " - + "here!" + _warn_msg( + "Custom MSA mode used. Not clear from config what to do here!" ) seq_dbs = [] use_mmseqs = False @@ -336,9 +340,9 @@ def _parse_colabfold_config(cnfg_file): else: mdl_description += ", without model relaxation" if cf_config["use_templates"]: - print( - "WARNING: ColabFold may use PDB70 or custom templates. " - "Not clear from config!" + _warn_msg( + "ColabFold may use PDB70 or custom templates. Not clear " + + "from config!" ) mdl_description += ", using templates" else: @@ -553,12 +557,22 @@ def _get_sequence(chn): def _check_sequence(up_ac, sequence): """Verify sequence to only contain standard olc.""" - for res in sequence: + ns_aa_pos = [] # positions of non-standard amino acids + for i, res in enumerate(sequence): if res not in "ACDEFGHIKLMNPQRSTVWY": + if res == "U": + _warn_msg( + f"Selenocysteine found at position {i+1} of entry " + + f"'{up_ac}', this residue may be missing in the " + + "model." + ) + ns_aa_pos.append(i) + continue raise RuntimeError( "Non-standard aa found in UniProtKB sequence " - + f"for entry '{up_ac}': {res}" + + f"for entry '{up_ac}': {res}, position {i+1}" ) + return ns_aa_pos def _get_n_parse_up_entry(up_ac, up_url): @@ -626,7 +640,11 @@ def _get_n_parse_up_entry(up_ac, up_url): data["up_isoform"] = None if "up_gn" not in data: - _abort_msg(f"No gene name found for UniProtKB entry '{up_ac}'.") + _warn_msg( + f"No gene name found for UniProtKB entry '{up_ac}', using " + + "UniProtKB AC instead." + ) + data["up_gn"] = up_ac if "up_last_mod" not in data: _abort_msg(f"No sequence version found for UniProtKB entry '{up_ac}'.") if "up_crc64" not in data: @@ -640,7 +658,7 @@ def _get_n_parse_up_entry(up_ac, up_url): + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != " + f"{len(data['up_sequence'])}" ) - _check_sequence(data["up_ac"], data["up_sequence"]) + data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"]) if "up_id" not in data: _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.") @@ -668,10 +686,24 @@ def _fetch_unisave_entry(up_ac, version): ) +def _cmp_sequences(mdl, upkb, ns_aa_pos): + """Compare sequence while paying attention on non-standard amino acids.""" + # We add a U to the sequence when necessary. AF2 does not model it. The PDB + # has selenocysteine as canonical aa, see PDB entry 7Z0T. + for pos in ns_aa_pos: + if mdl[pos] != "-": + _abort_msg( + f"Position {pos+1} of non-canonical amino acid should be " + "a gap!" + ) + mdl = mdl[0:pos] + "U" + mdl[pos + 1 :] + return mdl == upkb + + def _get_upkb_for_sequence(sqe, up_ac): """Get UniProtKB entry data for given sequence.""" up_data = _fetch_upkb_entry(up_ac) - while sqe != up_data["up_sequence"]: + while not _cmp_sequences(sqe, up_data["up_sequence"], up_data["up_ns_aa"]): if up_data["up_entry_version"] > 1: up_data = _fetch_unisave_entry( up_ac, up_data["up_entry_version"] - 1 @@ -732,7 +764,10 @@ def _get_modelcif_entities(target_ents, source, asym_units, system): """Create ModelCIF entities and asymmetric units.""" for cif_ent in target_ents: mdlcif_ent = modelcif.Entity( - cif_ent["pdb_sequence"], + # Use up_sequence above pdb_sequence as it contains no gaps but + # otherwise is equal to the pdb_sequence. Gaps in pdb_sequence + # originate from missing residues in the model. + cif_ent["up_sequence"], description=cif_ent["description"], source=source, references=[ @@ -1111,3 +1146,5 @@ def _main(): if __name__ == "__main__": _main() + +# LocalWords: aa pdb