diff --git a/translate2modelcif.py b/translate2modelcif.py index df86631e37023797bcfb18f79d4e0cc1a49c979e..e76380b47f69cd8c3f9d662cda40d75b2a9ffeef 100644 --- a/translate2modelcif.py +++ b/translate2modelcif.py @@ -553,12 +553,22 @@ def _get_sequence(chn): def _check_sequence(up_ac, sequence): """Verify sequence to only contain standard olc.""" - for res in sequence: + ns_aa_pos = [] # positions of non-standard amino acids + for i, res in enumerate(sequence): if res not in "ACDEFGHIKLMNPQRSTVWY": + if res == "U": + print( + f"WARNING: Selenocysteine found at position {i+1} of " + f"entry '{up_ac}', this residue may be missing in the " + "model." + ) + ns_aa_pos.append(i) + continue raise RuntimeError( "Non-standard aa found in UniProtKB sequence " - + f"for entry '{up_ac}': {res}" + + f"for entry '{up_ac}': {res}, position {i+1}" ) + return ns_aa_pos def _get_n_parse_up_entry(up_ac, up_url): @@ -640,7 +650,7 @@ def _get_n_parse_up_entry(up_ac, up_url): + f"UniProtKB entry '{up_ac}': {data['up_seqlen']} != " + f"{len(data['up_sequence'])}" ) - _check_sequence(data["up_ac"], data["up_sequence"]) + data["up_ns_aa"] = _check_sequence(data["up_ac"], data["up_sequence"]) if "up_id" not in data: _abort_msg(f"No ID found for UniProtKB entry '{up_ac}'.") @@ -668,10 +678,24 @@ def _fetch_unisave_entry(up_ac, version): ) +def _cmp_sequences(mdl, upkb, ns_aa_pos): + """Compare sequence while paying attention on non-standard amino acids.""" + # We add a U to the sequence when necessary. AF2 does not model it. The PDB + # has selenocysteine as canonical aa, see PDB entry 7Z0T. + for pos in ns_aa_pos: + if mdl[pos] != "-": + _abort_msg( + f"Position {pos+1} of non-canonical amino acid should be " + "a gap!" + ) + mdl = mdl[0:pos] + "U" + mdl[pos+1:] + return mdl == upkb + + def _get_upkb_for_sequence(sqe, up_ac): """Get UniProtKB entry data for given sequence.""" up_data = _fetch_upkb_entry(up_ac) - while sqe != up_data["up_sequence"]: + while not _cmp_sequences(sqe, up_data["up_sequence"], up_data["up_ns_aa"]): if up_data["up_entry_version"] > 1: up_data = _fetch_unisave_entry( up_ac, up_data["up_entry_version"] - 1 @@ -732,7 +756,10 @@ def _get_modelcif_entities(target_ents, source, asym_units, system): """Create ModelCIF entities and asymmetric units.""" for cif_ent in target_ents: mdlcif_ent = modelcif.Entity( - cif_ent["pdb_sequence"], + # Use up_sequence above pdb_sequence as it contains no gaps but + # otherwise is equal to the pdb_sequence. Gaps in pdb_sequence + # originate from missing residues in the model. + cif_ent["up_sequence"], description=cif_ent["description"], source=source, references=[ @@ -1111,3 +1138,5 @@ def _main(): if __name__ == "__main__": _main() + +# LocalWords: aa pdb