diff --git a/projects/2024-12-ma-denv/example_index.csv.gz b/projects/2024-12-ma-denv/example_index.csv.gz new file mode 100755 index 0000000000000000000000000000000000000000..23d5092b7f3babf8a77f1c6d7fa886c7adaed01c Binary files /dev/null and b/projects/2024-12-ma-denv/example_index.csv.gz differ diff --git a/projects/2024-12-ma-denv/example_result.cif.gz b/projects/2024-12-ma-denv/example_result.cif.gz new file mode 100644 index 0000000000000000000000000000000000000000..1d078534efb68ebe6516ac4f3f61fe0e7a050c6a Binary files /dev/null and b/projects/2024-12-ma-denv/example_result.cif.gz differ diff --git a/projects/2024-12-ma-denv/example_swissmodelcif.cif.gz b/projects/2024-12-ma-denv/example_swissmodelcif.cif.gz new file mode 100755 index 0000000000000000000000000000000000000000..f76354699f4fe37b517a5ffe546a363b1071f9de Binary files /dev/null and b/projects/2024-12-ma-denv/example_swissmodelcif.cif.gz differ diff --git a/projects/2024-12-ma-denv/example_unp_data.json.gz b/projects/2024-12-ma-denv/example_unp_data.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..35af190837c95979968b3e726939782ce26d4d9b Binary files /dev/null and b/projects/2024-12-ma-denv/example_unp_data.json.gz differ diff --git a/projects/2024-12-ma-denv/macif.py b/projects/2024-12-ma-denv/macif.py new file mode 100644 index 0000000000000000000000000000000000000000..da5b9c46cd2e8a773d717bb46fc38a23c6571e14 --- /dev/null +++ b/projects/2024-12-ma-denv/macif.py @@ -0,0 +1,186 @@ +"""mmCIF editing for ModelArchive depositions. + +Supposed to look like gemmi.cif but with some convenience on top. Not meant to +show high-performance but help preparing ModelCIF files for a MA deposition. +Use for 'one time' jobs, not as frequently run tool in a service/ pipeline.""" + +import gzip + +# For whatever reason, 'no-name-in-module' can not be silenced by config atm +# pylint: disable=no-name-in-module +from gemmi import cif + +# pylint: enable=no-name-in-module +import gemmi + + +def _gemmi_quote(value): + """Quote string values when necessary.""" + if ( + isinstance(value, str) + and " " in value + and not (value.startswith("'") and value.endswith("'")) + and not (value.startswith('"') and value.endswith('"')) + ): + return cif.quote(value) + return value + + +class MABlock: + """gemmi.cif wrapper that skips reading/ documents and jumps right into + gemmi.cif.Block objects. You have all the gemmi.cif.Block functionality + available plus our own convenience functions on top.""" + + def __init__(self, model_data): + """Initialise a single Block from a file.""" + self.source = model_data + self.doc = cif.read(model_data) + self.block = self.doc.sole_block() + + self._targets = None + self._polymer_targets = None + + def __getattr__(self, name): + """If an attribute is not found, try self.block before exception.""" + # The catch here is: when asking for self.foo, + # self.__getattribute__(self, "foo") is executed first. If "foo" is + # not found in self, self.__getattr__(self, "foo") is called. So here + # we already know, that "foo" is not there and we can check the + # original block. + if hasattr(self.block, name): + return getattr(self.block, name) + raise AttributeError( + f"'{type(self).__name__}' object has no attribute '{name}'" + ) + + @property + def targets(self): + """Info about targets.""" + if self._targets is not None: + return self._targets + self._targets = {} + table = self.find("_ma_target_entity.", ["entity_id"]) + for row in table: + if row["entity_id"] in self._targets: + raise RuntimeError( + f"Target with entity_id '{row['entity_id']}' is duplicated." + ) + self._targets[row["entity_id"]] = { + "entity_id": row["entity_id"], # makes live easier for singles + "sequence": self.get_sequence(row["entity_id"]), + } + table = self.find("_entity.", ["id", "type"]) + for row in table: + self._targets[row["id"]]["type"] = row["type"] + + return self._targets + + @property + def polymer_targets(self): + """Only targets of entity type 'polymer'.""" + if self._polymer_targets is not None: + return self._polymer_targets + self._polymer_targets = [] + for target in self.targets.values(): + if target["type"] == "polymer": + self._polymer_targets.append(target) + + return self._polymer_targets + + def find(self, name, columns): + """Get a table with defined colums. Throws an exception if table is not + found.""" + table = self.block.find(name, columns) + if len(table) == 0: + raise RuntimeError( + f"""Table '{name}' with columns '{"', '".join(columns)}' """ + + "not found." + ) + + return table + + def get_sequence(self, entity): + """Get the sequence of a 'polymer' entity. `entity` is the numeric ID + of the entity. + Reading sequences is inefficient atm, it reads the whole table for + every sequence w/o any caching.""" + table = self.find("_entity_poly_seq.", ["entity_id", "num", "mon_id"]) + sequence = "" + num = 0 + for row in table: + if row["entity_id"] != entity: + continue + num += 1 + assert int(row["num"]) == num + sequence += gemmi.find_tabulated_residue( + row["mon_id"] + ).one_letter_code + + return sequence + + def write_file(self, filename, compress=False, style=cif.Style.Simple): + """Write ModelCIF file to disk, compress upon request. + Will compress anyways if file ends with '.gz'.""" + if compress or filename.endswith(".gz"): + if not filename.endswith(".gz"): + filename += ".gz" + with gzip.open(filename, mode="wt", compresslevel=9) as gfh: + gfh.write(self.doc.as_string(style)) + else: + self.doc.write_file(filename, style) + + def add_to_category(self, category, match=None, **kwargs): + """Add item values to a category. + Keyword arguments are reserved for item names.""" + if category[0] != "_": + category = "_" + category + if category[-1] != ".": + category += "." + items = list(kwargs.keys()) + row = None + if match is not None: + table = self.find(category, items + [match[0]]) + for row in table: + if row[match[0]] == match[1]: + break + if row is None: + raise RuntimeError( + f"No item {match[0]}=={match[1]} found in category " + + f"{category}." + ) + else: + table = self.find(category, items) + assert len(table) == 1 + row = table[0] + for itm, val in kwargs.items(): + if row[itm] not in [".", "?"]: + print( + f" replacing '{cif.as_string(row[itm])}' with " + + f"'{val}' ({itm})" + ) + row[itm] = _gemmi_quote(val) + + def add_category(self, category, after=None, **kwargs): + """Add a new category to the block with only 1 set of values, 1 row + thinking of categories as tables. kwargs are reserved for item/ value + pairs. after is a special keyword parameter to locate the new category + inside the block.""" + if not category.startswith("_"): + category = "_" + category + # handle quoting + for values in kwargs.values(): + for i, val in enumerate(values): + values[i] = _gemmi_quote(val) + self.block.set_mmcif_category(category, kwargs, raw=True) + + if after is None: + return + if not after.startswith("_"): + after = "_" + after + if not after.endswith("."): + after += "." + table = self.block.find_mmcif_category(after) + idx = self.block.get_index(table.tags[-1]) + # be careful with move_item: loops are 1 item and move as a whole, + # single line values move per item/ value par. + self.block.move_item(-1, idx + 1) diff --git a/projects/2024-12-ma-denv/translate2modelcif.py b/projects/2024-12-ma-denv/translate2modelcif.py index aa3a49baf05b6b4f5bccdde84ea9563b86eebad6..9fca2a7d8c4ee2ee5c5adfc088f3511170d7b0a2 100644 --- a/projects/2024-12-ma-denv/translate2modelcif.py +++ b/projects/2024-12-ma-denv/translate2modelcif.py @@ -3,30 +3,26 @@ """Check & extend SWISS-MODEL models for ma-denv Example for running: -ToDo: Example call - -ToDo: Update expected output once script is done -Expected output in ./modelcif for example above: -- ma-taas-0272.cif as ModelCIF file -- ma-taas-0272.zip with accompanying data -- ma-taas-0272-image.png as image to use in ModelArchive -- ma-taas-0272-issues.json listing issues for that conversion (if any) +python translate2modelcif.py --unp-json example_unp_data.json \ + example_swissmodelcif.cif.gz \ + example_index.csv \ + done + +This call uses the example data from this directory. You may need to decompress +some of the files. The resulting './done/example_swissmodelcif.cif.gz' should +be identical to './example_swissmodelcif.cif.gz'. + +Expected output in ./done for example above: +- example_swissmodelcif.cif.gz as ModelCIF file with extended annotation """ from datetime import date import argparse import csv -import gzip import os import sys -# For whatever reason, 'no-name-in-module' can not be silenced by config atm -# pylint: disable=no-name-in-module -from gemmi import cif - -# pylint: enable=no-name-in-module -import gemmi - +from macif import MABlock from uniprotkb import UniProtKBEntryCache @@ -84,7 +80,6 @@ def _parse_args(): metavar="<INPUT MODELCIF FILE>", help="Path to a SWISS-MODEL ModelCIF file provided by depositors.", ) - # ToDo: fill '?' parser.add_argument( "model_info_csv", type=str, @@ -183,7 +178,7 @@ def _update_modelcif(mdl_file, mdl_info, unp_json_file, out_dir, compress): Caution: This is for updates BEFORE deposition, this update does not do a CIF-style revision history for you!""" - block = _MABlock(mdl_file) + block = MABlock(mdl_file) assert len(block.polymer_targets) == 1 target = block.polymer_targets[0] unp = UniProtKBEntryCache(unp_json_file) @@ -253,178 +248,6 @@ def _update_modelcif(mdl_file, mdl_info, unp_json_file, out_dir, compress): ) -################################################################################ -# FUNCTIONALITY THAT MAY MOVE INTO ITS OWN MODULE -################################################################################ - - -class _MABlock: - """gemmi.cif wrapper that skips reading/ documents and jumps right into - gemmi.cif.Block objects. You have all the gemmi.cif.Block functionality - available plus our own convenience functions on top.""" - - def __init__(self, model_data): - """Initialise a single Block from a file.""" - self.source = model_data - self.doc = cif.read(model_data) - self.block = self.doc.sole_block() - - self._targets = None - self._polymer_targets = None - - def __getattr__(self, name): - """If an attribute is not found, try self.block before exception.""" - # The catch here is: when asking for self.foo, - # self.__getattribute__(self, "foo") is executed first. If "foo" is - # not found in self, self.__getattr__(self, "foo") is called. So here - # we already know, that "foo" is not there and we can check the - # original block. - if hasattr(self.block, name): - return getattr(self.block, name) - raise AttributeError( - f"'{type(self).__name__}' object has no attribute '{name}'" - ) - - @property - def targets(self): - """Info about targets.""" - if self._targets is not None: - return self._targets - self._targets = {} - table = self.find("_ma_target_entity.", ["entity_id"]) - for row in table: - if row["entity_id"] in self._targets: - raise RuntimeError( - f"Target with entity_id '{row['entity_id']}' is duplicated." - ) - self._targets[row["entity_id"]] = { - "entity_id": row["entity_id"], # makes live easier for singles - "sequence": self.get_sequence(row["entity_id"]), - } - table = self.find("_entity.", ["id", "type"]) - for row in table: - self._targets[row["id"]]["type"] = row["type"] - - return self._targets - - @property - def polymer_targets(self): - """Only targets of entity type 'polymer'.""" - if self._polymer_targets is not None: - return self._polymer_targets - self._polymer_targets = [] - for target in self.targets.values(): - if target["type"] == "polymer": - self._polymer_targets.append(target) - - return self._polymer_targets - - def find(self, name, columns): - """Get a table with defined colums. Throws an exception if table is not - found.""" - table = self.block.find(name, columns) - if len(table) == 0: - raise RuntimeError( - f"""Table '{name}' with columns '{"', '".join(columns)}' """ - + "not found." - ) - - return table - - def get_sequence(self, entity): - """Get the sequence of a 'polymer' entity. `entity` is the numeric ID - of the entity. - Reading sequences is inefficient atm, it reads the whole table for - every sequence w/o any caching.""" - table = self.find("_entity_poly_seq.", ["entity_id", "num", "mon_id"]) - sequence = "" - num = 0 - for row in table: - if row["entity_id"] != entity: - continue - num += 1 - assert int(row["num"]) == num - sequence += gemmi.find_tabulated_residue( - row["mon_id"] - ).one_letter_code - - return sequence - - def write_file(self, filename, compress=False, style=cif.Style.Simple): - """Write ModelCIF file to disk, compress upon request. - Will compress anyways if file ends with '.gz'.""" - if compress or filename.endswith(".gz"): - if not filename.endswith(".gz"): - filename += ".gz" - with gzip.open(filename, mode="wt", compresslevel=9) as gfh: - gfh.write(self.doc.as_string(style)) - else: - self.doc.write_file(filename, style) - - def add_to_category(self, category, match=None, **kwargs): - """Add item values to a category. - Keyword arguments are reserved for item names.""" - if category[0] != "_": - category = "_" + category - if category[-1] != ".": - category += "." - items = list(kwargs.keys()) - row = None - if match is not None: - table = self.find(category, items + [match[0]]) - for row in table: - if row[match[0]] == match[1]: - break - if row is None: - raise RuntimeError( - f"No item {match[0]}=={match[1]} found in category " - + f"{category}." - ) - else: - table = self.find(category, items) - assert len(table) == 1 - row = table[0] - for itm, val in kwargs.items(): - if row[itm] not in [".", "?"]: - print( - f" replacing '{cif.as_string(row[itm])}' with " - + f"'{val}' ({itm})" - ) - # ToDo: handle quoting the same way as for add_category - row[itm] = cif.quote(val) if " " in val else val - - def add_category(self, category, after=None, **kwargs): - """Add a new category to the block with only 1 set of values, 1 row - thinking of categories as tables. kwargs are reserved for item/ value - pairs. after is a special keyword parameter to locate the new category - inside the block.""" - if not category.startswith("_"): - category = "_" + category - # handle quoting - for values in kwargs.values(): - for i, val in enumerate(values): - if ( - isinstance(val, str) - and " " in val - and not (val.startswith("'") and val.endswith("'")) - and not (val.startswith('"') and val.endswith('"')) - ): - values[i] = cif.quote(val) - self.block.set_mmcif_category(category, kwargs, raw=True) - - if after is None: - return - if not after.startswith("_"): - after = "_" + after - if not after.endswith("."): - after += "." - table = self.block.find_mmcif_category(after) - idx = self.block.get_index(table.tags[-1]) - # be careful with move_item: loops are 1 item and move as a whole, - # single line values move per item/ value par. - self.block.move_item(-1, idx + 1) - - ################################################################################