In [63]:
# fetch all functions from there for testing (__name__ hack to avoid executing main)
__name__ = "test"
execfile("./scripts/translate2modelcif.py")
_main()
PROTEINS ONLY in fasta_file: ['c104827_g1_i1', 'c104894_g1_i2']
Working on ./PKG Cut...
  translating c103531_g3_i1_unrelaxed_rank_1_model_3...
    preparing data... (0.02s)
    generating ModelCIF objects... (0.00s)
    processing QA scores... (0.20s)
    write to disk... (1.15s)
  ... done with c103531_g3_i1_unrelaxed_rank_1_model_3 (1.37s).
  translating c103545_g3_i2_unrelaxed_rank_1_model_3...
    preparing data... (0.08s)
    generating ModelCIF objects... (0.00s)
    processing QA scores... (5.47s)
    write to disk... (19.48s)
  ... done with c103545_g3_i2_unrelaxed_rank_1_model_3 (25.13s).
... done with ./PKG Cut.
In [3]:
    # HC filepaths
    cnfg_file = "./PKG/config.json"
    metadata_file = "./PKG/model_archive_metadata.csv"
    fasta_file = "./PKG/Spongilla_lacustris_translated_proteome_fixed.fasta"
    model_dir = "./PKG Cut"
    # model_dir = "./PKG-all"
    out_dir = "./modelcif"
    compress = False
    #

    # parse/fetch global data
    config_data = _parse_colabfold_config(cnfg_file)
    metadata = _get_metadata(metadata_file)
    seq_dict = _get_sequences(fasta_file, metadata)
    if compress:
        cifext = "cif.gz"
    else:
        cifext = "cif"
    # fetch a single randomly chosen UniProt entry for tax info
    upkb_data = _fetch_upkb_entry("P42690")
    tax_info = {k: v for k, v in upkb_data.items() \
                if k in ["up_organism", "up_ncbi_taxid"]}
PROTEINS ONLY in fasta_file: ['c104827_g1_i1', 'c104894_g1_i2']
In [4]:
fle = "c103531_g3_i1_unrelaxed_rank_1_model_3.pdb"
In [5]:
        file_prfx, mdl_name = _check_model_extra_files_present(model_dir,
                                                               fle)
        fle = os.path.join(model_dir, fle)
        mdlcf_json = _create_json(config_data)
        mdlcf_json["tax_info"] = tax_info
        # mdl_name = [TITLE]_unrelaxed_rank_X_model_Y.pdb
        mdl_name_parts = mdl_name.split('_')
        assert len(mdl_name_parts) == 8
        assert int(mdl_name_parts[5]) == 1 # rank 1 only
        mdlcf_json["mdl_num"] = int(mdl_name_parts[7])
        mdlcf_json["mdl_title"] = '_'.join(mdl_name_parts[:3])
In [6]:
        ost_ent = _create_model_json(mdlcf_json, fle, seq_dict, metadata)

        # read quality scores from JSON file
        _get_scores(mdlcf_json, file_prfx)
In [7]:
        _store_as_modelcif(mdlcf_json, ost_ent, out_dir, mdl_name, compress)
    generating ModelCIF objects... (0.00s)
    processing QA scores... (0.34s)
    write to disk... (1.04s)
In [8]:
scores = json.load(open("./PKG Cut/c103545_g3_i2_unrelaxed_rank_1_model_3_scores.json"))
print(scores.keys())
dict_keys(['max_pae', 'pae', 'plddt', 'ptm'])
In [9]:
# check files
pdb_files = open("./PKG/files_PDB.txt").readlines()
json_files = open("./PKG/files_JSON.txt").readlines()
assert len(pdb_files) == len(json_files)

mdl_titles = []
for pdb_file, json_file in zip(sorted(pdb_files), sorted(json_files)):
    splitfile = os.path.split(pdb_file.strip())
    assert len(splitfile) == 2
    if splitfile[1]:
        mdl_name, mdl_ext = os.path.splitext(splitfile[1])
        assert mdl_ext == ".pdb"
        splitfile_j = os.path.split(json_file.strip())
        assert len(splitfile_j) == 2
        assert splitfile_j[1] == f"{mdl_name}_scores.json"
        mdl_name_parts = mdl_name.split('_')
        if len(mdl_name_parts) != 8 or mdl_name_parts[5] != '1':
            print("WEIRD NAME", mdl_name)
        mdl_title = '_'.join(mdl_name_parts[:3])
        mdl_titles.append(mdl_title)
        
assert len(set(mdl_titles)) == len(mdl_titles)
WEIRD NAME c100000_g1_i2_unrelaxed_rank_c100000_g1_i2_model_3
WEIRD NAME c100001_g1_i1_unrelaxed_rank_1_model_c100001_g1_i1
In [10]:
# all in metadata?
only_in_metadata = set(metadata.index) - set(mdl_titles)
if only_in_metadata:
    print("PROTEINS ONLY in metadata:", sorted(only_in_metadata))
only_in_seqs = set(mdl_titles) - set(metadata.index)
assert len(only_in_seqs) == 0
len(mdl_titles), len(metadata), len(only_in_metadata)
PROTEINS ONLY in metadata: ['c104658_g3_i5', 'c104704_g1_i1', 'c104839_g1_i1', 'c104839_g1_i3', 'c104872_g1_i1', 'c104894_g1_i3', 'c104922_g1_i2', 'c104954_g1_i2', 'c104956_g1_i1', 'c104958_g1_i1', 'c104973_g1_i2']
Out[10]:
(41932, 41943, 11)
In [11]:
# check files that were converted (input is std out from run cut to only include "translating...")
log_lines = open("./FromWork/docker_out_clean.txt").readlines()
In [12]:
# plenty of assertions in here should also catch any errors...
idx = 0
timings = dict()
while idx < len(log_lines):
    l = log_lines[idx].strip()
    if "already done..." in l:
        idx += 1
        continue
    assert l.startswith("translating")
    mdl_title = "_".join(l.split()[1].split('_')[:3])
    l = log_lines[idx + 1].strip()
    assert l.startswith("preparing data")
    assert l.endswith("s)")
    t_prep = float(l.split()[-1][1:-2])
    l = log_lines[idx + 2].strip()
    assert l.startswith("generating ModelCIF objects")
    assert l.endswith("s)")
    t_cif = float(l.split()[-1][1:-2])
    l = log_lines[idx + 3].strip()
    assert l.startswith("processing QA scores")
    assert l.endswith("s)")
    t_qa = float(l.split()[-1][1:-2])
    l = log_lines[idx + 4].strip()
    assert l.startswith("write to disk")
    assert l.endswith("s)")
    t_write = float(l.split()[-1][1:-2])
    l = log_lines[idx + 5].strip()
    assert l.startswith("... done with")
    assert l.endswith("s).")
    t_all = float(l.split()[-1][1:-3])
    timings[mdl_title] = {
        "seq_len": len(seq_dict[mdl_title]),
        "t_prep": t_prep,
        "t_cif": t_cif,
        "t_qa": t_qa,
        "t_write": t_write,
        "t_all": t_all
    }
    idx += 6
In [13]:
print(f"DONE {len(timings)} models")
assert len(set(timings) - set(mdl_titles)) == 0
missing_ones = set(mdl_titles) - set(timings)
print(f"MISSING {len(missing_ones)} models")
DONE 37738 models
MISSING 4194 models
In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
x = []
y = []
y2 = []
for v in timings.values():
    x.append(v["seq_len"])
    y.append(v["t_all"])
    y2.append(v["t_write"])
# fit quadratic function (for fun)
model = np.poly1d(np.polyfit(x, y, 2))
print(model)
# plot it
plt.scatter(x, y)
plt.scatter(x, y2)
polyline = np.linspace(1, max(x), 50)
plt.plot(polyline, model(polyline))
plt.xlabel("seq. length")
plt.ylabel("conversion time")
plt.legend(["quadratic fit", "total time", "write time"])
plt.gcf().patch.set_facecolor('white')
print("Average time per model (s):", sum(y) / len(y))
           2
2.106e-05 x - 4.487e-05 x + 0.05244
Average time per model (s): 2.1000026498489452
In [16]:
# check results (self-contained version)
from ost import io
fasta_file = "./PKG/Spongilla_lacustris_translated_proteome_fixed.fasta"
out_dir = "./modelcif"
cifext = "cif"
seqs = io.LoadSequenceList(fasta_file)
seq_dict = {'_'.join(s.name.split('_')[:3]): s.string for s in seqs}
for out_fle in sorted(os.listdir(out_dir)):
    if not out_fle.endswith(cifext) or "_local_pairwise_qa" in out_fle:
        continue
    mdl_name = os.path.splitext(out_fle)[0]
    mdl_title = '_'.join(mdl_name.split('_')[:3])
    # check if result can be read and has expected seq.
    ent = io.LoadMMCIF(os.path.join(out_dir, out_fle))
    assert ent.chain_count == 1, f"Bad chain count {mdl_title}"
    ent_seq = "".join(res.one_letter_code for res in ent.residues)
    assert ent_seq == seq_dict[mdl_title], f"Bad seq. {mdl_title}"
    # NOTE: can use only this bit in main loop with os.path.join(out_dir, f"{mdl_name}.{cifext}")
In [17]:
# check ,-separation in metadata
for title, row in metadata.iterrows():
    desc = row.description
    for line in desc.splitlines():
        if "," in line:
            assert line.startswith("CoFFE orthogroups") or line.startswith("CoFFE GO terms"), title
---------------------------------------------------------------------------
AssertionError                            Traceback (most recent call last)
Input In [17], in <cell line: 2>()
      4 for line in desc.splitlines():
      5     if "," in line:
----> 6         assert line.startswith("CoFFE orthogroups") or line.startswith("CoFFE GO terms"), title

AssertionError: c103545_g3_i2
In [18]:
line
Out[18]:
'CoFFE UniProt function: FUNCTION: Involved in hearing and vision as member of the USH2 complex. Necessary for elongation and maintenance of inner and outer hair cell stereocilia in the organ of Corti in the inner ear. Involved in the maintenance of the hair bundle ankle region, which connects stereocilia in cochlear hair cells of the inner ear. In retina photoreceptors, required for the maintenance of periciliary membrane complex that seems to play a role in regulating intracellular protein transport. {ECO:0000250|UniProtKB:Q80VW5}.'
In [19]:
# ok so we cannot blindly assume that commas are bad...

# check longest tokens in metadata
new_desc = dict()
for title, row in metadata.iterrows():
    desc = row.description
    new_lines = []
    for line in desc.splitlines():
        is_loi = line.startswith("CoFFE orthogroups") \
              or line.startswith("CoFFE GO terms") \
              or line.startswith("Emapper orthogroups") \
              or line.startswith("Emapper GO terms") \
              or line.startswith("CoFFE PFAM domains") \
              or line.startswith("Emapper PFAM domains")
        if "," in line and is_loi:
            new_line = line.replace(",", ", ")
        else:
            new_line = line
        token_lengths = [len(token) for token in new_line.split()]
        if token_lengths and max(token_lengths) > 80:
            print(title, new_line)
        new_lines.append(new_line)
    new_desc[title] = "\n".join(new_lines)
c2495_g1_i1 CoFFE UniProt function: FUNCTION: Protein O-mannose kinase that specifically mediates phosphorylation at the 6-position of an O-mannose of the trisaccharide (N-acetylgalactosamine (GalNAc)-beta-1,3-N-acetylglucosamine (GlcNAc)-beta-1,4-mannose) to generate phosphorylated O-mannosyl trisaccharide (N-acetylgalactosamine-beta-1,3-N-acetylglucosamine-beta-1,4-(phosphate-6-)mannose). Phosphorylated O-mannosyl trisaccharide is a carbohydrate structure present in alpha-dystroglycan (DAG1), which is required for binding laminin G-like domain-containing extracellular proteins with high affinity. Only shows kinase activity when the GalNAc-beta-3-GlcNAc-beta-terminus is linked to the 4-position of O-mannose, suggesting that this disaccharide serves as the substrate recognition motif (By similarity). {ECO:0000250}.
In [20]:
# c2495_g1_i1 tested in MA and ok...let's check largest ones
sorted([len(desc), title] for title, desc in new_desc.items())[-5:]
# looks ok too...
Out[20]:
[[19286, 'c90048_g1_i1'],
 [20561, 'c104513_g1_i1'],
 [22685, 'c91485_g1_i1'],
 [27674, 'c95514_g1_i1'],
 [27687, 'c95514_g1_i3']]
In [35]:
# checking multiline details and markdown
from ost import io
import modelcif.reader
file_path = "./modelcif/c103531_g3_i1_unrelaxed_rank_1_model_3.cif"
ent, info = io.LoadMMCIF(file_path, info=True)
with open(file_path) as fh:
    s, = modelcif.reader.read(fh)
print(info.GetStructDetails().model_details[:300])
print('-' * 80)
print(s.model_details[:300])
# NOTE: there may be a bug in OST's multiline parsing in terms of keeping line breaks intact...
# ...and lines are stripped so that we lose the trailing 2 spaces...
Model generated using ColabFold v1.2.0 with AlphaFold producing 5 models with 3 recycles each without model relaxation without templates ranked by pLDDT starting from an MSA from MMseqs2 (UniRef+Environmental)
Emapper preferred name [EggNOG]: -
Emapper description [EggNOG]: -
CoFFE preferred name [E
--------------------------------------------------------------------------------
Model generated using ColabFold v1.2.0 with AlphaFold producing 5 models with 3 recycles each without model relaxation without templates ranked by pLDDT starting from an MSA from MMseqs2 (UniRef+Environmental)

Emapper preferred name [EggNOG]: -
Emapper description [EggNOG]: -
CoFFE preferred name [
In [36]:
import markdown
markdown_test = """isoform ID: c103531_g3_i1

And **fat** and *emphasized* markdown with lists:

- item1
- item2

and [links](https://github.com).

Note that markdown ignores linebreaks
unless two spaces at end  
<- as in line above"""
print(markdown_test)
print('-' * 80)
print(markdown.markdown(markdown_test))
isoform ID: c103531_g3_i1

And **fat** and *emphasized* markdown with lists:

- item1
- item2

and [links](https://github.com).

Note that markdown ignores linebreaks
unless two spaces at end  
<- as in line above
--------------------------------------------------------------------------------
<p>isoform ID: c103531_g3_i1</p>
<p>And <strong>fat</strong> and <em>emphasized</em> markdown with lists:</p>
<ul>
<li>item1</li>
<li>item2</li>
</ul>
<p>and <a href="https://github.com">links</a>.</p>
<p>Note that markdown ignores linebreaks
unless two spaces at end<br />
&lt;- as in line above</p>
In [2]:
# test zip file speed and size
from timeit import default_timer as timer
import os, zipfile, gzip, shutil

def zip_test(compression, level=None):
    zip_file = "test.zip"
    files = ["modelcif/c103531_g3_i1_unrelaxed_rank_1_model_3_local_pairwise_qa.cif"]
    files.append("modelcif/c103545_g3_i2_unrelaxed_rank_1_model_3_local_pairwise_qa.cif")
    pstart = timer()
    if compression == "gzip":
        file_size = 0
        for file in files:
            with open(file, 'rb') as f_in:
                with gzip.open(zip_file, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
            file_size += os.path.getsize(zip_file)
    else:
        with zipfile.ZipFile(zip_file, 'w', compression, compresslevel=level) as myzip:
            for file in files:
                myzip.write(file)
        file_size = os.path.getsize(zip_file)
    # for comparison full translation time for the files (i.e. from running _main)
    if len(files) == 1:
        ref_time = "1.38s"
    else:
        ref_time = f"{1.38+24.21}s"
    if compression == zipfile.ZIP_STORED:
        label = "none "
    elif compression == zipfile.ZIP_DEFLATED:
        if level:
            label = f"gz({level})"
        else:
            label = "gz   "
    elif compression == zipfile.ZIP_BZIP2:
        if level:
            label = f"bz({level})"
        else:
            label = "bz   "
    elif compression == zipfile.ZIP_LZMA:
        label = "lzma "
    else:
        label = "gzip "
    print(f"- {label}: {timer()-pstart:.2f}s vs {ref_time}, {file_size}")
    os.remove(zip_file)

zip_test(zipfile.ZIP_STORED)
zip_test("gzip")
zip_test(zipfile.ZIP_DEFLATED)
zip_test(zipfile.ZIP_DEFLATED, 9)
zip_test(zipfile.ZIP_BZIP2)
zip_test(zipfile.ZIP_BZIP2, 9)
zip_test(zipfile.ZIP_LZMA)
- none : 3.84s vs 25.59s, 27824253
- gzip : 2.70s vs 25.59s, 5417406
- gz   : 0.87s vs 25.59s, 5389143
- gz(9): 2.74s vs 25.59s, 5417802
- bz   : 1.61s vs 25.59s, 3802066
- bz(9): 1.56s vs 25.59s, 3802066
- lzma : 22.27s vs 25.59s, 2083062

FAZIT: lzma is too slow but bzip2 has good compromise of size vs speed

In [1]:
# check out pdbx
import os
from timeit import default_timer as timer
from pdbx.reader.PdbxReader import PdbxReader
from pdbx.writer.PdbxWriter import PdbxWriter
test_file = "modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif"
# test_file = "modelcif_MA_test/AF-P38129-F1-model_v3.cif"
# test_file = "modelcif_MA_test/ma-bak-cepc-0001.cif"
# test_file = "modelcif_MA_test/1crn.cif"
test_file_out = "modelcif_MA_test/test.cif"
pstart = timer()
pdbx_data = []
with open(test_file, 'r') as f_in:
    pdbxR = PdbxReader(f_in)
    pdbxR.read(pdbx_data)
with open(test_file_out, 'w') as f_out:
    pdbxW = PdbxWriter(f_out)
    pdbxW.write(pdbx_data)
print(f"{os.path.getsize(test_file)} vs {os.path.getsize(test_file_out)}" \
      f" in {timer()-pstart:.2f}s")
137179 vs 184836 in 0.25s

FAZIT: adds lots of spaces but doesn't remove anything
ISSUE: quotes around "stop_..." removed...

In [2]:
# check out gemmi
import gemmi
pstart = timer()
doc = gemmi.cif.read(test_file)
doc.write_file(test_file_out)
print(f"{os.path.getsize(test_file)} vs {os.path.getsize(test_file_out)}" \
      f" in {timer()-pstart:.2f}s")
137179 vs 137099 in 0.03s

FAZIT: much faster than pdbx, space handling similar to python-modelcif and also not removing anything and handling "stop_..." properly

Also note that neither of the two messes up the order of tables and so files are easily comparable!

FAZIT: gemmi is good!

In [14]:
# quick test changing data_ block name
doc = gemmi.cif.read(test_file)
doc.sole_block().name = 'test'
doc.write_file(test_file_out)
In [3]:
# parsing pdbx
# see https://mmcif.wwpdb.org/docs/sw-examples/python/html/
# -> all very low level
block = pdbx_data[0]
if block.exists("ma_qa_metric"):
    dc = block.getObj("ma_qa_metric")
    categoryName, attributeNameList, rowList = dc.get()
    print(categoryName)
    print(attributeNameList)
    print(rowList)
ma_qa_metric
['id', 'name', 'description', 'type', 'mode', 'type_other_details', 'software_group_id']
[['1', 'pLDDT', 'Predicted accuracy according to the CA-only lDDT in [0,100]', 'pLDDT', 'global', '.', '.'], ['2', 'pTM', 'Predicted accuracy according to the TM-score score in [0,1]', 'pTM', 'global', '.', '.'], ['3', 'pLDDT', 'Predicted accuracy according to the CA-only lDDT in [0,100]', 'pLDDT', 'local', '.', '.'], ['4', 'PAE', 'Predicted aligned error (in Angstroms)', 'PAE', 'local-pairwise', '.', '.']]
In [4]:
# parsing gemmi
# see https://gemmi.readthedocs.io/en/latest/cif.html#block
block = doc.sole_block()
table = block.find_mmcif_category("_ma_qa_metric.")
if table:
    print(table.get_prefix())
    print([tag for tag in table.tags])
    print([[col for col in row] for row in table])
_ma_qa_metric.
['_ma_qa_metric.id', '_ma_qa_metric.name', '_ma_qa_metric.description', '_ma_qa_metric.type', '_ma_qa_metric.mode', '_ma_qa_metric.type_other_details', '_ma_qa_metric.software_group_id']
[['1', 'pLDDT', "'Predicted accuracy according to the CA-only lDDT in [0,100]'", 'pLDDT', 'global', '.', '.'], ['2', 'pTM', "'Predicted accuracy according to the TM-score score in [0,1]'", 'pTM', 'global', '.', '.'], ['3', 'pLDDT', "'Predicted accuracy according to the CA-only lDDT in [0,100]'", 'pLDDT', 'local', '.', '.'], ['4', 'PAE', "'Predicted aligned error (in Angstroms)'", 'PAE', 'local-pairwise', '.', '.']]
In [15]:
# note: "row.str(#) or "gemmi.cif.as_string" map "." or "?" to "" and remove quotes from strings
if table:
    print([[gemmi.cif.as_string(col) for col in row] for row in table])
print(f'"{gemmi.cif.as_string("?")}"')
[['1', 'pLDDT', 'Predicted accuracy according to the CA-only lDDT in [0,100]', 'pLDDT', 'global', '', ''], ['2', 'pTM', 'Predicted accuracy according to the TM-score score in [0,1]', 'pTM', 'global', '', ''], ['3', 'pLDDT', 'Predicted accuracy according to the CA-only lDDT in [0,100]', 'pLDDT', 'local', '', ''], ['4', 'PAE', 'Predicted aligned error (in Angstroms)', 'PAE', 'local-pairwise', '', '']]
""
In [8]:
# use find with desired columns (incl. optional ones)
table = block.find("_ma_qa_metric.", ["id", "mode", "name", "type", "?meh"])
print(table.has_column(4))
# and look for rows according to first column
print(table.find_row("3"))
# and get dictionary (as is)
print(block.get_mmcif_category("_ma_qa_metric.", raw=True))
# or nicely processed (? and . becoming None)
block.get_mmcif_category("_ma_qa_metric.")
False
<gemmi.cif.Table.Row: 3 local pLDDT pLDDT None>
{'id': ['1', '2', '3', '4'], 'name': ['pLDDT', 'pTM', 'pLDDT', 'PAE'], 'description': ["'Predicted accuracy according to the CA-only lDDT in [0,100]'", "'Predicted accuracy according to the TM-score score in [0,1]'", "'Predicted accuracy according to the CA-only lDDT in [0,100]'", "'Predicted aligned error (in Angstroms)'"], 'type': ['pLDDT', 'pTM', 'pLDDT', 'PAE'], 'mode': ['global', 'global', 'local', 'local-pairwise'], 'type_other_details': ['.', '.', '.', '.'], 'software_group_id': ['.', '.', '.', '.']}
Out[8]:
{'id': ['1', '2', '3', '4'],
 'name': ['pLDDT', 'pTM', 'pLDDT', 'PAE'],
 'description': ['Predicted accuracy according to the CA-only lDDT in [0,100]',
  'Predicted accuracy according to the TM-score score in [0,1]',
  'Predicted accuracy according to the CA-only lDDT in [0,100]',
  'Predicted aligned error (in Angstroms)'],
 'type': ['pLDDT', 'pTM', 'pLDDT', 'PAE'],
 'mode': ['global', 'global', 'local', 'local-pairwise'],
 'type_other_details': [False, False, False, False],
 'software_group_id': [False, False, False, False]}
In [206]:
# some ways to deal with optional and missing (? or .) data
table = block.find("_software.", ["name", "?location", "?version", "?citation_id", "?meh"])
display([[(row.has(idx) and bool(row.str(idx))) for idx in range(len(row))] for row in table])
display([[(row.str(idx) if row.has(idx) else '') for idx in range(len(row))] for row in table])
[[True, True, True, True, False],
 [True, True, False, True, False],
 [True, True, False, True, False]]
[['ColabFold', 'https://github.com/sokrypton/ColabFold', '1.2.0', '1', ''],
 ['MMseqs2', 'https://github.com/soedinglab/mmseqs2', '', '2', ''],
 ['AlphaFold', 'https://github.com/deepmind/alphafold', '', '3', '']]
In [17]:
# note that the above also works for completely missing tables
table = block.find("_meh.", ["name", "?location", "?version", "?citation_id", "?meh"])
display([[(row.has(idx) and bool(row.str(idx))) for idx in range(len(row))] for row in table])
[]

ModelCIF -> MA¶

In [2]:
import gemmi

# define converters for printing or for html (here markdown used for display)
def _make_url(link_text, link_url):
    return f"[{link_text}]({link_url})"
def _make_list(list_top, list_items):
    list_texts = [f"- {item}" for item in list_items]
    list_text = '\n'.join(list_texts)
    if list_texts:
        return(f"{list_top}\n{list_text}")
def _make_multiline_text(lines):
    return '\n'.join(lines)
def _make_paragraph(text):
    return f"{text}\n\n"
In [24]:
# define test files for translations below
test_files = [
    "modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif",
    "modelcif_MA_test/AF-P38129-F1-model_v3.cif",
    "modelcif_MA_test/ma-bak-cepc-0001.cif",
    "modelcif_MA_test/1crn.cif",
    "modelcif_MA_test/Tara_A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1.cif",
    "modelcif_MA_test/Var3D_I6XD65_1046.cif",
    "modelcif_MA_test/Var3D_L7N665_1046.cif",
    "modelcif_MA_test/Var3D_P9WIE5_3767.cif",
    "modelcif_MA_test/ORNL_F0JBU9_9DELT.cif",
]

# open one for quick testing
doc = gemmi.cif.read(test_files[-4])
block = doc.sole_block()
In [18]:
# easy: look for unique model details (same for title; already in ParseCIF)
abstract = block.find_value("_struct.pdbx_model_details")
if abstract:
    print(gemmi.cif.as_string(abstract)[:400])
Model generated using ColabFold v1.2.0 with AlphaFold producing 5 models with 3 recycles each without model relaxation without templates ranked by pLDDT starting from an MSA from MMseqs2 (UniRef+Environmental)

Emapper preferred name [EggNOG]: -
Emapper description [EggNOG]: -
CoFFE preferred name [EggNOG]: PLA2G15
CoFFE description [EggNOG]: Lecithin:cholesterol acyltransferase
CoFFE UniProt func
In [10]:
# check out entity parsing from ParseCIF
# -> CHANGES: asym_ids as list of chains (wasn't used anyways), make align and 
def _get_chain_entity(block):
        chain_entity = {}

        ma_target_entity = block.find(
            "_ma_target_entity_instance.", ["asym_id", "entity_id"]
        )
        if ma_target_entity:
            for row in ma_target_entity:
                if row["entity_id"] in chain_entity:
                    chain_entity[row["entity_id"]]["asym_ids"].append(
                        gemmi.cif.as_string(row["asym_id"])
                    )
                else:
                    chain_entity[row["entity_id"]] = {
                        "asym_ids": [row["asym_id"]],
                        "ma_target_ref_db_details": [],
                    }

            for row in block.find("_entity.", ["id", "pdbx_description"]):
                chain_entity[row["id"]].update(
                    {"pdbx_description": gemmi.cif.as_string(row["pdbx_description"])}
                )

            cols = [
                "target_entity_id",
                "db_accession",
                "db_name",
                "?db_name_other_details",
                "?organism_scientific",
                "?seq_db_align_begin",
                "?seq_db_align_end",
            ]
            # NOTE: row[col] doesn't work with '?' in find! Bad crashed happen if you try...
            for row in block.find("_ma_target_ref_db_details.", cols):
                json_obj = {}
                target_entity_id = row[0] # make sure this stays at idx 0 in cols!
                for idx, cq in enumerate(cols):
                    if cq.startswith('?'):
                        col = cq[1:]
                        if not row.has(idx):
                            json_obj[col] = None
                            continue
                    else:
                        col = cq
                    if col != "target_entity_id":
                        if col.find("seq_db_align") > -1:
                            json_obj[col] = gemmi.cif.as_int(row[idx])
                        else:
                            json_obj[col] = gemmi.cif.as_string(row[idx])
                chain_entity[target_entity_id][
                    "ma_target_ref_db_details"
                ].append(json_obj)

            for row in block.find(
                "_entity_poly.",
                ["entity_id", "pdbx_strand_id", "pdbx_seq_one_letter_code"],
            ):
                chain_entity[row["entity_id"]]["olc"] = gemmi.cif.as_string(
                    row["pdbx_seq_one_letter_code"]
                ).replace("\n", "")

        return chain_entity
In [14]:
# do entity parsing
def _test_entity_parsing(test_file):
    # get file for testing
    doc = gemmi.cif.read(test_file)
    block = doc.sole_block()
    # reuse (updated!) code from ParseCIF
    chain_entity = _get_chain_entity(block)
    # do it
    item_strings = []
    for ent in chain_entity:
        item_title = f"{chain_entity[ent]['pdbx_description']} " \
                     f"(chains: {', '.join(chain_entity[ent]['asym_ids'])})"
        db_links = [item_title]
        for i, ref in enumerate(chain_entity[ent]["ma_target_ref_db_details"]):
            if ref["db_name"] == "UNP":
                link_text = ref['db_accession']
                link_url = f"https://www.uniprot.org/uniprot/{ref['db_accession']}"
                db_link = f"UniProt: {_make_url(link_text, link_url)}"
            elif ref["db_name"] == "OrthoDB":
                link_text = ref['db_accession']
                link_url = f"https://www.orthodb.org/?query={ref['db_accession']}"
                db_link = f"OrthoDB: {_make_url(link_text, link_url)}"
            elif ref["db_name"] == "Other" and ref["db_name_other_details"]:
                db_link = f"{ref['db_name_other_details']}: {ref['db_accession']}"
            else:
                db_link = f"{ref['db_name']}: {ref['db_accession']}"
            if ref['seq_db_align_begin'] and ref['seq_db_align_end']:
                db_link += f" {ref['seq_db_align_begin']}-{ref['seq_db_align_end']}"
            if ref['organism_scientific']:
                db_link += f"; {ref['organism_scientific']}"
            db_links.append(db_link)
        item_strings.append(_make_multiline_text(db_links))
    # and show me...
    if item_strings:
        list_top = f"The following molecular entities are in the model: ({test_file})"
        print(_make_list(list_top, item_strings))
    else:
        print(f"No molecular entities listed in {test_file}")
    print()
    
for test_file in test_files:
    _test_entity_parsing(test_file)
The following molecular entities are in the model: (modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif)
- Spongilla lacustris c103531_g3_i1 protein (chains: A)

The following molecular entities are in the model: (modelcif_MA_test/AF-P38129-F1-model_v3.cif)
- Transcription initiation factor TFIID subunit 5 (chains: A)
UniProt: [P38129](https://www.uniprot.org/uniprot/P38129) 1-798; Saccharomyces cerevisiae (strain ATCC 204508 / S288c)

The following molecular entities are in the model: (modelcif_MA_test/ma-bak-cepc-0001.cif)
- TAF4 (chains: A)
OrthoDB: [1294385_1:000f0d](https://www.orthodb.org/?query=1294385_1:000f0d) 1-388; Saccharomyces cerevisiae YJM1573
UniProt: [A6ZM67](https://www.uniprot.org/uniprot/A6ZM67) 1-388; Saccharomyces cerevisiae (strain YJM789) (Baker's yeast)
- TAF5 (chains: B)
OrthoDB: [1294385_1:000051](https://www.orthodb.org/?query=1294385_1:000051) 1-798; Saccharomyces cerevisiae YJM1573
UniProt: [P38129](https://www.uniprot.org/uniprot/P38129) 1-798; Saccharomyces cerevisiae (strain ATCC 204508 / S288c) (Baker's yeast)

No molecular entities listed in modelcif_MA_test/1crn.cif

The following molecular entities are in the model: (modelcif_MA_test/Tara_A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1.cif)
- Model of ZC3H11B (A0A1B0GTU1) (chains: A)
UniProt: [A0A1B0GTU1](https://www.uniprot.org/uniprot/A0A1B0GTU1) 1-805; Homo sapiens (Human)
- Model of ZC3H11A (O75152) (chains: B)
UniProt: [O75152](https://www.uniprot.org/uniprot/O75152) 1-810; Homo sapiens (Human)

The following molecular entities are in the model: (modelcif_MA_test/Var3D_I6XD65_1046.cif)
- Model subunit of I6XD65 (chains: A)
UniProt: [I6XD65](https://www.uniprot.org/uniprot/I6XD65) 1-185; Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)
- FE (II) ION (chains: B)
- Pyrazinamide (chains: C)

The following molecular entities are in the model: (modelcif_MA_test/Var3D_L7N665_1046.cif)
- Model subunit of L7N665 (chains: A, B)
UniProt: [L7N665](https://www.uniprot.org/uniprot/L7N665) 1-385; Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)

The following molecular entities are in the model: (modelcif_MA_test/Var3D_P9WIE5_3767.cif)
- Model subunit of P9WIE5 (chains: A, B)
UniProt: [P9WIE5](https://www.uniprot.org/uniprot/P9WIE5) 24-740; Mycobacterium tuberculosis (strain ATCC 25618 / H37Rv)
- Isoniazid (chains: C, D)
- SODIUM ION (chains: E, F)
- CHLORIDE ION (chains: G, H)
- PROTOPORPHYRIN IX CONTAINING FE (chains: I, J)
- GLYCEROL (chains: K, L, M, N, O, P)

The following molecular entities are in the model: (modelcif_MA_test/ORNL_F0JBU9_9DELT.cif)
- DND132RS05775 (chains: A)
UniProt: [F0JBU9](https://www.uniprot.org/uniprot/F0JBU9) 1-123
RefSeq: NC_016803 1-123

In [ ]:
# do SW parsing
def _test_sw_parsing(test_file):
    # get file for testing
    doc = gemmi.cif.read(test_file)
    block = doc.sole_block()
    # get author names for each citation
    tmp = dict()
    for row in block.find("_citation_author.", ["citation_id", "name"]):
        cid = row.str(0)
        name = row.str(1)
        if cid not in tmp:
            tmp[cid] = {"name": name.split()[0].split(",")[0], "etal": ""}
        else:
            tmp[cid]["etal"] = " et al."
    cit_names = {cid: (d["name"] + d["etal"]) for cid, d in tmp.items()}
    # add year if available
    table = block.find("_citation.", ["id", "?year"])
    if table.has_column(1):
        for row in table:
            cid = row.str(0)
            year = row.str(1)
            if cid in cit_names and year:
                cit_names[cid] += " " + year
    # add URL if available
    cit_urls = {}
    table = block.find("_citation.", ["id", "?pdbx_database_id_DOI", "?pdbx_database_id_PubMed"])
    formatters = ["https://doi.org/%s",
                  "https://www.ncbi.nlm.nih.gov/pubmed/%s"]
    for row in table:
        cid = row.str(0)
        # add whichever URL we find first
        for i in range(1, table.width()):
            if row.has(i) and row.str(i):
                cit_urls[cid] = formatters[i - 1] % row.str(i)
                break
    # now map this to software
    item_strings = []
    table = block.find("_software.", ["name", "?location", "?version", "?citation_id"])
    for row in table:
        sw_name = row.str(0)
        if row.has(1) and row.str(1):
            item = _make_url(sw_name, row.str(1))
        else:
            item = sw_name
        if row.has(2) and row.str(2):
            item += f" ({row.str(2)})"
        if row.has(3) and row.str(3) in cit_names:
            cid = row.str(3)
            if cid in cit_urls:
                item += f" ({_make_url(cit_names[cid], cit_urls[cid])})"
            else:
                item += f" ({cit_names[cid]})"
        item_strings.append(item)
    # and show me...
    if item_strings:
        list_top = f"The following software was used: ({test_file})"
        print(_make_list(list_top, item_strings))
    else:
        print(f"No software was listed for {test_file}")
    print()

for test_file in test_files:
    _test_sw_parsing(test_file)
In [182]:
# do ref. DB parsing
def _test_ref_db_parsing(test_file):
    # get file for testing
    doc = gemmi.cif.read(test_file)
    block = doc.sole_block()
    # look for DBs with version or release date
    item_strings = []
    table = block.find("_ma_data_ref_db.", ["name", "?version", "?release_date"])
    for row in table:
        item = f"{row.str(0)}"
        # add whichever version we find first
        for i in range(1, 3):
            if row.has(i) and row.str(i):
                item += f" ({row.str(i)})"
                break
        item_strings.append(item)
    # and show me...
    if item_strings:
        list_top = f"The following reference databases were used: ({test_file})"
        print(_make_list(list_top, item_strings))
    else:
        print(f"No reference databases were listed for {test_file}")
    print()

for test_file in test_files:
    _test_ref_db_parsing(test_file)
The following reference databases were used: (modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif)
- UniRef30 (2021_03)
- ColabFold DB (2021_08)

No reference databases were listed for modelcif_MA_test/AF-P38129-F1-model_v3.cif

No reference databases were listed for modelcif_MA_test/ma-bak-cepc-0001.cif

No reference databases were listed for modelcif_MA_test/1crn.cif

No reference databases were listed for modelcif_MA_test/Tara_A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1.cif

No reference databases were listed for modelcif_MA_test/Var3D_I6XD65_1046.cif

The following reference databases were used: (modelcif_MA_test/Var3D_L7N665_1046.cif)
- BFD (6a634dc6eb105c2e9b4cba7bbae93412)
- MGnify (2018_12)
- Uniclust30 (2018_08)
- Swiss-Prot (2021_03)
- TrEMBL (2021_03)
- UniRef90 (2021_03)

No reference databases were listed for modelcif_MA_test/Var3D_P9WIE5_3767.cif

No reference databases were listed for modelcif_MA_test/ORNL_F0JBU9_9DELT.cif

In [47]:
# do template parsing
def _test_tpl_parsing(test_file):
    # get file for testing
    doc = gemmi.cif.read(test_file)
    block = doc.sole_block()
    # collect info per tpl-id
    tpl_dict = {}  # keyed on template_id
    # fetch main info
    cols = ["template_id", "target_asym_id", "template_auth_asym_id",
            "?template_label_asym_id"]
    for row in block.find("_ma_template_details.", cols):
        tid = row.str(0)
        tpl_dict[tid] = {
            "trg_asym_id": row.str(1),
            "tpl_auth_asym_id": row.str(2)
        }
        if row.has(3) and row.str(3):
            tpl_dict[tid]["tpl_label_asym_id"] = row.str(3)
    # add ref DBs
    cols = ["template_id", "db_accession_code", "db_name",
            "?db_name_other_details"]
    for row in block.find("_ma_template_ref_db_details.", cols):
        tid = row.str(0)
        if tid in tpl_dict:
            tpl_dict[tid]["db_acc"] = row.str(1)
            if row.str(2) == "Other" and row.has(3) and row.str(3):
                tpl_dict[tid]["db_name"] = row.str(3)
            else:
                tpl_dict[tid]["db_name"] = row.str(2)
    # add info for small molecules
    cols = ["template_id", "?comp_id", "?details"]
    for row in block.find("_ma_template_non_poly.", cols):
        tid = row.str(0)
        if tid in tpl_dict:
            if row.has(1) and row.str(1):
                tpl_dict[tid]["non_poly_comp_id"] = row.str(1)
            if row.has(2) and row.str(2):
                tpl_dict[tid]["non_poly_details"] = row.str(2)
    # aggregate per template for diplaying
    tpl_td = dict()
    for tpl in tpl_dict.values():
        did = f"{tpl['db_name']}-{tpl['db_acc']}"
        if did not in tpl_td:
            if tpl['db_name'] == "PDB":
                link_url = f"http://dx.doi.org/10.2210/pdb{tpl['db_acc']}/pdb"
            elif tpl['db_name'] == "PubChem":
                link_url = f"https://pubchem.ncbi.nlm.nih.gov/compound/{tpl['db_acc']}"
            else:
                link_url = None
                print(f"URLs for {tpl['db_name']} NOT SUPPORTED YET")
            if link_url:
                tpl_text = f"{tpl['db_name']}: {_make_url(tpl['db_acc'], link_url)}"
            else:
                tpl_text = f"{tpl['db_name']}: {tpl['db_acc']}"
            tpl_td[did] = {
                "tpl_text": tpl_text,
                "tpl_chains_label": [],
                "tpl_chains_auth": [],
                "tpl_chains_all_label": True,
                "tpl_non_poly_ids": []
            }
        # collect chain names
        if "tpl_label_asym_id" in tpl:
            # if here it is guaranteed to be non-empty
            tpl_td[did]["tpl_chains_label"].append(tpl["tpl_label_asym_id"])
        else:
            # if any missing, we set all to False and fall back to auth
            tpl_td[did]["tpl_chains_all_label"] = False
        if tpl["tpl_auth_asym_id"]:
            # only add non empty ones
            tpl_td[did]["tpl_chains_auth"].append(tpl["tpl_auth_asym_id"])
        # collect info on non poly if available (prefer short comp. ID)
        if "non_poly_comp_id" in tpl:
            tpl_td[did]["tpl_non_poly_ids"].append(tpl["non_poly_comp_id"])
        elif "non_poly_details" in tpl:
            tpl_td[did]["tpl_non_poly_ids"].append(tpl["non_poly_details"])
    # turn into text
    item_strings = []
    for tpl in tpl_td.values():
        item = tpl["tpl_text"]
        if tpl["tpl_chains_all_label"] and tpl["tpl_chains_label"]:
            chain_ids = sorted(set(tpl['tpl_chains_label']))
            item += f"; chains (label_asym_id): {', '.join(chain_ids)}"
        elif tpl["tpl_chains_auth"]:
            chain_ids = sorted(set(tpl['tpl_chains_auth']))
            item += f"; chains (auth_asym_id): {', '.join(chain_ids)}"
        if tpl["tpl_non_poly_ids"]:
            np_ids = sorted(set(tpl['tpl_non_poly_ids']))
            item += f"; non-polymers: {', '.join(np_ids)}"
        item_strings.append(item)
    # and show me...
    if item_strings:
        list_top = f"The following templates were used: ({test_file})"
        print(_make_list(list_top, item_strings))
    else:
        print(f"No templates were listed for {test_file}")
    print()

for test_file in test_files:
    _test_tpl_parsing(test_file)
No templates were listed for modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif

The following templates were used: (modelcif_MA_test/AF-P38129-F1-model_v3.cif)
- PDB: [6TBM](http://dx.doi.org/10.2210/pdb6TBM/pdb); chains (auth_asym_id): G
- PDB: [6TB4](http://dx.doi.org/10.2210/pdb6TB4/pdb); chains (auth_asym_id): G
- PDB: [6F3T](http://dx.doi.org/10.2210/pdb6F3T/pdb); chains (auth_asym_id): A
- PDB: [6MZC](http://dx.doi.org/10.2210/pdb6MZC/pdb); chains (auth_asym_id): G

No templates were listed for modelcif_MA_test/ma-bak-cepc-0001.cif

No templates were listed for modelcif_MA_test/1crn.cif

No templates were listed for modelcif_MA_test/Tara_A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1.cif

The following templates were used: (modelcif_MA_test/Var3D_I6XD65_1046.cif)
- PDB: [3PL1](http://dx.doi.org/10.2210/pdb3PL1/pdb); chains (label_asym_id): A, B; non-polymers: FE2
- PubChem: [1046](https://pubchem.ncbi.nlm.nih.gov/compound/1046); non-polymers: PZA

No templates were listed for modelcif_MA_test/Var3D_L7N665_1046.cif

The following templates were used: (modelcif_MA_test/Var3D_P9WIE5_3767.cif)
- PDB: [1SJ2](http://dx.doi.org/10.2210/pdb1SJ2/pdb); chains (label_asym_id): A, B, C, D, E, F, G, H, I, J; non-polymers: GOL, HEM
- PDB: [6CDQ](http://dx.doi.org/10.2210/pdb6CDQ/pdb); chains (label_asym_id): D, E, K, M, N, S; non-polymers: CL, NA, NIZ

No templates were listed for modelcif_MA_test/ORNL_F0JBU9_9DELT.cif

In [183]:
# do protocol steps parsing
def _test_protocol_steps_parsing(test_file):
    # get file for testing
    doc = gemmi.cif.read(test_file)
    block = doc.sole_block()
    # do it
    item_strings = []
    table = block.find("_ma_protocol_step.", ["step_id", "method_type", "?details"])
    for row in table:
        item = f"Step {row.str(0)} - {row.str(1)}"
        if row.has(2) and row.str(2):
            item += f" : {row.str(2)}"
        item_strings.append(item)
    # and show me...
    if item_strings:
        print(f"({test_file})")
        print(_make_multiline_text(item_strings))
    else:
        print(f"No protocol steps listed in {test_file}")
    print()
    
for test_file in test_files:
    _test_protocol_steps_parsing(test_file)

# NOTE: compared with code in ParseCIF (8.8.22): block.find used without optional '?' there
# -> i.e. code above works also if it is lacking details
(modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif)
Step 1 - modeling : Model generated using ColabFold v1.2.0 with AlphaFold producing 5 models with 3 recycles each without model relaxation without templates ranked by pLDDT starting from an MSA from MMseqs2 (UniRef+Environmental)

(modelcif_MA_test/AF-P38129-F1-model_v3.cif)
Step 1 - coevolution MSA
Step 2 - template search
Step 3 - modeling

(modelcif_MA_test/ma-bak-cepc-0001.cif)
Step 1 - coevolution MSA : Create paired MSAs for the dimers
Step 2 - modeling : Model using AlphaFold with a 200 residue gap between the two chains

No protocol steps listed in modelcif_MA_test/1crn.cif

(modelcif_MA_test/Tara_A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1.cif)
Step 1 - modeling : Model using AlphaFold-Multimer (AlphaFold v2.2.0), without amber relaxation and producing 5 models with up to 3 recycles each, starting from paired and unparied MSAs for the dimers using MMseqs2.
Step 2 - model selection : Select best model, which is either the top-ranked model as determined by the ColabFold pipeline (iptmscore*0.8+ptmscore*0.2), or else the model with best congruence with crosslinks reported in the related study.

(modelcif_MA_test/Var3D_I6XD65_1046.cif)
Step 1 - modeling : Docking of ligand PZA in structure

(modelcif_MA_test/Var3D_L7N665_1046.cif)
Step 1 - modeling : Modelled the sequence of L7N665 with AlphaFold-Multimer as a homo-2-mer

(modelcif_MA_test/Var3D_P9WIE5_3767.cif)
Step 1 - modeling : transferred ligand NIZ from experimental structure 6cdq into protein structure

(modelcif_MA_test/ORNL_F0JBU9_9DELT.cif)
Step 1 - modeling
Step 2 - model selection

In [184]:
def _fetch_qa_data(block):
    # fetch main info
    table = block.find("_ma_qa_metric.", ["id", "name", "mode", "type"])
    qa_dict = dict() # for easy access: keyed on "id" and rest as dict
    for row in table:
        d = {key: row.str(idx + 1) for idx, key in enumerate(["name", "mode", "type"])}
        qa_dict[row.str(0)] = d
    # fetch global scores
    qa_global = []
    table = block.find("_ma_qa_metric_global.", ["metric_id", "metric_value"])
    for row in table:
        metric_id = row.str(0)
        metric_value = gemmi.cif.as_number(row.get(1))
        if metric_id in qa_dict:
            assert qa_dict[metric_id]["mode"] == "global"
            qa_dict[metric_id]["value"] = metric_value
            qa_global.append(qa_dict[metric_id])
    # fetch local scores
    qa_local = [d for d in qa_dict.values() if d["mode"] == "local"]
    qa_local_pairwise = [d for d in qa_dict.values() if d["mode"] == "local-pairwise"]
    return qa_global, qa_local, qa_local_pairwise

_fetch_qa_data(block)
Out[184]:
([{'name': 'pLDDT', 'mode': 'global', 'type': 'pLDDT', 'value': 88.312},
  {'name': 'pTM', 'mode': 'global', 'type': 'pTM', 'value': 0.78}],
 [{'name': 'pLDDT', 'mode': 'local', 'type': 'pLDDT'}],
 [{'name': 'PAE', 'mode': 'local-pairwise', 'type': 'PAE'}])
In [185]:
# do parsing of QA part and acc. data
def _test_qa_acc_data_parsing(test_file):
    # get file for testing
    doc = gemmi.cif.read(test_file)
    block = doc.sole_block()
    # get QA part (can reuse if already used elsewhere)
    qa_global, qa_local, qa_local_pairwise = _fetch_qa_data(block)
    # parse accompanying data
    file_contents = block.find_values("_ma_entry_associated_files.file_content")
    has_single_zip_file = len(file_contents) == 1 and \
                          file_contents.str(0) == "archive with multiple files"
    if has_single_zip_file:
        # override with data from other block
        file_contents = block.find_values("_ma_associated_archive_file_details.file_content")
    has_loc_pw_in_acc = any(True for v in file_contents \
                            if gemmi.cif.as_string(v) == "local pairwise QA scores")
    # put together text
    text = ""
    # text for QA
    item = ""
    if len(qa_global) > 1:
        score_strings = [f"{v['name']} of {v['value']}" for v in qa_global]
        item = f"The model has the following global model confidence scores:" \
               f" {', '.join(score_strings)}."
    elif len(qa_global) == 1:
        item = f"The model has a global model confidence score " \
               f"({qa_global[0]['name']}) of {qa_global[0]['value']}."
    if item:
        text += _make_paragraph(item)
    # lots of options for local QA string
    item = ""
    qa_local_names = ", ".join([v["name"] for v in qa_local])
    qa_loc_pw_names = ", ".join([v["name"] for v in qa_local_pairwise])
    if qa_local_names and qa_loc_pw_names and has_loc_pw_in_acc:
        item = f"Local per-residue model confidence scores ({qa_local_names}) " \
               f"are available in the model mmCIF file " \
               f"and local per-residue-pair scores ({qa_loc_pw_names}) " \
               f"in the accompanying data download."
    elif qa_local_names and qa_loc_pw_names and not has_loc_pw_in_acc:
        item = f"Local per-residue model confidence scores ({qa_local_names}) " \
               f"and local per-residue-pair scores ({qa_loc_pw_names}) " \
               f"are available in the model mmCIF file."
    elif qa_local_names and not qa_loc_pw_names:
        item = f"Local per-residue model confidence scores ({qa_local_names}) " \
               f"are available in the model mmCIF file."
    elif not qa_local_names and qa_loc_pw_names and has_loc_pw_in_acc:
        item = f"Local per-residue-pair model confidence scores ({qa_loc_pw_names}) " \
               f"are available in the accompanying data download."
    elif not qa_local_names and qa_loc_pw_names and not has_loc_pw_in_acc:
        item = f"Local per-residue-pair model confidence scores ({qa_loc_pw_names}) " \
               f"are available in the model mmCIF file."
    if item:
        text += _make_paragraph(item)
    # list files in accompanying data (if any)
    if has_single_zip_file:
        table = block.find("_ma_associated_archive_file_details.",
                           ["file_path", "?file_content", "?description"])
    else:
        # NOTE: aimed to work legacy-style for Baker-models but should be obsoleted
        # -> can replace below with "table = None" in future
        table = block.find("_ma_entry_associated_files.",
                           ["file_url", "?file_content", "?details"])
    if table:
        list_top = "Files in accompanying data:"
        list_items = []
        for row in table:
            item = f"{row.str(0)}"
            if row.has(1) and row.str(1):
                item += f" ({row.str(1)})"
            if row.has(2) and row.str(2):
                item += f": {row.str(2)}"
            list_items.append(item)
        text += _make_paragraph(_make_list(list_top, list_items))
    # conclude with standard pointer to ModelCIF file
    model_cif_link = _make_url(
        "ModelCIF format",
        "https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/"
    )
    text += _make_paragraph(
        f"Full details are available in {model_cif_link}" \
        f"in the model mmCIF file."
    )
    print(f"({test_file})")
    print(text)

for test_file in test_files:
    _test_qa_acc_data_parsing(test_file)
(modelcif_MA_test/c103531_g3_i1_unrelaxed_rank_1_model_3.cif)
The model has the following global model confidence scores: pLDDT of 88.312, pTM of 0.78.

Local per-residue model confidence scores (pLDDT) are available in the model mmCIF file and local per-residue-pair scores (PAE) in the accompanying data download.

Files in accompanying data:
- c103531_g3_i1_unrelaxed_rank_1_model_3_local_pairwise_qa.cif (local pairwise QA scores): Predicted aligned error

Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/AF-P38129-F1-model_v3.cif)
The model has a global model confidence score (pLDDT) of 73.91.

Local per-residue model confidence scores (pLDDT) are available in the model mmCIF file.

Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/ma-bak-cepc-0001.cif)
The model has a global model confidence score (pLDDT) of 67.43.

Local per-residue model confidence scores (pLDDT) are available in the model mmCIF file and local per-residue-pair scores (PAE, contact probability) in the accompanying data download.

Files in accompanying data:
- ma-bak-cepc-0001_local_pairwise_qa.cif (local pairwise QA scores)
- ma-bak-cepc-0001_i95.a3m (multiple sequence alignments)

Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/1crn.cif)
Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/Tara_A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1.cif)
The model has the following global model confidence scores: pLDDT of 32.232, pTM of 0.24.

Local per-residue model confidence scores (pLDDT) are available in the model mmCIF file and local per-residue-pair scores (PAE) in the accompanying data download.

Files in accompanying data:
- A0A1B0GTU1-O75152_unrelaxed_rank_1_model_1_local_pairwise_qa.cif (local pairwise QA scores): Predicted aligned error.

Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/Var3D_I6XD65_1046.cif)
Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/Var3D_L7N665_1046.cif)
The model has the following global model confidence scores: pLDDT of 88.204, pTM of 0.884, ipTM of 0.874.

Local per-residue model confidence scores (pLDDT) are available in the model mmCIF file and local per-residue-pair scores (PAE) in the accompanying data download.

Files in accompanying data:
- tmp/L7N665_1046_local_pairwise_qa.cif (local pairwise QA scores): Predicted aligned error.

Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/Var3D_P9WIE5_3767.cif)
Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


(modelcif_MA_test/ORNL_F0JBU9_9DELT.cif)
The model has a global model confidence score (pLDDT) of 92.31.

Local per-residue model confidence scores (pLDDT) and local per-residue-pair scores (PAE) are available in the model mmCIF file.

Full details are available in [ModelCIF format](https://mmcif.wwpdb.org/dictionaries/mmcif_ma.dic/Index/)in the model mmCIF file.


In [ ]: