{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ac0e1325",
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import datetime\n",
    "import os\n",
    "import pandas as pd\n",
    "import xml.dom.minidom\n",
    "from ost import io\n",
    "\n",
    "# from Tara code\n",
    "def _get_sequence(chn):\n",
    "    \"\"\"Get the sequence out of an OST chain.\"\"\"\n",
    "    # initialise\n",
    "    lst_rn = chn.residues[0].number.num\n",
    "    idx = 1\n",
    "    sqe = chn.residues[0].one_letter_code\n",
    "    if lst_rn != 1:\n",
    "        sqe = \"-\"\n",
    "        idx = 0\n",
    "\n",
    "    for res in chn.residues[idx:]:\n",
    "        lst_rn += 1\n",
    "        while lst_rn != res.number.num:\n",
    "            sqe += \"-\"\n",
    "            lst_rn += 1\n",
    "        sqe += res.one_letter_code\n",
    "\n",
    "    return sqe\n",
    "\n",
    "def _check_sequence(up_ac, sequence):\n",
    "    \"\"\"Verify sequence to only contain standard olc.\"\"\"\n",
    "    for res in sequence:\n",
    "        if res not in \"ACDEFGHIKLMNPQRSTVWY\":\n",
    "            raise RuntimeError(\n",
    "                \"Non-standard aa found in UniProtKB sequence \"\n",
    "                + f\"for entry '{up_ac}': {res}\"\n",
    "            )\n",
    "\n",
    "def _fetch_upkb_entry(up_ac):\n",
    "    \"\"\"Fetch data for an UniProtKB entry.\"\"\"\n",
    "    # This is a simple parser for UniProtKB txt format, instead of breaking it up\n",
    "    # into multiple functions, we just allow many many branches & statements,\n",
    "    # here.\n",
    "    # pylint: disable=too-many-branches,too-many-statements\n",
    "    data = {}\n",
    "    data[\"up_organism\"] = \"\"\n",
    "    data[\"up_sequence\"] = \"\"\n",
    "    data[\"up_ac\"] = up_ac\n",
    "    rspns = requests.get(f\"https://www.uniprot.org/uniprot/{up_ac}.txt\")\n",
    "    for line in rspns.iter_lines(decode_unicode=True):\n",
    "        if line.startswith(\"ID   \"):\n",
    "            sline = line.split()\n",
    "            if len(sline) != 5:\n",
    "                _abort_msg(f\"Unusual UniProtKB ID line found:\\n'{line}'\")\n",
    "            data[\"up_id\"] = sline[1]\n",
    "        elif line.startswith(\"OX   NCBI_TaxID=\"):\n",
    "            # Following strictly the UniProtKB format: 'OX   NCBI_TaxID=<ID>;'\n",
    "            data[\"up_ncbi_taxid\"] = line[len(\"OX   NCBI_TaxID=\") : -1]\n",
    "            data[\"up_ncbi_taxid\"] = data[\"up_ncbi_taxid\"].split(\"{\")[0].strip()\n",
    "        elif line.startswith(\"OS   \"):\n",
    "            if line[-1] == \".\":\n",
    "                data[\"up_organism\"] += line[len(\"OS   \") : -1]\n",
    "            else:\n",
    "                data[\"up_organism\"] += line[len(\"OS   \") : -1] + \" \"\n",
    "        elif line.startswith(\"SQ   \"):\n",
    "            sline = line.split()\n",
    "            if len(sline) != 8:\n",
    "                _abort_msg(f\"Unusual UniProtKB SQ line found:\\n'{line}'\")\n",
    "            data[\"up_seqlen\"] = int(sline[2])\n",
    "            data[\"up_crc64\"] = sline[6]\n",
    "        elif line.startswith(\"     \"):\n",
    "            sline = line.split()\n",
    "            if len(sline) > 6:\n",
    "                _abort_msg(\n",
    "                    \"Unusual UniProtKB sequence data line \"\n",
    "                    + f\"found:\\n'{line}'\"\n",
    "                )\n",
    "            data[\"up_sequence\"] += \"\".join(sline)\n",
    "        elif line.startswith(\"RP   \"):\n",
    "            if \"ISOFORM\" in line.upper():\n",
    "                RuntimeError(\n",
    "                    f\"First ISOFORM found for '{up_ac}', needs \" + \"handling.\"\n",
    "                )\n",
    "        elif line.startswith(\"DT   \"):\n",
    "            # 2012-10-03\n",
    "            dt_flds = line[len(\"DT   \") :].split(\", \")\n",
    "            if dt_flds[1].upper().startswith(\"SEQUENCE VERSION \"):\n",
    "                data[\"up_last_mod\"] = datetime.datetime.strptime(\n",
    "                    dt_flds[0], \"%d-%b-%Y\"\n",
    "                )\n",
    "        elif line.startswith(\"GN   Name=\"):\n",
    "            data[\"up_gn\"] = line[len(\"GN   Name=\") :].split(\";\")[0]\n",
    "            data[\"up_gn\"] = data[\"up_gn\"].split(\"{\")[0].strip()\n",
    "\n",
    "    # we have not seen isoforms in the data set, yet, so we just set them to '.'\n",
    "    data[\"up_isoform\"] = None\n",
    "\n",
    "    if \"up_gn\" not in data:\n",
    "        _abort_msg(f\"No gene name found for UniProtKB entry '{up_ac}'.\")\n",
    "    if \"up_last_mod\" not in data:\n",
    "        _abort_msg(f\"No sequence version found for UniProtKB entry '{up_ac}'.\")\n",
    "    if \"up_crc64\" not in data:\n",
    "        _abort_msg(f\"No CRC64 value found for UniProtKB entry '{up_ac}'.\")\n",
    "    if len(data[\"up_sequence\"]) == 0:\n",
    "        _abort_msg(f\"No sequence found for UniProtKB entry '{up_ac}'.\")\n",
    "    # check that sequence length and CRC64 is correct\n",
    "    if data[\"up_seqlen\"] != len(data[\"up_sequence\"]):\n",
    "        _abort_msg(\n",
    "            \"Sequence length of SQ line and sequence data differ for \"\n",
    "            + f\"UniProtKB entry '{up_ac}': {data['up_seqlen']} != \"\n",
    "            + f\"{len(data['up_sequence'])}\"\n",
    "        )\n",
    "    _check_sequence(data[\"up_ac\"], data[\"up_sequence\"])\n",
    "\n",
    "    if \"up_id\" not in data:\n",
    "        _abort_msg(f\"No ID found for UniProtKB entry '{up_ac}'.\")\n",
    "    if \"up_ncbi_taxid\" not in data:\n",
    "        _abort_msg(f\"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.\")\n",
    "    if len(data[\"up_organism\"]) == 0:\n",
    "        _abort_msg(f\"No organism species found for UniProtKB entry '{up_ac}'.\")\n",
    "\n",
    "    return data\n",
    "\n",
    "def _get_upkb_for_sequence(sqe, up_ac):\n",
    "    \"\"\"Get UniProtKB entry data for given sequence.\"\"\"\n",
    "    up_data = _fetch_upkb_entry(up_ac)\n",
    "    if sqe != up_data[\"up_sequence\"]:\n",
    "        raise RuntimeError(\n",
    "            f\"Sequences not equal from file: {sqe}, from UniProtKB: \"\n",
    "            + f\"{up_data['up_sequence']}\"\n",
    "        )\n",
    "\n",
    "    return up_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "37bece22",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _get_ncbi_sequence(ncbi_ac):\n",
    "    \"\"\"Fetch OST sequence object from NCBI web service.\"\"\"\n",
    "    # src: https://www.ncbi.nlm.nih.gov/books/NBK25500/#_chapter1_Downloading_Full_Records_\n",
    "    rspns = requests.get(f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/\" \\\n",
    "                         f\"efetch.fcgi?db=protein&id={ncbi_ac}\" \\\n",
    "                         f\"&rettype=fasta&retmode=text\")\n",
    "    return io.SequenceFromString(rspns.text, \"fasta\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "3f83b12d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s = _get_ncbi_sequence(\"CAD2068351.1\")\n",
    "# up_data = _fetch_upkb_entry(\"A0A485PQD1\")\n",
    "# print(s.name, s, len(s), up_data[\"up_sequence\"] == str(s))\n",
    "# up_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "ff132158",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check USDA data\n",
    "metadata_file = \"./InputFiles/ASFV-G_proteome_accessions.csv\"\n",
    "pdb_dir = \"./InputFiles/AlphaFold-RENAME\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "d8ca29cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Protein</th>\n",
       "      <th>Associated PDB</th>\n",
       "      <th>NCBI_Accession</th>\n",
       "      <th>UniProt_ID</th>\n",
       "      <th>_struct.title</th>\n",
       "      <th>_struct.pdbx_model_detail</th>\n",
       "      <th>ranking debugg model ID</th>\n",
       "      <th>notes</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>285L</td>\n",
       "      <td>285L.pdb</td>\n",
       "      <td>CAD2068351.1</td>\n",
       "      <td>A0A485PQD1</td>\n",
       "      <td>ASFV-G 285L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_1_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>A104R</td>\n",
       "      <td>A104R.pdb</td>\n",
       "      <td>CAD2068395.1</td>\n",
       "      <td>A0A0A1E0L7</td>\n",
       "      <td>ASFV-G A104R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_2_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>A118R</td>\n",
       "      <td>A118R.pdb</td>\n",
       "      <td>CAD2068397.1</td>\n",
       "      <td>A0A2X0RVA9</td>\n",
       "      <td>ASFV-G A118R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_1_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>A137R</td>\n",
       "      <td>A137R.pdb</td>\n",
       "      <td>CAD2068404.1</td>\n",
       "      <td>A0A2X0THQ0</td>\n",
       "      <td>ASFV-G A137R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>A151R</td>\n",
       "      <td>A151R.pdb</td>\n",
       "      <td>CAD2068398.1</td>\n",
       "      <td>A0A2X0TC55</td>\n",
       "      <td>ASFV-G A151R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>192</th>\n",
       "      <td>QP509L</td>\n",
       "      <td>QP509L.pdb</td>\n",
       "      <td>CAD2068484.1</td>\n",
       "      <td>A0A2X0THX2</td>\n",
       "      <td>ASFV-G QP509L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>193</th>\n",
       "      <td>R298L</td>\n",
       "      <td>R298L.pdb</td>\n",
       "      <td>CAD2068482.1</td>\n",
       "      <td>A0A2X0SE42</td>\n",
       "      <td>ASFV-G R298L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>194</th>\n",
       "      <td>S183L</td>\n",
       "      <td>S183L.pdb</td>\n",
       "      <td>CAD2068472.1</td>\n",
       "      <td>A0A2X0SE34</td>\n",
       "      <td>ASFV-G S183L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>195</th>\n",
       "      <td>S273R</td>\n",
       "      <td>S273R.pdb</td>\n",
       "      <td>CAD2068473.1</td>\n",
       "      <td>A0A2X0TKM5</td>\n",
       "      <td>ASFV-G S273R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_2_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>196</th>\n",
       "      <td>X69R</td>\n",
       "      <td>X69R.pdb</td>\n",
       "      <td>CAD2068372.1</td>\n",
       "      <td>A0A2X0TKC7</td>\n",
       "      <td>ASFV-G X69R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_2_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>197 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    Protein Associated PDB NCBI_Accession  UniProt_ID _struct.title   \\\n",
       "0      285L       285L.pdb   CAD2068351.1  A0A485PQD1    ASFV-G 285L   \n",
       "1     A104R      A104R.pdb   CAD2068395.1  A0A0A1E0L7   ASFV-G A104R   \n",
       "2     A118R      A118R.pdb   CAD2068397.1  A0A2X0RVA9   ASFV-G A118R   \n",
       "3     A137R      A137R.pdb   CAD2068404.1  A0A2X0THQ0   ASFV-G A137R   \n",
       "4     A151R      A151R.pdb   CAD2068398.1  A0A2X0TC55   ASFV-G A151R   \n",
       "..      ...            ...            ...         ...            ...   \n",
       "192  QP509L     QP509L.pdb   CAD2068484.1  A0A2X0THX2  ASFV-G QP509L   \n",
       "193   R298L      R298L.pdb   CAD2068482.1  A0A2X0SE42   ASFV-G R298L   \n",
       "194   S183L      S183L.pdb   CAD2068472.1  A0A2X0SE34   ASFV-G S183L   \n",
       "195   S273R      S273R.pdb   CAD2068473.1  A0A2X0TKM5   ASFV-G S273R   \n",
       "196    X69R       X69R.pdb   CAD2068372.1  A0A2X0TKC7    ASFV-G X69R   \n",
       "\n",
       "                     _struct.pdbx_model_detail ranking debugg model ID notes  \n",
       "0    This model was predicted using AlphaFold2          model_1_pred_0   NaN  \n",
       "1    This model was predicted using AlphaFold2          model_2_pred_0   NaN  \n",
       "2    This model was predicted using AlphaFold2          model_1_pred_0   NaN  \n",
       "3    This model was predicted using AlphaFold2          model_3_pred_0   NaN  \n",
       "4    This model was predicted using AlphaFold2          model_4_pred_0   NaN  \n",
       "..                                         ...                     ...   ...  \n",
       "192  This model was predicted using AlphaFold2                     NaN   NaN  \n",
       "193  This model was predicted using AlphaFold2          model_3_pred_0   NaN  \n",
       "194  This model was predicted using AlphaFold2          model_4_pred_0   NaN  \n",
       "195  This model was predicted using AlphaFold2          model_2_pred_0   NaN  \n",
       "196  This model was predicted using AlphaFold2          model_2_pred_0   NaN  \n",
       "\n",
       "[197 rows x 8 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata = pd.read_csv(metadata_file)\n",
    "assert len(set(metadata.Protein)) == metadata.shape[0]\n",
    "assert len(set(metadata[\"Associated PDB\"])) == metadata.shape[0]\n",
    "metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "cc7150d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "pdb_files = [f for f in sorted(os.listdir(pdb_dir)) if f.endswith(\".pdb\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "c038aa57",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check names\n",
    "pdb_file_split = [os.path.splitext(f) for f in pdb_files]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "287a8d61",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ONLY AS PDB: ['QP509L-unrelaxed.pdb']\n",
      "ONLY IN METADATA: []\n"
     ]
    }
   ],
   "source": [
    "# CHECK THAT PDB FILES MATCH EXISTING ONES\n",
    "# -> extra QP509L-unrelaxed expected\n",
    "tstp = set(pdb_files)\n",
    "tstm = set(metadata[\"Associated PDB\"])\n",
    "print(\"ONLY AS PDB:\", sorted(tstp - tstm))\n",
    "print(\"ONLY IN METADATA:\", sorted(tstm - tstp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "cbf8d9ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ONLY AS PDB: ['QP509L-unrelaxed']\n",
      "ONLY IN METADATA: []\n"
     ]
    }
   ],
   "source": [
    "# CHECK THAT FILE NAMES MATCH PROTEIN NAMES\n",
    "tstp = set(fs[0] for fs in pdb_file_split)\n",
    "tstm = set(metadata.Protein)\n",
    "print(\"ONLY AS PDB:\", sorted(tstp - tstm))\n",
    "print(\"ONLY IN METADATA:\", sorted(tstm - tstp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "95c0209e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# can use either Protein or PDB name as index\n",
    "metadata = metadata.set_index(\"Protein\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "68a98f97",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Associated PDB</th>\n",
       "      <th>NCBI_Accession</th>\n",
       "      <th>UniProt_ID</th>\n",
       "      <th>_struct.title</th>\n",
       "      <th>_struct.pdbx_model_detail</th>\n",
       "      <th>ranking debugg model ID</th>\n",
       "      <th>notes</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Protein</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Associated PDB, NCBI_Accession, UniProt_ID, _struct.title , _struct.pdbx_model_detail, ranking debugg model ID, notes]\n",
       "Index: []"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# NOTE: stupid space there...\n",
    "metadata[metadata[\"_struct.title \"].isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "628099f5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Associated PDB</th>\n",
       "      <th>NCBI_Accession</th>\n",
       "      <th>UniProt_ID</th>\n",
       "      <th>_struct.title</th>\n",
       "      <th>_struct.pdbx_model_detail</th>\n",
       "      <th>ranking debugg model ID</th>\n",
       "      <th>notes</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Protein</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [Associated PDB, NCBI_Accession, UniProt_ID, _struct.title , _struct.pdbx_model_detail, ranking debugg model ID, notes]\n",
       "Index: []"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata[metadata[\"_struct.pdbx_model_detail\"].isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "79f8dbfa",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Associated PDB</th>\n",
       "      <th>NCBI_Accession</th>\n",
       "      <th>UniProt_ID</th>\n",
       "      <th>_struct.title</th>\n",
       "      <th>_struct.pdbx_model_detail</th>\n",
       "      <th>ranking debugg model ID</th>\n",
       "      <th>notes</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Protein</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>CP2475L_p14</th>\n",
       "      <td>CP2475L_p14.pdb</td>\n",
       "      <td>CAD2068454.1</td>\n",
       "      <td>A0A2X0THU5</td>\n",
       "      <td>ASFV-G CP2475L p14</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>protein p14 from the pp220 polyprotein encoded...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CP2475L_p34</th>\n",
       "      <td>CP2475L_p34.pdb</td>\n",
       "      <td>CAD2068454.1</td>\n",
       "      <td>A0A2X0THU5</td>\n",
       "      <td>ASFV-G CP2475L p34</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>protein p34 from the pp220 polyprotein encoded...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CP2475L_p37</th>\n",
       "      <td>CP2475L_p37.pdb</td>\n",
       "      <td>CAD2068454.1</td>\n",
       "      <td>A0A2X0THU5</td>\n",
       "      <td>ASFV-G CP2475L p37</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>protein p37 from the pp220 polyprotein encoded...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CP2475L_p150</th>\n",
       "      <td>CP2475L_p150.pdb</td>\n",
       "      <td>CAD2068454.1</td>\n",
       "      <td>A0A2X0THU5</td>\n",
       "      <td>ASFV-G CP2475L p150</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_1_pred_0</td>\n",
       "      <td>protein p150 from the pp220 polyprotein encode...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>CP2475L_p5</th>\n",
       "      <td>CP2475L_p5.pdb</td>\n",
       "      <td>CAD2068454.1</td>\n",
       "      <td>A0A2X0THU5</td>\n",
       "      <td>ASFV-G CP2475L p5</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>protein p5 from the pp220 polyprotein encoded ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>D250R</th>\n",
       "      <td>D250R.pdb</td>\n",
       "      <td>CAD2068464.1</td>\n",
       "      <td>A0A2X0THV3</td>\n",
       "      <td>ASFV-G D250R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>Mislabled on NCBI and Uniport as D205R</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>DP79L</th>\n",
       "      <td>DP79L.pdb</td>\n",
       "      <td>CAD2068466.1</td>\n",
       "      <td>A0A0A1E158</td>\n",
       "      <td>ASFV-G DP79L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>Mislabled on Uniprot as D79L</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hypothetical_01</th>\n",
       "      <td>hypothetical_01.pdb</td>\n",
       "      <td>CAD2068367.1</td>\n",
       "      <td>A0A485PU43</td>\n",
       "      <td>ASFV-G hypothetical_01</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_5_pred_0</td>\n",
       "      <td>labeled as hypothetical on NCBI and Uniprot</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hypothetical_02</th>\n",
       "      <td>hypothetical_02.pdb</td>\n",
       "      <td>CAD2068400.1</td>\n",
       "      <td>A0A485PQI3</td>\n",
       "      <td>ASFV-G hypothetical_02</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>labeled as hypthetical on NCBI and Uniprot</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>hypothetical_03</th>\n",
       "      <td>hypothetical_03.pdb</td>\n",
       "      <td>CAD2068512.1</td>\n",
       "      <td>A0A485PZB7</td>\n",
       "      <td>ASFV-G hypothetical_03</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>labeled as hypthetical on NCBI and Uniprot</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                      Associated PDB NCBI_Accession  UniProt_ID  \\\n",
       "Protein                                                           \n",
       "CP2475L_p14          CP2475L_p14.pdb   CAD2068454.1  A0A2X0THU5   \n",
       "CP2475L_p34          CP2475L_p34.pdb   CAD2068454.1  A0A2X0THU5   \n",
       "CP2475L_p37          CP2475L_p37.pdb   CAD2068454.1  A0A2X0THU5   \n",
       "CP2475L_p150        CP2475L_p150.pdb   CAD2068454.1  A0A2X0THU5   \n",
       "CP2475L_p5            CP2475L_p5.pdb   CAD2068454.1  A0A2X0THU5   \n",
       "D250R                      D250R.pdb   CAD2068464.1  A0A2X0THV3   \n",
       "DP79L                      DP79L.pdb   CAD2068466.1  A0A0A1E158   \n",
       "hypothetical_01  hypothetical_01.pdb   CAD2068367.1  A0A485PU43   \n",
       "hypothetical_02  hypothetical_02.pdb   CAD2068400.1  A0A485PQI3   \n",
       "hypothetical_03  hypothetical_03.pdb   CAD2068512.1  A0A485PZB7   \n",
       "\n",
       "                         _struct.title   \\\n",
       "Protein                                   \n",
       "CP2475L_p14          ASFV-G CP2475L p14   \n",
       "CP2475L_p34          ASFV-G CP2475L p34   \n",
       "CP2475L_p37          ASFV-G CP2475L p37   \n",
       "CP2475L_p150        ASFV-G CP2475L p150   \n",
       "CP2475L_p5            ASFV-G CP2475L p5   \n",
       "D250R                      ASFV-G D250R   \n",
       "DP79L                      ASFV-G DP79L   \n",
       "hypothetical_01  ASFV-G hypothetical_01   \n",
       "hypothetical_02  ASFV-G hypothetical_02   \n",
       "hypothetical_03  ASFV-G hypothetical_03   \n",
       "\n",
       "                                 _struct.pdbx_model_detail  \\\n",
       "Protein                                                      \n",
       "CP2475L_p14      This model was predicted using AlphaFold2   \n",
       "CP2475L_p34      This model was predicted using AlphaFold2   \n",
       "CP2475L_p37      This model was predicted using AlphaFold2   \n",
       "CP2475L_p150     This model was predicted using AlphaFold2   \n",
       "CP2475L_p5       This model was predicted using AlphaFold2   \n",
       "D250R            This model was predicted using AlphaFold2   \n",
       "DP79L            This model was predicted using AlphaFold2   \n",
       "hypothetical_01  This model was predicted using AlphaFold2   \n",
       "hypothetical_02  This model was predicted using AlphaFold2   \n",
       "hypothetical_03  This model was predicted using AlphaFold2   \n",
       "\n",
       "                ranking debugg model ID  \\\n",
       "Protein                                   \n",
       "CP2475L_p14              model_4_pred_0   \n",
       "CP2475L_p34              model_4_pred_0   \n",
       "CP2475L_p37              model_4_pred_0   \n",
       "CP2475L_p150             model_1_pred_0   \n",
       "CP2475L_p5               model_3_pred_0   \n",
       "D250R                    model_3_pred_0   \n",
       "DP79L                    model_3_pred_0   \n",
       "hypothetical_01          model_5_pred_0   \n",
       "hypothetical_02          model_3_pred_0   \n",
       "hypothetical_03          model_3_pred_0   \n",
       "\n",
       "                                                             notes  \n",
       "Protein                                                             \n",
       "CP2475L_p14      protein p14 from the pp220 polyprotein encoded...  \n",
       "CP2475L_p34      protein p34 from the pp220 polyprotein encoded...  \n",
       "CP2475L_p37      protein p37 from the pp220 polyprotein encoded...  \n",
       "CP2475L_p150     protein p150 from the pp220 polyprotein encode...  \n",
       "CP2475L_p5       protein p5 from the pp220 polyprotein encoded ...  \n",
       "D250R                       Mislabled on NCBI and Uniport as D205R  \n",
       "DP79L                                 Mislabled on Uniprot as D79L  \n",
       "hypothetical_01        labeled as hypothetical on NCBI and Uniprot  \n",
       "hypothetical_02         labeled as hypthetical on NCBI and Uniprot  \n",
       "hypothetical_03         labeled as hypthetical on NCBI and Uniprot  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata[~metadata.notes.isna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "id": "30f0abc2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "protein p14 from the pp220 polyprotein encoded by CP2475L\n",
      "protein p34 from the pp220 polyprotein encoded by CP2475L\n",
      "protein p37 from the pp220 polyprotein encoded by CP2475L\n",
      "protein p150 from the pp220 polyprotein encoded by CP2475L\n",
      "protein p5 from the pp220 polyprotein encoded by CP2475L\n",
      "mislabeled on NCBI and UniProt as D205R\n",
      "mislabeled on UniProt as D79L\n",
      "labeled as hypothetical on NCBI and UniProt\n",
      "labeled as hypothetical on NCBI and UniProt\n",
      "labeled as hypothetical on NCBI and UniProt\n"
     ]
    }
   ],
   "source": [
    "for mdl_notes in metadata[~metadata.notes.isna()].notes:\n",
    "    mdl_notes = mdl_notes.replace(\"hypthetical\", \"hypothetical\") \\\n",
    "                         .replace(\"Uniport\", \"UniProt\") \\\n",
    "                         .replace(\"Uniprot\", \"UniProt\") \\\n",
    "                         .replace(\"Mislabled\", \"mislabeled\")\n",
    "    print(mdl_notes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "id": "9b11b709",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'labeled as hypothetical on NCBI and Uniprot'"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mdl_notes = 'labeled as hypthetical on NCBI and Uniprot'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "4e1a1d06",
   "metadata": {},
   "outputs": [],
   "source": [
    "# tst = metadata.loc[\"hypothetical_03\"]\n",
    "# s = _get_ncbi_sequence(tst.NCBI_Accession)\n",
    "# up_data = _fetch_upkb_entry(tst.UniProt_ID)\n",
    "# print(s.name, s, len(s), up_data[\"up_sequence\"] == str(s))\n",
    "# up_data\n",
    "\n",
    "# checked all the one above manually and ok as stated (best to add to model_detail!)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "id": "5f53cdc7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Associated PDB                                              QP509L.pdb\n",
       "NCBI_Accession                                            CAD2068484.1\n",
       "UniProt_ID                                                  A0A2X0THX2\n",
       "_struct.title                                            ASFV-G QP509L\n",
       "_struct.pdbx_model_detail    This model was predicted using AlphaFold2\n",
       "ranking debugg model ID                                            NaN\n",
       "notes                                                              NaN\n",
       "NCBI_Gi                                                     1886137009\n",
       "NCBI_UpdateDate                                             2020/08/05\n",
       "NCBI_TaxId                                                       10497\n",
       "Name: QP509L, dtype: object"
      ]
     },
     "execution_count": 113,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# special one...\n",
    "metadata.loc[\"QP509L\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "84497409",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'This model was predicted using AlphaFold2'}"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(metadata[\"_struct.pdbx_model_detail\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "3836742d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "197"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(metadata[\"_struct.title \"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "6bb77c3e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "QP509L nan\n"
     ]
    }
   ],
   "source": [
    "# check model numbers\n",
    "tst = metadata[\"ranking debugg model ID\"]\n",
    "for idx, mdl_id in tst.items():\n",
    "    mdl_num = None\n",
    "    if type(mdl_id) == str:\n",
    "        mdl_id_split = mdl_id.split('_')\n",
    "        if len(mdl_id_split) == 4:\n",
    "            mdl_num = int(mdl_id_split[1])\n",
    "    if not mdl_num:\n",
    "        print(idx, mdl_id)\n",
    "    elif mdl_num not in range(1, 6):\n",
    "        print(idx, mdl_id, mdl_num)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "7b163025",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _check_subset(s1, s2):\n",
    "    # check if s2 is uniquely contained in s1\n",
    "    # (and if so, returns values for seq_db_align_begin & seq_db_align_end)\n",
    "    if s1.count(s2) == 1:\n",
    "        align_begin = s1.find(s2) + 1\n",
    "        align_end = align_begin + len(s2) - 1\n",
    "        return align_begin, align_end\n",
    "    else:\n",
    "        return None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "0fe8bd2f",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CP2475L_p14 PDB seq. is subset of UP (369, 522)\n",
      "CP2475L_p14 PDB seq. is subset of NCBI (369, 522)\n",
      "CP2475L_p150 PDB seq. is subset of UP (894, 2476)\n",
      "CP2475L_p150 PDB seq. is subset of NCBI (894, 2476)\n",
      "CP2475L_p34 PDB seq. is subset of UP (45, 368)\n",
      "CP2475L_p34 PDB seq. is subset of NCBI (45, 368)\n",
      "CP2475L_p37 PDB seq. is subset of UP (523, 893)\n",
      "CP2475L_p37 PDB seq. is subset of NCBI (523, 893)\n",
      "CP2475L_p5 PDB seq. is subset of UP (2, 39)\n",
      "CP2475L_p5 PDB seq. is subset of NCBI (2, 39)\n"
     ]
    }
   ],
   "source": [
    "# check shared ones\n",
    "for protein, pdb_ext in sorted(pdb_file_split):\n",
    "    if protein not in metadata.index:\n",
    "        print(\"SKIPPING\", protein)\n",
    "        continue\n",
    "    else:\n",
    "        row = metadata.loc[protein]\n",
    "    pdb_path = os.path.join(pdb_dir, protein + pdb_ext)\n",
    "    ent = io.LoadPDB(pdb_path)\n",
    "    assert ent.chain_count == 1\n",
    "    sqe = _get_sequence(ent.chains[0])\n",
    "    s_ncbi = _get_ncbi_sequence(row.NCBI_Accession)\n",
    "    up_data = _fetch_upkb_entry(row.UniProt_ID)\n",
    "    if up_data[\"up_sequence\"] != str(s_ncbi):\n",
    "        print(protein, \"inconsistent UP/NCBI sequences\", up_data[\"up_sequence\"], str(s_ncbi))\n",
    "    if up_data[\"up_sequence\"] != sqe:\n",
    "        tst = _check_subset(up_data[\"up_sequence\"], sqe)\n",
    "        if tst:\n",
    "            print(protein, \"PDB seq. is subset of UP\", tst)\n",
    "        else:\n",
    "            print(protein, \"inconsistent UP/PDB sequences\", up_data[\"up_sequence\"], sqe)\n",
    "    if str(s_ncbi) != sqe:\n",
    "        tst = _check_subset(str(s_ncbi), sqe)\n",
    "        if tst:\n",
    "            print(protein, \"PDB seq. is subset of NCBI\", tst)\n",
    "        else:\n",
    "            print(protein, \"inconsistent NCBI/PDB sequences\", str(s_ncbi), sqe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "48165d13",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{1.0} {0.0}\n",
      "IN UNRELAXED: []\n",
      "IN RELAXED: []\n"
     ]
    }
   ],
   "source": [
    "# check QP509L (take pLDDT from unrelaxed)\n",
    "ent_unr = io.LoadPDB(\"./InputFiles/AlphaFold-RENAME/QP509L-unrelaxed.pdb\")\n",
    "ent_rel = io.LoadPDB(\"./InputFiles/AlphaFold-RENAME/QP509L.pdb\")\n",
    "print(set(a.occupancy for a in ent_rel.atoms), set(a.b_factor for a in ent_rel.atoms))\n",
    "ev_atoms = set(a.qualified_name for a in ent_rel.atoms)\n",
    "eu_atoms = set(a.qualified_name for a in ent_unr.atoms)\n",
    "print(\"IN UNRELAXED:\", sorted(eu_atoms - ev_atoms))\n",
    "print(\"IN RELAXED:\", sorted(ev_atoms - eu_atoms))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "fadfbdd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def _get_ncbi_info(ncbi_ac):\n",
    "    \"\"\"Fetch dict with info from NCBI web service.\"\"\"\n",
    "    # src: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary\n",
    "    rspns = requests.get(f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/\" \\\n",
    "                         f\"esummary.fcgi?db=protein&id={ncbi_ac}\")\n",
    "    dom = xml.dom.minidom.parseString(rspns.text)\n",
    "    docsums = dom.getElementsByTagName(\"DocSum\")\n",
    "    assert len(docsums) == 1\n",
    "    docsum = docsums[0]\n",
    "    ncbi_dict = {}\n",
    "    for cn in docsum.childNodes:\n",
    "        if cn.nodeName == \"Item\":\n",
    "            cn_name = cn.getAttribute(\"Name\")\n",
    "            cn_type = cn.getAttribute(\"Type\")\n",
    "            if cn.childNodes:\n",
    "                d = cn.childNodes[0].data\n",
    "                if cn_type == \"String\":\n",
    "                    ncbi_dict[cn_name] = d\n",
    "                elif cn_type == \"Integer\":\n",
    "                    ncbi_dict[cn_name] = int(d)\n",
    "                else:\n",
    "                    raise RuntimeError(f\"Unknown type {cn_type} for {ncbi_ac}\")\n",
    "            else:\n",
    "                ncbi_dict[cn_name] = None\n",
    "    return ncbi_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "1114ebce",
   "metadata": {},
   "outputs": [],
   "source": [
    "# fetch some extra info from NCBI\n",
    "for idx, row in metadata.iterrows():\n",
    "    ncbi_info = _get_ncbi_info(row.NCBI_Accession)\n",
    "    # Gi is some numerical sequence identifier used internally by NCBI\n",
    "    metadata.loc[idx, \"NCBI_Gi\"] = str(ncbi_info[\"Gi\"])\n",
    "    # UpdateData is to be stored as the version date in ModelCIF\n",
    "    metadata.loc[idx, \"NCBI_UpdateDate\"] = ncbi_info[\"UpdateDate\"]\n",
    "    # TaxId should be same as one from UP\n",
    "    metadata.loc[idx, \"NCBI_TaxId\"] = str(ncbi_info[\"TaxId\"])\n",
    "    # Status expected to be live\n",
    "    if ncbi_info[\"Status\"] != \"live\":\n",
    "        print(idx, row.NCBI_Accession, \"Status\", ncbi_info[\"Status\"])\n",
    "    # ReplacedBy expected to be empty\n",
    "    if ncbi_info[\"ReplacedBy\"]:\n",
    "        print(idx, row.NCBI_Accession, \"ReplacedBy\", ncbi_info[\"ReplacedBy\"])\n",
    "    # AccessionVersion expected to be NCBI_Accession\n",
    "    if ncbi_info[\"AccessionVersion\"] != row.NCBI_Accession:\n",
    "        print(idx, row.NCBI_Accession, \"AccessionVersion\", ncbi_info[\"AccessionVersion\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "e5d1ea29",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Associated PDB</th>\n",
       "      <th>NCBI_Accession</th>\n",
       "      <th>UniProt_ID</th>\n",
       "      <th>_struct.title</th>\n",
       "      <th>_struct.pdbx_model_detail</th>\n",
       "      <th>ranking debugg model ID</th>\n",
       "      <th>notes</th>\n",
       "      <th>NCBI_Gi</th>\n",
       "      <th>NCBI_UpdateDate</th>\n",
       "      <th>NCBI_TaxId</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Protein</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>285L</th>\n",
       "      <td>285L.pdb</td>\n",
       "      <td>CAD2068351.1</td>\n",
       "      <td>A0A485PQD1</td>\n",
       "      <td>ASFV-G 285L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_1_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136876</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A104R</th>\n",
       "      <td>A104R.pdb</td>\n",
       "      <td>CAD2068395.1</td>\n",
       "      <td>A0A0A1E0L7</td>\n",
       "      <td>ASFV-G A104R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_2_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136920</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A118R</th>\n",
       "      <td>A118R.pdb</td>\n",
       "      <td>CAD2068397.1</td>\n",
       "      <td>A0A2X0RVA9</td>\n",
       "      <td>ASFV-G A118R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_1_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136922</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A137R</th>\n",
       "      <td>A137R.pdb</td>\n",
       "      <td>CAD2068404.1</td>\n",
       "      <td>A0A2X0THQ0</td>\n",
       "      <td>ASFV-G A137R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136929</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>A151R</th>\n",
       "      <td>A151R.pdb</td>\n",
       "      <td>CAD2068398.1</td>\n",
       "      <td>A0A2X0TC55</td>\n",
       "      <td>ASFV-G A151R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136923</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>QP509L</th>\n",
       "      <td>QP509L.pdb</td>\n",
       "      <td>CAD2068484.1</td>\n",
       "      <td>A0A2X0THX2</td>\n",
       "      <td>ASFV-G QP509L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886137009</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>R298L</th>\n",
       "      <td>R298L.pdb</td>\n",
       "      <td>CAD2068482.1</td>\n",
       "      <td>A0A2X0SE42</td>\n",
       "      <td>ASFV-G R298L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_3_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886137007</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>S183L</th>\n",
       "      <td>S183L.pdb</td>\n",
       "      <td>CAD2068472.1</td>\n",
       "      <td>A0A2X0SE34</td>\n",
       "      <td>ASFV-G S183L</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_4_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136997</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>S273R</th>\n",
       "      <td>S273R.pdb</td>\n",
       "      <td>CAD2068473.1</td>\n",
       "      <td>A0A2X0TKM5</td>\n",
       "      <td>ASFV-G S273R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_2_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136998</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>X69R</th>\n",
       "      <td>X69R.pdb</td>\n",
       "      <td>CAD2068372.1</td>\n",
       "      <td>A0A2X0TKC7</td>\n",
       "      <td>ASFV-G X69R</td>\n",
       "      <td>This model was predicted using AlphaFold2</td>\n",
       "      <td>model_2_pred_0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1886136897</td>\n",
       "      <td>2020/08/05</td>\n",
       "      <td>10497</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>197 rows × 10 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Associated PDB NCBI_Accession  UniProt_ID _struct.title   \\\n",
       "Protein                                                            \n",
       "285L          285L.pdb   CAD2068351.1  A0A485PQD1    ASFV-G 285L   \n",
       "A104R        A104R.pdb   CAD2068395.1  A0A0A1E0L7   ASFV-G A104R   \n",
       "A118R        A118R.pdb   CAD2068397.1  A0A2X0RVA9   ASFV-G A118R   \n",
       "A137R        A137R.pdb   CAD2068404.1  A0A2X0THQ0   ASFV-G A137R   \n",
       "A151R        A151R.pdb   CAD2068398.1  A0A2X0TC55   ASFV-G A151R   \n",
       "...                ...            ...         ...            ...   \n",
       "QP509L      QP509L.pdb   CAD2068484.1  A0A2X0THX2  ASFV-G QP509L   \n",
       "R298L        R298L.pdb   CAD2068482.1  A0A2X0SE42   ASFV-G R298L   \n",
       "S183L        S183L.pdb   CAD2068472.1  A0A2X0SE34   ASFV-G S183L   \n",
       "S273R        S273R.pdb   CAD2068473.1  A0A2X0TKM5   ASFV-G S273R   \n",
       "X69R          X69R.pdb   CAD2068372.1  A0A2X0TKC7    ASFV-G X69R   \n",
       "\n",
       "                         _struct.pdbx_model_detail ranking debugg model ID  \\\n",
       "Protein                                                                      \n",
       "285L     This model was predicted using AlphaFold2          model_1_pred_0   \n",
       "A104R    This model was predicted using AlphaFold2          model_2_pred_0   \n",
       "A118R    This model was predicted using AlphaFold2          model_1_pred_0   \n",
       "A137R    This model was predicted using AlphaFold2          model_3_pred_0   \n",
       "A151R    This model was predicted using AlphaFold2          model_4_pred_0   \n",
       "...                                            ...                     ...   \n",
       "QP509L   This model was predicted using AlphaFold2                     NaN   \n",
       "R298L    This model was predicted using AlphaFold2          model_3_pred_0   \n",
       "S183L    This model was predicted using AlphaFold2          model_4_pred_0   \n",
       "S273R    This model was predicted using AlphaFold2          model_2_pred_0   \n",
       "X69R     This model was predicted using AlphaFold2          model_2_pred_0   \n",
       "\n",
       "        notes     NCBI_Gi NCBI_UpdateDate NCBI_TaxId  \n",
       "Protein                                               \n",
       "285L      NaN  1886136876      2020/08/05      10497  \n",
       "A104R     NaN  1886136920      2020/08/05      10497  \n",
       "A118R     NaN  1886136922      2020/08/05      10497  \n",
       "A137R     NaN  1886136929      2020/08/05      10497  \n",
       "A151R     NaN  1886136923      2020/08/05      10497  \n",
       "...       ...         ...             ...        ...  \n",
       "QP509L    NaN  1886137009      2020/08/05      10497  \n",
       "R298L     NaN  1886137007      2020/08/05      10497  \n",
       "S183L     NaN  1886136997      2020/08/05      10497  \n",
       "S273R     NaN  1886136998      2020/08/05      10497  \n",
       "X69R      NaN  1886136897      2020/08/05      10497  \n",
       "\n",
       "[197 rows x 10 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "b8d18043",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(193, {'10497'}, 193, {'2020/08/05'})"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(metadata.NCBI_Accession)), set(metadata.NCBI_TaxId), \\\n",
    "len(set(metadata.NCBI_Gi)), set(metadata.NCBI_UpdateDate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "4385bd3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# all matching tax IDs?\n",
    "for protein, upac in metadata.UniProt_ID.items():\n",
    "    up_info = _fetch_upkb_entry(upac)\n",
    "    if up_info[\"up_ncbi_taxid\"] != \"10497\":\n",
    "        print(protein, up_info)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "0376a9e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check PDB files\n",
    "import ost\n",
    "from ost import testutils, conop\n",
    "# setup conop\n",
    "testutils.SetDefaultCompoundLib()\n",
    "io.profiles['DEFAULT'].processor = conop.RuleBasedProcessor(conop.GetDefaultLib())\n",
    "# check processing\n",
    "ost.PushVerbosityLevel(2)\n",
    "for protein, pdb_ext in sorted(pdb_file_split):\n",
    "    pdb_path = os.path.join(pdb_dir, protein + pdb_ext)\n",
    "    ent = io.LoadPDB(pdb_path)\n",
    "ost.PopVerbosityLevel()\n",
    "# NOTE: lack of output means that all atom names are ok"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "5a857298",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'up_organism': 'African swine fever virus (ASFV)',\n",
       " 'up_sequence': 'MLLYIVIIVACIISKLVPNEYWAIHLFFIIMIFMVYMYEKLDIHQKSQFWNYTMSGLSGHNVQVTCKCY',\n",
       " 'up_ac': 'A0A2X0TKC7',\n",
       " 'up_id': 'A0A2X0TKC7_ASF',\n",
       " 'up_last_mod': datetime.datetime(2018, 9, 12, 0, 0),\n",
       " 'up_gn': 'X69R CDS',\n",
       " 'up_ncbi_taxid': '10497',\n",
       " 'up_seqlen': 69,\n",
       " 'up_crc64': '3B92E4DB323A7A74',\n",
       " 'up_isoform': None}"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "up_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "6c8d2b8a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Caption': 'CAD2068372',\n",
       " 'Title': 'X69R CDS [African swine fever virus]',\n",
       " 'Extra': 'gi|1886136897|emb|CAD2068372.1|[1886136897]',\n",
       " 'Gi': 1886136897,\n",
       " 'CreateDate': '2010/08/18',\n",
       " 'UpdateDate': '2020/08/05',\n",
       " 'Flags': 0,\n",
       " 'TaxId': 10497,\n",
       " 'Length': 69,\n",
       " 'Status': 'live',\n",
       " 'ReplacedBy': None,\n",
       " 'Comment': '  ',\n",
       " 'AccessionVersion': 'CAD2068372.1'}"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ncbi_info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "f7097a09",
   "metadata": {},
   "outputs": [],
   "source": [
    "# check files that were converted (input is std out from run cut to only include \"translating...\")\n",
    "log_lines = open(\"./script_out.txt\").readlines()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "id": "34ed53d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# plenty of assertions in here should also catch any errors...\n",
    "idx = 1\n",
    "timings = dict()\n",
    "while idx < len(log_lines) - 1:\n",
    "    l = log_lines[idx].strip()\n",
    "    if \"already done...\" in l:\n",
    "        idx += 1\n",
    "        continue\n",
    "    assert l.startswith(\"translating\")\n",
    "    mdl_title = l[len(\"translating\"):-3].strip()\n",
    "    if mdl_title in timings:\n",
    "        print(\"WEIRD\", l)\n",
    "    l = log_lines[idx + 1].strip()\n",
    "    assert l.startswith(\"preparing data\")\n",
    "    assert l.endswith(\"s)\")\n",
    "    t_prep = float(l.split()[-1][1:-2])\n",
    "    l = log_lines[idx + 2].strip()\n",
    "    assert l.startswith(\"generating ModelCIF objects\")\n",
    "    assert l.endswith(\"s)\")\n",
    "    t_cif = float(l.split()[-1][1:-2])\n",
    "    l = log_lines[idx + 3].strip()\n",
    "    assert l.startswith(\"processing QA scores\")\n",
    "    assert l.endswith(\"s)\")\n",
    "    t_qa = float(l.split()[-1][1:-2])\n",
    "    l = log_lines[idx + 4].strip()\n",
    "    assert l.startswith(\"write to disk\")\n",
    "    assert l.endswith(\"s)\")\n",
    "    t_write = float(l.split()[-1][1:-2])\n",
    "    l = log_lines[idx + 5].strip()\n",
    "    assert l.startswith(\"... done with\")\n",
    "    assert l.endswith(\"s).\")\n",
    "    t_all = float(l.split()[-1][1:-3])\n",
    "    timings[mdl_title] = {\n",
    "        \"t_prep\": t_prep,\n",
    "        \"t_cif\": t_cif,\n",
    "        \"t_qa\": t_qa,\n",
    "        \"t_write\": t_write,\n",
    "        \"t_all\": t_all\n",
    "    }\n",
    "    idx += 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "876acf01",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DONE 197 models\n",
      "MISSING 0 models\n"
     ]
    }
   ],
   "source": [
    "print(f\"DONE {len(timings)} models\")\n",
    "mdl_titles = set(metadata.index)\n",
    "assert len(set(timings) - mdl_titles) == 0\n",
    "missing_ones = mdl_titles - set(timings)\n",
    "print(f\"MISSING {len(missing_ones)} models\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "id": "612a1791",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TOTAL TIME SPENT: 356.38 s\n",
      "-> t_prep: 80.5%\n",
      "-> t_cif: 1.2%\n",
      "-> t_qa: 0.0%\n",
      "-> t_write: 17.8%\n"
     ]
    }
   ],
   "source": [
    "k_parts = [\"t_prep\", \"t_cif\", \"t_qa\", \"t_write\"]\n",
    "totals = {k: sum(v[k] for v in timings.values()) \\\n",
    "          for k in k_parts + [\"t_all\"]}\n",
    "print(f\"TOTAL TIME SPENT: {round(totals['t_all'], 2)} s\")\n",
    "for k in k_parts:\n",
    "    print(f\"-> {k}: {round(100 * totals[k] / totals['t_all'], 1)}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "283d223f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}