{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "ac0e1325", "metadata": {}, "outputs": [], "source": [ "import requests\n", "import datetime\n", "import os\n", "import pandas as pd\n", "import xml.dom.minidom\n", "from ost import io\n", "\n", "# from Tara code\n", "def _get_sequence(chn):\n", " \"\"\"Get the sequence out of an OST chain.\"\"\"\n", " # initialise\n", " lst_rn = chn.residues[0].number.num\n", " idx = 1\n", " sqe = chn.residues[0].one_letter_code\n", " if lst_rn != 1:\n", " sqe = \"-\"\n", " idx = 0\n", "\n", " for res in chn.residues[idx:]:\n", " lst_rn += 1\n", " while lst_rn != res.number.num:\n", " sqe += \"-\"\n", " lst_rn += 1\n", " sqe += res.one_letter_code\n", "\n", " return sqe\n", "\n", "def _check_sequence(up_ac, sequence):\n", " \"\"\"Verify sequence to only contain standard olc.\"\"\"\n", " for res in sequence:\n", " if res not in \"ACDEFGHIKLMNPQRSTVWY\":\n", " raise RuntimeError(\n", " \"Non-standard aa found in UniProtKB sequence \"\n", " + f\"for entry '{up_ac}': {res}\"\n", " )\n", "\n", "def _fetch_upkb_entry(up_ac):\n", " \"\"\"Fetch data for an UniProtKB entry.\"\"\"\n", " # This is a simple parser for UniProtKB txt format, instead of breaking it up\n", " # into multiple functions, we just allow many many branches & statements,\n", " # here.\n", " # pylint: disable=too-many-branches,too-many-statements\n", " data = {}\n", " data[\"up_organism\"] = \"\"\n", " data[\"up_sequence\"] = \"\"\n", " data[\"up_ac\"] = up_ac\n", " rspns = requests.get(f\"https://www.uniprot.org/uniprot/{up_ac}.txt\")\n", " for line in rspns.iter_lines(decode_unicode=True):\n", " if line.startswith(\"ID \"):\n", " sline = line.split()\n", " if len(sline) != 5:\n", " _abort_msg(f\"Unusual UniProtKB ID line found:\\n'{line}'\")\n", " data[\"up_id\"] = sline[1]\n", " elif line.startswith(\"OX NCBI_TaxID=\"):\n", " # Following strictly the UniProtKB format: 'OX NCBI_TaxID=<ID>;'\n", " data[\"up_ncbi_taxid\"] = line[len(\"OX NCBI_TaxID=\") : -1]\n", " data[\"up_ncbi_taxid\"] = data[\"up_ncbi_taxid\"].split(\"{\")[0].strip()\n", " elif line.startswith(\"OS \"):\n", " if line[-1] == \".\":\n", " data[\"up_organism\"] += line[len(\"OS \") : -1]\n", " else:\n", " data[\"up_organism\"] += line[len(\"OS \") : -1] + \" \"\n", " elif line.startswith(\"SQ \"):\n", " sline = line.split()\n", " if len(sline) != 8:\n", " _abort_msg(f\"Unusual UniProtKB SQ line found:\\n'{line}'\")\n", " data[\"up_seqlen\"] = int(sline[2])\n", " data[\"up_crc64\"] = sline[6]\n", " elif line.startswith(\" \"):\n", " sline = line.split()\n", " if len(sline) > 6:\n", " _abort_msg(\n", " \"Unusual UniProtKB sequence data line \"\n", " + f\"found:\\n'{line}'\"\n", " )\n", " data[\"up_sequence\"] += \"\".join(sline)\n", " elif line.startswith(\"RP \"):\n", " if \"ISOFORM\" in line.upper():\n", " RuntimeError(\n", " f\"First ISOFORM found for '{up_ac}', needs \" + \"handling.\"\n", " )\n", " elif line.startswith(\"DT \"):\n", " # 2012-10-03\n", " dt_flds = line[len(\"DT \") :].split(\", \")\n", " if dt_flds[1].upper().startswith(\"SEQUENCE VERSION \"):\n", " data[\"up_last_mod\"] = datetime.datetime.strptime(\n", " dt_flds[0], \"%d-%b-%Y\"\n", " )\n", " elif line.startswith(\"GN Name=\"):\n", " data[\"up_gn\"] = line[len(\"GN Name=\") :].split(\";\")[0]\n", " data[\"up_gn\"] = data[\"up_gn\"].split(\"{\")[0].strip()\n", "\n", " # we have not seen isoforms in the data set, yet, so we just set them to '.'\n", " data[\"up_isoform\"] = None\n", "\n", " if \"up_gn\" not in data:\n", " _abort_msg(f\"No gene name found for UniProtKB entry '{up_ac}'.\")\n", " if \"up_last_mod\" not in data:\n", " _abort_msg(f\"No sequence version found for UniProtKB entry '{up_ac}'.\")\n", " if \"up_crc64\" not in data:\n", " _abort_msg(f\"No CRC64 value found for UniProtKB entry '{up_ac}'.\")\n", " if len(data[\"up_sequence\"]) == 0:\n", " _abort_msg(f\"No sequence found for UniProtKB entry '{up_ac}'.\")\n", " # check that sequence length and CRC64 is correct\n", " if data[\"up_seqlen\"] != len(data[\"up_sequence\"]):\n", " _abort_msg(\n", " \"Sequence length of SQ line and sequence data differ for \"\n", " + f\"UniProtKB entry '{up_ac}': {data['up_seqlen']} != \"\n", " + f\"{len(data['up_sequence'])}\"\n", " )\n", " _check_sequence(data[\"up_ac\"], data[\"up_sequence\"])\n", "\n", " if \"up_id\" not in data:\n", " _abort_msg(f\"No ID found for UniProtKB entry '{up_ac}'.\")\n", " if \"up_ncbi_taxid\" not in data:\n", " _abort_msg(f\"No NCBI taxonomy ID found for UniProtKB entry '{up_ac}'.\")\n", " if len(data[\"up_organism\"]) == 0:\n", " _abort_msg(f\"No organism species found for UniProtKB entry '{up_ac}'.\")\n", "\n", " return data\n", "\n", "def _get_upkb_for_sequence(sqe, up_ac):\n", " \"\"\"Get UniProtKB entry data for given sequence.\"\"\"\n", " up_data = _fetch_upkb_entry(up_ac)\n", " if sqe != up_data[\"up_sequence\"]:\n", " raise RuntimeError(\n", " f\"Sequences not equal from file: {sqe}, from UniProtKB: \"\n", " + f\"{up_data['up_sequence']}\"\n", " )\n", "\n", " return up_data" ] }, { "cell_type": "code", "execution_count": 2, "id": "37bece22", "metadata": {}, "outputs": [], "source": [ "def _get_ncbi_sequence(ncbi_ac):\n", " \"\"\"Fetch OST sequence object from NCBI web service.\"\"\"\n", " # src: https://www.ncbi.nlm.nih.gov/books/NBK25500/#_chapter1_Downloading_Full_Records_\n", " rspns = requests.get(f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/\" \\\n", " f\"efetch.fcgi?db=protein&id={ncbi_ac}\" \\\n", " f\"&rettype=fasta&retmode=text\")\n", " return io.SequenceFromString(rspns.text, \"fasta\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "3f83b12d", "metadata": {}, "outputs": [], "source": [ "# s = _get_ncbi_sequence(\"CAD2068351.1\")\n", "# up_data = _fetch_upkb_entry(\"A0A485PQD1\")\n", "# print(s.name, s, len(s), up_data[\"up_sequence\"] == str(s))\n", "# up_data" ] }, { "cell_type": "code", "execution_count": 5, "id": "ff132158", "metadata": {}, "outputs": [], "source": [ "# check USDA data\n", "metadata_file = \"./InputFiles/ASFV-G_proteome_accessions.csv\"\n", "pdb_dir = \"./InputFiles/AlphaFold-RENAME\"" ] }, { "cell_type": "code", "execution_count": 6, "id": "d8ca29cb", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Protein</th>\n", " <th>Associated PDB</th>\n", " <th>NCBI_Accession</th>\n", " <th>UniProt_ID</th>\n", " <th>_struct.title</th>\n", " <th>_struct.pdbx_model_detail</th>\n", " <th>ranking debugg model ID</th>\n", " <th>notes</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>285L</td>\n", " <td>285L.pdb</td>\n", " <td>CAD2068351.1</td>\n", " <td>A0A485PQD1</td>\n", " <td>ASFV-G 285L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_1_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>A104R</td>\n", " <td>A104R.pdb</td>\n", " <td>CAD2068395.1</td>\n", " <td>A0A0A1E0L7</td>\n", " <td>ASFV-G A104R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_2_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>A118R</td>\n", " <td>A118R.pdb</td>\n", " <td>CAD2068397.1</td>\n", " <td>A0A2X0RVA9</td>\n", " <td>ASFV-G A118R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_1_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>A137R</td>\n", " <td>A137R.pdb</td>\n", " <td>CAD2068404.1</td>\n", " <td>A0A2X0THQ0</td>\n", " <td>ASFV-G A137R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>A151R</td>\n", " <td>A151R.pdb</td>\n", " <td>CAD2068398.1</td>\n", " <td>A0A2X0TC55</td>\n", " <td>ASFV-G A151R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>192</th>\n", " <td>QP509L</td>\n", " <td>QP509L.pdb</td>\n", " <td>CAD2068484.1</td>\n", " <td>A0A2X0THX2</td>\n", " <td>ASFV-G QP509L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>193</th>\n", " <td>R298L</td>\n", " <td>R298L.pdb</td>\n", " <td>CAD2068482.1</td>\n", " <td>A0A2X0SE42</td>\n", " <td>ASFV-G R298L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>194</th>\n", " <td>S183L</td>\n", " <td>S183L.pdb</td>\n", " <td>CAD2068472.1</td>\n", " <td>A0A2X0SE34</td>\n", " <td>ASFV-G S183L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>195</th>\n", " <td>S273R</td>\n", " <td>S273R.pdb</td>\n", " <td>CAD2068473.1</td>\n", " <td>A0A2X0TKM5</td>\n", " <td>ASFV-G S273R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_2_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " <tr>\n", " <th>196</th>\n", " <td>X69R</td>\n", " <td>X69R.pdb</td>\n", " <td>CAD2068372.1</td>\n", " <td>A0A2X0TKC7</td>\n", " <td>ASFV-G X69R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_2_pred_0</td>\n", " <td>NaN</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>197 rows × 8 columns</p>\n", "</div>" ], "text/plain": [ " Protein Associated PDB NCBI_Accession UniProt_ID _struct.title \\\n", "0 285L 285L.pdb CAD2068351.1 A0A485PQD1 ASFV-G 285L \n", "1 A104R A104R.pdb CAD2068395.1 A0A0A1E0L7 ASFV-G A104R \n", "2 A118R A118R.pdb CAD2068397.1 A0A2X0RVA9 ASFV-G A118R \n", "3 A137R A137R.pdb CAD2068404.1 A0A2X0THQ0 ASFV-G A137R \n", "4 A151R A151R.pdb CAD2068398.1 A0A2X0TC55 ASFV-G A151R \n", ".. ... ... ... ... ... \n", "192 QP509L QP509L.pdb CAD2068484.1 A0A2X0THX2 ASFV-G QP509L \n", "193 R298L R298L.pdb CAD2068482.1 A0A2X0SE42 ASFV-G R298L \n", "194 S183L S183L.pdb CAD2068472.1 A0A2X0SE34 ASFV-G S183L \n", "195 S273R S273R.pdb CAD2068473.1 A0A2X0TKM5 ASFV-G S273R \n", "196 X69R X69R.pdb CAD2068372.1 A0A2X0TKC7 ASFV-G X69R \n", "\n", " _struct.pdbx_model_detail ranking debugg model ID notes \n", "0 This model was predicted using AlphaFold2 model_1_pred_0 NaN \n", "1 This model was predicted using AlphaFold2 model_2_pred_0 NaN \n", "2 This model was predicted using AlphaFold2 model_1_pred_0 NaN \n", "3 This model was predicted using AlphaFold2 model_3_pred_0 NaN \n", "4 This model was predicted using AlphaFold2 model_4_pred_0 NaN \n", ".. ... ... ... \n", "192 This model was predicted using AlphaFold2 NaN NaN \n", "193 This model was predicted using AlphaFold2 model_3_pred_0 NaN \n", "194 This model was predicted using AlphaFold2 model_4_pred_0 NaN \n", "195 This model was predicted using AlphaFold2 model_2_pred_0 NaN \n", "196 This model was predicted using AlphaFold2 model_2_pred_0 NaN \n", "\n", "[197 rows x 8 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata = pd.read_csv(metadata_file)\n", "assert len(set(metadata.Protein)) == metadata.shape[0]\n", "assert len(set(metadata[\"Associated PDB\"])) == metadata.shape[0]\n", "metadata" ] }, { "cell_type": "code", "execution_count": 66, "id": "cc7150d4", "metadata": {}, "outputs": [], "source": [ "pdb_files = [f for f in sorted(os.listdir(pdb_dir)) if f.endswith(\".pdb\")]" ] }, { "cell_type": "code", "execution_count": 67, "id": "c038aa57", "metadata": {}, "outputs": [], "source": [ "# check names\n", "pdb_file_split = [os.path.splitext(f) for f in pdb_files]" ] }, { "cell_type": "code", "execution_count": 68, "id": "287a8d61", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ONLY AS PDB: ['QP509L-unrelaxed.pdb']\n", "ONLY IN METADATA: []\n" ] } ], "source": [ "# CHECK THAT PDB FILES MATCH EXISTING ONES\n", "# -> extra QP509L-unrelaxed expected\n", "tstp = set(pdb_files)\n", "tstm = set(metadata[\"Associated PDB\"])\n", "print(\"ONLY AS PDB:\", sorted(tstp - tstm))\n", "print(\"ONLY IN METADATA:\", sorted(tstm - tstp))" ] }, { "cell_type": "code", "execution_count": 70, "id": "cbf8d9ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "ONLY AS PDB: ['QP509L-unrelaxed']\n", "ONLY IN METADATA: []\n" ] } ], "source": [ "# CHECK THAT FILE NAMES MATCH PROTEIN NAMES\n", "tstp = set(fs[0] for fs in pdb_file_split)\n", "tstm = set(metadata.Protein)\n", "print(\"ONLY AS PDB:\", sorted(tstp - tstm))\n", "print(\"ONLY IN METADATA:\", sorted(tstm - tstp))" ] }, { "cell_type": "code", "execution_count": 14, "id": "95c0209e", "metadata": {}, "outputs": [], "source": [ "# can use either Protein or PDB name as index\n", "metadata = metadata.set_index(\"Protein\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "68a98f97", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Associated PDB</th>\n", " <th>NCBI_Accession</th>\n", " <th>UniProt_ID</th>\n", " <th>_struct.title</th>\n", " <th>_struct.pdbx_model_detail</th>\n", " <th>ranking debugg model ID</th>\n", " <th>notes</th>\n", " </tr>\n", " <tr>\n", " <th>Protein</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Associated PDB, NCBI_Accession, UniProt_ID, _struct.title , _struct.pdbx_model_detail, ranking debugg model ID, notes]\n", "Index: []" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# NOTE: stupid space there...\n", "metadata[metadata[\"_struct.title \"].isna()]" ] }, { "cell_type": "code", "execution_count": 16, "id": "628099f5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Associated PDB</th>\n", " <th>NCBI_Accession</th>\n", " <th>UniProt_ID</th>\n", " <th>_struct.title</th>\n", " <th>_struct.pdbx_model_detail</th>\n", " <th>ranking debugg model ID</th>\n", " <th>notes</th>\n", " </tr>\n", " <tr>\n", " <th>Protein</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ "Empty DataFrame\n", "Columns: [Associated PDB, NCBI_Accession, UniProt_ID, _struct.title , _struct.pdbx_model_detail, ranking debugg model ID, notes]\n", "Index: []" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata[metadata[\"_struct.pdbx_model_detail\"].isna()]" ] }, { "cell_type": "code", "execution_count": 17, "id": "79f8dbfa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Associated PDB</th>\n", " <th>NCBI_Accession</th>\n", " <th>UniProt_ID</th>\n", " <th>_struct.title</th>\n", " <th>_struct.pdbx_model_detail</th>\n", " <th>ranking debugg model ID</th>\n", " <th>notes</th>\n", " </tr>\n", " <tr>\n", " <th>Protein</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>CP2475L_p14</th>\n", " <td>CP2475L_p14.pdb</td>\n", " <td>CAD2068454.1</td>\n", " <td>A0A2X0THU5</td>\n", " <td>ASFV-G CP2475L p14</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>protein p14 from the pp220 polyprotein encoded...</td>\n", " </tr>\n", " <tr>\n", " <th>CP2475L_p34</th>\n", " <td>CP2475L_p34.pdb</td>\n", " <td>CAD2068454.1</td>\n", " <td>A0A2X0THU5</td>\n", " <td>ASFV-G CP2475L p34</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>protein p34 from the pp220 polyprotein encoded...</td>\n", " </tr>\n", " <tr>\n", " <th>CP2475L_p37</th>\n", " <td>CP2475L_p37.pdb</td>\n", " <td>CAD2068454.1</td>\n", " <td>A0A2X0THU5</td>\n", " <td>ASFV-G CP2475L p37</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>protein p37 from the pp220 polyprotein encoded...</td>\n", " </tr>\n", " <tr>\n", " <th>CP2475L_p150</th>\n", " <td>CP2475L_p150.pdb</td>\n", " <td>CAD2068454.1</td>\n", " <td>A0A2X0THU5</td>\n", " <td>ASFV-G CP2475L p150</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_1_pred_0</td>\n", " <td>protein p150 from the pp220 polyprotein encode...</td>\n", " </tr>\n", " <tr>\n", " <th>CP2475L_p5</th>\n", " <td>CP2475L_p5.pdb</td>\n", " <td>CAD2068454.1</td>\n", " <td>A0A2X0THU5</td>\n", " <td>ASFV-G CP2475L p5</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>protein p5 from the pp220 polyprotein encoded ...</td>\n", " </tr>\n", " <tr>\n", " <th>D250R</th>\n", " <td>D250R.pdb</td>\n", " <td>CAD2068464.1</td>\n", " <td>A0A2X0THV3</td>\n", " <td>ASFV-G D250R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>Mislabled on NCBI and Uniport as D205R</td>\n", " </tr>\n", " <tr>\n", " <th>DP79L</th>\n", " <td>DP79L.pdb</td>\n", " <td>CAD2068466.1</td>\n", " <td>A0A0A1E158</td>\n", " <td>ASFV-G DP79L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>Mislabled on Uniprot as D79L</td>\n", " </tr>\n", " <tr>\n", " <th>hypothetical_01</th>\n", " <td>hypothetical_01.pdb</td>\n", " <td>CAD2068367.1</td>\n", " <td>A0A485PU43</td>\n", " <td>ASFV-G hypothetical_01</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_5_pred_0</td>\n", " <td>labeled as hypothetical on NCBI and Uniprot</td>\n", " </tr>\n", " <tr>\n", " <th>hypothetical_02</th>\n", " <td>hypothetical_02.pdb</td>\n", " <td>CAD2068400.1</td>\n", " <td>A0A485PQI3</td>\n", " <td>ASFV-G hypothetical_02</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>labeled as hypthetical on NCBI and Uniprot</td>\n", " </tr>\n", " <tr>\n", " <th>hypothetical_03</th>\n", " <td>hypothetical_03.pdb</td>\n", " <td>CAD2068512.1</td>\n", " <td>A0A485PZB7</td>\n", " <td>ASFV-G hypothetical_03</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>labeled as hypthetical on NCBI and Uniprot</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " Associated PDB NCBI_Accession UniProt_ID \\\n", "Protein \n", "CP2475L_p14 CP2475L_p14.pdb CAD2068454.1 A0A2X0THU5 \n", "CP2475L_p34 CP2475L_p34.pdb CAD2068454.1 A0A2X0THU5 \n", "CP2475L_p37 CP2475L_p37.pdb CAD2068454.1 A0A2X0THU5 \n", "CP2475L_p150 CP2475L_p150.pdb CAD2068454.1 A0A2X0THU5 \n", "CP2475L_p5 CP2475L_p5.pdb CAD2068454.1 A0A2X0THU5 \n", "D250R D250R.pdb CAD2068464.1 A0A2X0THV3 \n", "DP79L DP79L.pdb CAD2068466.1 A0A0A1E158 \n", "hypothetical_01 hypothetical_01.pdb CAD2068367.1 A0A485PU43 \n", "hypothetical_02 hypothetical_02.pdb CAD2068400.1 A0A485PQI3 \n", "hypothetical_03 hypothetical_03.pdb CAD2068512.1 A0A485PZB7 \n", "\n", " _struct.title \\\n", "Protein \n", "CP2475L_p14 ASFV-G CP2475L p14 \n", "CP2475L_p34 ASFV-G CP2475L p34 \n", "CP2475L_p37 ASFV-G CP2475L p37 \n", "CP2475L_p150 ASFV-G CP2475L p150 \n", "CP2475L_p5 ASFV-G CP2475L p5 \n", "D250R ASFV-G D250R \n", "DP79L ASFV-G DP79L \n", "hypothetical_01 ASFV-G hypothetical_01 \n", "hypothetical_02 ASFV-G hypothetical_02 \n", "hypothetical_03 ASFV-G hypothetical_03 \n", "\n", " _struct.pdbx_model_detail \\\n", "Protein \n", "CP2475L_p14 This model was predicted using AlphaFold2 \n", "CP2475L_p34 This model was predicted using AlphaFold2 \n", "CP2475L_p37 This model was predicted using AlphaFold2 \n", "CP2475L_p150 This model was predicted using AlphaFold2 \n", "CP2475L_p5 This model was predicted using AlphaFold2 \n", "D250R This model was predicted using AlphaFold2 \n", "DP79L This model was predicted using AlphaFold2 \n", "hypothetical_01 This model was predicted using AlphaFold2 \n", "hypothetical_02 This model was predicted using AlphaFold2 \n", "hypothetical_03 This model was predicted using AlphaFold2 \n", "\n", " ranking debugg model ID \\\n", "Protein \n", "CP2475L_p14 model_4_pred_0 \n", "CP2475L_p34 model_4_pred_0 \n", "CP2475L_p37 model_4_pred_0 \n", "CP2475L_p150 model_1_pred_0 \n", "CP2475L_p5 model_3_pred_0 \n", "D250R model_3_pred_0 \n", "DP79L model_3_pred_0 \n", "hypothetical_01 model_5_pred_0 \n", "hypothetical_02 model_3_pred_0 \n", "hypothetical_03 model_3_pred_0 \n", "\n", " notes \n", "Protein \n", "CP2475L_p14 protein p14 from the pp220 polyprotein encoded... \n", "CP2475L_p34 protein p34 from the pp220 polyprotein encoded... \n", "CP2475L_p37 protein p37 from the pp220 polyprotein encoded... \n", "CP2475L_p150 protein p150 from the pp220 polyprotein encode... \n", "CP2475L_p5 protein p5 from the pp220 polyprotein encoded ... \n", "D250R Mislabled on NCBI and Uniport as D205R \n", "DP79L Mislabled on Uniprot as D79L \n", "hypothetical_01 labeled as hypothetical on NCBI and Uniprot \n", "hypothetical_02 labeled as hypthetical on NCBI and Uniprot \n", "hypothetical_03 labeled as hypthetical on NCBI and Uniprot " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata[~metadata.notes.isna()]" ] }, { "cell_type": "code", "execution_count": 126, "id": "30f0abc2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "protein p14 from the pp220 polyprotein encoded by CP2475L\n", "protein p34 from the pp220 polyprotein encoded by CP2475L\n", "protein p37 from the pp220 polyprotein encoded by CP2475L\n", "protein p150 from the pp220 polyprotein encoded by CP2475L\n", "protein p5 from the pp220 polyprotein encoded by CP2475L\n", "mislabeled on NCBI and UniProt as D205R\n", "mislabeled on UniProt as D79L\n", "labeled as hypothetical on NCBI and UniProt\n", "labeled as hypothetical on NCBI and UniProt\n", "labeled as hypothetical on NCBI and UniProt\n" ] } ], "source": [ "for mdl_notes in metadata[~metadata.notes.isna()].notes:\n", " mdl_notes = mdl_notes.replace(\"hypthetical\", \"hypothetical\") \\\n", " .replace(\"Uniport\", \"UniProt\") \\\n", " .replace(\"Uniprot\", \"UniProt\") \\\n", " .replace(\"Mislabled\", \"mislabeled\")\n", " print(mdl_notes)" ] }, { "cell_type": "code", "execution_count": 121, "id": "9b11b709", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'labeled as hypothetical on NCBI and Uniprot'" ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "mdl_notes = 'labeled as hypthetical on NCBI and Uniprot'" ] }, { "cell_type": "code", "execution_count": 31, "id": "4e1a1d06", "metadata": {}, "outputs": [], "source": [ "# tst = metadata.loc[\"hypothetical_03\"]\n", "# s = _get_ncbi_sequence(tst.NCBI_Accession)\n", "# up_data = _fetch_upkb_entry(tst.UniProt_ID)\n", "# print(s.name, s, len(s), up_data[\"up_sequence\"] == str(s))\n", "# up_data\n", "\n", "# checked all the one above manually and ok as stated (best to add to model_detail!)" ] }, { "cell_type": "code", "execution_count": 113, "id": "5f53cdc7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Associated PDB QP509L.pdb\n", "NCBI_Accession CAD2068484.1\n", "UniProt_ID A0A2X0THX2\n", "_struct.title ASFV-G QP509L\n", "_struct.pdbx_model_detail This model was predicted using AlphaFold2\n", "ranking debugg model ID NaN\n", "notes NaN\n", "NCBI_Gi 1886137009\n", "NCBI_UpdateDate 2020/08/05\n", "NCBI_TaxId 10497\n", "Name: QP509L, dtype: object" ] }, "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# special one...\n", "metadata.loc[\"QP509L\"]" ] }, { "cell_type": "code", "execution_count": 33, "id": "84497409", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'This model was predicted using AlphaFold2'}" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "set(metadata[\"_struct.pdbx_model_detail\"])" ] }, { "cell_type": "code", "execution_count": 34, "id": "3836742d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "197" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(metadata[\"_struct.title \"]))" ] }, { "cell_type": "code", "execution_count": 35, "id": "6bb77c3e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "QP509L nan\n" ] } ], "source": [ "# check model numbers\n", "tst = metadata[\"ranking debugg model ID\"]\n", "for idx, mdl_id in tst.items():\n", " mdl_num = None\n", " if type(mdl_id) == str:\n", " mdl_id_split = mdl_id.split('_')\n", " if len(mdl_id_split) == 4:\n", " mdl_num = int(mdl_id_split[1])\n", " if not mdl_num:\n", " print(idx, mdl_id)\n", " elif mdl_num not in range(1, 6):\n", " print(idx, mdl_id, mdl_num)" ] }, { "cell_type": "code", "execution_count": 36, "id": "7b163025", "metadata": {}, "outputs": [], "source": [ "def _check_subset(s1, s2):\n", " # check if s2 is uniquely contained in s1\n", " # (and if so, returns values for seq_db_align_begin & seq_db_align_end)\n", " if s1.count(s2) == 1:\n", " align_begin = s1.find(s2) + 1\n", " align_end = align_begin + len(s2) - 1\n", " return align_begin, align_end\n", " else:\n", " return None" ] }, { "cell_type": "code", "execution_count": 37, "id": "0fe8bd2f", "metadata": { "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CP2475L_p14 PDB seq. is subset of UP (369, 522)\n", "CP2475L_p14 PDB seq. is subset of NCBI (369, 522)\n", "CP2475L_p150 PDB seq. is subset of UP (894, 2476)\n", "CP2475L_p150 PDB seq. is subset of NCBI (894, 2476)\n", "CP2475L_p34 PDB seq. is subset of UP (45, 368)\n", "CP2475L_p34 PDB seq. is subset of NCBI (45, 368)\n", "CP2475L_p37 PDB seq. is subset of UP (523, 893)\n", "CP2475L_p37 PDB seq. is subset of NCBI (523, 893)\n", "CP2475L_p5 PDB seq. is subset of UP (2, 39)\n", "CP2475L_p5 PDB seq. is subset of NCBI (2, 39)\n" ] } ], "source": [ "# check shared ones\n", "for protein, pdb_ext in sorted(pdb_file_split):\n", " if protein not in metadata.index:\n", " print(\"SKIPPING\", protein)\n", " continue\n", " else:\n", " row = metadata.loc[protein]\n", " pdb_path = os.path.join(pdb_dir, protein + pdb_ext)\n", " ent = io.LoadPDB(pdb_path)\n", " assert ent.chain_count == 1\n", " sqe = _get_sequence(ent.chains[0])\n", " s_ncbi = _get_ncbi_sequence(row.NCBI_Accession)\n", " up_data = _fetch_upkb_entry(row.UniProt_ID)\n", " if up_data[\"up_sequence\"] != str(s_ncbi):\n", " print(protein, \"inconsistent UP/NCBI sequences\", up_data[\"up_sequence\"], str(s_ncbi))\n", " if up_data[\"up_sequence\"] != sqe:\n", " tst = _check_subset(up_data[\"up_sequence\"], sqe)\n", " if tst:\n", " print(protein, \"PDB seq. is subset of UP\", tst)\n", " else:\n", " print(protein, \"inconsistent UP/PDB sequences\", up_data[\"up_sequence\"], sqe)\n", " if str(s_ncbi) != sqe:\n", " tst = _check_subset(str(s_ncbi), sqe)\n", " if tst:\n", " print(protein, \"PDB seq. is subset of NCBI\", tst)\n", " else:\n", " print(protein, \"inconsistent NCBI/PDB sequences\", str(s_ncbi), sqe)" ] }, { "cell_type": "code", "execution_count": 65, "id": "48165d13", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{1.0} {0.0}\n", "IN UNRELAXED: []\n", "IN RELAXED: []\n" ] } ], "source": [ "# check QP509L (take pLDDT from unrelaxed)\n", "ent_unr = io.LoadPDB(\"./InputFiles/AlphaFold-RENAME/QP509L-unrelaxed.pdb\")\n", "ent_rel = io.LoadPDB(\"./InputFiles/AlphaFold-RENAME/QP509L.pdb\")\n", "print(set(a.occupancy for a in ent_rel.atoms), set(a.b_factor for a in ent_rel.atoms))\n", "ev_atoms = set(a.qualified_name for a in ent_rel.atoms)\n", "eu_atoms = set(a.qualified_name for a in ent_unr.atoms)\n", "print(\"IN UNRELAXED:\", sorted(eu_atoms - ev_atoms))\n", "print(\"IN RELAXED:\", sorted(ev_atoms - eu_atoms))" ] }, { "cell_type": "code", "execution_count": 43, "id": "fadfbdd0", "metadata": {}, "outputs": [], "source": [ "def _get_ncbi_info(ncbi_ac):\n", " \"\"\"Fetch dict with info from NCBI web service.\"\"\"\n", " # src: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESummary\n", " rspns = requests.get(f\"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/\" \\\n", " f\"esummary.fcgi?db=protein&id={ncbi_ac}\")\n", " dom = xml.dom.minidom.parseString(rspns.text)\n", " docsums = dom.getElementsByTagName(\"DocSum\")\n", " assert len(docsums) == 1\n", " docsum = docsums[0]\n", " ncbi_dict = {}\n", " for cn in docsum.childNodes:\n", " if cn.nodeName == \"Item\":\n", " cn_name = cn.getAttribute(\"Name\")\n", " cn_type = cn.getAttribute(\"Type\")\n", " if cn.childNodes:\n", " d = cn.childNodes[0].data\n", " if cn_type == \"String\":\n", " ncbi_dict[cn_name] = d\n", " elif cn_type == \"Integer\":\n", " ncbi_dict[cn_name] = int(d)\n", " else:\n", " raise RuntimeError(f\"Unknown type {cn_type} for {ncbi_ac}\")\n", " else:\n", " ncbi_dict[cn_name] = None\n", " return ncbi_dict" ] }, { "cell_type": "code", "execution_count": 44, "id": "1114ebce", "metadata": {}, "outputs": [], "source": [ "# fetch some extra info from NCBI\n", "for idx, row in metadata.iterrows():\n", " ncbi_info = _get_ncbi_info(row.NCBI_Accession)\n", " # Gi is some numerical sequence identifier used internally by NCBI\n", " metadata.loc[idx, \"NCBI_Gi\"] = str(ncbi_info[\"Gi\"])\n", " # UpdateData is to be stored as the version date in ModelCIF\n", " metadata.loc[idx, \"NCBI_UpdateDate\"] = ncbi_info[\"UpdateDate\"]\n", " # TaxId should be same as one from UP\n", " metadata.loc[idx, \"NCBI_TaxId\"] = str(ncbi_info[\"TaxId\"])\n", " # Status expected to be live\n", " if ncbi_info[\"Status\"] != \"live\":\n", " print(idx, row.NCBI_Accession, \"Status\", ncbi_info[\"Status\"])\n", " # ReplacedBy expected to be empty\n", " if ncbi_info[\"ReplacedBy\"]:\n", " print(idx, row.NCBI_Accession, \"ReplacedBy\", ncbi_info[\"ReplacedBy\"])\n", " # AccessionVersion expected to be NCBI_Accession\n", " if ncbi_info[\"AccessionVersion\"] != row.NCBI_Accession:\n", " print(idx, row.NCBI_Accession, \"AccessionVersion\", ncbi_info[\"AccessionVersion\"])" ] }, { "cell_type": "code", "execution_count": 45, "id": "e5d1ea29", "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>Associated PDB</th>\n", " <th>NCBI_Accession</th>\n", " <th>UniProt_ID</th>\n", " <th>_struct.title</th>\n", " <th>_struct.pdbx_model_detail</th>\n", " <th>ranking debugg model ID</th>\n", " <th>notes</th>\n", " <th>NCBI_Gi</th>\n", " <th>NCBI_UpdateDate</th>\n", " <th>NCBI_TaxId</th>\n", " </tr>\n", " <tr>\n", " <th>Protein</th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " <th></th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>285L</th>\n", " <td>285L.pdb</td>\n", " <td>CAD2068351.1</td>\n", " <td>A0A485PQD1</td>\n", " <td>ASFV-G 285L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_1_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136876</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>A104R</th>\n", " <td>A104R.pdb</td>\n", " <td>CAD2068395.1</td>\n", " <td>A0A0A1E0L7</td>\n", " <td>ASFV-G A104R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_2_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136920</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>A118R</th>\n", " <td>A118R.pdb</td>\n", " <td>CAD2068397.1</td>\n", " <td>A0A2X0RVA9</td>\n", " <td>ASFV-G A118R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_1_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136922</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>A137R</th>\n", " <td>A137R.pdb</td>\n", " <td>CAD2068404.1</td>\n", " <td>A0A2X0THQ0</td>\n", " <td>ASFV-G A137R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136929</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>A151R</th>\n", " <td>A151R.pdb</td>\n", " <td>CAD2068398.1</td>\n", " <td>A0A2X0TC55</td>\n", " <td>ASFV-G A151R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136923</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>...</th>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " <td>...</td>\n", " </tr>\n", " <tr>\n", " <th>QP509L</th>\n", " <td>QP509L.pdb</td>\n", " <td>CAD2068484.1</td>\n", " <td>A0A2X0THX2</td>\n", " <td>ASFV-G QP509L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>1886137009</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>R298L</th>\n", " <td>R298L.pdb</td>\n", " <td>CAD2068482.1</td>\n", " <td>A0A2X0SE42</td>\n", " <td>ASFV-G R298L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_3_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886137007</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>S183L</th>\n", " <td>S183L.pdb</td>\n", " <td>CAD2068472.1</td>\n", " <td>A0A2X0SE34</td>\n", " <td>ASFV-G S183L</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_4_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136997</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>S273R</th>\n", " <td>S273R.pdb</td>\n", " <td>CAD2068473.1</td>\n", " <td>A0A2X0TKM5</td>\n", " <td>ASFV-G S273R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_2_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136998</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " <tr>\n", " <th>X69R</th>\n", " <td>X69R.pdb</td>\n", " <td>CAD2068372.1</td>\n", " <td>A0A2X0TKC7</td>\n", " <td>ASFV-G X69R</td>\n", " <td>This model was predicted using AlphaFold2</td>\n", " <td>model_2_pred_0</td>\n", " <td>NaN</td>\n", " <td>1886136897</td>\n", " <td>2020/08/05</td>\n", " <td>10497</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "<p>197 rows × 10 columns</p>\n", "</div>" ], "text/plain": [ " Associated PDB NCBI_Accession UniProt_ID _struct.title \\\n", "Protein \n", "285L 285L.pdb CAD2068351.1 A0A485PQD1 ASFV-G 285L \n", "A104R A104R.pdb CAD2068395.1 A0A0A1E0L7 ASFV-G A104R \n", "A118R A118R.pdb CAD2068397.1 A0A2X0RVA9 ASFV-G A118R \n", "A137R A137R.pdb CAD2068404.1 A0A2X0THQ0 ASFV-G A137R \n", "A151R A151R.pdb CAD2068398.1 A0A2X0TC55 ASFV-G A151R \n", "... ... ... ... ... \n", "QP509L QP509L.pdb CAD2068484.1 A0A2X0THX2 ASFV-G QP509L \n", "R298L R298L.pdb CAD2068482.1 A0A2X0SE42 ASFV-G R298L \n", "S183L S183L.pdb CAD2068472.1 A0A2X0SE34 ASFV-G S183L \n", "S273R S273R.pdb CAD2068473.1 A0A2X0TKM5 ASFV-G S273R \n", "X69R X69R.pdb CAD2068372.1 A0A2X0TKC7 ASFV-G X69R \n", "\n", " _struct.pdbx_model_detail ranking debugg model ID \\\n", "Protein \n", "285L This model was predicted using AlphaFold2 model_1_pred_0 \n", "A104R This model was predicted using AlphaFold2 model_2_pred_0 \n", "A118R This model was predicted using AlphaFold2 model_1_pred_0 \n", "A137R This model was predicted using AlphaFold2 model_3_pred_0 \n", "A151R This model was predicted using AlphaFold2 model_4_pred_0 \n", "... ... ... \n", "QP509L This model was predicted using AlphaFold2 NaN \n", "R298L This model was predicted using AlphaFold2 model_3_pred_0 \n", "S183L This model was predicted using AlphaFold2 model_4_pred_0 \n", "S273R This model was predicted using AlphaFold2 model_2_pred_0 \n", "X69R This model was predicted using AlphaFold2 model_2_pred_0 \n", "\n", " notes NCBI_Gi NCBI_UpdateDate NCBI_TaxId \n", "Protein \n", "285L NaN 1886136876 2020/08/05 10497 \n", "A104R NaN 1886136920 2020/08/05 10497 \n", "A118R NaN 1886136922 2020/08/05 10497 \n", "A137R NaN 1886136929 2020/08/05 10497 \n", "A151R NaN 1886136923 2020/08/05 10497 \n", "... ... ... ... ... \n", "QP509L NaN 1886137009 2020/08/05 10497 \n", "R298L NaN 1886137007 2020/08/05 10497 \n", "S183L NaN 1886136997 2020/08/05 10497 \n", "S273R NaN 1886136998 2020/08/05 10497 \n", "X69R NaN 1886136897 2020/08/05 10497 \n", "\n", "[197 rows x 10 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "metadata" ] }, { "cell_type": "code", "execution_count": 46, "id": "b8d18043", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(193, {'10497'}, 193, {'2020/08/05'})" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(set(metadata.NCBI_Accession)), set(metadata.NCBI_TaxId), \\\n", "len(set(metadata.NCBI_Gi)), set(metadata.NCBI_UpdateDate)" ] }, { "cell_type": "code", "execution_count": 54, "id": "4385bd3d", "metadata": {}, "outputs": [], "source": [ "# all matching tax IDs?\n", "for protein, upac in metadata.UniProt_ID.items():\n", " up_info = _fetch_upkb_entry(upac)\n", " if up_info[\"up_ncbi_taxid\"] != \"10497\":\n", " print(protein, up_info)" ] }, { "cell_type": "code", "execution_count": 64, "id": "0376a9e3", "metadata": {}, "outputs": [], "source": [ "# check PDB files\n", "import ost\n", "from ost import testutils, conop\n", "# setup conop\n", "testutils.SetDefaultCompoundLib()\n", "io.profiles['DEFAULT'].processor = conop.RuleBasedProcessor(conop.GetDefaultLib())\n", "# check processing\n", "ost.PushVerbosityLevel(2)\n", "for protein, pdb_ext in sorted(pdb_file_split):\n", " pdb_path = os.path.join(pdb_dir, protein + pdb_ext)\n", " ent = io.LoadPDB(pdb_path)\n", "ost.PopVerbosityLevel()\n", "# NOTE: lack of output means that all atom names are ok" ] }, { "cell_type": "code", "execution_count": 84, "id": "5a857298", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'up_organism': 'African swine fever virus (ASFV)',\n", " 'up_sequence': 'MLLYIVIIVACIISKLVPNEYWAIHLFFIIMIFMVYMYEKLDIHQKSQFWNYTMSGLSGHNVQVTCKCY',\n", " 'up_ac': 'A0A2X0TKC7',\n", " 'up_id': 'A0A2X0TKC7_ASF',\n", " 'up_last_mod': datetime.datetime(2018, 9, 12, 0, 0),\n", " 'up_gn': 'X69R CDS',\n", " 'up_ncbi_taxid': '10497',\n", " 'up_seqlen': 69,\n", " 'up_crc64': '3B92E4DB323A7A74',\n", " 'up_isoform': None}" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "up_info" ] }, { "cell_type": "code", "execution_count": 85, "id": "6c8d2b8a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Caption': 'CAD2068372',\n", " 'Title': 'X69R CDS [African swine fever virus]',\n", " 'Extra': 'gi|1886136897|emb|CAD2068372.1|[1886136897]',\n", " 'Gi': 1886136897,\n", " 'CreateDate': '2010/08/18',\n", " 'UpdateDate': '2020/08/05',\n", " 'Flags': 0,\n", " 'TaxId': 10497,\n", " 'Length': 69,\n", " 'Status': 'live',\n", " 'ReplacedBy': None,\n", " 'Comment': ' ',\n", " 'AccessionVersion': 'CAD2068372.1'}" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ncbi_info" ] }, { "cell_type": "code", "execution_count": 154, "id": "f7097a09", "metadata": {}, "outputs": [], "source": [ "# check files that were converted (input is std out from run cut to only include \"translating...\")\n", "log_lines = open(\"./script_out.txt\").readlines()" ] }, { "cell_type": "code", "execution_count": 155, "id": "34ed53d8", "metadata": {}, "outputs": [], "source": [ "# plenty of assertions in here should also catch any errors...\n", "idx = 1\n", "timings = dict()\n", "while idx < len(log_lines) - 1:\n", " l = log_lines[idx].strip()\n", " if \"already done...\" in l:\n", " idx += 1\n", " continue\n", " assert l.startswith(\"translating\")\n", " mdl_title = l[len(\"translating\"):-3].strip()\n", " if mdl_title in timings:\n", " print(\"WEIRD\", l)\n", " l = log_lines[idx + 1].strip()\n", " assert l.startswith(\"preparing data\")\n", " assert l.endswith(\"s)\")\n", " t_prep = float(l.split()[-1][1:-2])\n", " l = log_lines[idx + 2].strip()\n", " assert l.startswith(\"generating ModelCIF objects\")\n", " assert l.endswith(\"s)\")\n", " t_cif = float(l.split()[-1][1:-2])\n", " l = log_lines[idx + 3].strip()\n", " assert l.startswith(\"processing QA scores\")\n", " assert l.endswith(\"s)\")\n", " t_qa = float(l.split()[-1][1:-2])\n", " l = log_lines[idx + 4].strip()\n", " assert l.startswith(\"write to disk\")\n", " assert l.endswith(\"s)\")\n", " t_write = float(l.split()[-1][1:-2])\n", " l = log_lines[idx + 5].strip()\n", " assert l.startswith(\"... done with\")\n", " assert l.endswith(\"s).\")\n", " t_all = float(l.split()[-1][1:-3])\n", " timings[mdl_title] = {\n", " \"t_prep\": t_prep,\n", " \"t_cif\": t_cif,\n", " \"t_qa\": t_qa,\n", " \"t_write\": t_write,\n", " \"t_all\": t_all\n", " }\n", " idx += 6" ] }, { "cell_type": "code", "execution_count": 156, "id": "876acf01", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "DONE 197 models\n", "MISSING 0 models\n" ] } ], "source": [ "print(f\"DONE {len(timings)} models\")\n", "mdl_titles = set(metadata.index)\n", "assert len(set(timings) - mdl_titles) == 0\n", "missing_ones = mdl_titles - set(timings)\n", "print(f\"MISSING {len(missing_ones)} models\")" ] }, { "cell_type": "code", "execution_count": 157, "id": "612a1791", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TOTAL TIME SPENT: 356.38 s\n", "-> t_prep: 80.5%\n", "-> t_cif: 1.2%\n", "-> t_qa: 0.0%\n", "-> t_write: 17.8%\n" ] } ], "source": [ "k_parts = [\"t_prep\", \"t_cif\", \"t_qa\", \"t_write\"]\n", "totals = {k: sum(v[k] for v in timings.values()) \\\n", " for k in k_parts + [\"t_all\"]}\n", "print(f\"TOTAL TIME SPENT: {round(totals['t_all'], 2)} s\")\n", "for k in k_parts:\n", " print(f\"-> {k}: {round(100 * totals[k] / totals['t_all'], 1)}%\")" ] }, { "cell_type": "code", "execution_count": null, "id": "283d223f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" } }, "nbformat": 4, "nbformat_minor": 5 }