chore: started cnn notebook

035f5c9a · TheRiPtide · 93f1e520 · 035f5c9a
Commit 035f5c9a authored 3 years ago by TheRiPtide
--- a/notebooks/internal_priming.ipynb
+++ b/notebooks/internal_priming.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "# Issue 21: Inferring the code of internal priming by deep learning\n",
+    "\n",
+    "In real data sets we would like to distinguish poly(A) sites from internal priming sites. To do this, we want to construct a classifier that uses the sequence flanking the sites. As a deep learning architecture we can use a convolutional neural network, for e.g. from a numpy implementation, https://pypi.org/project/numpycnn/)\n",
+    "\n",
+    "Reference: https://www.analyticsvidhya.com/blog/2019/10/building-image-classification-models-cnn-pytorch/\n",
+    "\n",
+    "Input: sequences of bona fide and internally-primed poly(A) sites (#16)\n",
+    "\n",
+    "Output: classifier based on the nucleotide sequence around the sites"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# importing the libraries\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "\n",
+    "# for creating validation set\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# for evaluating the model\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "# PyTorch libraries and modules\n",
+    "import torch\n",
+    "from torch.autograd import Variable\n",
+    "from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout\n",
+    "from torch.optim import Adam, SGD\n",
+    "\n",
+    "\n",
+    "# adding the nn\n",
+    "class Net(Module):\n",
+    "    def __init__(self):\n",
+    "        super(Net, self).__init__()\n",
+    "\n",
+    "        self.cnn_layers = Sequential(\n",
+    "            # Defining a 2D convolution layer\n",
+    "            Conv2d(1, 4, kernel_size=3, stride=1, padding=1),\n",
+    "            BatchNorm2d(4),\n",
+    "            ReLU(inplace=True),\n",
+    "            MaxPool2d(kernel_size=2, stride=2),\n",
+    "            # Defining another 2D convolution layer\n",
+    "            Conv2d(4, 4, kernel_size=3, stride=1, padding=1),\n",
+    "            BatchNorm2d(4),\n",
+    "            ReLU(inplace=True),\n",
+    "            MaxPool2d(kernel_size=2, stride=2),\n",
+    "        )\n",
+    "\n",
+    "        self.linear_layers = Sequential(\n",
+    "            Linear(4 * 7 * 7, 10)\n",
+    "        )\n",
+    "\n",
+    "    # Defining the forward pass\n",
+    "    def forward(self, x):\n",
+    "        x = self.cnn_layers(x)\n",
+    "        x = x.view(x.size(0), -1)\n",
+    "        x = self.linear_layers(x)\n",
+    "        return x"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "## Load data"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# TODO: Get test data from issues 25 and 26\n",
+    "train_x = []\n",
+    "train_y = []\n",
+    "test_x = []\n",
+    "test_y = []\n",
+    "\n",
+    "train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1)\n",
+    "\n",
+    "# TODO: reshape shape from [n, l] to [n, 1, l]\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Model call and loss function definition"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "# defining the model\n",
+    "model = Net()\n",
+    "# defining the optimizer\n",
+    "optimizer = Adam(model.parameters(), lr=0.07)\n",
+    "# defining the loss function\n",
+    "criterion = CrossEntropyLoss()\n",
+    "# checking if GPU is available\n",
+    "if torch.cuda.is_available():\n",
+    "    model = model.cuda()\n",
+    "    criterion = criterion.cuda()\n",
+    "\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
+%% Cell type:markdown id: tags:
+# Issue 21: Inferring the code of internal priming by deep learning
+In real data sets we would like to distinguish poly(A) sites from internal priming sites. To do this, we want to construct a classifier that uses the sequence flanking the sites. As a deep learning architecture we can use a convolutional neural network, for e.g. from a numpy implementation, https://pypi.org/project/numpycnn/)
+Reference: https://www.analyticsvidhya.com/blog/2019/10/building-image-classification-models-cnn-pytorch/
+Input: sequences of bona fide and internally-primed poly(A) sites (#16)
+Output: classifier based on the nucleotide sequence around the sites
+%% Cell type:code id: tags:
+``` python
+# importing the libraries
+import pandas as pd
+import numpy as np
+# for creating validation set
+from sklearn.model_selection import train_test_split
+# for evaluating the model
+from sklearn.metrics import accuracy_score
+from tqdm import tqdm
+# PyTorch libraries and modules
+import torch
+from torch.autograd import Variable
+from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout
+from torch.optim import Adam, SGD
+# adding the nn
+class Net(Module):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.cnn_layers = Sequential(
+            # Defining a 2D convolution layer
+            Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
+            BatchNorm2d(4),
+            ReLU(inplace=True),
+            MaxPool2d(kernel_size=2, stride=2),
+            # Defining another 2D convolution layer
+            Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
+            BatchNorm2d(4),
+            ReLU(inplace=True),
+            MaxPool2d(kernel_size=2, stride=2),
+        )
+        self.linear_layers = Sequential(
+            Linear(4 * 7 * 7, 10)
+        )
+    # Defining the forward pass
+    def forward(self, x):
+        x = self.cnn_layers(x)
+        x = x.view(x.size(0), -1)
+        x = self.linear_layers(x)
+        return x
+```
+%% Cell type:markdown id: tags:
+## Load data
+%% Cell type:code id: tags:
+``` python
+# TODO: Get test data from issues 25 and 26
+train_x = []
+train_y = []
+test_x = []
+test_y = []
+train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1)
+# TODO: reshape shape from [n, l] to [n, 1, l]
+```
+%% Cell type:markdown id: tags:
+# Model call and loss function definition
+%% Cell type:code id: tags:
+``` python
+# defining the model
+model = Net()
+# defining the optimizer
+optimizer = Adam(model.parameters(), lr=0.07)
+# defining the loss function
+criterion = CrossEntropyLoss()
+# checking if GPU is available
+if torch.cuda.is_available():
+    model = model.cuda()
+    criterion = criterion.cuda()
+```