Skip to content
Snippets Groups Projects
Commit c1c0b313 authored by TheRiPtide's avatar TheRiPtide
Browse files

feat: Better visualizability of first convolutional layer with onehot encoding

parent 465fcb01
No related branches found
No related tags found
1 merge request!23feat: deep-leaning poly(A) classifier
No preview for this file type
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Issue 21: Inferring the code of internal priming by deep learning # Issue 21: Inferring the code of internal priming by deep learning
In real data sets we would like to distinguish poly(A) sites from internal priming sites. To do this, we want to construct a classifier that uses the sequence flanking the sites. As a deep learning architecture we can use a convolutional neural network, for e.g. from a numpy implementation, https://pypi.org/project/numpycnn/) In real data sets we would like to distinguish poly(A) sites from internal priming sites. To do this, we want to construct a classifier that uses the sequence flanking the sites. As a deep learning architecture we can use a convolutional neural network, for e.g. from a numpy implementation, https://pypi.org/project/numpycnn/)
Reference: https://www.analyticsvidhya.com/blog/2019/10/building-image-classification-models-cnn-pytorch/ Reference: https://www.analyticsvidhya.com/blog/2019/10/building-image-classification-models-cnn-pytorch/
Input: sequences of bona fide and internally-primed poly(A) sites (#16) Input: sequences of bona fide and internally-primed poly(A) sites (#16)
Output: classifier based on the nucleotide sequence around the sites Output: classifier based on the nucleotide sequence around the sites
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# importing the libraries # importing the libraries
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
# for creating validation set # for creating validation set
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
# for evaluating the model # for evaluating the model
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from tqdm import tqdm from tqdm import tqdm
# PyTorch libraries and modules # PyTorch libraries and modules
import torch import torch
from torch.autograd import Variable from torch.autograd import Variable
from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, MaxPool1d, Module, Softmax, BatchNorm1d, Dropout, Conv1d from torch.nn import Linear, ReLU, CrossEntropyLoss, Sequential, MaxPool1d, Module, Softmax, BatchNorm1d, Dropout, Conv1d
from torch.optim import Adam, SGD from torch.optim import Adam, SGD
from torchsummary import summary from torchsummary import summary
# adding the nn # adding the nn
class Net(Module): class Net(Module):
def __init__(self): def __init__(self):
super(Net, self).__init__() super(Net, self).__init__()
self.cnn_layers = Sequential( self.cnn_layers = Sequential(
# Defining a 1D convolution layer # Defining a 1D convolution layer
Conv1d(1, 4, kernel_size=3, stride=1, padding=1), Conv1d(4, 4, kernel_size=3, stride=1, padding=1),
BatchNorm1d(4), BatchNorm1d(4),
ReLU(inplace=True), ReLU(inplace=True),
MaxPool1d(kernel_size=2, stride=2), MaxPool1d(kernel_size=2, stride=2),
# Defining another 1D convolution layer # Defining another 1D convolution layer
Conv1d(4, 4, kernel_size=3, stride=1, padding=1), Conv1d(4, 4, kernel_size=3, stride=1, padding=1),
BatchNorm1d(4), BatchNorm1d(4),
ReLU(inplace=True), ReLU(inplace=True),
MaxPool1d(kernel_size=2, stride=2), MaxPool1d(kernel_size=2, stride=2),
) )
self.linear_layers = Sequential( self.linear_layers = Sequential(
Linear(4 * 50, 10) Linear(4 * 50, 10)
) )
# Defining the forward pass # Defining the forward pass
def forward(self, x): def forward(self, x):
x = self.cnn_layers(x) x = self.cnn_layers(x)
x = x.view(x.size(0), -1) x = x.view(x.size(0), -1)
x = self.linear_layers(x) x = self.linear_layers(x)
return x return x
# defining training function # defining training function
def train(): def train():
model.train() model.train()
tr_loss = 0 tr_loss = 0
# getting the training set # getting the training set
x_train, y_train = Variable(train_x), Variable(train_y) x_train, y_train = Variable(train_x), Variable(train_y)
# getting the validation set # getting the validation set
x_val, y_val = Variable(val_x), Variable(val_y) x_val, y_val = Variable(val_x), Variable(val_y)
# converting the data into GPU format # converting the data into GPU format
if torch.cuda.is_available(): if torch.cuda.is_available():
x_train = x_train.cuda() x_train = x_train.cuda()
y_train = y_train.cuda() y_train = y_train.cuda()
x_val = x_val.cuda() x_val = x_val.cuda()
y_val = y_val.cuda() y_val = y_val.cuda()
# clearing the Gradients of the model parameters # clearing the Gradients of the model parameters
optimizer.zero_grad() optimizer.zero_grad()
# prediction for training and validation set # prediction for training and validation set
output_train = model(x_train) output_train = model(x_train)
output_val = model(x_val) output_val = model(x_val)
# computing the training and validation loss # computing the training and validation loss
loss_train = criterion(output_train, y_train) loss_train = criterion(output_train, y_train)
loss_val = criterion(output_val, y_val) loss_val = criterion(output_val, y_val)
# computing the updated weights of all the model parameters # computing the updated weights of all the model parameters
loss_train.backward() loss_train.backward()
optimizer.step() optimizer.step()
tr_loss = loss_train.item() tr_loss = loss_train.item()
return loss_train, loss_val return loss_train, loss_val
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Load data ## Load data
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
enum = { enum = {
'A': [1, 0, 0, 0], 'A': [1, 0, 0, 0],
'U': [0, 1, 0, 0], 'U': [0, 1, 0, 0],
'G': [0, 0, 1, 0], 'G': [0, 0, 1, 0],
'C': [0, 0, 0, 1] 'C': [0, 0, 0, 1]
} }
# TODO: Get test data from issues 25 and 26 # TODO: Get test data from issues 25 and 26
internal_ends = [] internal_ends = []
real_ends = [] real_ends = []
with open('../inputs/mock_internal_ends.txt', 'r') as file: with open('../inputs/mock_internal_ends.txt', 'r') as file:
while line := file.readline().rstrip(): while line := file.readline().rstrip():
line_list = list(line) line_list = list(line)
internal_ends.append(line_list) internal_ends.append(line_list)
with open('../inputs/mock_real_ends.txt', 'r') as file: with open('../inputs/mock_real_ends.txt', 'r') as file:
while line := file.readline().rstrip(): while line := file.readline().rstrip():
line_list = list(line) line_list = list(line)
real_ends.append(line_list) real_ends.append(line_list)
y_real = [1] * len(real_ends) y_real = [1] * len(real_ends)
y_internal = [0] * len(internal_ends) y_internal = [0] * len(internal_ends)
all_ends = real_ends + internal_ends all_ends = real_ends + internal_ends
train_y = y_real + y_internal train_y = y_real + y_internal
train_x = [None] * len(all_ends) train_x = [None] * len(all_ends)
for i in tqdm(range(len(all_ends))): for i in tqdm(range(len(all_ends))):
enum_list = [enum[key] for key in all_ends[i]] enum_list = [enum[key] for key in all_ends[i]]
train_x[i] = enum_list train_x[i] = enum_list
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1) train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1)
``` ```
%% Output %% Output
100%|██████████| 20000/20000 [00:00<00:00, 27099.07it/s] 100%|██████████| 20000/20000 [00:00<00:00, 23948.83it/s]
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# TODO: reshape shape from [n, l] to [n, 1, l] # TODO: reshape shape from [n, l] to [n, 1, l]
train_x = np.array(train_x, dtype=np.float32) train_x = np.array(train_x, dtype=np.float32)
train_y = np.array(train_y, dtype=np.int64) train_y = np.array(train_y, dtype=np.int64)
val_x = np.array(val_x, dtype=np.float32) val_x = np.array(val_x, dtype=np.float32)
val_y = np.array(val_y, dtype=np.int64) val_y = np.array(val_y, dtype=np.int64)
train_shape = train_x.shape train_shape = train_x.shape
val_shape = val_x.shape val_shape = val_x.shape
train_x = train_x.reshape(train_shape[0], 1, train_shape[1], 4) train_x = train_x.reshape(train_shape[0], 4, train_shape[1])
val_x = val_x.reshape(val_shape[0], 1, val_shape[1], 4) val_x = val_x.reshape(val_shape[0], 4, val_shape[1])
train_x = torch.from_numpy(train_x) train_x = torch.from_numpy(train_x)
train_y = torch.from_numpy(train_y) train_y = torch.from_numpy(train_y)
val_x = torch.from_numpy(val_x) val_x = torch.from_numpy(val_x)
val_y = torch.from_numpy(val_y) val_y = torch.from_numpy(val_y)
``` ```
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
# Model call and loss function definition # Model call and loss function definition
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# defining the model # defining the model
model = Net() model = Net()
# defining the optimizer # defining the optimizer
optimizer = Adam(model.parameters(), lr=0.07) optimizer = Adam(model.parameters(), lr=0.07)
# defining the loss function # defining the loss function
criterion = CrossEntropyLoss() criterion = CrossEntropyLoss()
# checking if GPU is available # checking if GPU is available
if torch.cuda.is_available(): if torch.cuda.is_available():
model = model.cuda() model = model.cuda()
criterion = criterion.cuda() criterion = criterion.cuda()
# defining the number of epochs # defining the number of epochs
n_epochs = 25 n_epochs = 25
# empty list to store training losses # empty list to store training losses
train_losses = [] train_losses = []
# empty list to store validation losses # empty list to store validation losses
val_losses = [] val_losses = []
# training the model # training the model
for epoch in tqdm(range(n_epochs)): for epoch in tqdm(range(n_epochs)):
train_loss, val_loss = train() train_loss, val_loss = train()
train_losses.append(train_loss) train_losses.append(train_loss)
val_losses.append(val_loss) val_losses.append(val_loss)
``` ```
%% Output %% Output
0%| | 0/25 [00:00<?, ?it/s] 100%|██████████| 25/25 [00:19<00:00, 1.25it/s]
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_14744/999922600.py in <module>
24 # training the model
25 for epoch in tqdm(range(n_epochs)):
---> 26 train_loss, val_loss = train()
27 train_losses.append(train_loss)
28 val_losses.append(val_loss)
~\AppData\Local\Temp/ipykernel_14744/2669949571.py in train()
67
68 # prediction for training and validation set
---> 69 output_train = model(x_train)
70 output_val = model(x_val)
71
c:\users\gzaug\onedrive\dokumente\uni\programming in life sciences\scrna-seq-simulation\venv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
~\AppData\Local\Temp/ipykernel_14744/2669949571.py in forward(self, x)
43 # Defining the forward pass
44 def forward(self, x):
---> 45 x = self.cnn_layers(x)
46 x = x.view(x.size(0), -1)
47 x = self.linear_layers(x)
c:\users\gzaug\onedrive\dokumente\uni\programming in life sciences\scrna-seq-simulation\venv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
c:\users\gzaug\onedrive\dokumente\uni\programming in life sciences\scrna-seq-simulation\venv\lib\site-packages\torch\nn\modules\container.py in forward(self, input)
139 def forward(self, input):
140 for module in self:
--> 141 input = module(input)
142 return input
143
c:\users\gzaug\onedrive\dokumente\uni\programming in life sciences\scrna-seq-simulation\venv\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []
c:\users\gzaug\onedrive\dokumente\uni\programming in life sciences\scrna-seq-simulation\venv\lib\site-packages\torch\nn\modules\conv.py in forward(self, input)
299
300 def forward(self, input: Tensor) -> Tensor:
--> 301 return self._conv_forward(input, self.weight, self.bias)
302
303
c:\users\gzaug\onedrive\dokumente\uni\programming in life sciences\scrna-seq-simulation\venv\lib\site-packages\torch\nn\modules\conv.py in _conv_forward(self, input, weight, bias)
295 weight, bias, self.stride,
296 _single(0), self.dilation, self.groups)
--> 297 return F.conv1d(input, weight, bias, self.stride,
298 self.padding, self.dilation, self.groups)
299
RuntimeError: Expected 3-dimensional input for 3-dimensional weight [4, 1, 3], but got 4-dimensional input of size [18000, 1, 200, 4] instead
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Model Performance ## Model Performance
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
train_losses_list = [train_loss.item() for train_loss in train_losses] train_losses_list = [train_loss.item() for train_loss in train_losses]
val_losses_list = [val_loss.item() for val_loss in val_losses] val_losses_list = [val_loss.item() for val_loss in val_losses]
# plotting the training and validation loss # plotting the training and validation loss
plt.plot(train_losses_list, label='Training loss') plt.plot(train_losses_list, label='Training loss')
plt.plot(val_losses_list, label='Validation loss') plt.plot(val_losses_list, label='Validation loss')
plt.legend() plt.legend()
plt.show() plt.show()
``` ```
%% Output
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
# prediction for training set # prediction for training set
with torch.no_grad(): with torch.no_grad():
output = model(train_x.cpu()) output = model(train_x.cpu())
softmax = torch.exp(output).cpu() softmax = torch.exp(output).cpu()
prob = list(softmax.numpy()) prob = list(softmax.numpy())
predictions = np.argmax(prob, axis=1) predictions = np.argmax(prob, axis=1)
# accuracy on training set # accuracy on training set
print(accuracy_score(train_y, predictions)) print(accuracy_score(train_y, predictions))
with torch.no_grad(): with torch.no_grad():
output = model(val_x.cpu()) output = model(val_x.cpu())
softmax = torch.exp(output).cpu() softmax = torch.exp(output).cpu()
prob = list(softmax.numpy()) prob = list(softmax.numpy())
predictions = np.argmax(prob, axis=1) predictions = np.argmax(prob, axis=1)
# accuracy on validation set # accuracy on validation set
print(accuracy_score(val_y, predictions)) print(accuracy_score(val_y, predictions))
``` ```
%% Output
0.9966111111111111
0.998
%% Cell type:markdown id: tags: %% Cell type:markdown id: tags:
## Save Model ## Save Model
%% Cell type:code id: tags: %% Cell type:code id: tags:
``` python ``` python
torch.save(model.state_dict(), '../models/internal_priming.pth') torch.save(model.state_dict(), '../models/internal_priming.pth')
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if param.requires_grad: if param.requires_grad:
print(name, param.data) print(name, param.data)
``` ```
%% Output
cnn_layers.0.weight tensor([[[-0.0161, -0.1727, -0.0706],
[-0.1442, -0.2347, -0.1731],
[ 0.0937, -0.2319, 0.0297],
[ 0.4707, 1.2928, -0.7642]],
[[ 0.6057, 0.1347, -0.0041],
[ 0.0967, -0.9823, -0.6446],
[ 0.5267, 0.5432, 0.1329],
[-0.0482, -0.0046, -0.0177]],
[[-0.3372, -0.1560, -0.0196],
[ 0.1110, -0.3352, -0.1544],
[-0.4499, -0.2293, 0.0253],
[ 0.9977, -0.9843, -1.0379]],
[[-0.1263, -0.5871, -0.0511],
[-0.1365, -0.1569, 0.0979],
[-0.3475, -0.3901, 0.0927],
[-0.6534, -0.9329, 0.6516]]])
cnn_layers.0.bias tensor([-0.0021, -0.1551, -1.0721, 0.4632])
cnn_layers.1.weight tensor([1.8590, 0.1158, 1.1549, 0.8114])
cnn_layers.1.bias tensor([ 0.7813, -0.7185, 0.1945, 0.2015])
cnn_layers.4.weight tensor([[[-1.0589, 0.3008, -1.0521],
[ 0.2243, 0.5245, 0.1523],
[ 0.0767, 0.6713, -0.4829],
[-0.6312, -0.4684, -0.3525]],
[[ 0.6076, 0.0118, 0.3328],
[-0.6541, 0.2015, 0.1579],
[-0.8182, 0.1377, -0.8822],
[ 0.5961, -0.2152, 0.7089]],
[[ 0.5840, 0.3963, -0.3982],
[ 0.4481, 0.1088, 0.2149],
[ 0.4938, 0.3682, 0.5467],
[-0.1666, 0.2545, 0.4419]],
[[ 0.2782, -0.2773, -0.6268],
[ 0.1686, 0.1611, -0.3611],
[-0.9431, -0.2470, -0.1781],
[-0.2127, 0.1223, -0.0467]]])
cnn_layers.4.bias tensor([ 0.0243, 0.1496, -0.2523, -0.1505])
cnn_layers.5.weight tensor([0.9917, 1.0135, 0.2734, 0.0942])
cnn_layers.5.bias tensor([-0.2346, -0.1730, -0.6458, -0.8736])
linear_layers.0.weight tensor([[ 0.2819, 0.3333, 0.3363, ..., 0.3874, 0.3519, 0.2827],
[ 0.3381, 0.3392, 0.2918, ..., 0.3641, 0.2983, 0.3425],
[-0.3373, -0.3675, -0.4146, ..., -0.3503, -0.4156, -0.3663],
...,
[-0.3867, -0.3346, -0.3592, ..., -0.4135, -0.3362, -0.3592],
[-0.3415, -0.3677, -0.3740, ..., -0.4074, -0.3575, -0.3526],
[-0.4087, -0.3892, -0.3258, ..., -0.3189, -0.4211, -0.3985]])
linear_layers.0.bias tensor([ 0.1986, 0.3250, -0.4212, -0.3442, -0.3814, -0.3203, -0.3380, -0.4000,
-0.3805, -0.4522])
... ...
......
...@@ -15,7 +15,7 @@ class Net(Module): ...@@ -15,7 +15,7 @@ class Net(Module):
self.cnn_layers = Sequential( self.cnn_layers = Sequential(
# Defining a 1D convolution layer # Defining a 1D convolution layer
Conv1d(1, 4, kernel_size=3, stride=1, padding=1), Conv1d(4, 4, kernel_size=3, stride=1, padding=1),
BatchNorm1d(4), BatchNorm1d(4),
ReLU(inplace=True), ReLU(inplace=True),
MaxPool1d(kernel_size=2, stride=2), MaxPool1d(kernel_size=2, stride=2),
...@@ -42,11 +42,11 @@ class PolyAClassifier: ...@@ -42,11 +42,11 @@ class PolyAClassifier:
"""Classifier object using the state-dict of a pretrained pytorch model.""" """Classifier object using the state-dict of a pretrained pytorch model."""
enum = { enum = {
'A': 0.0, 'A': [1, 0, 0, 0],
'U': 1 / 3, 'U': [0, 1, 0, 0],
'T': 1 / 3, 'T': [0, 1, 0, 0],
'G': 2 / 3, 'G': [0, 0, 1, 0],
'C': 1.0 'C': [0, 0, 0, 1]
} }
def __init__(self, model=Net, state_dict_path: str = './models/internal_priming.pth'): def __init__(self, model=Net, state_dict_path: str = './models/internal_priming.pth'):
...@@ -103,7 +103,7 @@ class PolyAClassifier: ...@@ -103,7 +103,7 @@ class PolyAClassifier:
raise ValueError('Not all sequences of length 200') raise ValueError('Not all sequences of length 200')
test_shape = test.shape test_shape = test.shape
test = test.reshape(test_shape[0], 1, test_shape[1]) test = test.reshape(test_shape[0], 4, test_shape[1])
if test_shape[1] != 200: if test_shape[1] != 200:
raise ValueError('Sequences not of length 200') raise ValueError('Sequences not of length 200')
... ...
......
No preview for this file type
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment