diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000000000000000000000000000000..94a25f7f4cb416c083d265558da75d457237d671 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project version="4"> + <component name="VcsDirectoryMappings"> + <mapping directory="$PROJECT_DIR$" vcs="Git" /> + </component> +</project> \ No newline at end of file diff --git a/README.rst b/README.rst index 1bb9012bc5a1ea99b6ab5d07203e432871f7f57f..b8df6a1a89fff9f9a9705e41ba3a458f70f28a20 100644 --- a/README.rst +++ b/README.rst @@ -18,7 +18,12 @@ pathways :alt: Updates -Pathways allows you to compute importance scores for a a list of metabolic pathways. +Pathways computes importance scores for a a list of metabolic pathways. + +Pathways is an adaptation of `Verbeke et al.'s`_ approach that ranks pathways by integrating multiple types of +data (gene expression, mutation, methylation and copy number data) together with a gene interaction network. Pathways is +focused on uniquely using mutation data to rank the importance of a set of genes (i.e pathways) for a group of +samples(i.e genomes). * Free software: MIT license @@ -27,14 +32,26 @@ Pathways allows you to compute importance scores for a a list of metabolic pathw Features -------- +* TODO + +Installation +------------ + +To install Pathways, + +* TODO + +Documentation +------------- * TODO Credits ---------- +------- This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template. +.. _`Verbeke et al.'s`: http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0133503 .. _Cookiecutter: https://github.com/audreyr/cookiecutter .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage diff --git a/docs/conf.py b/docs/conf.py index bd51940a8fe4c355d65d1dbcb364cf3e56eacf5a..b7b9d6080ce602866e960017018cecefff08b20b 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,7 +40,19 @@ import pathways # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode'] +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.viewcode','sphinx.ext.napoleon', + 'sphinx.ext.intersphinx'] + +# Add Intersphinx links and their targets. +# Locations and names of other projects that should be linked to in this documentation. +# A dictionary mapping unique identifiers to a tuple (target, inventory). Each target is the +# base URI of a foreign Sphinx documentation set and can be a local path or an HTTP URI. +# The inventory indicates where the inventory file can be found: it can be None (at the same +# location as the base URI) or another local or HTTP URI. +# The unique identifier can be used to prefix cross-reference targets, so that it is clear which +# intersphinx set the target belongs to. A link like :ref:`comparison manual <python:comparisons>` +# will link to the label “comparisons” in the doc set “python”, if it exists. +# intersphinx_mapping = {'igraph': ('http://igraph.org/python/doc/igraph-module', None)} # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/pathways/__init__.py b/pathways/__init__.py index c78865fb8d875fc49c0abc1964e17b37827472d2..f53e4e5bc08b1b22c1a04f86ea0d206d63c2419c 100644 --- a/pathways/__init__.py +++ b/pathways/__init__.py @@ -5,3 +5,4 @@ __author__ = """Monica R. Ticlla Ccenhua""" __email__ = 'monica.ticlla@sib.swiss' __version__ = '0.1.0' + diff --git a/pathways/graph.py b/pathways/graph.py new file mode 100644 index 0000000000000000000000000000000000000000..433024ffe77c81a18a548d90519b301046d4808c --- /dev/null +++ b/pathways/graph.py @@ -0,0 +1,262 @@ +""" + pathways.graph + + + This module implements the :class:`GlobalGraph` object. + + + :copyright: (c) 2018 by Monica Ticlla. + :license: MIT, see LICENSE for more details. + +""" + + +from igraph import Graph + + +__author__ = "Monica Ticlla" +__email__ = "monica.ticlla@sib.swiss" +__copyright__ = "Copyright (C) 2018 Monica Ticlla" +__license__ = "MIT see LICENSE for more details" + + +class GlobalGraph(Graph): + """ + The :class:`GlobalGraph` object is a subclass of :class:`igraph.Graph` and provides an interface for + handling the `igraph.Graph <http://igraph.org/python/doc/igraph.Graph-class.html>`_ object created with + :py:meth:`~pathways.graph.create_global_graph`. A :class:`GlobalGraph` object has three types of nodes or + vertices ('snp', 'sample', 'gene'). Edges in the :class:`GlobalGraph` are undirected and connect a node of type 'snp' + with a node of type sample or with a node of type 'gene'. Edges are also connecting vertices of type 'gene' to represent + gene-to-gene interactions. + + A :class:`GlobalGraph` object is created only by :py:meth:`~pathways.graph.create_global_graph` function. + + """ + + @property + def samples_names(self): + """Finds all nodes of type 'sample' in the graph and returns their names. + + Returns: + A list with the names of 'sample' nodes. + + """ + samples = self.vs.select(type_eq = 'sample')['name'] + return samples + + @property + def snps_names(self): + """Finds all nodes of type 'snp' in the graph and returns their names. + + Returns: + A list with the names of SNP nodes in the graph. + + """ + snps = self.vs.select(type_eq = 'snp')['name'] + return snps + + @property + def genes_names(self): + """Finds all nodes of type 'gene' in the graph and returns their names. + + Returns: + List of names of gene nodes in the graph. + + """ + genes = self.vs.select(type_eq = 'gene')['name'] + return genes + + @property + def samples_indices(self): + """Finds all nodes of type 'sample' in the graph and returns their numerical indices. + + Returns: + List of the indices of sample nodes in the graph. + + """ + + samples_ixs = [v.index for v in self.vs.select(type_eq = 'sample')] + return samples_ixs + + @property + def snps_indices(self): + """Finds all nodes of type 'snp' in the graph and returns their numerical indices. + + Returns: + A List of indices of SNP nodes in the graph. + + """ + snps_ixs = [v.index for v in self.vs.select(type_eq = 'snp')] + return snps_ixs + + @property + def genes_indices(self): + """Finds all nodes of type 'gene' in the graph and returns their numerical indices. + + Returns: + A list of indices of genes nodes in the graph. + + """ + genes_ixs = [v.index for v in self.vs.select(type_eq = 'gene')] + return genes_ixs + + def indices(self, names): + """Given the list of node names, finds these nodes and returns their indices in the :class:`Graph` object. + + Args: + names (list): list of vertices names + + Returns: + A list of indices for the given nodes names + + """ + # TODO(monica.ticlla@sib.swiss): Raise warning when no nodes with a name in names is found in the :class:`Graph` object. + + indices = [v.index for v in self.vs.select(name_in = names)] + return indices + + +def create_global_graph(snp_sample, snp_gene, gene_gene): + """Creates an instance of :class:`GlobalGraph`. + + Args: + snp_sample (:class:`pandas.DataFrame` object): A pandas :class:`DataFrame` object with 'SNPs' as rows and 'samples' + as columns. Labels of rows should be the names of the 'SNPs' and labels of columns should be the names of + the 'samples'. Duplicates are not allowed. For a given row and column index, the value of the entry is '1' + if the 'SNP' is present in that 'sample' or '0' otherwise. + snp_gene (:class:`pandas.DataFrame` object): A pandas :class:`DataFrame` object with 'SNPs' as rows and 'genes' + as columns. Labels of rows should be the names of the 'SNPs' and labels of columns should be the names of + the 'genes'. The number, labels and order of rows should be identical to those of snp_sample. + For a given row and column index, the value of the entry is '1' if the 'SNP' is present in that 'gene' or + '0' otherwise. + gene_gene (:class:`pandas.DataFrame` object): A pandas :class:`DataFrame` object with two columns. Each row represents + the interaction of two genes, and the two columns contain the name of the interacting genes. + + Example: :: + + from pathways import graph + import numpy as np + import pandas as pd + + + sample_ids = ['sample1','sample2','sample3','sample4','sample5'] + snp_ids = ['snp1','snp2','snp3','snp4'] + gene_ids = ['gene1','gene2','gene3','gene4','gene5'] + + snp_sample_edges = [('snp1','sample1'), + ('snp2','sample2'), + ('snp3','sample3'), + ('snp4','sample4'), + ('snp4','sample5')] + snp_gene_edges = [('snp1','gene1'), + ('snp2','gene1'), + ('snp3','gene2'), + ('snp4','gene3')] + + gene_gene_edges = [('gene1','gene5'), + ('gene2','gene4'), + ('gene3','gene4')] + + data_zeros = np.zeros((len(snp_ids), len(sample_ids))) + + snp_sample_binary = pd.DataFrame(data_zeros, index=snp_ids, columns=sample_ids) + for snp_sample_edge in snp_sample_edges: + snp_sample_binary.loc[snp_sample_edge[0],snp_sample_edge[1]] = 1 + + data_zeros = np.zeros((len(snp_ids), len(gene_ids))) + + snp_gene_binary = pd.DataFrame(data_zeros, index=snp_ids, columns=gene_ids) + for snp_gene_edge in snp_gene_edges: + snp_gene_binary.loc[snp_gene_edge[0],snp_gene_edge[1]] = 1 + + gene_gene_df = pd.DataFrame(gene_gene_edges, columns=list('AB'), index=np.arange(0,len(gene_gene_edges))) + + my_global_graph = graph.create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + """ + + snp_sample_adj = snp_sample + snp_gene_adj = snp_gene + + ################################################################################## + # Clean datasets + ################################################################################## + # Remove samples with no SNPs + samples_noSNPs = snp_sample_adj.columns.values[snp_sample_adj.sum(axis=0) == 0] + snp_sample_adj.drop(samples_noSNPs, axis=1, inplace=True) + + snps_todrop = list() + # Remove SNPs with zero frequency + snps_noSamples = list(snp_sample_adj.index.values[snp_sample_adj.sum(axis=1) == 0]) + if len(snps_noSamples)>0: + snps_todrop.extend(snps_noSamples) + # Remove SNPs not assigned to a gene + snps_noGene = list(snp_gene_adj.index.values[snp_gene_adj.sum(axis=1) == 0]) + if len(snps_noGene)>0: + snps_todrop.extend(snps_noGene) + # put together SNPs to drop + if len(snps_todrop) > 0: + snps_todrop = set(snps_todrop) + snp_sample_adj.drop(snps_todrop, axis=0, inplace=True) + snp_gene_adj.drop(snps_todrop, axis=0, inplace=True) + + ################################################################################## + # List of vertices for graph + ################################################################################## + samples_names = snp_sample_adj.columns.values + snps_names= snp_sample_adj.index.values + + genes_names = list() + genes_names.extend(snp_gene_adj.columns.values) + genes_names.extend(gene_gene.iloc[:,0].values) + genes_names.extend(gene_gene.iloc[:,1].values) + genes_names = list(set(genes_names)) + + + ################################################################################## + # Create graph + ################################################################################## + + # Add vertices + #--------------- + + global_graph = GlobalGraph() + # First add vertices of type 'sample' + global_graph.add_vertices(samples_names) + global_graph.vs['type'] = 'sample' + # Then, add nodes of type 'snp' + global_graph.add_vertices(snps_names) + global_graph.vs.select(name_in = snps_names)['type'] = 'snp' + # Finally add nodes of type 'gene' + global_graph.add_vertices(genes_names) + global_graph.vs.select(name_in = genes_names)['type'] = 'gene' + + # Add edges + #--------------- + snp_sample_edges = [] + snp_gene_edges = [] + gene_gene_edges = zip(gene_gene.iloc[:,0].values,gene_gene.iloc[:,1].values) + + for snp in snps_names: + sample_ids = snp_sample_adj.columns.values[snp_sample_adj.loc[snp,:] == 1] + gene_ids = snp_gene_adj.columns.values[snp_gene_adj.loc[snp,:] == 1] + + snp_ids_for_samples = [snp]*len(sample_ids) + snp_ids_for_genes = [snp]*len(gene_ids) + + snp_sample_edges.extend(zip(snp_ids_for_samples,sample_ids)) + snp_gene_edges.extend(zip(snp_ids_for_genes,gene_ids)) + + global_graph.add_edges(snp_sample_edges) + global_graph.add_edges(snp_gene_edges) + global_graph.add_edges(gene_gene_edges) + + # Check graph is connected + # if not global_graph.is_connected(): + return global_graph + + + + diff --git a/pathways/kernel.py b/pathways/kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..b763fa84ba79cad4ff7d370a351bc720f5b3cd10 --- /dev/null +++ b/pathways/kernel.py @@ -0,0 +1,412 @@ +""" + pathways.kernel + + + This module implements the :class:`GraphKernel` object. + + + :copyright: (c) 2018 by Monica Ticlla. + :license: MIT, see LICENSE for more details. + +""" + +import numpy as np +from numpy.matlib import repmat +import subprocess +from igraph.datatypes import Matrix +from scipy.sparse import csc_matrix,lil_matrix,issparse +from scipy.sparse import linalg +from scipy.linalg import inv,eigh,solve + + +__author__ = "Monica Ticlla" +__email__ = "monica.ticlla@sib.swiss" +__copyright__ = "Copyright (C) 2018 Monica Ticlla" +__license__ = "MIT see LICENSE for more details" + + +class GraphKernel(object): + """Generic Kernel of a graph. + + This class is built to provide an interface for handling the Kernel matrix of :class:`GlobalGraph` object. + + """ + import warnings + warnings.filterwarnings("always") + + def __init__(self, *args, **kwds): + """__init__(data=None, vertices=None, method=None, alpha=None, + centered=False, normalized=False, as_distance=False) + + Constructs the kernel of a graph. + + Args: + + matrix: the elements of the matrix as a list of lists or C{None} to create a 0x0 matrix. + vertices: the vertices of the graph as a C{igraph.VertexSeq} object. + method: + alpha: + centered: + normalized: + as_distance: + + """ + # Set up default values for the parameters. This should match the order in *args + kwd_order = ["matrix", "vertices", "method", "alpha", "centered", 'normalized','as_distance'] + params = [[],None,None,None,False,False,False] + + args = list(args) + + # Override default parameters from args + params[:len(args)] = args + # Override default parameters from keywords + for idx, k in enumerate(kwd_order): + if k in kwds: + params[idx] = kwds[k] + # Now, translate the params list to argument names + matrix, vertices, method, alpha, centered, normalized, as_distance = params + + self.matrix = matrix + self.vertices = vertices + self.__attributes = dict() + self.__attributes['method'] = method + self.__attributes['alpha'] = alpha + self.__attributes['centered'] = centered + self.__attributes['normalized'] = normalized + self.__attributes['as_distance'] = as_distance + + @property + def matrix(self): + """Returns the kernel matrix as a sparse matrix""" + + return self.__matrix + + @matrix.setter + def matrix(self, data=None): + """Sets the data stored in the matrix as a sparse matrix""" + if data is not None: + if not issparse(data): + self.__matrix = csc_matrix(data) + elif issparse(data): + self.__matrix = data + + @property + def shape(self): + return self.__matrix.shape + + @property + def alpha(self): + return self.__attributes['alpha'] + + @property + def method(self): + return self.__attributes['method'] + @property + def normalized(self): + return self.__attributes['normalized'] + + @property + def attributes(self): + return self.__attributes + + def center(self, inplace=True): + if inplace: + self.matrix = center_kernel(self.matrix) + self.__attributes['centered'] = True + elif not inplace: + return GraphKernel(center_kernel(self.matrix), + vertices=self.vertices, + centered=True, alpha=self.alpha, + method=self.method) + + def normalize(self, inplace=True): + if inplace: + self.matrix=normalize_kernel(self.matrix) + self.__attributes['normalized'] = True + elif not inplace: + return GraphKernel(normalize_kernel(self.matrix), + vertices=self.vertices, + normalized=True, alpha=self.alpha, + method=self.method) + + def as_distance(self, inplace=True): + # EE contains the squared distances + if not self.normalized: + EE = to_distance(self.matrix, is_normalized=False) + elif self.normalized: + EE = to_distance(self.matrix, is_normalized=True) + + if inplace: + self.matrix = EE + self.__attributes['as_distance'] = True + elif not inplace: + return GraphKernel(EE, vertices=self.vertices, + as_distance=True, alpha=self.alpha, + method=self.method) + + def head(self, n=5, m=5): + """Get the first n rows and m columns from self. + + Args: + n (int): optional + The number of rows to get. This number must be + greater than 0. If not specified, 5 rows will be retrieved. + m (int): optional + The number of columns (samples) to get. This number must be + greater than 0. If not specified, 5 columns will be retrieved. + + Returns: + A subset of the kernel matrix in dense format + + """ + if self.__matrix.shape[0]>=5: + return self.__matrix[:n,:m].todense() + elif self.__matrix.shape[0]<5: + return self.matrix.todense() + + def subset(self, rows, cols): + """Subsets the kernel matrix according to given indices for rows and columns. + + Args: + rows: list with numerical indices of the Kernel matrix + cols: list with numerical indices of the kernel matrix + + Returns: + A subset of the kernel matrix in sparse format + + """ + return subset_csc(self.matrix,rows_indices=rows,cols_indices=cols) + + +def expm_eig(A): + """Computes the matrix exponential for a square, symmetric matrix, by eigenvalue decomposition. + + Args: + A: a square and symmetric matrix + + """ + D, V = eigh(A) + #np.exp(D) * V is equivalent to V @ np.diag(D) + return np.dot(np.exp(D) * V, inv(V)) + + +def expm_solveq(A): + """Computes the matrix exponential for a square, symmetric matrix, by eigenvalue decomposition. + + """ + D, V = eigh(A) + return solve(V.T, (np.exp(D) * V).T).T + + +def ed(A,alpha=0.1): + """Computes the exponential diffusion kernel (ed). + + Args: + A: Adjacency matrix of a graph in sparse format. + alpha: the diffusion time from node i to node j. Values are between 0 and 1 + + Returns: The exponential diffusion kernel matrix in sparse format. + + """ + + return csc_matrix(expm_solveq(alpha * A.todense())) + + +def led(L,alpha=0.1): + """Compute the Laplacian exponential diffusion kernel. + + Args: + L : The Laplacian matrix of a graph in sparse format. + alpha: The diffusion time from node i to node j. Values are between 0 and 1 + Returns: + The Laplacian exponential diffusion kernel in sparse format. + + """ + return csc_matrix(expm_solveq(-alpha * L.todense())) + + +def normalize_kernel(K): + """Normalizes the kernel matrix of a graph. + + 'K' is normalized if and only if Kii = 1 para todo i en {1, ..., n}. + + Args: + K: The kernel matrix of a graph in sparse format. + Returns: + The normalized (i.e cosine similarity) kernel matrix in sparse format + + """ + n = K.shape[0] + # Row vector + D = np.diag(K.todense()) + # + Kii = repmat(D, n, 1) + Kjj = repmat(np.reshape(D, (n, -1)), 1, n) + + return csc_matrix(K.todense() / (np.sqrt(Kii * Kjj))) + + +def center_kernel(K): + """Centers the kernel matrix of a graph. + + A centered kernel matrix corresponds to inner products of centered node vectors if and only if Ke =0. + e is a column vector, all of whose elements are 1 (e = [1,1,1, ..., 1].T). + + Args: + K: The kernel matrix of a graph in sparse format. + Returns: + The centered kernel matrix in sparse format + + """ + n = K.shape[0] + K_row_means = K.mean(axis=0) + ones_column_vector = np.ones((n, 1)) + + # This is a product of a column vector by row vector + # creates a matrix with identical rows, each being the vector of row means of K + K_row_means_n_times = ones_column_vector * K_row_means + + # mean of row means + mean_of_K_row_means = K_row_means.mean() + + # the centered kernel + C = K.todense() - K_row_means_n_times.T - K_row_means_n_times + mean_of_K_row_means * np.ones((n, n)) + + return csc_matrix(C) + + +def to_distance(K, is_normalized=False): + """Derives the distance measure associated to the kernel matrix K. + + Args: + K: The kernel matrix of a graph in sparse format. + Returns: + The squared distances in a sparse format matrix + + """ + if not is_normalized: + n = K.shape[0] + # Row vector + D = np.diag(K.todense()) + # + Kii = repmat(D, n, 1) + Kjj = repmat(np.reshape(D, (n, -1)), 1, n) + EE = Kii + Kjj - 2 * np.array(K.todense()) + elif is_normalized: + EE = 2 * (1 - np.array(K.todense())) + + EE[EE < 0.0] = 0.0 + + return csc_matrix(EE) + + +def is_sequence(v): + """Check if array C{v} is a sequence of numbers. + + Args: + v: Array of numbers + Returns: bool + """ + v_start = v[0] + v_end = v[-1] + return np.array_equal(v, np.arange(v[0],v[-1] + 1)) + + +def subset_csc(M, rows_indices,cols_indices): + """Subset a CSC sparse matrix 'M' given the specified indices for rows (rows_indices) and columns (cols_indices). + + Args: + M: The CSC matrix + rows_indices: + cols_indices: + Returns: A subset of M in CSC matrix format + + """ + if is_sequence(rows_indices) & is_sequence(cols_indices): + row_start_ix = rows_indices[0] + row_end_ix = rows_indices[-1] + 1 + col_start_ix = cols_indices[0] + col_end_ix = cols_indices[-1] + 1 + + new_M = M[row_start_ix:row_end_ix,col_start_ix:col_end_ix] + return new_M + + elif is_sequence(rows_indices) & (not is_sequence(cols_indices)): + row_start_ix = rows_indices[0] + row_end_ix = rows_indices[-1] + 1 + try: + # May raise IndexError + new_M = M[row_start_ix:row_end_ix,cols_indices] + except IndexError as e: + print(e) + raise e + else: + return new_M + + elif (not is_sequence(rows_indices)) & is_sequence(cols_indices): + col_start_ix = cols_indices[0] + col_end_ix = cols_indices[-1] + 1 + try: + # May raise IndexError + new_M = M[rows_indices, col_start_ix:col_end_ix] + except IndexError as e: + print(e) + raise e + else: + return new_M + + elif (not is_sequence(rows_indices)) & (not is_sequence(cols_indices)): + try: + # May raise IndexError + new_M = M[rows_indices,:][:,cols_indices] + except IndexError as e: + print(e) + raise e + else: + return new_M + + +def get_kernel(graph, method='ed', alpha=0.01): + """Compute the kernel matrix from an adjacency matrix. + + Args: + graph: :class:`Graph` or :class:`GlobalGraph` object + method: any element from this list ['ed', 'led','vnd'] + - ed: The Exponential Diffusion kernel + - led: The Laplacian Exponential Diffusion kernel + - vnd: The Von Neumann Diffusion kernel + alpha: diffusion time from node i to node j + + Returns: + A :class:`GraphKernel` object + + """ + + # The exponential diffusion kernel + if method == 'ed': + # 1. Get the adjacency matrix of the graph + A = csc_matrix(graph.get_adjacency().data) + # 2. Compute the matrix exponential using eigenvalue decomposition + K = ed(A, alpha) + + # The Laplacian exponential diffusion kernel + elif method == 'led': + # 1. Get the Laplacian matrix of the graph + L = csc_matrix(graph.laplacian()) + # 2. Compute matrix exponential + K = led(L, alpha) + + # The von Neumann diffusion kernel + elif method == 'vnd': + # 1. Get the adjacency matrix of the graph + A = lil_matrix(graph.get_adjacency().data) + # 2. + A= -alpha * A + # 3. This is equivalent to adding the Identity matrix + A.setdiag(1) + #4 + K = inv(A.tocsc) + + K[K<0.0] = 0.0 + + return GraphKernel(K, vertices=graph.vs, alpha=alpha, method=method) diff --git a/pathways/pathways.py b/pathways/pathways.py deleted file mode 100644 index 7fbbae4f9c58882c3754a89675312f3c1430ffd8..0000000000000000000000000000000000000000 --- a/pathways/pathways.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Main module.""" diff --git a/pathways/pathways_relevance.py b/pathways/pathways_relevance.py new file mode 100644 index 0000000000000000000000000000000000000000..581f7655d22c27521dd29fb09de9923b36f688bc --- /dev/null +++ b/pathways/pathways_relevance.py @@ -0,0 +1,298 @@ +""" + pathways.pathways_relevance + + + This module implements the :class:`PathwaysScores` object. + + + :copyright: (c) 2018 by Monica Ticlla. + :license: MIT, see LICENSE for more details. + +""" + +from igraph import Graph +import numpy as np +import pandas as pd +import random +from itertools import chain +import matplotlib.pyplot as plt +from pathways import util + +__author__ = "Monica Ticlla" +__email__ = "monica.ticlla@sib.swiss" +__copyright__ = "Copyright (C) 2018 Monica Ticlla" +__license__ = "MIT see LICENSE for more details" + + +class PathwaysScores(object): + """This class is built to provide an interface for handling the list of pathways and their scores computed by + :py:meth:`score_pathways`. + + """ + def __init__(self, *args, **kwds): + """__init__(pathways, names, scores, p_scores, gene_scores) + + Constructs the PathwaysScores object. + + Args: + pathways : dictionary of pathways with list of genes as values for each pathway key + names : dictionary of pathways with the long names of each pathway as values + scores : dictionary of pathways with raw relevance score as value for each pathway key + p_scores : dictionary of pathways with the probability of its raw relevance score as value + genes_scores: dictionary of pathways with dictionary of raw scores for each gene + + + """ + + # Set up default values for the parameters. This should match the order in *args + kwd_order = ['pathways','names','scores','p_scores','genes_scores'] + params = [{},{},{},{},{}] + + args = list(args) + + # Override default parameters from args + params[:len(args)] = args + # Override default parameters from keywords + for idx, k in enumerate(kwd_order): + if k in kwds: + params[idx] = kwds[k] + # Now, translate the params list to argument names + pathways, names, scores, p_scores, genes_scores = params + + self.pathways = pathways + self.names = names + self.scores = scores + self.p_scores = p_scores + self.genes_scores = genes_scores + + def __get_sorted_pscores(self, threshold=1.0): + """Sort self.p_scores + + Args: + threshold: maximum value for the pscore of a pathway. + Pathways with pscores above this threshold ar not considered. + + Returns: + A sorted list of tuples with each element having this form '(p_score, pathway_id)' + + """ + sorted_pscores = sorted((pscore, pathway) for (pathway, pscore) in self.p_scores.items() if pscore <= threshold) + return sorted_pscores + + def significant_pathways(self, threshold=0.05): + """Retrieves IDs of pathways with pscore bellow threshold. + + Args: + threshold: maximum value for the pscore of a pathway + Returns: + List of IDs for pathways with pscores bellow threshold + + """ + sorted_pscores = self.__get_sorted_pscores(threshold=threshold) + pathways_ids = [pathway for pscore, pathway in sorted_pscores] + return pathways_ids + + def genes_in_significant_pathways(self, threshold=0.05): + """Retrieves a sorted list with all the genes in pathways with pscores bellow threshold. + + Args: + threshold: maximum value for the pscore of a pathway + Returns: + List of two-element tuples, where the first element corresponds to the gene score and the second element + corresponds to the gene id + + """ + pathways_significant = self.significant_pathways(threshold=threshold) + genes_in_pathways = list(set(chain.from_iterable(self.pathways[pathway] for pathway in pathways_significant))) + genes_in_pathways_and_scores = sorted((self.genes_scores.get(gene,0), gene) for gene in genes_in_pathways) + genes_in_pathways_and_scores.sort(reverse=True) + + return genes_in_pathways_and_scores + + def plot_pathways(self, threshold=0.05, ax=None): + """Plot ranked list of pathways as an horizontal bar chart. + + X-axis corresponds to the -log10 of the pathways p-scores. + + Args: + threshold: maximum value for the pscore of a pathway + ax: + + Returns: instance of :class:`Figure` + + """ + + sorted_pscores = self.__get_sorted_pscores() + pathways_names_pscores_sorted = [(self.names[pathway],np.abs(np.log10(pscore)*-1)) + for pscore, pathway in sorted_pscores if pscore <= threshold] + + if not ax: + ax = plt.gca() + + ax.barh(np.arange(len(pathways_names_pscores_sorted)), + [value for name,value in pathways_names_pscores_sorted], + align='center') + + ax.set_yticks(np.arange(len(pathways_names_pscores_sorted))) + ax.set_yticklabels([name for name, value in pathways_names_pscores_sorted]) + ax.invert_yaxis() + ax.set_xlabel('-log10(pscore)') + ax.set_title('Ranking of pathways') + + ax.tick_params(axis='x', which='both', labelbottom=True, labeltop=False) + + return ax + + def plot_genes_in_significant_pathways(self, threshold=0.05, ax=None): + """Plots a heatmap of scores of genes from pathways with p-scores bellow threshold. + + Args: + threshold: maximum value for the pscore of a pathway + ax: + Returns: + instance of :class:`Figure` + """ + + pathways_significant = self.significant_pathways(threshold=threshold) + genes_in_pathways_and_scores = self.genes_in_significant_pathways(threshold=threshold) + genes = [item[1] for item in genes_in_pathways_and_scores] + + data_matrix = np.empty((len(genes), len(pathways_significant))) + data_matrix[:] = np.nan + + # Create dataframe filled with zeroes + genes_in_pathways_and_scores_df = pd.DataFrame(data_matrix, index=genes, columns=pathways_significant) + + # Fill dataframe + for pathway in pathways_significant: + pathway_genes = list(self.pathways[pathway].flat) + genes_in_pathways_and_scores_df.loc[pathway_genes, pathway] = list(self.genes_scores.get(gene, np.nan) + for gene in pathway_genes) + # Remove genes with nan values for all pathways + genes_in_pathways_and_scores_df = genes_in_pathways_and_scores_df.loc[~genes_in_pathways_and_scores_df.isnull().all(1),:] + + pathways_significant_names = [self.names[pathway_id] for pathway_id in pathways_significant] + + if not ax: + ax = plt.gca() + + + im, cbar = util.heatmap(data = genes_in_pathways_and_scores_df.as_matrix(), + row_labels = genes, col_labels=pathways_significant_names, + ax=ax, cbar_kw = {'shrink':0.15, 'orientation':'vertical'}, + cmap="YlGn", cbarlabel="Gene score") + + return ax + + + +def score_pathways(global_graph, graph_kernel, pathways, names, samples=None, threshold=1, permutations=100): + """Computes relevance scores for pathways given a :class:`GlobalGraph` object and its :class:`GraphKernel` object. + Args: + global_graph: a graph of class GlobalGraph + graph_kernel: the corresponding kernel of global_graph. It's an object of class GraphKernel. + pathways: a dictionary of pathways + names: + samples: list of samples names for which to compute the pathway scores + threshold: minimum number of genes in pathway to compute score + permutations: number of permutations to compute the pathway scores + Returns: + Dictionary of pathways and their corresponding scores + """ + + gene_indices = global_graph.genes_indices + gene_names = global_graph.genes_names + + if samples is not None: + sample_indices = global_graph.indices(names=samples) + else: + sample_indices = global_graph.samples_indices + + # Extract similarities between a sample and a gene from kernel matrix + per_gene_mean_similarities = graph_kernel.subset(rows=sample_indices,cols=gene_indices).mean(0) + # indices of gene in the sample x gene similarity matrix + gene_names_ix = {gene: ix for ix, gene in enumerate(gene_names)} + + my_pathways = pathways + scores_for_pathways = dict() + p_scores_for_pathways = dict() + scores_per_gene = dict(zip(gene_names, list(per_gene_mean_similarities.flat))) + + progress_counter = 0 + for pathway in my_pathways: + progress_counter += 1 + + if progress_counter % 10 == 0: + print('_') + + similarities_permutations = np.zeros(permutations + 1) + genes_from_pathway = my_pathways[pathway] + + if len(genes_from_pathway) >= threshold: + genes_in_path_indices = sorted(gene_names_ix[gene] for gene in genes_from_pathway if gene in gene_names_ix) + gene_indices_to_shuffle = np.arange(len(gene_names_ix)) + + # Compute score only if at least one gene in the pathway + # is present in the kernel matrix + if len(genes_in_path_indices) >= 1: + for permutation in range(permutations + 1): + if permutation == 0: + scores_for_pathways[pathway] = per_gene_mean_similarities[:, genes_in_path_indices].mean() + similarities = per_gene_mean_similarities[:, genes_in_path_indices] + elif permutation > 0: + random.shuffle(gene_indices_to_shuffle) + similarities = per_gene_mean_similarities[:,gene_indices_to_shuffle[0:len(genes_in_path_indices)]] + + similarities_permutations[permutation] = similarities.mean() + + p_scores_for_pathways[pathway] = ((similarities_permutations[1:] >= similarities_permutations[0]).sum() + 1) / \ + (permutations + 1) + + else: + print('{} : no genes found in kernel matrix'.format(names[pathway])) + scores_for_pathways[pathway] = 0 + p_scores_for_pathways[pathway] = 1 + else: + scores_for_pathways[pathway] = np.nan + p_scores_for_pathways[pathway] = np.nan + + return PathwaysScores(pathways=my_pathways, names=names, scores=scores_for_pathways, + p_scores=p_scores_for_pathways, genes_scores=scores_per_gene) + + +def random_means_for_subset(similarity_matrix, subset_size, permutations, nr_cores=None): + """ + Args: + similarity_matrix: + subset_size: + permutations: + nr_cores: + Returns: + + """ + + # TODO: incorporate this function in score_pathways() + + nr_of_genes_in_similarity_matrix = similarity_matrix.shape[1] + random_genes_indices = [list(np.random.choice(nr_of_genes_in_similarity_matrix, subset_size, + replace=False)) for _ in range(permutations)] + + similarities_permutations = [similarity_matrix[:, permuted_indices].mean() for permuted_indices in random_genes_indices] + + return similarities_permutations + + + + + + + + + + + + + + + + diff --git a/pathways/util.py b/pathways/util.py new file mode 100644 index 0000000000000000000000000000000000000000..c7d261a9c2322d15c818935c5620a437bb075ff6 --- /dev/null +++ b/pathways/util.py @@ -0,0 +1,142 @@ +""" + pathways.util + + + This module contains various functions. + + + :copyright: (c) 2018 by Monica Ticlla. + :license: MIT, see LICENSE for more details. + +""" + +import numpy as np +import matplotlib +import matplotlib.pyplot as plt +from copy import copy + +__author__ = "Monica Ticlla" +__email__ = "monica.ticlla@sib.swiss" +__copyright__ = "Copyright (C) 2018 Monica Ticlla" +__license__ = "MIT see LICENSE for more details" + + +def heatmap(data, row_labels, col_labels, ax=None, + cbar_kw={}, cbarlabel="", **kwargs): + """Creates a heatmap from a numpy array and two lists of labels. + + Args: + data : A 2D numpy array of shape (N,M) + row_labels : A list or array of length N with the labels + for the rows + col_labels : A list or array of length M with the labels + for the columns + ax : A matplotlib.axes.Axes instance to which the heatmap + is plotted. If not provided, use current axes or + create a new one. + cbar_kw : A dictionary with arguments to + :meth:`matplotlib.Figure.colorbar`. + cbarlabel : The label for the colorbar + All other arguments are directly passed on to the imshow call. + + """ + + if not ax: + ax = plt.gca() + + masked_data = np.ma.array(data, mask=np.isnan(data)) + + if 'cmap' in kwargs: + cmap = copy(plt.get_cmap(kwargs['cmap'])) + cmap.set_bad('grey', 1.) + kwargs['cmap'] = cmap + im = ax.imshow(masked_data, **kwargs) + else: + cmap = copy(matplotlib.cm.jet) + cmap.set_bad('grey', 1.) + im = ax.imshow(masked_data, cmap=cmap,**kwargs) + + # Plot the heatmap + #im = ax.imshow(data, **kwargs) + #im = ax.imshow(masked_data,**kwargs) + + # Create colorbar + cbar = ax.figure.colorbar(im, ax=ax, **cbar_kw) + cbar.ax.set_ylabel(cbarlabel, rotation=-90, va="bottom") + + # We want to show all ticks... + ax.set_xticks(np.arange(data.shape[1])) + ax.set_yticks(np.arange(data.shape[0])) + # ... and label them with the respective list entries. + ax.set_xticklabels(col_labels) + ax.set_yticklabels(row_labels) + + # Let the horizontal axes labeling appear on top. + ax.tick_params(top=True, bottom=False, + labeltop=True, labelbottom=False) + + # Rotate the tick labels and set their alignment. + plt.setp(ax.get_xticklabels(), rotation=-30, ha="right", + rotation_mode="anchor") + + # Turn spines off and create white grid. + for edge, spine in ax.spines.items(): + spine.set_visible(False) + + ax.set_xticks(np.arange(data.shape[1]+1)-.5, minor=True) + ax.set_yticks(np.arange(data.shape[0]+1)-.5, minor=True) + ax.grid(which="minor", color="w", linestyle='-', linewidth=3) + ax.tick_params(which="minor", bottom=False, left=False) + + return im, cbar + + +def annotate_heatmap(im, data=None, valfmt="{x:.2f}", textcolors=["black", "white"], threshold=None, **textkw): + """A function to annotate a heatmap. + + Args: + im : The AxesImage to be labeled. + data : Data used to annotate. If None, the image's data is used. + valfmt : The format of the annotations inside the heatmap. + This should either use the string format method, e.g. + "$ {x:.2f}", or be a :class:`matplotlib.ticker.Formatter`. + textcolors : A list or array of two color specifications. The first is + used for values below a threshold, the second for those + above. + threshold : Value in data units according to which the colors from + textcolors are applied. If None (the default) uses the + middle of the colormap as separation. + + Further arguments are passed on to the created text labels. + + """ + + if not isinstance(data, (list, np.ndarray)): + data = im.get_array() + + # Normalize the threshold to the images color range. + if threshold is not None: + threshold = im.norm(threshold) + else: + threshold = im.norm(data.max())/2. + + # Set default alignment to center, but allow it to be + # overwritten by textkw. + kw = dict(horizontalalignment="center", + verticalalignment="center") + kw.update(textkw) + + # Get the formatter in case a string is supplied + if isinstance(valfmt, str): + valfmt = matplotlib.ticker.StrMethodFormatter(valfmt) + + # Loop over the data and create a `Text` for each "pixel". + # Change the text's color depending on the data. + texts = [] + for i in range(data.shape[0]): + for j in range(data.shape[1]): + kw.update(color=textcolors[im.norm(data[i, j]) > threshold]) + text = im.axes.text(j, i, valfmt(data[i, j], None), **kw) + texts.append(text) + + return texts diff --git a/requirements_dev.txt b/requirements_dev.txt index 5c9dcb6277102ae8d8bafdf2276317adb85131a1..6a253e4d9cfd89464be5ef847d66e9ae087d238a 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -1,11 +1,91 @@ -pip==8.1.2 +alabaster==0.7.10 +argh==0.26.2 +Babel==2.5.1 +backcall==0.1.0 +bcbio-gff==0.6.4 +biopython==1.72 +bleach==2.1.3 bumpversion==0.5.3 -wheel==0.29.0 -watchdog==0.8.3 -flake8==2.6.0 -tox==2.3.1 +certifi==2018.4.16 +cffi==1.11.2 +chardet==3.0.4 +click==6.7 coverage==4.1 -Sphinx==1.4.8 cryptography==1.7 +cycler==0.10.0 +decorator==4.1.2 +docutils==0.14 +entrypoints==0.2.3 +flake8==2.6.0 +html5lib==1.0.1 +idna==2.6 +igraph==0.1.11 +imagesize==0.7.1 +ipykernel==4.8.2 +ipython==6.4.0 +ipython-genutils==0.2.0 +ipywidgets==7.2.1 +jedi==0.12.0 +Jinja2==2.9.6 +jsonschema==2.6.0 +jupyter==1.0.0 +jupyter-client==5.2.3 +jupyter-console==5.2.0 +jupyter-core==4.4.0 +kiwisolver==1.0.1 +llvmlite==0.23.2 +MarkupSafe==1.0 +matplotlib==2.2.2 +mccabe==0.5.3 +mistune==0.8.3 +nbconvert==5.3.1 +nbformat==4.4.0 +networkx==2.0 +nose==1.3.7 +notebook==5.5.0 +numba==0.38.1 +numpy==1.13.3 +packaging==17.1 +pandas==0.21.0 +pandocfilters==1.4.2 +parso==0.2.1 +pathtools==0.1.2 +pexpect==4.6.0 +pickleshare==0.7.4 +plotly==2.7.0 +pluggy==0.3.1 +prompt-toolkit==1.0.15 +ptyprocess==0.5.2 +py==1.4.34 +pyasn1==0.3.7 +pycodestyle==2.0.0 +pycparser==2.18 +pyflakes==1.2.3 +Pygments==2.2.0 +pyparsing==2.2.0 +pytest==3.2.3 +python-dateutil==2.6.1 +python-igraph==0.7.1.post6 +pytz==2017.3 PyYAML==3.11 - +pyzmq==17.0.0 +qtconsole==4.3.1 +requests==2.19.1 +scipy==1.0.0 +Send2Trash==1.5.0 +simplegeneric==0.8.1 +six==1.11.0 +snowballstemmer==1.2.1 +Sphinx==1.7.6 +sphinxcontrib-websupport==1.1.0 +terminado==0.8.1 +testpath==0.3.1 +tornado==5.0.2 +tox==2.3.1 +traitlets==4.3.2 +urllib3==1.23 +virtualenv==15.1.0 +watchdog==0.8.3 +wcwidth==0.1.7 +webencodings==0.5.1 +widgetsnbextension==3.2.1 diff --git a/setup.py b/setup.py index 66c9a3436dcaf33c6989b083dd83df6cda05eb5d..2c8f61d8932d7e04c0cef769cdf50d9db18a8008 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,11 @@ with open('HISTORY.rst') as history_file: requirements = [ 'Click>=6.0', + 'scipy>=1.0.0', + 'python-igraph==0.7.1.post6', + 'numpy>=1.13.3', + 'pandas>=0.21.0', + 'matplotlib>=2.2.2', # TODO: put package requirements here ] @@ -31,7 +36,7 @@ setup( long_description=readme + '\n\n' + history, author="Monica R. Ticlla Ccenhua", author_email='monica.ticlla@sib.swiss', - url='https://github.com/mticlla/pathways', + url='https://git.scicore.unibas.ch/TBRU/pathways.git', packages=find_packages(include=['pathways']), entry_points={ 'console_scripts': [ diff --git a/tests/test_graph.py b/tests/test_graph.py new file mode 100644 index 0000000000000000000000000000000000000000..6489d4288fac0724621da1e78f8185e2852e23be --- /dev/null +++ b/tests/test_graph.py @@ -0,0 +1,182 @@ +import unittest +import warnings +from pathways.graph import GlobalGraph, create_global_graph +from igraph import Graph +import numpy as np +import pandas as pd + + +def check_equal_lists(L1, L2): + """Check if two lists are equal""" + if sorted(L1) == sorted(L2): + return True + else: + return False + + +class TestGraphCase(unittest.TestCase): + """pathways.graph test cases""" + + def setUp(self): + """Create simple graph object""" + global sample_ids, snp_ids, gene_ids, \ + snp_sample_edges, snp_gene_edges, gene_gene_edges, \ + snp_sample_binary, snp_gene_binary, gene_gene_df + + sample_ids = ['sample1','sample2','sample3','sample4','sample5'] + snp_ids = ['snp1','snp2','snp3','snp4'] + gene_ids = ['gene1','gene2','gene3','gene4','gene5'] + + snp_sample_edges = [('snp1','sample1'), + ('snp2','sample2'), + ('snp3','sample3'), + ('snp4','sample4'), + ('snp4','sample5')] + snp_gene_edges = [('snp1','gene1'), + ('snp2','gene1'), + ('snp3','gene2'), + ('snp4','gene3')] + + gene_gene_edges = [('gene1','gene5'), + ('gene2','gene4'), + ('gene3','gene4')] + + data_zeros = np.zeros((len(snp_ids), len(sample_ids))) + + snp_sample_binary = pd.DataFrame(data_zeros, index=snp_ids, columns=sample_ids) + for snp_sample_edge in snp_sample_edges: + snp_sample_binary.loc[snp_sample_edge[0],snp_sample_edge[1]] = 1 + + data_zeros = np.zeros((len(snp_ids), len(gene_ids))) + + snp_gene_binary = pd.DataFrame(data_zeros, index=snp_ids, columns=gene_ids) + for snp_gene_edge in snp_gene_edges: + snp_gene_binary.loc[snp_gene_edge[0],snp_gene_edge[1]] = 1 + + gene_gene_df = pd.DataFrame(gene_gene_edges, columns=list('AB'), index=np.arange(0,len(gene_gene_edges))) + + def ignore_warnings(test_func): + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + + return do_test + + def tearDown(self): + """Teardown.""" + pass + + def test_GlobalGraph_isInstanceOf_igraphGraph(self): + """ + Test class GlobalGraph creates objects of class Graph from igraph + + """ + g = GlobalGraph() + assert isinstance(g,Graph) + + def test_create_global_graph(self): + """ + Test function create_global_graph creates object of class GlobalGraph. + + """ + + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + assert isinstance(my_graph,GlobalGraph) + + def test_samples_names(self): + """ + Test method samples_names from class GlobalGraph returns list of names for edges of type 'sample'. + + """ + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + assert check_equal_lists(my_graph.samples_names, sample_ids) + + def test_snps_names(self): + """ + Test method 'snps_names' from class GlobalGraph returns list of names for edges of type 'snp'. + + """ + + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + assert check_equal_lists(my_graph.snps_names, snp_ids) + + def test_genes_names(self): + """ + Test method 'genes_names' from class GlobalGraph returns list of names for edges of type 'gene'. + + """ + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + assert check_equal_lists(my_graph.genes_names, gene_ids) + + def test_samples_indices(self): + """ + Test method 'samples_indices' from class GlobalGraph returns list of indices for edges of type 'sample' + + """ + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + samples_indices = np.arange(0,len(sample_ids)) + + assert check_equal_lists(my_graph.samples_indices,samples_indices) + + def test_snps_indices(self): + """ + Test method 'snps_indices', from class GlobalGraph, returns list of indices for edges of type 'snp' + + """ + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + snps_indices = np.arange(len(sample_ids),len(sample_ids)+len(snp_ids)) + + assert check_equal_lists(my_graph.snps_indices,snps_indices) + + def test_genes_indices(self): + """ + Test method 'genes_indices', from class GlobalGraph, returns list of indices for edges of type 'gene'. + + """ + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + genes_indices = np.arange(len(sample_ids)+len(snp_ids), + len(sample_ids)+len(snp_ids)+len(gene_ids)) + + assert check_equal_lists(my_graph.genes_indices,genes_indices) + + def test_indices(self): + """ + Test method 'samples_indices' from class GlobalGraph returns list of indices for a list of edge names. + + """ + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + samples_indices = np.arange(0,len(sample_ids)) + + assert check_equal_lists(my_graph.indices(names=sample_ids), samples_indices) + + + + + + + diff --git a/tests/test_kernel.py b/tests/test_kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..16850d84a45f28f92e3311c3539f8c20a8769f30 --- /dev/null +++ b/tests/test_kernel.py @@ -0,0 +1,209 @@ +import unittest +import warnings +from pathways.kernel import get_kernel, expm_eig, expm_solveq, ed, led, \ + normalize_kernel, center_kernel, to_distance, is_sequence, \ + subset_csc +from igraph import Graph +import numpy as np +from numpy import allclose +from scipy.sparse import linalg +from scipy.sparse import csc_matrix +import time + + +class TestKernelCase(unittest.TestCase): + """pathways.kernel test cases""" + + def setUp(self): + """Create simple graph object""" + global my_graph, my_graph_500 + + my_graph = Graph([(0,1), (0,2), (2,3), (3,4), (4,2), (2,5), (5,0), (6,3), (5,6)]) + my_graph_500 = Graph.Erdos_Renyi(n=500,p=0.3,directed=False) + + def ignore_warnings(test_func): + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + + return do_test + + def tearDown(self): + """Teardown.""" + pass + + def test_expm_eig_vs_expm_solveq_in_speed_1000nodes(self): + my_graph_1000 = Graph.Erdos_Renyi(n=1000, p=0.3, directed=False) + print('Graph nr edges: {}'.format(my_graph_1000.ecount())) + print('Graph nr edges: {}'.format(my_graph_1000.vcount())) + + my_graph_1000_adj = csc_matrix(my_graph_1000.get_adjacency().data) + + time_start = time.clock() + my_kernel_expm_solveq = expm_solveq(my_graph_1000_adj.todense()*0.01) + time_end = time.clock() + print('expm_solveq time:{}'.format(time_end - time_start)) + + time_start_a = time.clock() + my_kernel_expm_eig = expm_eig(my_graph_1000_adj.todense() * 0.01) + time_end_a = time.clock() + print('expm_eig time:{}'.format(time_end_a - time_start_a)) + + assert allclose(my_kernel_expm_solveq, my_kernel_expm_eig) + + + def test_expm_eig_vs_expm_pade_in_sparse_500nodes(self): + + print('Graph nr edges: {}'.format(my_graph_500.ecount())) + print('Graph nr edges: {}'.format(my_graph_500.vcount())) + my_graph_500_adj = csc_matrix(my_graph_500.get_adjacency().data) + + time_start = time.clock() + my_kernel_expm_eig = expm_eig(my_graph_500_adj.todense()*0.01) + time_end = time.clock() + print('expm_eig time:{}'.format(time_end - time_start)) + + time_start_a = time.clock() + my_kernel_expm_pade = linalg.expm(my_graph_500_adj*0.01) + time_end_a = time.clock() + print('expm_pade time:{}'.format(time_end_a - time_start_a)) + + assert allclose(my_kernel_expm_eig, my_kernel_expm_pade.A) + + def test_ed_vs_get_kernel_with_sparse(self): + my_graph_500_adj = csc_matrix(my_graph_500.get_adjacency().data) + + time_start = time.clock() + my_kernel_ed = ed(my_graph_500_adj, alpha=0.01) + print('kernel with function "ed":{}'.format(time.clock() - time_start)) + time_start_a = time.clock() + + my_kernel_ed_using_get_kernel = get_kernel(my_graph_500,method='ed', alpha=0.01).matrix + print('kernel with function "get_kernel":{}'.format(time.clock() - time_start_a)) + + assert allclose(my_kernel_ed.A, my_kernel_ed_using_get_kernel.A) + + def test_led_vs_get_kernel_with_sparse(self): + my_graph_500_lap = csc_matrix(my_graph_500.laplacian()) + time_start = time.clock() + my_kernel_led = led(my_graph_500_lap, alpha=0.01) + print('kernel with function "led":{}'.format(time.clock() - time_start)) + time_start_a = time.clock() + my_kernel_led_using_get_kernel = get_kernel(my_graph_500,method='led', alpha=0.01).matrix + print('kernel with function "get_kernel":{}'.format(time.clock() - time_start_a)) + + assert allclose(my_kernel_led.A, my_kernel_led_using_get_kernel.A) + + def test_normalize_kernel_inplace(self): + my_graph_kernel = get_kernel(my_graph,method='ed',alpha=0.01) + my_graph_kernel_normalized_matrix = normalize_kernel(my_graph_kernel.matrix) + my_graph_kernel.normalize(inplace=True) + assert allclose(my_graph_kernel.matrix.A, my_graph_kernel_normalized_matrix.A) + + def test_normalize_kernel_property(self): + """ K is normalized if and only if Kii = 1 para todo i en {1, ..., n}. + """ + my_graph_kernel = get_kernel(my_graph, method='ed', alpha=0.01) + my_graph_kernel_n = my_graph_kernel.normalize(inplace=False) + ones_vector = np.ones(my_graph_kernel_n.shape[0]) + + assert allclose(np.diag(my_graph_kernel_n.matrix.todense()), ones_vector) + + + def test_center_kernel_property(self): + """ A symmetric kernel matrix K is centered if and only if Ke = 0. + Where 'e' is a column vector all of whose elements are 1 (e = [1,1,...,1].T) and + '0' is a column vector all of whose elements are zero. + """ + my_graph_kernel = get_kernel(my_graph, method='ed', alpha=0.01) + my_graph_kernel_c = my_graph_kernel.center(inplace=False) + e_vector = np.ones((my_graph_kernel.shape[0], 1)) + zero_vector = np.zeros((my_graph_kernel.shape[0], 1)) + + #print(my_graph_kernel_c.matrix.dot(e_vector)) + #print(zero_vector) + + assert allclose(my_graph_kernel_c.matrix.dot(e_vector), zero_vector) + + def test_center_kernel_inplace(self): + my_graph_kernel = get_kernel(my_graph,method='ed',alpha=0.01) + my_graph_kernel_centered_matrix = center_kernel(my_graph_kernel.matrix) + my_graph_kernel.center(inplace=True) + assert allclose(my_graph_kernel.matrix.A, my_graph_kernel_centered_matrix.A) + + def test_as_distance_kernel_inplace(self): + my_graph_kernel = get_kernel(my_graph,method='ed',alpha=0.01) + my_graph_kernel_distances = to_distance(my_graph_kernel.matrix,is_normalized=False) + my_graph_kernel.as_distance(inplace=True) + assert allclose(my_graph_kernel.matrix.A, my_graph_kernel_distances.A) + + def test_GraphKernel_head(self): + my_graph_kernel = get_kernel(my_graph, method='ed', alpha=0.01) + assert my_graph_kernel.head(n=5).shape == (5,5) + + def test_is_sequence_when_is_true(self): + some_array = [5,6,7,8] + assert is_sequence(some_array) + + def test_is_sequence_when_is_false(self): + some_array = [5,8,3,2] + assert not is_sequence(some_array) + + def test_subset_csc_with_rowIndices_as_sequence(self): + some_csc_matrix = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5], [1,2,3]]) + row_indices = [1,2,3] + col_indices = [0,2] + expected_output = csc_matrix([[0,3],[4,5],[1,3]]) + output = subset_csc(some_csc_matrix,rows_indices=row_indices,cols_indices=col_indices) + assert allclose(expected_output.A, output.A) + + def test_subset_csc_with_colIndices_as_sequence(self): + some_csc_matrix = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5], [1,2,3]]) + row_indices = [0,2] + col_indices = [0,1,2] + expected_output = csc_matrix([[1,2,0],[4,0,5]]) + output = subset_csc(some_csc_matrix,rows_indices=row_indices,cols_indices=col_indices) + assert allclose(expected_output.A, output.A) + + def test_subset_with_rows_and_columns_not_as_sequence(self): + some_csc_matrix = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5], [1,2,3]]) + row_indices = [1,3] + col_indices = [0,2] + expected_output = csc_matrix([[0,3],[1,3]]) + output = subset_csc(some_csc_matrix,rows_indices=row_indices,cols_indices=col_indices) + assert allclose(expected_output.A, output.A) + + def test_subset_with_rows_and_columns_as_sequence(self): + some_csc_matrix = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5], [1,2,3]]) + row_indices = [1,2,3] + col_indices = [0,1,2] + expected_output = csc_matrix([[0,0,3],[4,0,5],[1,2,3]]) + output = subset_csc(some_csc_matrix,rows_indices=row_indices,cols_indices=col_indices) + assert allclose(expected_output.A, output.A) + + def test_subset_csc_when_IndexError_in_rows(self): + some_csc_matrix = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5], [1,2,3]]) + row_indices = [2,4] + col_indices = [0,1,2] + + with self.assertRaises(IndexError): + subset_csc(some_csc_matrix,row_indices,col_indices) + + def test_subset_csc_when_IndexError_in_cols(self): + some_csc_matrix = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5], [1,2,3]]) + row_indices = [1,2,3] + col_indices = [0,3] + + with self.assertRaises(IndexError): + subset_csc(some_csc_matrix,row_indices,col_indices) + + def test_GraphKernel_subset(self): + my_graph_kernel = get_kernel(my_graph, method='ed', alpha=0.01) + row_indices = [0,1,2,3] + col_indices = [0,1,2] + print(my_graph_kernel.shape) + assert allclose(my_graph_kernel.subset(rows=row_indices,cols=col_indices).A, + subset_csc(my_graph_kernel.matrix,rows_indices=row_indices,cols_indices=col_indices).A) + + diff --git a/tests/test_pathways.py b/tests/test_pathways.py index 986f2fe9035d7f68d74e26e091038a42eaab64cf..d5ef05b6780840274256c69b960e3d658c6a6ad5 100644 --- a/tests/test_pathways.py +++ b/tests/test_pathways.py @@ -7,7 +7,7 @@ import unittest from click.testing import CliRunner -from pathways import pathways +#from pathways import pathways from pathways import cli diff --git a/tests/test_pathways_relevance.py b/tests/test_pathways_relevance.py new file mode 100644 index 0000000000000000000000000000000000000000..63d4a7958e855579e971a4d464704259c5b75dc4 --- /dev/null +++ b/tests/test_pathways_relevance.py @@ -0,0 +1,76 @@ +import unittest +import warnings +from pathways.graph import create_global_graph +from pathways.pathways_relevance import random_means_for_subset +from pathways.kernel import get_kernel +import pandas as pd +import numpy as np +import time + + +class TestPathwaysRelevanceCase(unittest.TestCase): + """pathways.kernel test cases""" + + def setUp(self): + """Create simple graph object""" + global my_graph, sample_ids, snp_ids, gene_ids, \ + snp_sample_edges, snp_gene_edges, gene_gene_edges, \ + snp_sample_binary, snp_gene_binary, gene_gene_df + + sample_ids = ['sample1','sample2','sample3','sample4','sample5'] + snp_ids = ['snp1','snp2','snp3','snp4'] + gene_ids = ['gene1','gene2','gene3','gene4','gene5'] + + snp_sample_edges = [('snp1','sample1'), + ('snp2','sample2'), + ('snp3','sample3'), + ('snp4','sample4'), + ('snp4','sample5')] + snp_gene_edges = [('snp1','gene1'), + ('snp2','gene1'), + ('snp3','gene2'), + ('snp4','gene3')] + + gene_gene_edges = [('gene1','gene5'), + ('gene2','gene4'), + ('gene3','gene4')] + + data_zeros = np.zeros((len(snp_ids), len(sample_ids))) + + snp_sample_binary = pd.DataFrame(data_zeros, index=snp_ids, columns=sample_ids) + for snp_sample_edge in snp_sample_edges: + snp_sample_binary.loc[snp_sample_edge[0],snp_sample_edge[1]] = 1 + + data_zeros = np.zeros((len(snp_ids), len(gene_ids))) + + snp_gene_binary = pd.DataFrame(data_zeros, index=snp_ids, columns=gene_ids) + for snp_gene_edge in snp_gene_edges: + snp_gene_binary.loc[snp_gene_edge[0],snp_gene_edge[1]] = 1 + + gene_gene_df = pd.DataFrame(gene_gene_edges, columns=list('AB'), index=np.arange(0,len(gene_gene_edges))) + + my_graph = create_global_graph(snp_sample=snp_sample_binary, + snp_gene=snp_gene_binary, + gene_gene=gene_gene_df) + + def ignore_warnings(test_func): + def do_test(self, *args, **kwargs): + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + test_func(self, *args, **kwargs) + + return do_test + + def tearDown(self): + """Teardown.""" + pass + + def test_permutation_scores(self): + my_graph_kernel = get_kernel(my_graph,method='ed',alpha=0.01) + sample_gene_submatrix = my_graph_kernel.subset(my_graph.indices(sample_ids),my_graph.indices(gene_ids)) + gene_set_size = 3 + + start = time.clock() + gene_set_permutation_scores = random_means_for_subset(sample_gene_submatrix.todense(), gene_set_size, permutations=10) + end = time.clock() + print('Took {} seconds'.format(end - start))