From f279f5b1ae44f7bc977dac9848eadef9ac95e938 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Fri, 12 Jan 2018 16:36:56 +0900 Subject: [PATCH] Added network inference files --- .gitignore | 2 + EvaluationMetrics.py | 68 +++++ LinkPrediction.py | 429 +++++++++++++++++++++++++++++ LinkPredictionResults.py | 124 +++++++++ NetworkInference.py | 576 +++++++++++++++++++++++++++++++++++++++ NetworkInferencePlots.py | 122 +++++++++ NetworkStatistics.py | 236 ++++++++++++++++ 7 files changed, 1557 insertions(+) create mode 100644 EvaluationMetrics.py create mode 100644 LinkPrediction.py create mode 100644 LinkPredictionResults.py create mode 100644 NetworkInference.py create mode 100644 NetworkInferencePlots.py create mode 100644 NetworkStatistics.py diff --git a/.gitignore b/.gitignore index 21e29ac..e7d996a 100755 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,5 @@ .DS_Store? ._* *.idea/ + +Qiery Parser.py diff --git a/EvaluationMetrics.py b/EvaluationMetrics.py new file mode 100644 index 0000000..1e60cb2 --- /dev/null +++ b/EvaluationMetrics.py @@ -0,0 +1,68 @@ +######################################################################################### +# EvaluationMetrics.py +# Purpose: script contains methods for evaluating link prediction algorithm performance +# version 1.0 +# date: 01.28.2017 +######################################################################################### + + +# import module/script dependencies +import random +import heapq + + + +def AUC(nonexist_scores, missing_scores): + ''' + Function calculates the probability that a randomly chosen missing link is given a higher score than a randomly + chosen nonexistent link using procedures described by Lu & Zhou (2010) + (doi:http://dx.doi.org/10.1016/j.physa.2010.11.027T). The function takes a list containing missing, non-existent + edges, and predictions, and returns an AUC score. Current method is currently set to perform 1,000 comparisons. + :param nonexist_scores: list of nonexistent edges and scores + :param missing_scores: list of test edges and scores + :return: an integer which is the AUC score for that comparison + ''' + + # comparisons = len(nonexist_scores)*len(missing_scores) #HOW MANY?? + comparisons = 1000 + count = 0.0 + + for i in xrange(comparisons): + TN = random.sample(nonexist_scores.values(), 1) + TP = random.sample(missing_scores.values(), 1) + + if TP > TN: + count += 1.0 + if TP == TN: + count += 0.5 + + auc = count/comparisons + + return auc + + + +def KPrecision(auc, scores, testing_edges): + ''' + Function calculates the ratio of relevant items selected from the top n items using procedures described by Lu & + Zhou (2010) (doi:http://dx.doi.org/10.1016/j.physa.2010.11.027T). The function takes a list containing missing and + non-existent edges, the list of all predictions, and returns a top k-precision score for 20% of scores. + :param auc: integer representing AUC score - to indicate whether the top or bottom of list should be assessed + :param scores: list of test edges and scores + :param testing_edges: list of nonexistent edges and scores + :return: an integer which is the precision for the top number of selected links + ''' + + #get 20% of edges + links = [int(len(scores)*0.20) if int(len(scores)*0.20) >= 1 else 1][0] + + if auc < 0.5: + # pred_list = heapq.nsmallest(links, set([x[1] for x in scores.items()])) #returns n highest likelihood scores + pred_list = heapq.nsmallest(links, scores, key=lambda k: scores[k]) + else: + # pred_list = heapq.nlargest(links, set([x[1] for x in scores.items()])) #returns n highest likelihood scores + pred_list = heapq.nlargest(links, scores, key=lambda k: scores[k]) + + y_pred = [0 if (i[0], i[1]) in testing_edges else 0 if (i[1], i[0]) in testing_edges else 1 for i in pred_list] + + return float(y_pred.count(0))/links \ No newline at end of file diff --git a/LinkPrediction.py b/LinkPrediction.py new file mode 100644 index 0000000..682c629 --- /dev/null +++ b/LinkPrediction.py @@ -0,0 +1,429 @@ +########################################################################################## +# LinkPrediction.py - http://www.research.rutgers.edu/~ss2078/papers/LinkPrediction.pdf +# Purpose: script contains methods for 10 link prediction algorithms +# version 1.1.0 +# date: 01.28.2017 +########################################################################################## + + +# import module/script dependencies +import networkx as nx +import numpy as np +import random +import six + + + +def DegreeProduct(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the Degree Product for these edges given the + structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + + scores[edge] = (nx.degree(graph, i) * nx.degree(graph, j)) + + return scores + + +def CommonNeighbors(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the Common Neighbors for these edges given the + structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + i_neighbors = set(graph[i].keys()) + j_neighbors = set(graph[j].keys()) + + if len(i_neighbors) == 0 or len(j_neighbors) == 0: + scores[edge] = 0.0 + + elif len(i_neighbors.intersection(j_neighbors)) == 0: + scores[edge] = 0.0 + + else: + n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys())) + scores[edge] = float(len(n_intersection)) + + return scores + + +def Jaccard(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the Jaccard for these edges given the + structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + i_neighbors = set(graph[i].keys()) + j_neighbors = set(graph[j].keys()) + + if len(i_neighbors) == 0 or len(j_neighbors) == 0: + scores[edge] = 0.0 + + elif len(i_neighbors.intersection(j_neighbors)) == 0: + scores[edge] = 0.0 + + else: + n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys())) + n_union = set(graph[i].keys()).union(set(graph[j].keys())) + scores[edge] = float(len(n_intersection))/len(n_union) + + return scores + + +def Sorensen(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the Sorenson Similarity for these edges + given the structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + i_neighbors = set(graph[i].keys()) + j_neighbors = set(graph[j].keys()) + + if len(i_neighbors) == 0 or len(j_neighbors) == 0: + scores[edge] = 0.0 + + elif len(i_neighbors.intersection(j_neighbors)) == 0: + scores[edge] = 0.0 + + else: + n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys())) + n_degree = graph.degree(i) + graph.degree(j) + scores[edge] = float(len(n_intersection))/n_degree + + return scores + + +def LHN(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the Leicht-Holme-Newman for these edges given the + structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + i_neighbors = set(graph[i].keys()) + j_neighbors = set(graph[j].keys()) + + if len(i_neighbors) == 0 or len(j_neighbors) == 0: + scores[edge] = 0.0 + + elif len(i_neighbors.intersection(j_neighbors)) == 0: + scores[edge] = 0.0 + + else: + n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys())) + n_degree = graph.degree(i) * graph.degree(j) + scores[edge] = float(len(n_intersection))/n_degree + + return scores + + +def ShortestPath(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the shortest path for these edges given the + structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + + if nx.has_path(graph, i, j): + scores[edge] = 1.0/len(nx.shortest_path(graph, i, j)) + + else: + scores[edge] = 0.0 + + return scores + + +def ResourceAllocation(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the resource allocation for these edges + given the structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + i_neighbors = set(graph[i].keys()) + j_neighbors = set(graph[j].keys()) + + if len(i_neighbors) == 0 or len(j_neighbors) == 0: + scores[edge] = 0.0 + + elif len(i_neighbors.intersection(j_neighbors)) == 0: + scores[edge] = 0.0 + + else: + w = [] + n_intersection = i_neighbors.intersection(j_neighbors) + + for c in n_intersection: + w.append(graph.degree(c)) + scores[edge] = 1.0/np.sum(w) + + return scores + + +def AdamicAdvar(graph, edges): + ''' Function takes a networkx graph object and list of edges calculates the Adamic Advar for these edges given the + structure of the graph. + :param graph: networkx graph object + :param edges: list of tuples + :return: a dictionary of scores for the edges + ''' + + scores = {} + + for edge in edges: + i = edge[0] + j = edge[1] + i_neighbors = set(graph[i].keys()) + j_neighbors = set(graph[j].keys()) + + if len(i_neighbors) == 0 or len(j_neighbors) == 0: + scores[edge] = 0.0 + + elif len(i_neighbors.intersection(j_neighbors)) == 0: + scores[edge] = 0.0 + + else: + w = [] + n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys())) + + for c in n_intersection: + w.append(graph.degree(c)) + scores[edge] = 1.0/np.log(np.sum(w)) + + return scores + + +##for the following algorithms parameter values were chosen to be consistent with: +#Liben-Nowell D, Kleinberg J. The link-prediction problem for social networks. Journal of the American society for information science and technology. + +def katz(G, beta=0.001, max_power=5, weight=None, dtype=None): #https://github.com/rafguns/linkpred/blob/master/linkpred/predictors/path.py + """Predict by Katz (1953) measure + Let 'A' be an adjacency matrix for the directed network `G`. + Then, each element 'a_{ij}' of 'A^k' (the `k`-th power of `A`) has a + value equal to the number of walks with length `k` from `i` to `j`. + The probability of a link rapidly decreases as the walks grow longer. + Katz therefore introduces an extra parameter (here beta) to weigh + longer walks less. + Parameters + ---------- + beta : a float + the value of beta in the formula of the Katz equation + max_power : an int + the maximum number of powers to take into account + weight : string or None + The edge attribute that holds the numerical value used for + the edge weight. If None then treat as unweighted. + dtype : a data type + data type of edge weights (default numpy.int32) + """ + ineligible = G.edges() + nodelist = G.nodes() + adj = nx.to_scipy_sparse_matrix(G, dtype=np.int32, weight=weight) + res = {} + + for k in range(1, max_power + 1): + matrix = (adj ** k).tocoo() + for i, j, d in zip(matrix.row, matrix.col, matrix.data): + if i == j: + continue + u, v = nodelist[i], nodelist[j] + # if nx.has_path(G, u, v): + if (u,v) not in ineligible: + w = d * (beta ** k) + res[(u,v)] = 0.0 + res[(u,v)] += w + # if not G.is_directed(): + # We count double in case of undirected networks ((i, j) and (j, i)) + for pair in res: + res[pair]/= 2 + + return res + + +def raw_google_matrix(G, nodelist=None, weight=None): + """Calculate the raw Google matrix (stochastic without teleportation); taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py""" + M = nx.to_numpy_matrix(G, nodelist=nodelist, dtype=np.float32, weight=weight) + n, m = M.shape # should be square + assert n == m and n > 0 + # Find 'dangling' nodes, i.e. nodes whose row's sum = 0 + dangling = np.where(M.sum(axis=1) == 0) + # add constant to dangling nodes' row + for d in dangling[0]: + M[d] = 1.0 / n + # Normalize. We now have the 'raw' Google matrix (cf. example on p. 11 of + # Langville & Meyer (2006)). + M = M / M.sum(axis=1) + return M + + +def SimRank(G, c=0.8, num_iterations=10, weight=None): + """Predict using SimRank; taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py + .. math :: + sim(u, v) = \frac{c}{|N(u)| \cdot |N(v)|} \sum_{p \in N(u)} + \sum_{q \in N(v)} sim(p, q) + where 'N(v)' is the set of neighbours of node 'v'. + Parameters + ---------- + c : float, optional + decay factor, determines how quickly similarity decreases + num_iterations : int, optional + number of iterations to calculate + weight: string or None, optional + If None, all edge weights are considered equal. + Otherwise holds the name of the edge attribute used as weight. + """ + #set-up inital variables + res = {} + nodelist = G.nodes() + ineligible = G.edges() + n = len(G) + M = raw_google_matrix(G, nodelist=nodelist, weight=weight) + sim = np.identity(n, dtype=np.float32) + + for i in range(num_iterations): + temp = c * M.T * sim * M + sim = temp + np.identity(n) - np.diag(np.diag(temp)) + + (m, n) = sim.shape + assert m == n + + for i in range(m): + # sim(a, b) = sim(b, a), leading to a 'mirrored' matrix. + # We start the column range at i + 1, such that we only look at the + # upper triangle in the matrix, excluding the diagonal: + # sim(a, a) = 1. + u = nodelist[i] + for j in range(i + 1, n): + if sim[i, j] > 0: + v = nodelist[j] + if (u, v) not in ineligible: + res[(u, v)] = 0.0 + res[(u, v)] = sim[i, j] + return res + + +def RPR(G, alpha=0.15, beta=0): + """Return the rooted PageRank of all nodes with respect to node 'root' + taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py + Parameters + ---------- + G : a networkx.(Di)Graph + network to compute PR on + root : a node from the network + the node that will be the starting point of all random walks + alpha : float + PageRank probability that we will advance to a neighbour of the + current node in a random walk + beta : float or int + Normally, we return to the root node with probability 1 - alpha. + With this parameter, we can also advance to a random other node in the + network with probability beta. Thus, we get back to the root node with + probability 1 - alpha - beta. This is off (0) by default. + weight : string or None + The edge attribute that holds the numerical value used for + the edge weight. If None then treat as unweighted. + """ + #set default variables + weight = None + res = {} #stores results + eligible_node = G.nodes() + personalization = dict.fromkeys(G, beta) + + for u in G.nodes(): + personalization[u] = 1 - beta + pagerank_scores = nx.pagerank_scipy(G, alpha, personalization, weight=weight) + + for v, w in six.iteritems(pagerank_scores): + if w > 0 and u != v and v in eligible_node: + res[(u, v)] = 0.0 + res[(u, v)] += w + + return res + + +# def simrank(G, c=0.8, num_iterations= 10): +# r"""Calculate SimRank matrix for nodes in nodelist; taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py +# SimRank is defined as: +# .. math :: +# sim(u, v) = \frac{c}{|N(u)| |N(v)|} \sum_{p \in N(u)} +# \sum_{q \in N(v)} sim(p, q) +# Parameters +# ---------- +# G : a networkx.Graph +# network +# nodelist : collection of nodes, optional +# nodes to calculate SimRank for (default: all) +# c : float, optional +# decay factor, determines how quickly similarity decreases +# num_iterations : int, optional +# number of iterations to calculate +# weight: string or None, optional +# If None, all edge weights are considered equal. +# Otherwise holds the name of the edge attribute used as weight. +# """ +# labels = nx.convert_node_labels_to_integers(G, first_label=0, ordering='default', label_attribute='names') # stores the node label +# +# nodelist = None +# weight = None +# n = len(labels) +# M = raw_google_matrix(labels, nodelist=nodelist, weight=weight) +# sim = np.identity(n, dtype=np.float32) +# for i in range(num_iterations): +# temp = c * M.T * sim * M +# sim = temp + np.identity(n) - np.diag(np.diag(temp)) +# +# sim_res = nx.from_numpy_matrix(sim) +# sim_res = nx.Graph(sim_res) +# sim_res.remove_edges_from(sim_res.selfloop_edges()) +# +# #save edges to dictionary +# sim_edges = {} +# for edge in sim_res.edges(data=True): +# node0 = labels.node[edge[0]]['names'] +# node1 = labels.node[edge[1]]['names'] +# sim_edges[(node0, node1)] = edge[2]['weight'] +# +# return sim_edges diff --git a/LinkPredictionResults.py b/LinkPredictionResults.py new file mode 100644 index 0000000..cc7e707 --- /dev/null +++ b/LinkPredictionResults.py @@ -0,0 +1,124 @@ +####################################################################################################### +# LinkPredictionResults.py +# Purpose: script runs 10 link prediction algorithms on training and testing network data in parallel +# version 1.2.0 +# date: 01.28.2017 +####################################################################################################### + + +# import module/script dependencies +import networkx as nx +import numpy as np +from collections import Counter +import operator +import LinkPrediction +import csv + + +def LabelDict(results, id, var): + ''' + Function takes a json file of results (containing ice ids and labels) and two variables storing the id and labels + and returns a dictionary where the keys are the id and the values are the corresponding labels. + :param results: json file of ice ids and labels + :param id: variable storing ids + :param var: variable storing labels + :return: + ''' + + label_dict = {} + + for res in results['results']['bindings']: + label_dict[str(res[str(id)]['value'].split('/')[-1]).encode('utf8')] = str( + res[str(var)]['value'].encode('utf8')) + + return label_dict + + +def EdgeChecker(scores, edges): + ''' + Function takes a dictionary of edges (keys) and scores (values) and a list of edges. Using the intersection of the + edges and keys edges a new dictionary is created. + :param scores: dictionary of edges (keys) and scores (values) + :param edges: list of tuples + :return: dictionary of tuples (keys) and scores (values) + ''' + final_dict = {} + + for edge in set(edges).intersection(set(scores.keys())): + final_dict[edge] = scores[edge] + + return final_dict + + + +def main(): + + #read in graphs + owl_graph = nx.read_gml('Network_Data/Trametinib_query_OWL_network.gml').to_undirected() + nets_graph = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected() + mid_graph = nx.read_gml('Network_Data/Trametinib_query_PART_network').to_undirected() + + #run link predictions for each graph + nets_scores = LinkPrediction.katz(nets_graph, beta=0.001, max_power=5, weight=None, dtype=None) + nets_nonexist = list(nx.non_edges(nets_graph)) + nets_preds = EdgeChecker(nets_scores, nets_nonexist) + + owl_nonexist = list(nx.non_edges(owl_graph)) + owl_scores = LinkPrediction.RPR(owl_graph, alpha = 0.15, beta = 0) + owl_preds = EdgeChecker(owl_scores, owl_nonexist) + + #explore predictions + len(nets_preds) #1652 + np.min(nets_preds.values()) + np.mean(nets_preds.values()) + np.max(nets_preds.values()) + + #print top 20 edges + sorted(Counter(sorted(nets_preds.values())).items(), key=lambda i: i[0]) #get distribution of counts + sorted_scores = sorted(owl_preds.items(), key=operator.itemgetter(1), reverse=True) #biggest first + sorted_scores = sorted(nets_preds.items(), key=operator.itemgetter(1), reverse=False) #smallest first + + #investigate the top n items + edges = sorted_scores[0:20] + + + ## Write results for use with with ranking methods + + # graphs + graph = nx.read_gml('Network_Data/Trametinib_query_OWL_network.gml').to_undirected() + + # graph = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected() + + graph = nx.read_gml('Network_Data/DDI_reactome_query_NETS_network.gml').to_undirected() + + + methods = [LinkPrediction.DegreeProduct(graph, list(nx.non_edges(graph))), + LinkPrediction.ShortestPath(graph, list(nx.non_edges(graph))), + LinkPrediction.CommonNeighbors(graph, list(nx.non_edges(graph))), + LinkPrediction.AdamicAdvar(graph, list(nx.non_edges(graph))), + LinkPrediction.Jaccard(graph, list(nx.non_edges(graph))), + LinkPrediction.LHN(graph, list(nx.non_edges(graph))), + LinkPrediction.ResourceAllocation(graph, list(nx.non_edges(graph))), + LinkPrediction.Sorensen(graph, list(nx.non_edges(graph))), + LinkPrediction.katz(graph, beta=0.001, max_power=5, weight=None, dtype=None), + LinkPrediction.RPR(graph, alpha = 0.15, beta = 0)] + + + # method counter for labeling csv files + count = 0 + + for method in methods: + updated_res = EdgeChecker(method, list(nx.non_edges(graph))) + + with open('Results/DDI_reactome/NETS_DDI ' + str(count) + '.csv', 'wb') as csvfile: + writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL) + for key, values in updated_res.items(): + writer.writerow([key, values]) + + count += 1 + + + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/NetworkInference.py b/NetworkInference.py new file mode 100644 index 0000000..0a8168c --- /dev/null +++ b/NetworkInference.py @@ -0,0 +1,576 @@ +####################################################################################################### +# NetworkInference.py +# Purpose: script runs 10 link prediction algorithms on training and testing network data in parallel +# version 1.1.0 +# date: 01.28.2017 +####################################################################################################### + + +# import module/script dependencies +import multiprocessing +from datetime import datetime +from functools import partial +import networkx as nx +import random +import json +import EvaluationMetrics +import LinkPrediction + + + +def GraphMaker(graph, percent): + ''' + Function takes a Networkx graph object and percent of edges to sample. The function creates a training graph by + randomly sampling a certain percent of edges to remove from the full graph. The percent of randomly sampled edges + is stored as a list. + :param graph: networkx graph object + :param percent: an integer of edges to sample + :return: training_graph - network graph with randomly sampled edges removed; testing_edges - list of randomly + sampled edges + ''' + + training = set(random.sample(graph.edges(), int(nx.number_of_edges(graph) * percent))) + testing_edges = set(set(graph.edges()) - training) + + if len(training) + len(testing_edges) != len(graph.edges()): #verify that training graph/testing edges are correct + raise ValueError('# of training + testing edges != total # of edges in graph') + + # training graph + training_graph = nx.Graph() + + # original graph nodes + training_graph.add_nodes_from(graph.nodes()) + + # add training edges between nodes + training_graph.add_edges_from(training) + + if len(training_graph.nodes()) != len(graph.nodes()): #verify training graph contains original graph + raise ValueError('Training graph does not contain all of the original graph nodes') + + return training_graph, testing_edges + + +def DPFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Degree Product scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.DegreeProduct(training_graph, testing_edges) + nonexist_scores = LinkPrediction.DegreeProduct(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def SPFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Shortest Path scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.ShortestPath(training_graph, testing_edges) + nonexist_scores = LinkPrediction.ShortestPath(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def CNFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Common Neighbors scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.CommonNeighbors(training_graph, testing_edges) + nonexist_scores = LinkPrediction.CommonNeighbors(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def JFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Jaccard scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.Jaccard(training_graph, testing_edges) + nonexist_scores = LinkPrediction.Jaccard(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def SSFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Sorenson Similarity scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.Sorensen(training_graph, testing_edges) + nonexist_scores = LinkPrediction.Sorensen(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def LHNFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Leicht-Holme-Newman scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.LHN(training_graph, testing_edges) + nonexist_scores = LinkPrediction.LHN(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def AAFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Adamic Advar scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.AdamicAdvar(training_graph, testing_edges) + nonexist_scores = LinkPrediction.AdamicAdvar(training_graph, nonexist_edges) + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def RAFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Resource Allocaiton scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + missing_scores = LinkPrediction.ResourceAllocation(training_graph, testing_edges) + nonexist_scores = LinkPrediction.ResourceAllocation(training_graph, nonexist_edges) + + #get AUC + auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores) + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges) + prec.append(prec_round) + + return auc, prec + + +def KFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Katz scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + scores = LinkPrediction.katz(training_graph, beta=0.001, max_power=5, weight=None, dtype=None) + + #get AUC + count = 0.0 + for i in xrange(1000): + TN = random.sample(nonexist_edges, 1)[0] + TP = random.sample(testing_edges, 1)[0] + + if (TN[0], TN[1]) in scores.keys(): + TN_val = scores[(TN[0], TN[1])] + else: TN_val = 0.0 + + if (TP[0], TP[1]) in scores.keys(): + TP_val = scores[(TP[0], TP[1])] + else: TP_val = 0.0 + + if TP_val > TN_val: + count += 1.0 + if TP_val == TN_val: + count += 0.5 + + auc_round = count/1000 + auc.append(auc_round) + + #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges) + prec.append(prec_round) + + return auc, prec + + +def SFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the SimRank scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + scores = LinkPrediction.SimRank(training_graph, c=0.8, num_iterations= 10) + + # get AUC + count = 0.0 + for i in xrange(1000): + TN = random.sample(nonexist_edges, 1)[0] + TP = random.sample(testing_edges, 1)[0] + + if (TN[0], TN[1]) in scores.keys(): + TN_val = scores[(TN[0], TN[1])] + else: + TN_val = 0.0 + + if (TP[0], TP[1]) in scores.keys(): + TP_val = scores[(TP[0], TP[1])] + else: + TP_val = 0.0 + + if TP_val > TN_val: + count += 1.0 + if TP_val == TN_val: + count += 0.5 + + auc_round = count/1000 + auc.append(auc_round) + + # precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges) + prec.append(prec_round) + + return auc, prec + + +def PRFracAUC(network, nonexist_edges, iterations, steps): + ''' + Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample + (steps) and runs the Rooted Page Rank scoring function over each sampled network for the specified number of + iterations. + :param network: undirected graph + :param nonexist_edges: list of non-existent edges from the graph + :param iterations: integer representing the number of iterations to run + :param steps: list of percent of edges to sample + :return: + ''' + auc = []; prec = [] + + for j in xrange(iterations): + iteration_data = GraphMaker(network, steps) + training_graph = iteration_data[0] + testing_edges = iteration_data[1] + scores = LinkPrediction.RPR(training_graph, alpha=0.15, beta=0) + + # get AUC + count = 0.0 + for i in xrange(1000): + TN = random.sample(nonexist_edges, 1)[0] + TP = random.sample(testing_edges, 1)[0] + + if (TN[0], TN[1]) in scores.keys(): + TN_val = scores[(TN[0], TN[1])] + else: + TN_val = 0.0 + + if (TP[0], TP[1]) in scores.keys(): + TP_val = scores[(TP[0], TP[1])] + else: + TP_val = 0.0 + + if TP_val > TN_val: + count += 1.0 + if TP_val == TN_val: + count += 0.5 + + auc_round = count/1000 + auc.append(auc_round) + + # precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5 + prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges) + prec.append(prec_round) + + return auc, prec + + +def main(): + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + + #specify initial arguments for all functions + manager = multiprocessing.Manager() + network = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected() + nonexist_edges = manager.list(list(nx.non_edges(network))) + nonexist_edges = list(nx.non_edges(network)) # non-existent edges in graph + iterations = 100 + steps = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95] + file = 'Results/Trametinib/NETS_Tram_' + + pool = multiprocessing.Pool(processes=4) # set up pool + + #Degree Product + func = partial(DPFracAUC, network, nonexist_edges, iterations) + DPres = pool.map(func, steps) + print 'Finished running Degree Product' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'DP.json', 'w') as fout: + json.dump(DPres, fout) + + #Shortest Path + func2 = partial(SPFracAUC, network, nonexist_edges, iterations) + SPres = pool.map(func2, steps) + print 'Finished running Shortest Path' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'SP.json', 'w') as fout: + json.dump(SPres, fout) + + #Common Neighbors + func3 = partial(CNFracAUC, network, nonexist_edges, iterations) + CNres = pool.map(func3, steps) + print 'Finished running Common Neighbors' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'CN.json', 'w') as fout: + json.dump(CNres, fout) + + #Jaccard + func4 = partial(JFracAUC, network, nonexist_edges, iterations) + Jres = pool.map(func4, steps) + print 'Finished running Jaccard Index' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'J.json', 'w') as fout: + json.dump(Jres, fout) + + #Sorensen Similarity + func5 = partial(SSFracAUC, network, nonexist_edges, iterations) + SSres = pool.map(func5, steps) + print 'Finished running Sorensen Similarity' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'SS.json', 'w') as fout: + json.dump(SSres, fout) + + #Leicht-Holme-Newman + func6 = partial(LHNFracAUC, network, nonexist_edges, iterations) + LHNres = pool.map(func6, steps) + print 'Finished running Leicht-Holme-Newman' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'LHN.json', 'w') as fout: + json.dump(LHNres, fout) + + #Adamic Advar + func7 = partial(AAFracAUC, network, nonexist_edges, iterations) + AAres = pool.map(func7, steps) + print 'Finished running Adamic Advar' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'AA.json', 'w') as fout: + json.dump(AAres, fout) + + #Resource Allocation + func8 = partial(RAFracAUC, network, nonexist_edges, iterations) + RAres = pool.map(func8, steps) + print 'Finished running Resource Allocation' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'RA.json', 'w') as fout: + json.dump(RAres, fout) + + #Katz + func9 = partial(KFracAUC, network, nonexist_edges, iterations) + Kres = pool.map(func9, steps) + print 'Finished running Katz' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + #write dictionary to json file + with open(str(file) + 'K.json', 'w') as fout: + json.dump(Kres, fout) + + # #Simrank + # func10 = partial(SFracAUC, network, nonexist_edges, iterations) + # Sres = pool.map(func10, steps) + # print 'Finished running SimRank' + # # write dictionary to json file + # with open(str(file) + 'SR.json', 'w') as fout: + # json.dump(Sres, fout) + + # Rooted Page Rank + func11 = partial(PRFracAUC, network, nonexist_edges, iterations) + RPRres = pool.map(func11, steps) + print 'Finished running Rooted Page Rank' + print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + # write dictionary to json file + with open(str(file) + 'RPR.json', 'w') as fout: + json.dump(RPRres, fout) + + + pool.close() + pool.join() + + print str('Finished running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S')) + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/NetworkInferencePlots.py b/NetworkInferencePlots.py new file mode 100644 index 0000000..b071dfb --- /dev/null +++ b/NetworkInferencePlots.py @@ -0,0 +1,122 @@ +############################################################################################## +# NetworkInferencePlots.py +# Purpose: script generates AUC plots to compare performance of network inference algorithms +# version 1.0.8 +# date: 01.28.2017 +############################################################################################## + + +# import module/script dependencies +import json +import numpy as np +import matplotlib.pyplot as plt + + + +def ResultParser(file): + ''' + Function takes as input a file that stores a list of lists (length of steps) and outputs a list of the average AUC + and precision values across n steps. + :param file: string storing the json file containing link prediction results + :return: list storing the average AUC values across n steps + ''' + + auc_results = []; prec_results = []; auc = []; prec = []; auc_mod = [] + + # open the json file + with open(file) as json_data: + results = json.load(json_data) + + for i in results: + if len(i) == 2: + auc_results.append(i[0]) + auc = [np.mean(x) for x in auc_results] + prec_results.append(i[1]) + prec = [np.mean(x) for x in prec_results] + else: + auc_results.append(i) + auc = [np.mean(x) for x in auc_results] + + for j in auc: + if j < 0.5: + auc_mod.append(1 - j) + else: + auc_mod.append(j) + + return auc_mod, prec + + + +def ResultPlotter(steps, data, xlabel, ylabel, title, output): + ''' + Function takes a list of data, plotting parameters and plots the data. The plot is saved according to the file + directory specified by the user. + :param steps: list storing the x-axis values + :param data: list of lists of AUC scores for plotting + :param xlabel: string storing x-axis label + :param ylabel: string storing y-axis label + :param title: string storing plot title + :param output: string storing information for saving plot + :return: plot is created and written to user-specified location + ''' + + plt1, = plt.plot(steps, data[0], color = 'purple', marker = '>', linestyle = ':', linewidth=2) + plt2, = plt.plot(steps, data[1], color = 'magenta', marker = 's', linestyle = ':',linewidth=2) + plt3, = plt.plot(steps, data[2], color = 'forestgreen', marker = 'o', linestyle = ':',linewidth=2) + plt4, = plt.plot(steps, data[3], color = 'orange', marker = 'd', linestyle = ':',linewidth=2) + plt5, = plt.plot(steps, data[4], color = 'gold', marker = '*', linestyle = ':',linewidth=2) + plt6, = plt.plot(steps, data[5], color = 'slategray', marker = '>', linestyle = ':',linewidth=2) + plt7, = plt.plot(steps, data[6], color = 'limegreen', marker = 's', linestyle = ':',linewidth=2) + plt8, = plt.plot(steps, data[7], color = 'cyan', marker = 'd', linestyle = ':',linewidth=2) + plt9, = plt.plot(steps, data[8], color = 'red', marker = '*', linestyle = ':',linewidth=2) + plt10, = plt.plot(steps, data[9], color = 'royalblue', marker = '>', linestyle = ':',linewidth=2) + # plt11, = plt.plot(steps, data[10], color='orange', marker='o', linestyle=':', linewidth=2) + + hline = plt.axhline(y=0.5, xmax=1, linestyle ='--', color='black') + # plt.legend([plt1, plt2, plt3, plt4, plt5, plt6, plt7, plt8, plt9, plt10, hline], + # ['Degree Product', 'Common Neighbors', 'Shortest Path', 'Jaccard Index', 'Sorenson Similarity', + # 'Leicht-Holme-Newman', 'Adamic Advar', 'Resource Allocation', 'Katz', 'Rooted Page Rank', + # 'Pure Chance'], loc='upper center', prop={'size': 10}, ncol=3) + + + plt.legend([plt1, plt2, plt3, plt4, plt5, plt6, plt7, plt8, plt9, plt10, hline], ['Degree Product', 'Common Neighbors', 'Shortest Path', 'Jaccard Index', 'Sorenson Similarity', 'Leicht-Holme-Newman', 'Adamic Advar', 'Resource Allocation', 'Katz', 'Rooted Page Rank', 'Pure Chance'],loc=9,prop={'size': 9}, bbox_to_anchor=(0.5, -0.15), ncol=3) + + plt.subplots_adjust(bottom=0.25) + #plt.tight_layout(pad=6.5) + plt.xlabel(xlabel, fontsize=14) + plt.ylabel(ylabel, fontsize=14) + plt.grid(True, color ='black') + plt.title(title, fontsize=18) + plt.ylim(0.45, 1.0) + plt.yticks(fontsize=12) + plt.yticks(fontsize=12) + + return plt.savefig(output, dpi=600) #save plot + + + +def main(): + #load results and format for plotting + file = 'Results/Trametinib/OWL_Tram' + dp_res = ResultParser(str(file) + '_DP.json') + sp_res = ResultParser(str(file) + '_SP.json') + cn_res = ResultParser(str(file) + '_CN.json') + j_res = ResultParser(str(file) + '_J.json') + ss_res = ResultParser(str(file) + '_SS.json') + lhn_res = ResultParser(str(file) + '_LHN.json') + aa_res = ResultParser(str(file) + '_AA.json') + ra_res = ResultParser(str(file) + '_RA.json') + k_res = ResultParser(str(file) + '_K.json') + # s_res = ResultParser(str(file) + '_SR.json') + rpr_res = ResultParser(str(file) + '_RPR.json') + # hrg_resa = ResultParser('Results/Trematinib/nice_trem_HRG_AUC.json') + # hrg_resp = ResultParser2('Results/Trematinib/nice_trem_HRG_PREC.json') + + #plot results + data = [dp_res[0], cn_res[0], sp_res[0], j_res[0], ss_res[0], lhn_res[0], aa_res[0], ra_res[0], k_res[0], rpr_res[0]] + steps = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95] + ResultPlotter(steps, data, 'Fraction of Observed Edges', 'Average AUC', '', 'Results/Trametinib/AUCplot_OWL.png') + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/NetworkStatistics.py b/NetworkStatistics.py new file mode 100644 index 0000000..29dcf99 --- /dev/null +++ b/NetworkStatistics.py @@ -0,0 +1,236 @@ +#################################################################### +# NetworkStatistics.py +# Purpose: script explores different characteristics of a newtork +# version 1.1.0 +# date: 07.21.2017 +#################################################################### + + +# import module/script dependencies +import networkx as nx +import numpy as np +from collections import Counter +import matplotlib.pyplot as plt +from pylab import show +from powerlaw import plot_pdf, Fit, pdf +import powerlaw + + + +# produce network statistics +# read in graphs +trem_owl = nx.read_gml('Network_Data/Trametinib_query_OWL_network.gml') +trem_nets = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml') +ddi_owl = nx.read_gml('Network_Data/DDI_reactome_query_OWL_network.gml') +ddi_nets=nx.read_gml('Network_Data/DDI_reactome_query_NETS_network.gml') + +len(list(nx.non_edges(ddi_nets.to_undirected()))) + + +count = set +for node in trem_nets.nodes(data=True): + + + + +nx.number_connected_components(ddi_owl.to_undirected()) + +## Descriptives + +for i in [ddi_nets, ddi_owl]: + print 'Clustering Coefficient' + print nx.average_clustering(i.to_undirected()) + + print 'Number of Connected Components' + print nx.number_connected_components(i.to_undirected()) + + print 'Average Network Closeness (Centrality)' + print sum(nx.closeness_centrality(i.to_undirected()).values())/len(nx.closeness_centrality(i.to_undirected()).values()) + + print 'degree heterogeneity' + print float(np.mean(map(lambda x: x ** 2, i.to_undirected().degree().values()))) / ( + (i.to_undirected().number_of_edges() * 2 /i.to_undirected().number_of_nodes()) ** 2) + + print 'Average number of neighbors' + print sum(nx.average_neighbor_degree(i.to_undirected()).values()) + + print 'Nodes' + print nx.number_of_nodes(i.to_undirected()) + + print 'edges' + print nx.number_of_edges(i.to_undirected()) + + print 'Density' + print nx.density(trem_nets.to_undirected()) + + print 'Number of cliques' + print len(nx.number_of_cliques(i.to_undirected())) + + print 'Average Degree' + print 2.0 * (len(i.to_undirected().edges())) / len(i.to_undirected().nodes()) + + print 'Average Degree Assortativity' + print nx.degree_assortativity_coefficient(i.to_undirected()) + +# things for only connected graphs +ddi = max(nx.connected_component_subgraphs(ddi_nets.to_undirected()), key=len) +print 'Network Diameter' +nx.diameter(ddi.to_undirected()) + +print 'Shorest paths (sum)' +sum(nx.all_pairs_shortest_path_length(ddi_owl.to_undirected()).values()[0].values()) + +print 'Average shortest_path (Characteristic path length)' +nx.average_shortest_path_length(ddi_owl.to_undirected()) + +ddi_owl_nodes = ddi_owl.nodes() +## CCDF with power-law fit +trem_owl_freq = Counter(nx.degree(trem_owl.to_undirected()).values()).values() +trem_nets_freq = Counter(nx.degree(trem_nets.to_undirected()).values()).values() +ddi_owl_freq = np.sort(Counter(list(nx.degree(ddi_owl.to_undirected()).values())).values()) +ddi_nets_freq = np.sort(Counter(list(nx.degree(ddi_nets.to_undirected()).values())).values()) + + +nx.degree(trem_owl.to_undirected()).values() + +fit1 = powerlaw.Fit(nx.degree(trem_owl.to_undirected()).values(), discrete = True) +fit2 = powerlaw.Fit(nx.degree(trem_nets.to_undirected()).values(), discrete = True) + +fit1 = powerlaw.Fit(nx.degree(ddi_owl.to_undirected()).values(), discrete = True) +fit2 = powerlaw.Fit(nx.degree(ddi_nets.to_undirected()).values(), discrete = True) + +fig2 = fit1.plot_ccdf(color='mediumaquamarine', linewidth=2) +fit1.power_law.plot_ccdf(color='mediumaquamarine', linestyle='--', ax=fig2) +fit2.plot_ccdf(color='m', linewidth=2, ax=fig2) +fit2.power_law.plot_ccdf(color='m', linestyle='--', ax=fig2) + +fig2.legend(['OWL CCDF', + 'OWL CCDF Power-Law Fit', + 'OWL-NETS CCDF', + 'OWL-NETS CCDF Power-Law Fit'],loc='lower left', fontsize=14, numpoints=1) +# +# from mpl_toolkits.axes_grid.inset_locator import inset_axes +# fig2in = inset_axes(fig2, width="30%", height="30%", loc=3) +# fig2in.hist(nx.degree(ddi_owl).values(), bins = 5000, normed = True, color='deepskyblue') +# fig2in.set_xlim(0, 100) +# +# fig2in.hist(nx.degree(ddi_nets).values(), bins = 50, normed = True, color='red', alpha = 0.65) +# # fig2in.set_xlim(0, 0.1) +# fig2in.set_xticks([]) +# fig2in.set_yticks([]) + +fig2.patch.set_facecolor('whitesmoke') +fig2.set_xlabel(r'Degree $k$', fontsize=16) +fig2.set_ylabel(r'$P(K \geq k)$', fontsize=16) +fig2.grid(True, color='black') +fig2.tick_params(axis='both', which='major', labelsize=14) +fig2.tick_params(axis='both', which='minor', labelsize=14) +# plt.ylim(0.01, 1.01) +# plt.xlim(0.98, 31622) +plt.tight_layout() +plt.savefig("Results/DDI_reactome/DDI_degreeDistribution.png", dpi=600) # save plot + + + +# create degree distribution plots +plot_res = [] +for r in [trem_owl, trem_nets, ddi_owl, ddi_nets]: + # #get counts of node occurrence in target list + degree = Counter(list(nx.degree(r.to_undirected()).values())).values() # for r = 1 + + # update indegree to include a value of zero for nodes not in target list + pdf = np.sort(degree) + cdf = np.arange(len(pdf)) / float(len(pdf)) + ccdf = 1 - cdf + plot_res.append([pdf, ccdf]) + +# generate plot +plt1, = plt.loglog(plot_res[2][0], plot_res[2][1], '.', color='turquoise', markersize=12, alpha=0.75) +plt2, = plt.loglog(plot_res[0][0], plot_res[0][1], '.', color='turquoise', markersize=12, alpha=0.75) +plt3, = plt.loglog(plot_res[3][0], plot_res[3][1], '.', color='red', markersize=12, alpha=0.75) +plt4, = plt.loglog(plot_res[1][0], plot_res[1][1], '.', color='lime', markersize=12) + + +plt.legend([plt1, plt3], ['OWL Representation', 'OWL-NETS'], + loc='upper right', fontsize=12, numpoints=1) + +plt.legend([plt2, plt4], ['OWL Representation', 'OWL-NETS'], + loc='upper right', fontsize=12, numpoints=1) + +plt.xlabel(r'Degree $k$', fontsize=12) +plt.ylabel(r'$Pr(k)$', fontsize=12) +plt.grid(True, color='black') +plt.ylim(0.01, 1.01) +plt.xlim(0.98, 31622) +plt.tight_layout() +plt.savefig("Network_visualizations/Tram_degreeDistribution.png", dpi=600) # save plot +# save plot + + +## Betweenness + +def most_important(G): + ''' + returns a copy of G with the most important nodes according to the pagerank + ''' + ranking = nx.betweenness_centrality(G).items() + print ranking + + r = [x[1] for x in ranking] + m = sum(r) / len(r) # mean centrality + t = m * 3 #threshold + Gt = G.copy() + + # threshold, we keep only the nodes with 3 times the mean + for k, v in ranking: + if v < t: + Gt.remove_node(k) + + return Gt + +G = graph.to_undirected() +G=owl.to_undirected() +Gt = most_important(G) # trimming + +Gt.nodes(data = True) + +# create the layout +pos = nx.spring_layout(G) +# draw the nodes and the edges (all) +nx.draw_networkx_nodes(G,pos,node_color='b',alpha=0.3,node_size=50) +nx.draw_networkx_edges(G,pos,alpha=0.1) + +# draw the most important nodes with a different style +nx.draw_networkx_nodes(Gt,pos,node_color='r',alpha=0.4,node_size=200) + + +# also the labels this time +labels = nx.get_node_attributes(Gt, 'type') +nx.draw_networkx_labels(Gt,pos,labels,font_size=5,font_color='black') +plt.axis('off') +plt.tight_layout() +plt.savefig("Network_visualizations/Trem_betweeness_importance_NETS.png", dpi=1000) + +res = [] +count1 = 0 +count2 = 0 +count3 = 0 +count4 = 0 +for i in ddi.to_undirected().edges(data =True): + if i[2]['nice'] == ['drug1_ice', 'target_ice']: + count1 += 1 + + if i[2]['nice'] == ['pathway_target_ice', 'drug2_ice']: + count2 += 1 + + if i[2]['nice'] == ['target_ice', 'pathway_ice']: + count3 += 1 + + if i[2]['nice'] == ['pathway_ice', 'pathway_target_ice']: + count4 += 1 + + + + +if __name__ == '__main__': + main() \ No newline at end of file