From f279f5b1ae44f7bc977dac9848eadef9ac95e938 Mon Sep 17 00:00:00 2001
From: callahantiff <callahantiff@gmail.com>
Date: Fri, 12 Jan 2018 16:36:56 +0900
Subject: [PATCH] Added network inference files

---
 .gitignore               |   2 +
 EvaluationMetrics.py     |  68 +++++
 LinkPrediction.py        | 429 +++++++++++++++++++++++++++++
 LinkPredictionResults.py | 124 +++++++++
 NetworkInference.py      | 576 +++++++++++++++++++++++++++++++++++++++
 NetworkInferencePlots.py | 122 +++++++++
 NetworkStatistics.py     | 236 ++++++++++++++++
 7 files changed, 1557 insertions(+)
 create mode 100644 EvaluationMetrics.py
 create mode 100644 LinkPrediction.py
 create mode 100644 LinkPredictionResults.py
 create mode 100644 NetworkInference.py
 create mode 100644 NetworkInferencePlots.py
 create mode 100644 NetworkStatistics.py

diff --git a/.gitignore b/.gitignore
index 21e29ac..e7d996a 100755
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,5 @@
 .DS_Store?
 ._*
 *.idea/
+
+Qiery Parser.py
diff --git a/EvaluationMetrics.py b/EvaluationMetrics.py
new file mode 100644
index 0000000..1e60cb2
--- /dev/null
+++ b/EvaluationMetrics.py
@@ -0,0 +1,68 @@
+#########################################################################################
+# EvaluationMetrics.py
+# Purpose: script contains methods for evaluating link prediction algorithm performance
+# version 1.0
+# date: 01.28.2017
+#########################################################################################
+
+
+# import module/script dependencies
+import random
+import heapq
+
+
+
+def AUC(nonexist_scores, missing_scores):
+    '''
+    Function calculates the probability that a randomly chosen missing link is given a higher score than a randomly
+    chosen nonexistent link using procedures described by Lu & Zhou (2010)
+    (doi:http://dx.doi.org/10.1016/j.physa.2010.11.027T). The function takes a list containing missing, non-existent
+    edges, and predictions, and returns an AUC score. Current method is currently set to perform 1,000 comparisons.
+    :param nonexist_scores: list of nonexistent edges and scores
+    :param missing_scores: list of test edges and scores
+    :return: an integer which is the AUC score for that comparison
+    '''
+
+    # comparisons = len(nonexist_scores)*len(missing_scores) #HOW MANY??
+    comparisons = 1000
+    count = 0.0
+
+    for i in xrange(comparisons):
+        TN = random.sample(nonexist_scores.values(), 1)
+        TP = random.sample(missing_scores.values(), 1)
+
+        if TP > TN:
+            count += 1.0
+        if TP == TN:
+            count += 0.5
+
+    auc = count/comparisons
+
+    return auc
+
+
+
+def KPrecision(auc, scores, testing_edges):
+    '''
+    Function calculates the ratio of relevant items selected from the top n items using procedures described by Lu &
+    Zhou (2010) (doi:http://dx.doi.org/10.1016/j.physa.2010.11.027T). The function takes a list containing missing and
+    non-existent edges, the list of all predictions, and returns a top k-precision score for 20% of scores.
+    :param auc: integer representing AUC score - to indicate whether the top or bottom of list should be assessed
+    :param scores: list of test edges and scores
+    :param testing_edges: list of nonexistent edges and scores
+    :return: an integer which is the precision for the top number of selected links
+    '''
+
+    #get 20% of edges
+    links = [int(len(scores)*0.20) if int(len(scores)*0.20) >= 1 else 1][0]
+
+    if auc < 0.5:
+        # pred_list = heapq.nsmallest(links, set([x[1] for x in scores.items()])) #returns n highest likelihood scores
+        pred_list = heapq.nsmallest(links, scores, key=lambda k: scores[k])
+    else:
+        # pred_list = heapq.nlargest(links, set([x[1] for x in scores.items()])) #returns n highest likelihood scores
+        pred_list = heapq.nlargest(links, scores, key=lambda k: scores[k])
+
+    y_pred = [0 if (i[0], i[1]) in testing_edges else 0 if (i[1], i[0]) in testing_edges else 1 for i in pred_list]
+
+    return float(y_pred.count(0))/links
\ No newline at end of file
diff --git a/LinkPrediction.py b/LinkPrediction.py
new file mode 100644
index 0000000..682c629
--- /dev/null
+++ b/LinkPrediction.py
@@ -0,0 +1,429 @@
+##########################################################################################
+# LinkPrediction.py  - http://www.research.rutgers.edu/~ss2078/papers/LinkPrediction.pdf
+# Purpose: script contains methods for 10 link prediction algorithms
+# version 1.1.0
+# date: 01.28.2017
+##########################################################################################
+
+
+# import module/script dependencies
+import networkx as nx
+import numpy as np
+import random
+import six
+
+
+
+def DegreeProduct(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the Degree Product for these edges given the
+    structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+
+        scores[edge] = (nx.degree(graph, i) * nx.degree(graph, j))
+
+    return scores
+
+
+def CommonNeighbors(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the Common Neighbors for these edges given the
+    structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+        i_neighbors = set(graph[i].keys())
+        j_neighbors = set(graph[j].keys())
+
+        if len(i_neighbors) == 0 or len(j_neighbors) == 0:
+            scores[edge] = 0.0
+
+        elif len(i_neighbors.intersection(j_neighbors)) == 0:
+            scores[edge] = 0.0
+
+        else:
+            n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys()))
+            scores[edge] = float(len(n_intersection))
+
+    return scores
+
+
+def Jaccard(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the Jaccard for these edges given the
+    structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+        i_neighbors = set(graph[i].keys())
+        j_neighbors = set(graph[j].keys())
+
+        if len(i_neighbors) == 0 or len(j_neighbors) == 0:
+            scores[edge] = 0.0
+
+        elif len(i_neighbors.intersection(j_neighbors)) == 0:
+            scores[edge] = 0.0
+
+        else:
+            n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys()))
+            n_union = set(graph[i].keys()).union(set(graph[j].keys()))
+            scores[edge] = float(len(n_intersection))/len(n_union)
+
+    return scores
+
+
+def Sorensen(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the Sorenson Similarity for these edges
+    given the structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+        i_neighbors = set(graph[i].keys())
+        j_neighbors = set(graph[j].keys())
+
+        if len(i_neighbors) == 0 or len(j_neighbors) == 0:
+            scores[edge] = 0.0
+
+        elif len(i_neighbors.intersection(j_neighbors)) == 0:
+            scores[edge] = 0.0
+
+        else:
+            n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys()))
+            n_degree = graph.degree(i) + graph.degree(j)
+            scores[edge] = float(len(n_intersection))/n_degree
+
+    return scores
+
+
+def LHN(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the Leicht-Holme-Newman for these edges given the
+    structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+        i_neighbors = set(graph[i].keys())
+        j_neighbors = set(graph[j].keys())
+
+        if len(i_neighbors) == 0 or len(j_neighbors) == 0:
+            scores[edge] = 0.0
+
+        elif len(i_neighbors.intersection(j_neighbors)) == 0:
+            scores[edge] = 0.0
+
+        else:
+            n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys()))
+            n_degree = graph.degree(i) * graph.degree(j)
+            scores[edge] = float(len(n_intersection))/n_degree
+
+    return scores
+
+
+def ShortestPath(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the shortest path for these edges given the
+    structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+
+        if nx.has_path(graph, i, j):
+            scores[edge] = 1.0/len(nx.shortest_path(graph, i, j))
+
+        else:
+            scores[edge] = 0.0
+
+    return scores
+
+
+def ResourceAllocation(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the resource allocation for these edges
+    given the structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+        i_neighbors = set(graph[i].keys())
+        j_neighbors = set(graph[j].keys())
+
+        if len(i_neighbors) == 0 or len(j_neighbors) == 0:
+            scores[edge] = 0.0
+
+        elif len(i_neighbors.intersection(j_neighbors)) == 0:
+            scores[edge] = 0.0
+
+        else:
+            w = []
+            n_intersection = i_neighbors.intersection(j_neighbors)
+
+            for c in n_intersection:
+                w.append(graph.degree(c))
+            scores[edge] = 1.0/np.sum(w)
+
+    return scores
+
+
+def AdamicAdvar(graph, edges):
+    ''' Function takes a networkx graph object and list of edges calculates the Adamic Advar for these edges given the
+    structure of the graph.
+    :param graph: networkx graph object
+    :param edges: list of tuples
+    :return: a dictionary of scores for the edges
+    '''
+
+    scores = {}
+
+    for edge in edges:
+        i = edge[0]
+        j = edge[1]
+        i_neighbors = set(graph[i].keys())
+        j_neighbors = set(graph[j].keys())
+
+        if len(i_neighbors) == 0 or len(j_neighbors) == 0:
+            scores[edge] = 0.0
+
+        elif len(i_neighbors.intersection(j_neighbors)) == 0:
+            scores[edge] = 0.0
+
+        else:
+            w = []
+            n_intersection = set(graph[i].keys()).intersection(set(graph[j].keys()))
+
+            for c in n_intersection:
+                w.append(graph.degree(c))
+            scores[edge] = 1.0/np.log(np.sum(w))
+
+    return scores
+
+
+##for the following algorithms parameter values were chosen to be consistent with:
+#Liben-Nowell D, Kleinberg J. The link-prediction problem for social networks. Journal of the American society for information science and technology.
+
+def katz(G, beta=0.001, max_power=5, weight=None, dtype=None): #https://github.com/rafguns/linkpred/blob/master/linkpred/predictors/path.py
+    """Predict by Katz (1953) measure
+    Let 'A' be an adjacency matrix for the directed network `G`.
+    Then, each element 'a_{ij}' of 'A^k' (the `k`-th power of `A`) has a
+    value equal to the number of walks with length `k` from `i` to `j`.
+    The probability of a link rapidly decreases as the walks grow longer.
+    Katz therefore introduces an extra parameter (here beta) to weigh
+    longer walks less.
+    Parameters
+    ----------
+    beta : a float
+        the value of beta in the formula of the Katz equation
+    max_power : an int
+        the maximum number of powers to take into account
+    weight : string or None
+        The edge attribute that holds the numerical value used for
+        the edge weight.  If None then treat as unweighted.
+    dtype : a data type
+        data type of edge weights (default numpy.int32)
+    """
+    ineligible = G.edges()
+    nodelist = G.nodes()
+    adj = nx.to_scipy_sparse_matrix(G, dtype=np.int32, weight=weight)
+    res = {}
+
+    for k in range(1, max_power + 1):
+        matrix = (adj ** k).tocoo()
+        for i, j, d in zip(matrix.row, matrix.col, matrix.data):
+            if i == j:
+                continue
+            u, v = nodelist[i], nodelist[j]
+            # if nx.has_path(G, u, v):
+            if (u,v) not in ineligible:
+                w = d * (beta ** k)
+                res[(u,v)] = 0.0
+                res[(u,v)] += w
+    # if not G.is_directed():
+        # We count double in case of undirected networks ((i, j) and (j, i))
+    for pair in res:
+        res[pair]/= 2
+
+    return res
+
+
+def raw_google_matrix(G, nodelist=None, weight=None):
+    """Calculate the raw Google matrix (stochastic without teleportation); taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py"""
+    M = nx.to_numpy_matrix(G, nodelist=nodelist, dtype=np.float32, weight=weight)
+    n, m = M.shape  # should be square
+    assert n == m and n > 0
+    # Find 'dangling' nodes, i.e. nodes whose row's sum = 0
+    dangling = np.where(M.sum(axis=1) == 0)
+    # add constant to dangling nodes' row
+    for d in dangling[0]:
+        M[d] = 1.0 / n
+    # Normalize. We now have the 'raw' Google matrix (cf. example on p. 11 of
+    # Langville & Meyer (2006)).
+    M = M / M.sum(axis=1)
+    return M
+
+
+def SimRank(G, c=0.8, num_iterations=10, weight=None):
+        """Predict using SimRank; taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py
+        .. math ::
+            sim(u, v) = \frac{c}{|N(u)| \cdot |N(v)|} \sum_{p \in N(u)}
+                        \sum_{q \in N(v)} sim(p, q)
+        where 'N(v)' is the set of neighbours of node 'v'.
+        Parameters
+        ----------
+        c : float, optional
+            decay factor, determines how quickly similarity decreases
+        num_iterations : int, optional
+            number of iterations to calculate
+        weight: string or None, optional
+            If None, all edge weights are considered equal.
+            Otherwise holds the name of the edge attribute used as weight.
+        """
+        #set-up inital variables
+        res = {}
+        nodelist = G.nodes()
+        ineligible = G.edges()
+        n = len(G)
+        M = raw_google_matrix(G, nodelist=nodelist, weight=weight)
+        sim = np.identity(n, dtype=np.float32)
+
+        for i in range(num_iterations):
+            temp = c * M.T * sim * M
+            sim = temp + np.identity(n) - np.diag(np.diag(temp))
+
+        (m, n) = sim.shape
+        assert m == n
+
+        for i in range(m):
+            # sim(a, b) = sim(b, a), leading to a 'mirrored' matrix.
+            # We start the column range at i + 1, such that we only look at the
+            # upper triangle in the matrix, excluding the diagonal:
+            # sim(a, a) = 1.
+            u = nodelist[i]
+            for j in range(i + 1, n):
+                if sim[i, j] > 0:
+                    v = nodelist[j]
+                    if (u, v) not in ineligible:
+                        res[(u, v)] = 0.0
+                        res[(u, v)] = sim[i, j]
+        return res
+
+
+def RPR(G, alpha=0.15, beta=0):
+    """Return the rooted PageRank of all nodes with respect to node 'root'
+    taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py
+    Parameters
+    ----------
+    G : a networkx.(Di)Graph
+        network to compute PR on
+    root : a node from the network
+        the node that will be the starting point of all random walks
+    alpha : float
+        PageRank probability that we will advance to a neighbour of the
+        current node in a random walk
+    beta : float or int
+        Normally, we return to the root node with probability 1 - alpha.
+        With this parameter, we can also advance to a random other node in the
+        network with probability beta. Thus, we get back to the root node with
+        probability 1 - alpha - beta. This is off (0) by default.
+    weight : string or None
+        The edge attribute that holds the numerical value used for
+        the edge weight.  If None then treat as unweighted.
+    """
+    #set default variables
+    weight = None
+    res = {} #stores results
+    eligible_node = G.nodes()
+    personalization = dict.fromkeys(G, beta)
+
+    for u in G.nodes():
+        personalization[u] = 1 - beta
+        pagerank_scores = nx.pagerank_scipy(G, alpha, personalization, weight=weight)
+
+        for v, w in six.iteritems(pagerank_scores):
+            if w > 0 and u != v and v in eligible_node:
+                res[(u, v)] = 0.0
+                res[(u, v)] += w
+
+    return res
+
+
+# def simrank(G, c=0.8, num_iterations= 10):
+#     r"""Calculate SimRank matrix for nodes in nodelist; taken from: https://github.com/rafguns/linkpred/blob/master/linkpred/network/algorithms.py
+#     SimRank is defined as:
+#     .. math ::
+#         sim(u, v) = \frac{c}{|N(u)| |N(v)|} \sum_{p \in N(u)}
+#                     \sum_{q \in N(v)} sim(p, q)
+#     Parameters
+#     ----------
+#     G : a networkx.Graph
+#         network
+#     nodelist : collection of nodes, optional
+#         nodes to calculate SimRank for (default: all)
+#     c : float, optional
+#         decay factor, determines how quickly similarity decreases
+#     num_iterations : int, optional
+#         number of iterations to calculate
+#     weight: string or None, optional
+#         If None, all edge weights are considered equal.
+#         Otherwise holds the name of the edge attribute used as weight.
+#     """
+#     labels = nx.convert_node_labels_to_integers(G, first_label=0, ordering='default', label_attribute='names')  # stores the node label
+#
+#     nodelist = None
+#     weight = None
+#     n = len(labels)
+#     M = raw_google_matrix(labels, nodelist=nodelist, weight=weight)
+#     sim = np.identity(n, dtype=np.float32)
+#     for i in range(num_iterations):
+#         temp = c * M.T * sim * M
+#         sim = temp + np.identity(n) - np.diag(np.diag(temp))
+#
+#     sim_res = nx.from_numpy_matrix(sim)
+#     sim_res = nx.Graph(sim_res)
+#     sim_res.remove_edges_from(sim_res.selfloop_edges())
+#
+#     #save edges to dictionary
+#     sim_edges = {}
+#     for edge in sim_res.edges(data=True):
+#         node0 = labels.node[edge[0]]['names']
+#         node1 = labels.node[edge[1]]['names']
+#         sim_edges[(node0, node1)] = edge[2]['weight']
+#
+#     return sim_edges
diff --git a/LinkPredictionResults.py b/LinkPredictionResults.py
new file mode 100644
index 0000000..cc7e707
--- /dev/null
+++ b/LinkPredictionResults.py
@@ -0,0 +1,124 @@
+#######################################################################################################
+# LinkPredictionResults.py
+# Purpose: script runs 10 link prediction algorithms on training and testing network data in parallel
+# version 1.2.0
+# date: 01.28.2017
+#######################################################################################################
+
+
+# import module/script dependencies
+import networkx as nx
+import numpy as np
+from collections import Counter
+import operator
+import LinkPrediction
+import csv
+
+
+def LabelDict(results, id, var):
+    '''
+    Function takes a json file of results (containing ice ids and labels) and two variables storing the id and labels
+    and returns a dictionary where the keys are the id and the values are the corresponding labels.
+    :param results: json file of ice ids and labels
+    :param id: variable storing ids
+    :param var: variable storing labels
+    :return:
+    '''
+
+    label_dict = {}
+
+    for res in results['results']['bindings']:
+        label_dict[str(res[str(id)]['value'].split('/')[-1]).encode('utf8')] = str(
+            res[str(var)]['value'].encode('utf8'))
+
+    return label_dict
+
+
+def EdgeChecker(scores, edges):
+    '''
+    Function takes a dictionary of edges (keys) and scores (values) and a list of edges. Using the intersection of the
+    edges and keys edges a new dictionary is created.
+    :param scores: dictionary of edges (keys) and scores (values)
+    :param edges: list of tuples
+    :return: dictionary of tuples (keys) and scores (values)
+    '''
+    final_dict = {}
+
+    for edge in set(edges).intersection(set(scores.keys())):
+        final_dict[edge] = scores[edge]
+
+    return final_dict
+
+
+
+def main():
+
+    #read in graphs
+    owl_graph = nx.read_gml('Network_Data/Trametinib_query_OWL_network.gml').to_undirected()
+    nets_graph = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected()
+    mid_graph = nx.read_gml('Network_Data/Trametinib_query_PART_network').to_undirected()
+
+    #run link predictions for each graph
+    nets_scores = LinkPrediction.katz(nets_graph, beta=0.001, max_power=5, weight=None, dtype=None)
+    nets_nonexist = list(nx.non_edges(nets_graph))
+    nets_preds = EdgeChecker(nets_scores, nets_nonexist)
+
+    owl_nonexist = list(nx.non_edges(owl_graph))
+    owl_scores = LinkPrediction.RPR(owl_graph, alpha = 0.15, beta = 0)
+    owl_preds = EdgeChecker(owl_scores, owl_nonexist)
+
+    #explore predictions
+    len(nets_preds) #1652
+    np.min(nets_preds.values())
+    np.mean(nets_preds.values())
+    np.max(nets_preds.values())
+
+    #print top 20 edges
+    sorted(Counter(sorted(nets_preds.values())).items(), key=lambda i: i[0]) #get distribution of counts
+    sorted_scores = sorted(owl_preds.items(), key=operator.itemgetter(1), reverse=True) #biggest first
+    sorted_scores = sorted(nets_preds.items(), key=operator.itemgetter(1), reverse=False) #smallest first
+
+    #investigate the top n items
+    edges = sorted_scores[0:20]
+
+
+    ## Write results for use with with ranking methods
+
+    # graphs
+    graph = nx.read_gml('Network_Data/Trametinib_query_OWL_network.gml').to_undirected()
+
+    # graph = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected()
+
+    graph = nx.read_gml('Network_Data/DDI_reactome_query_NETS_network.gml').to_undirected()
+
+
+    methods = [LinkPrediction.DegreeProduct(graph, list(nx.non_edges(graph))),
+               LinkPrediction.ShortestPath(graph, list(nx.non_edges(graph))),
+               LinkPrediction.CommonNeighbors(graph, list(nx.non_edges(graph))),
+               LinkPrediction.AdamicAdvar(graph, list(nx.non_edges(graph))),
+               LinkPrediction.Jaccard(graph, list(nx.non_edges(graph))),
+               LinkPrediction.LHN(graph, list(nx.non_edges(graph))),
+               LinkPrediction.ResourceAllocation(graph, list(nx.non_edges(graph))),
+               LinkPrediction.Sorensen(graph, list(nx.non_edges(graph))),
+               LinkPrediction.katz(graph, beta=0.001, max_power=5, weight=None, dtype=None),
+               LinkPrediction.RPR(graph, alpha = 0.15, beta = 0)]
+
+
+    # method counter for labeling csv files
+    count = 0
+
+    for method in methods:
+        updated_res = EdgeChecker(method, list(nx.non_edges(graph)))
+
+        with open('Results/DDI_reactome/NETS_DDI ' + str(count) + '.csv', 'wb') as csvfile:
+            writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
+            for key, values in updated_res.items():
+                writer.writerow([key, values])
+
+        count += 1
+
+
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/NetworkInference.py b/NetworkInference.py
new file mode 100644
index 0000000..0a8168c
--- /dev/null
+++ b/NetworkInference.py
@@ -0,0 +1,576 @@
+#######################################################################################################
+# NetworkInference.py
+# Purpose: script runs 10 link prediction algorithms on training and testing network data in parallel
+# version 1.1.0
+# date: 01.28.2017
+#######################################################################################################
+
+
+# import module/script dependencies
+import multiprocessing
+from datetime import datetime
+from functools import partial
+import networkx as nx
+import random
+import json
+import EvaluationMetrics
+import LinkPrediction
+
+
+
+def GraphMaker(graph, percent):
+    '''
+    Function takes a Networkx graph object and percent of edges to sample. The function creates a training graph by
+    randomly sampling a certain percent of edges to remove from the full graph. The percent of randomly sampled edges
+    is stored as a list.
+    :param graph: networkx graph object
+    :param percent: an integer of edges to sample
+    :return: training_graph - network graph with randomly sampled edges removed; testing_edges - list of randomly
+    sampled edges
+    '''
+
+    training = set(random.sample(graph.edges(), int(nx.number_of_edges(graph) * percent)))
+    testing_edges = set(set(graph.edges()) - training)
+
+    if len(training) + len(testing_edges) != len(graph.edges()): #verify that training graph/testing edges are correct
+        raise ValueError('# of training + testing edges != total # of edges in graph')
+
+    # training graph
+    training_graph = nx.Graph()
+
+    # original graph nodes
+    training_graph.add_nodes_from(graph.nodes())
+
+    # add training edges between nodes
+    training_graph.add_edges_from(training)
+
+    if len(training_graph.nodes()) != len(graph.nodes()): #verify training graph contains original graph
+        raise ValueError('Training graph does not contain all of the original graph nodes')
+
+    return training_graph, testing_edges
+
+
+def DPFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Degree Product scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.DegreeProduct(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.DegreeProduct(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def SPFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Shortest Path scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.ShortestPath(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.ShortestPath(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def CNFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Common Neighbors scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.CommonNeighbors(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.CommonNeighbors(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def JFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Jaccard scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.Jaccard(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.Jaccard(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def SSFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Sorenson Similarity scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.Sorensen(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.Sorensen(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def LHNFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Leicht-Holme-Newman scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.LHN(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.LHN(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def AAFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Adamic Advar scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.AdamicAdvar(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.AdamicAdvar(training_graph, nonexist_edges)
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def RAFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Resource Allocaiton scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        missing_scores = LinkPrediction.ResourceAllocation(training_graph, testing_edges)
+        nonexist_scores = LinkPrediction.ResourceAllocation(training_graph, nonexist_edges)
+
+        #get AUC
+        auc_round = EvaluationMetrics.AUC(nonexist_scores, missing_scores)
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, dict(missing_scores, **nonexist_scores), testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def KFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Katz scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        scores = LinkPrediction.katz(training_graph, beta=0.001, max_power=5, weight=None, dtype=None)
+
+        #get AUC
+        count = 0.0
+        for i in xrange(1000):
+            TN = random.sample(nonexist_edges, 1)[0]
+            TP = random.sample(testing_edges, 1)[0]
+
+            if (TN[0], TN[1]) in scores.keys():
+                TN_val = scores[(TN[0], TN[1])]
+            else: TN_val = 0.0
+
+            if (TP[0], TP[1]) in scores.keys():
+                TP_val = scores[(TP[0], TP[1])]
+            else: TP_val = 0.0
+
+            if TP_val > TN_val:
+                count += 1.0
+            if TP_val == TN_val:
+                count += 0.5
+
+        auc_round = count/1000
+        auc.append(auc_round)
+
+        #precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def SFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the SimRank scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        scores = LinkPrediction.SimRank(training_graph, c=0.8, num_iterations= 10)
+
+        # get AUC
+        count = 0.0
+        for i in xrange(1000):
+            TN = random.sample(nonexist_edges, 1)[0]
+            TP = random.sample(testing_edges, 1)[0]
+
+            if (TN[0], TN[1]) in scores.keys():
+                TN_val = scores[(TN[0], TN[1])]
+            else:
+                TN_val = 0.0
+
+            if (TP[0], TP[1]) in scores.keys():
+                TP_val = scores[(TP[0], TP[1])]
+            else:
+                TP_val = 0.0
+
+            if TP_val > TN_val:
+                count += 1.0
+            if TP_val == TN_val:
+                count += 0.5
+
+        auc_round = count/1000
+        auc.append(auc_round)
+
+        # precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def PRFracAUC(network, nonexist_edges, iterations, steps):
+    '''
+    Function takes a network, list of non-existent edges, the number of iterations, and the percent of edges to sample
+    (steps) and runs the Rooted Page Rank scoring function over each sampled network for the specified number of
+    iterations.
+    :param network: undirected graph
+    :param nonexist_edges: list of non-existent edges from the graph
+    :param iterations: integer representing the number of iterations to run
+    :param steps: list of percent of edges to sample
+    :return:
+    '''
+    auc = []; prec = []
+
+    for j in xrange(iterations):
+        iteration_data = GraphMaker(network, steps)
+        training_graph = iteration_data[0]
+        testing_edges = iteration_data[1]
+        scores = LinkPrediction.RPR(training_graph, alpha=0.15, beta=0)
+
+        # get AUC
+        count = 0.0
+        for i in xrange(1000):
+            TN = random.sample(nonexist_edges, 1)[0]
+            TP = random.sample(testing_edges, 1)[0]
+
+            if (TN[0], TN[1]) in scores.keys():
+                TN_val = scores[(TN[0], TN[1])]
+            else:
+                TN_val = 0.0
+
+            if (TP[0], TP[1]) in scores.keys():
+                TP_val = scores[(TP[0], TP[1])]
+            else:
+                TP_val = 0.0
+
+            if TP_val > TN_val:
+                count += 1.0
+            if TP_val == TN_val:
+                count += 0.5
+
+        auc_round = count/1000
+        auc.append(auc_round)
+
+        # precision - getting top or bottom K links depends on whether or not AUC is >/< 0.5
+        prec_round = EvaluationMetrics.KPrecision(auc_round, scores, testing_edges)
+        prec.append(prec_round)
+
+    return auc, prec
+
+
+def main():
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+
+    #specify initial arguments for all functions
+    manager = multiprocessing.Manager()
+    network = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml').to_undirected()
+    nonexist_edges = manager.list(list(nx.non_edges(network)))
+    nonexist_edges = list(nx.non_edges(network))  # non-existent edges in graph
+    iterations = 100
+    steps = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95]
+    file = 'Results/Trametinib/NETS_Tram_'
+
+    pool = multiprocessing.Pool(processes=4)  # set up pool
+
+    #Degree Product
+    func = partial(DPFracAUC, network, nonexist_edges, iterations)
+    DPres = pool.map(func, steps)
+    print 'Finished running Degree Product'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'DP.json', 'w') as fout:
+        json.dump(DPres, fout)
+
+    #Shortest Path
+    func2 = partial(SPFracAUC, network, nonexist_edges, iterations)
+    SPres = pool.map(func2, steps)
+    print 'Finished running Shortest Path'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'SP.json', 'w') as fout:
+        json.dump(SPres, fout)
+
+    #Common Neighbors
+    func3 = partial(CNFracAUC, network, nonexist_edges, iterations)
+    CNres = pool.map(func3, steps)
+    print 'Finished running Common Neighbors'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'CN.json', 'w') as fout:
+        json.dump(CNres, fout)
+
+    #Jaccard
+    func4 = partial(JFracAUC, network, nonexist_edges, iterations)
+    Jres = pool.map(func4, steps)
+    print 'Finished running Jaccard Index'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'J.json', 'w') as fout:
+        json.dump(Jres, fout)
+
+    #Sorensen Similarity
+    func5 = partial(SSFracAUC, network, nonexist_edges, iterations)
+    SSres = pool.map(func5, steps)
+    print 'Finished running Sorensen Similarity'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'SS.json', 'w') as fout:
+        json.dump(SSres, fout)
+
+    #Leicht-Holme-Newman
+    func6 = partial(LHNFracAUC, network, nonexist_edges, iterations)
+    LHNres = pool.map(func6, steps)
+    print 'Finished running Leicht-Holme-Newman'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'LHN.json', 'w') as fout:
+        json.dump(LHNres, fout)
+
+    #Adamic Advar
+    func7 = partial(AAFracAUC, network, nonexist_edges, iterations)
+    AAres = pool.map(func7, steps)
+    print 'Finished running Adamic Advar'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'AA.json', 'w') as fout:
+        json.dump(AAres, fout)
+
+    #Resource Allocation
+    func8 = partial(RAFracAUC, network, nonexist_edges, iterations)
+    RAres = pool.map(func8, steps)
+    print 'Finished running Resource Allocation'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'RA.json', 'w') as fout:
+        json.dump(RAres, fout)
+
+    #Katz
+    func9 = partial(KFracAUC, network, nonexist_edges, iterations)
+    Kres = pool.map(func9, steps)
+    print 'Finished running Katz'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    #write dictionary to json file
+    with open(str(file) + 'K.json', 'w') as fout:
+        json.dump(Kres, fout)
+
+    # #Simrank
+    # func10 = partial(SFracAUC, network, nonexist_edges, iterations)
+    # Sres = pool.map(func10, steps)
+    # print 'Finished running SimRank'
+    # # write dictionary to json file
+    # with open(str(file) + 'SR.json', 'w') as fout:
+    #     json.dump(Sres, fout)
+
+    # Rooted Page Rank
+    func11 = partial(PRFracAUC, network, nonexist_edges, iterations)
+    RPRres = pool.map(func11, steps)
+    print 'Finished running Rooted Page Rank'
+    print str('Started running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+    # write dictionary to json file
+    with open(str(file) + 'RPR.json', 'w') as fout:
+        json.dump(RPRres, fout)
+
+
+    pool.close()
+    pool.join()
+
+    print str('Finished running predictions ' + datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/NetworkInferencePlots.py b/NetworkInferencePlots.py
new file mode 100644
index 0000000..b071dfb
--- /dev/null
+++ b/NetworkInferencePlots.py
@@ -0,0 +1,122 @@
+##############################################################################################
+# NetworkInferencePlots.py
+# Purpose: script generates AUC plots to compare performance of network inference algorithms
+# version 1.0.8
+# date: 01.28.2017
+##############################################################################################
+
+
+# import module/script dependencies
+import json
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+
+def ResultParser(file):
+    '''
+    Function takes as input a file that stores a list of lists (length of steps) and outputs a list of the average AUC
+    and precision values across n steps.
+    :param file: string storing the json file containing link prediction results
+    :return: list storing the average AUC values across n steps
+    '''
+
+    auc_results = []; prec_results = []; auc = []; prec = []; auc_mod = []
+
+    # open the json file
+    with open(file) as json_data:
+        results = json.load(json_data)
+
+    for i in results:
+        if len(i) == 2:
+            auc_results.append(i[0])
+            auc = [np.mean(x) for x in auc_results]
+            prec_results.append(i[1])
+            prec = [np.mean(x) for x in prec_results]
+        else:
+            auc_results.append(i)
+            auc = [np.mean(x) for x in auc_results]
+
+    for j in auc:
+        if j < 0.5:
+            auc_mod.append(1 - j)
+        else:
+            auc_mod.append(j)
+
+    return auc_mod, prec
+
+
+
+def ResultPlotter(steps, data, xlabel, ylabel, title, output):
+    '''
+    Function takes a list of data, plotting parameters and plots the data. The plot is saved according to the file
+    directory specified by the user.
+    :param steps: list storing the x-axis values
+    :param data: list of lists of AUC scores for plotting
+    :param xlabel: string storing x-axis label
+    :param ylabel: string storing y-axis label
+    :param title: string storing plot title
+    :param output: string storing information for saving plot
+    :return: plot is created and written to user-specified location
+    '''
+
+    plt1, = plt.plot(steps, data[0], color = 'purple', marker = '>', linestyle = ':', linewidth=2)
+    plt2, = plt.plot(steps, data[1], color = 'magenta', marker = 's', linestyle = ':',linewidth=2)
+    plt3, = plt.plot(steps, data[2], color = 'forestgreen', marker = 'o', linestyle = ':',linewidth=2)
+    plt4, = plt.plot(steps, data[3], color = 'orange', marker = 'd', linestyle = ':',linewidth=2)
+    plt5, = plt.plot(steps, data[4], color = 'gold', marker = '*', linestyle = ':',linewidth=2)
+    plt6, = plt.plot(steps, data[5], color = 'slategray', marker = '>', linestyle = ':',linewidth=2)
+    plt7, = plt.plot(steps, data[6], color = 'limegreen', marker = 's', linestyle = ':',linewidth=2)
+    plt8, = plt.plot(steps, data[7], color = 'cyan', marker = 'd', linestyle = ':',linewidth=2)
+    plt9, = plt.plot(steps, data[8], color = 'red', marker = '*', linestyle = ':',linewidth=2)
+    plt10, = plt.plot(steps, data[9], color = 'royalblue', marker = '>', linestyle = ':',linewidth=2)
+    # plt11, = plt.plot(steps, data[10], color='orange', marker='o', linestyle=':', linewidth=2)
+
+    hline = plt.axhline(y=0.5, xmax=1, linestyle ='--', color='black')
+    # plt.legend([plt1, plt2, plt3, plt4, plt5, plt6, plt7, plt8, plt9, plt10, hline],
+    #            ['Degree Product', 'Common Neighbors', 'Shortest Path', 'Jaccard Index', 'Sorenson Similarity',
+    #             'Leicht-Holme-Newman', 'Adamic Advar', 'Resource Allocation', 'Katz', 'Rooted Page Rank',
+    #             'Pure Chance'], loc='upper center', prop={'size': 10}, ncol=3)
+
+
+    plt.legend([plt1, plt2, plt3, plt4, plt5, plt6, plt7, plt8, plt9, plt10, hline], ['Degree Product', 'Common Neighbors', 'Shortest Path', 'Jaccard Index', 'Sorenson Similarity', 'Leicht-Holme-Newman', 'Adamic Advar', 'Resource Allocation', 'Katz', 'Rooted Page Rank', 'Pure Chance'],loc=9,prop={'size': 9}, bbox_to_anchor=(0.5, -0.15), ncol=3)
+
+    plt.subplots_adjust(bottom=0.25)
+    #plt.tight_layout(pad=6.5)
+    plt.xlabel(xlabel, fontsize=14)
+    plt.ylabel(ylabel, fontsize=14)
+    plt.grid(True, color ='black')
+    plt.title(title, fontsize=18)
+    plt.ylim(0.45, 1.0)
+    plt.yticks(fontsize=12)
+    plt.yticks(fontsize=12)
+
+    return plt.savefig(output, dpi=600) #save plot
+
+
+
+def main():
+    #load results and format for plotting
+    file = 'Results/Trametinib/OWL_Tram'
+    dp_res = ResultParser(str(file) + '_DP.json')
+    sp_res = ResultParser(str(file) + '_SP.json')
+    cn_res = ResultParser(str(file) + '_CN.json')
+    j_res = ResultParser(str(file) + '_J.json')
+    ss_res = ResultParser(str(file) + '_SS.json')
+    lhn_res = ResultParser(str(file) + '_LHN.json')
+    aa_res = ResultParser(str(file) + '_AA.json')
+    ra_res = ResultParser(str(file) + '_RA.json')
+    k_res = ResultParser(str(file) + '_K.json')
+    # s_res = ResultParser(str(file) + '_SR.json')
+    rpr_res = ResultParser(str(file) + '_RPR.json')
+    # hrg_resa = ResultParser('Results/Trematinib/nice_trem_HRG_AUC.json')
+    # hrg_resp = ResultParser2('Results/Trematinib/nice_trem_HRG_PREC.json')
+
+    #plot results
+    data = [dp_res[0], cn_res[0], sp_res[0], j_res[0], ss_res[0], lhn_res[0], aa_res[0], ra_res[0], k_res[0], rpr_res[0]]
+    steps = [0.05, 0.1, 0.3, 0.5, 0.7, 0.9, 0.95]
+    ResultPlotter(steps, data, 'Fraction of Observed Edges', 'Average  AUC', '', 'Results/Trametinib/AUCplot_OWL.png')
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/NetworkStatistics.py b/NetworkStatistics.py
new file mode 100644
index 0000000..29dcf99
--- /dev/null
+++ b/NetworkStatistics.py
@@ -0,0 +1,236 @@
+####################################################################
+# NetworkStatistics.py
+# Purpose: script explores different characteristics of a newtork
+# version 1.1.0
+# date: 07.21.2017
+####################################################################
+
+
+# import module/script dependencies
+import networkx as nx
+import numpy as np
+from collections import Counter
+import matplotlib.pyplot as plt
+from pylab import show
+from powerlaw import plot_pdf, Fit, pdf
+import powerlaw
+
+
+
+# produce network statistics
+# read in graphs
+trem_owl = nx.read_gml('Network_Data/Trametinib_query_OWL_network.gml')
+trem_nets = nx.read_gml('Network_Data/Trametinib_query_NETS_network.gml')
+ddi_owl = nx.read_gml('Network_Data/DDI_reactome_query_OWL_network.gml')
+ddi_nets=nx.read_gml('Network_Data/DDI_reactome_query_NETS_network.gml')
+
+len(list(nx.non_edges(ddi_nets.to_undirected())))
+
+
+count = set
+for node in trem_nets.nodes(data=True):
+
+
+
+
+nx.number_connected_components(ddi_owl.to_undirected())
+
+## Descriptives
+
+for i in [ddi_nets, ddi_owl]:
+    print 'Clustering Coefficient'
+    print nx.average_clustering(i.to_undirected())
+
+    print 'Number of Connected Components'
+    print nx.number_connected_components(i.to_undirected())
+
+    print 'Average Network Closeness (Centrality)'
+    print sum(nx.closeness_centrality(i.to_undirected()).values())/len(nx.closeness_centrality(i.to_undirected()).values())
+
+    print 'degree heterogeneity'
+    print float(np.mean(map(lambda x: x ** 2, i.to_undirected().degree().values()))) / (
+    (i.to_undirected().number_of_edges() * 2 /i.to_undirected().number_of_nodes()) ** 2)
+
+    print 'Average number of neighbors'
+    print sum(nx.average_neighbor_degree(i.to_undirected()).values())
+
+    print 'Nodes'
+    print nx.number_of_nodes(i.to_undirected())
+
+    print 'edges'
+    print nx.number_of_edges(i.to_undirected())
+
+    print 'Density'
+    print nx.density(trem_nets.to_undirected())
+
+    print 'Number of cliques'
+    print len(nx.number_of_cliques(i.to_undirected()))
+
+    print 'Average Degree'
+    print 2.0 * (len(i.to_undirected().edges())) / len(i.to_undirected().nodes())
+
+    print 'Average Degree Assortativity'
+    print nx.degree_assortativity_coefficient(i.to_undirected())
+
+# things for only connected graphs
+ddi = max(nx.connected_component_subgraphs(ddi_nets.to_undirected()), key=len)
+print 'Network Diameter'
+nx.diameter(ddi.to_undirected())
+
+print 'Shorest paths (sum)'
+sum(nx.all_pairs_shortest_path_length(ddi_owl.to_undirected()).values()[0].values())
+
+print 'Average shortest_path (Characteristic path length)'
+nx.average_shortest_path_length(ddi_owl.to_undirected())
+
+ddi_owl_nodes = ddi_owl.nodes()
+## CCDF with power-law fit
+trem_owl_freq = Counter(nx.degree(trem_owl.to_undirected()).values()).values()
+trem_nets_freq = Counter(nx.degree(trem_nets.to_undirected()).values()).values()
+ddi_owl_freq = np.sort(Counter(list(nx.degree(ddi_owl.to_undirected()).values())).values())
+ddi_nets_freq = np.sort(Counter(list(nx.degree(ddi_nets.to_undirected()).values())).values())
+
+
+nx.degree(trem_owl.to_undirected()).values()
+
+fit1 = powerlaw.Fit(nx.degree(trem_owl.to_undirected()).values(), discrete = True)
+fit2 = powerlaw.Fit(nx.degree(trem_nets.to_undirected()).values(), discrete = True)
+
+fit1 = powerlaw.Fit(nx.degree(ddi_owl.to_undirected()).values(), discrete = True)
+fit2 = powerlaw.Fit(nx.degree(ddi_nets.to_undirected()).values(), discrete = True)
+
+fig2 = fit1.plot_ccdf(color='mediumaquamarine', linewidth=2)
+fit1.power_law.plot_ccdf(color='mediumaquamarine', linestyle='--', ax=fig2)
+fit2.plot_ccdf(color='m', linewidth=2, ax=fig2)
+fit2.power_law.plot_ccdf(color='m', linestyle='--', ax=fig2)
+
+fig2.legend(['OWL CCDF',
+             'OWL CCDF Power-Law Fit',
+             'OWL-NETS CCDF',
+             'OWL-NETS CCDF Power-Law Fit'],loc='lower left', fontsize=14, numpoints=1)
+#
+# from mpl_toolkits.axes_grid.inset_locator import inset_axes
+# fig2in = inset_axes(fig2, width="30%", height="30%", loc=3)
+# fig2in.hist(nx.degree(ddi_owl).values(), bins = 5000, normed = True, color='deepskyblue')
+# fig2in.set_xlim(0, 100)
+#
+# fig2in.hist(nx.degree(ddi_nets).values(), bins = 50, normed = True, color='red', alpha = 0.65)
+# # fig2in.set_xlim(0, 0.1)
+# fig2in.set_xticks([])
+# fig2in.set_yticks([])
+
+fig2.patch.set_facecolor('whitesmoke')
+fig2.set_xlabel(r'Degree $k$', fontsize=16)
+fig2.set_ylabel(r'$P(K \geq k)$', fontsize=16)
+fig2.grid(True, color='black')
+fig2.tick_params(axis='both', which='major', labelsize=14)
+fig2.tick_params(axis='both', which='minor', labelsize=14)
+# plt.ylim(0.01, 1.01)
+# plt.xlim(0.98, 31622)
+plt.tight_layout()
+plt.savefig("Results/DDI_reactome/DDI_degreeDistribution.png", dpi=600)  # save plot
+
+
+
+# create degree distribution plots
+plot_res = []
+for r in [trem_owl, trem_nets, ddi_owl, ddi_nets]:
+    # #get counts of node occurrence in target list
+    degree = Counter(list(nx.degree(r.to_undirected()).values())).values()  # for r = 1
+
+    # update indegree to include a value of zero for nodes not in target list
+    pdf = np.sort(degree)
+    cdf = np.arange(len(pdf)) / float(len(pdf))
+    ccdf = 1 - cdf
+    plot_res.append([pdf, ccdf])
+
+# generate plot
+plt1, = plt.loglog(plot_res[2][0], plot_res[2][1], '.', color='turquoise', markersize=12, alpha=0.75)
+plt2, = plt.loglog(plot_res[0][0], plot_res[0][1], '.', color='turquoise', markersize=12, alpha=0.75)
+plt3, = plt.loglog(plot_res[3][0], plot_res[3][1], '.', color='red', markersize=12, alpha=0.75)
+plt4, = plt.loglog(plot_res[1][0], plot_res[1][1], '.', color='lime', markersize=12)
+
+
+plt.legend([plt1, plt3], ['OWL Representation', 'OWL-NETS'],
+           loc='upper right', fontsize=12, numpoints=1)
+
+plt.legend([plt2, plt4], ['OWL Representation', 'OWL-NETS'],
+           loc='upper right', fontsize=12, numpoints=1)
+
+plt.xlabel(r'Degree $k$', fontsize=12)
+plt.ylabel(r'$Pr(k)$', fontsize=12)
+plt.grid(True, color='black')
+plt.ylim(0.01, 1.01)
+plt.xlim(0.98, 31622)
+plt.tight_layout()
+plt.savefig("Network_visualizations/Tram_degreeDistribution.png", dpi=600)  # save plot
+# save plot
+
+
+## Betweenness
+
+def most_important(G):
+    '''
+    returns a copy of G with the most important nodes according to the pagerank
+    '''
+    ranking = nx.betweenness_centrality(G).items()
+    print ranking
+
+    r = [x[1] for x in ranking]
+    m = sum(r) / len(r)  # mean centrality
+    t = m * 3  #threshold
+    Gt = G.copy()
+
+    # threshold, we keep only the nodes with 3 times the mean
+    for k, v in ranking:
+        if v < t:
+            Gt.remove_node(k)
+
+    return Gt
+
+G = graph.to_undirected()
+G=owl.to_undirected()
+Gt = most_important(G) # trimming
+
+Gt.nodes(data = True)
+
+# create the layout
+pos = nx.spring_layout(G)
+# draw the nodes and the edges (all)
+nx.draw_networkx_nodes(G,pos,node_color='b',alpha=0.3,node_size=50)
+nx.draw_networkx_edges(G,pos,alpha=0.1)
+
+# draw the most important nodes with a different style
+nx.draw_networkx_nodes(Gt,pos,node_color='r',alpha=0.4,node_size=200)
+
+
+# also the labels this time
+labels = nx.get_node_attributes(Gt, 'type')
+nx.draw_networkx_labels(Gt,pos,labels,font_size=5,font_color='black')
+plt.axis('off')
+plt.tight_layout()
+plt.savefig("Network_visualizations/Trem_betweeness_importance_NETS.png", dpi=1000)
+
+res = []
+count1 = 0
+count2 = 0
+count3 = 0
+count4 = 0
+for i in ddi.to_undirected().edges(data =True):
+    if i[2]['nice'] == ['drug1_ice', 'target_ice']:
+        count1 += 1
+
+    if i[2]['nice'] == ['pathway_target_ice', 'drug2_ice']:
+        count2 += 1
+
+    if i[2]['nice'] == ['target_ice', 'pathway_ice']:
+        count3 += 1
+
+    if i[2]['nice'] == ['pathway_ice', 'pathway_target_ice']:
+        count4 += 1
+
+
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file