From d77eb80f0a9f0659dda44c6f851cbb5ac24de5e3 Mon Sep 17 00:00:00 2001 From: jessicawwen Date: Wed, 1 Jan 2025 08:19:10 +0800 Subject: [PATCH] feat:try to add reverse_csr --- cpp_easygraph/classes/graph.cpp | 266 +++++++++ cpp_easygraph/classes/graph.h | 4 + .../functions/structural_holes/evaluation.cpp | 36 +- .../functions/structural_holes/constraint.cpp | 20 +- .../functions/structural_holes/constraint.cu | 517 ++++++++++-------- .../functions/structural_holes/constraint.cuh | 16 +- gpu_easygraph/gpu_easygraph.h | 11 +- 7 files changed, 599 insertions(+), 271 deletions(-) diff --git a/cpp_easygraph/classes/graph.cpp b/cpp_easygraph/classes/graph.cpp index 122469bc..5d127714 100644 --- a/cpp_easygraph/classes/graph.cpp +++ b/cpp_easygraph/classes/graph.cpp @@ -9,6 +9,7 @@ Graph::Graph() { this->dirty_adj = true; this->linkgraph_dirty = true; this->csr_graph = nullptr; + this->in_csr_graph = nullptr; this->node_to_id = py::dict(); this->id_to_node = py::dict(); this->graph = py::dict(); @@ -715,6 +716,81 @@ void Graph::drop_cache() { csr_graph = nullptr; } +std::shared_ptr Graph::gen_CSR_fast(const std::string& weight_key) { + // Step 2: 收集所有节点 ID 并排序 + std::vector node_vec; + for (const auto& item : this->node_to_id) { + node_vec.push_back(item.second.cast()); + } + std::sort(node_vec.begin(), node_vec.end()); + + // Step 3: 遍历邻接表,构造 edges + int num_nodes = node_vec.size(); + std::vector rowPtrOut(num_nodes + 1, 0); + std::vector rowPtrIn(num_nodes + 1, 0); + std::vector> edges; + + for (const auto& adj_item : this->adj) { + node_t u_id = adj_item.first; + for (const auto& neighbor : adj_item.second) { + node_t v_id = neighbor.first; + const auto& edge_attrs = neighbor.second; + + double weight = 1.0; // 默认权重 + if (edge_attrs.find(weight_key) != edge_attrs.end()) { + weight = edge_attrs.at(weight_key); + } + + edges.emplace_back(u_id, v_id, weight); + rowPtrOut[u_id + 1]++; + rowPtrIn[v_id + 1]++; + } + } + + // Step 4: 计算 rowPtr 的前缀和 + for (int i = 1; i <= num_nodes; i++) { + rowPtrOut[i] += rowPtrOut[i - 1]; + rowPtrIn[i] += rowPtrIn[i - 1]; + } + + // Step 5: 填充 colIdx 和 val + std::vector colIdxOut(edges.size()); + std::vector valOut(edges.size()); + std::vector colIdxIn(edges.size()); + std::vector valIn(edges.size()); + std::vector offsetOut(num_nodes, 0); + std::vector offsetIn(num_nodes, 0); + + for (const auto& [u_id, v_id, weight] : edges) { + int posOut = rowPtrOut[u_id] + offsetOut[u_id]; + colIdxOut[posOut] = v_id; + valOut[posOut] = weight; + offsetOut[u_id]++; + + int posIn = rowPtrIn[v_id] + offsetIn[v_id]; + colIdxIn[posIn] = u_id; + valIn[posIn] = weight; + offsetIn[v_id]++; + } + + // Step 6: 保存到 CSRGraph 对象 + this->csr_graph = std::make_shared(); + auto& csr = *this->csr_graph; + csr.nodes = node_vec; + csr.V = rowPtrOut; + csr.E = colIdxOut; + csr.unweighted_W = valOut; + + this->in_csr_graph = std::make_shared(); + auto& in_csr = *this->in_csr_graph; + in_csr.nodes = node_vec; + in_csr.V = rowPtrIn; + in_csr.E = colIdxIn; + in_csr.unweighted_W = valIn; + + return this->csr_graph; +} + std::shared_ptr Graph::gen_CSR(const std::string& weight) { if (csr_graph != nullptr) { if (csr_graph->W_map.find(weight) == csr_graph->W_map.end()) { @@ -849,6 +925,196 @@ std::shared_ptr> Graph::gen_CSR_sources(const py::object& py_so return sources; } +std::shared_ptr Graph::gen_reverse_CSR(const std::string& weight) { + // 如果 in_csr_graph 已经存在,检查有没有对应 weight 的权重 + if (in_csr_graph != nullptr) { + // 如果没有该字段的权重,则需要补一份 + if (in_csr_graph->W_map.find(weight) == in_csr_graph->W_map.end()) { + // 构造一个新的权重向量 + auto W = std::make_shared>(); + W->reserve(in_csr_graph->E.size()); + + // 反向 CSR 的节点顺序、V、E 在 in_csr_graph 中已经确定 + // 这里只需要根据 E 来补上权重 + // E[i] 表示从第 u = in_csr_graph->idx2node[...] 个节点 + // 指向 E[i] 对应的节点,这里是反向边,所以在原图中是 E[i] -> u + // + // 我们可以通过 V 来找出每个节点对应的边区间 + const auto& V = in_csr_graph->V; + const auto& E = in_csr_graph->E; + const auto& idx2node = in_csr_graph->nodes; // 排过序的节点列表 + const auto& node2idx = in_csr_graph->node2idx; // 节点到索引 + + // 遍历每个节点在 in_csr_graph 的所有反向边 + for (size_t u_idx = 0; u_idx < idx2node.size(); ++u_idx) { + int start = V[u_idx]; + int end = V[u_idx + 1]; + node_t u_node = idx2node[u_idx]; // 反向图中的“目标”节点 + + for (int edge_pos = start; edge_pos < end; ++edge_pos) { + int v_idx = E[edge_pos]; + // v_idx 对应的是原图中的“源”节点 + node_t v_node = idx2node[v_idx]; + + // 在原图中 v_node -> u_node + // 查找它的 edge_attr + const auto& v_adjs = adj.find(v_node)->second; + const auto& edge_attr = v_adjs.find(u_node)->second; + + auto edge_it = edge_attr.find(weight); + double w = (edge_it != edge_attr.end()) ? edge_it->second : 1.0; + + W->push_back(w); + } + } + + // 存到 in_csr_graph->W_map + in_csr_graph->W_map[weight] = W; + } + } + else { + // 如果从未构造过 in_csr_graph 或者图数据修改过,就需要重新构造 + in_csr_graph = std::make_shared(); + + // 1. 收集并排序所有节点 + auto& nodes = in_csr_graph->nodes; // 用于存储排好序的节点 + nodes.reserve(node.size()); + for (auto it = node.begin(); it != node.end(); ++it) { + nodes.push_back(it->first); + } + std::sort(nodes.begin(), nodes.end()); + + // 2. 建立 node2idx + auto& node2idx = in_csr_graph->node2idx; + for (int i = 0; i < (int)nodes.size(); ++i) { + node2idx[nodes[i]] = i; + } + + // 3. 构造【反向】邻接表(临时结构),以便后续统一写入 V、E、W + // 这里我们需要把 adj 中的 (src -> dst) 翻转成 (dst -> src) + std::unordered_map> rev_adj; + for (auto& kv : adj) { + node_t src = kv.first; + for (const auto& kv : adj) { + node_t src = kv.first; + for (const auto& kv2 : kv.second) { + node_t dst = kv2.first; + // kv2.second 是 edge_attr_dict_factory 类型 + rev_adj[dst][src] = kv2.second; + } + } + } + + // 4. 分配 V、E、W 并写入 + auto& V = in_csr_graph->V; + auto& E = in_csr_graph->E; + auto W = std::make_shared>(); + + V.reserve(nodes.size() + 1); + + // 对每个节点(按排序后顺序)填充 V, E, W + for (int i = 0; i < (int)nodes.size(); ++i) { + node_t cur_node = nodes[i]; + // 记录当前 E 的起始位置 + V.push_back((int)E.size()); + + // 查看反向邻接 rev_adj[cur_node],即有哪些节点指向 cur_node + auto rev_it = rev_adj.find(cur_node); + if (rev_it != rev_adj.end()) { + const auto& neighbors = rev_it->second; // map + for (auto& nb_kv : neighbors) { + node_t src_node = nb_kv.first; + const auto& edge_attr = nb_kv.second; + + // src_node -> cur_node 在原图中 + // 在反向 CSR 中,我们在“cur_node”这行添加一个从 cur_node -> src_node 的记录 + E.push_back(node2idx[src_node]); + + // 权重处理 + auto edge_it = edge_attr.find(weight); + double w_val = (edge_it != edge_attr.end()) ? edge_it->second : 1.0; + W->push_back(w_val); + } + } + } + + // 最后一个节点的“结束位置”,相当于 E.size() + V.push_back((int)E.size()); + + // 存储权重向量 + in_csr_graph->W_map[weight] = W; + } + + return in_csr_graph; +} + +// 生成无权反向 CSR +std::shared_ptr Graph::gen_reverse_CSR() { + // 如果已经有 in_csr_graph,则只需要检查 unweighted_W 是否跟 E.size() 对齐 + if (in_csr_graph != nullptr) { + if (in_csr_graph->unweighted_W.size() != in_csr_graph->E.size()) { + in_csr_graph->unweighted_W = std::vector(in_csr_graph->E.size(), 1.0); + } + } + else { + // 和上面同理,重新构造 + in_csr_graph = std::make_shared(); + + // 1. 收集并排序节点 + auto& nodes = in_csr_graph->nodes; + nodes.reserve(node.size()); + for (auto it = node.begin(); it != node.end(); ++it) { + nodes.push_back(it->first); + } + std::sort(nodes.begin(), nodes.end()); + + // 2. 建立 node2idx + auto& node2idx = in_csr_graph->node2idx; + for (int i = 0; i < (int)nodes.size(); ++i) { + node2idx[nodes[i]] = i; + } + + // 3. 构造反向邻接表 + std::unordered_map> rev_adj; + for (auto& kv : adj) { + node_t src = kv.first; + for (const auto& kv : adj) { + node_t src = kv.first; + for (const auto& kv2 : kv.second) { + node_t dst = kv2.first; + // kv2.second 是 edge_attr_dict_factory 类型 + rev_adj[dst][src] = kv2.second; + } + } + } + + // 4. 写入 V、E + auto& V = in_csr_graph->V; + auto& E = in_csr_graph->E; + V.reserve(nodes.size() + 1); + + for (int i = 0; i < (int)nodes.size(); ++i) { + node_t cur_node = nodes[i]; + V.push_back((int)E.size()); + + auto rev_it = rev_adj.find(cur_node); + if (rev_it != rev_adj.end()) { + const auto& neighbors = rev_it->second; + for (auto& nb_kv : neighbors) { + node_t src_node = nb_kv.first; + E.push_back(node2idx[src_node]); + } + } + } + V.push_back((int)E.size()); + + // 无权时,统一填 1.0 + in_csr_graph->unweighted_W = std::vector(E.size(), 1.0); + } + + return in_csr_graph; +} + std::shared_ptr Graph::gen_COO() { if (coo_graph != nullptr) { if (coo_graph->unweighted_W.size() != coo_graph->row.size()) { diff --git a/cpp_easygraph/classes/graph.h b/cpp_easygraph/classes/graph.h index a10eceeb..8f451850 100644 --- a/cpp_easygraph/classes/graph.h +++ b/cpp_easygraph/classes/graph.h @@ -11,6 +11,7 @@ struct Graph { adj_dict_factory adj; Graph_L linkgraph_structure; std::shared_ptr csr_graph; + std::shared_ptr in_csr_graph; py::kwargs node_to_id, id_to_node, graph; node_t id; bool dirty_nodes, dirty_adj, linkgraph_dirty; @@ -32,10 +33,13 @@ struct Graph { std::shared_ptr gen_CSR(const std::string& weight); std::shared_ptr gen_CSR(); + std::shared_ptr gen_reverse_CSR(const std::string& weight); + std::shared_ptr gen_reverse_CSR(); std::shared_ptr> gen_CSR_sources(const py::object& py_sources); std::shared_ptr gen_COO(); std::shared_ptr gen_COO(const std::string& weight); std::shared_ptr transfer_csr_to_coo(const std::shared_ptr& csr_graph); + std::shared_ptr gen_CSR_fast(const std::string& weight_key); }; py::object Graph__init__(py::args args, py::kwargs kwargs); diff --git a/cpp_easygraph/functions/structural_holes/evaluation.cpp b/cpp_easygraph/functions/structural_holes/evaluation.cpp index 9118349b..427e04a7 100644 --- a/cpp_easygraph/functions/structural_holes/evaluation.cpp +++ b/cpp_easygraph/functions/structural_holes/evaluation.cpp @@ -212,22 +212,27 @@ static py::object invoke_gpu_constraint(py::object G, py::object nodes, py::obje Graph& G_ = G.cast(); if (weight.is_none()) { G_.gen_CSR(); + G_.gen_reverse_CSR(); } else { G_.gen_CSR(weight_to_string(weight)); - } - auto csr_graph = G_.csr_graph; - auto coo_graph = G_.transfer_csr_to_coo(csr_graph); - std::vector& V = csr_graph->V; - std::vector& E = csr_graph->E; - std::vector& row = coo_graph->row; - std::vector& col = coo_graph->col; - std::vector *W_p = weight.is_none() ? &(coo_graph->unweighted_W) - : coo_graph->W_map.find(weight_to_string(weight))->second.get(); - std::unordered_map& node2idx = coo_graph->node2idx; - int num_nodes = coo_graph->node2idx.size(); + G_.gen_reverse_CSR(weight_to_string(weight)); + } + // G_.gen_CSR_fast(weight_to_string(weight)); + auto out_csr_graph = G_.csr_graph; + auto in_csr_graph = G_.in_csr_graph; + std::vector& rowPtrOut = out_csr_graph->V; + std::vector& colIdxOut = out_csr_graph->E; + std::vector *valOut = weight.is_none() ? &(out_csr_graph->unweighted_W) + : out_csr_graph->W_map.find(weight_to_string(weight))->second.get(); + std::vector& rowPtrIn = in_csr_graph->V; + + std::vector& colIdxIn = in_csr_graph->E; + std::vector *valIn = weight.is_none() ? &(in_csr_graph->unweighted_W) + : in_csr_graph->W_map.find(weight_to_string(weight))->second.get(); + std::unordered_map& node2idx = out_csr_graph->node2idx; + int num_nodes = out_csr_graph->node2idx.size(); bool is_directed = G.attr("is_directed")().cast(); std::vector constraint_results(num_nodes, 0.0); - std::vector node_mask(num_nodes, 0); py::list nodes_list; if (!nodes.is_none()) { @@ -240,8 +245,11 @@ static py::object invoke_gpu_constraint(py::object G, py::object nodes, py::obje nodes_list = py::list(G.attr("nodes")); std::fill(node_mask.begin(), node_mask.end(), 1); } - - int gpu_r = gpu_easygraph::constraint(V, E, row, col, num_nodes, *W_p, is_directed, node_mask, constraint_results); + + int gpu_r = gpu_easygraph::constraint(num_nodes, + rowPtrOut, colIdxOut, *valOut, + rowPtrIn, colIdxIn, *valIn, + is_directed, node_mask, constraint_results); if (gpu_r != gpu_easygraph::EG_GPU_SUCC) { py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r)); } diff --git a/gpu_easygraph/functions/structural_holes/constraint.cpp b/gpu_easygraph/functions/structural_holes/constraint.cpp index 834c37c2..678ea9c4 100644 --- a/gpu_easygraph/functions/structural_holes/constraint.cpp +++ b/gpu_easygraph/functions/structural_holes/constraint.cpp @@ -10,20 +10,22 @@ namespace gpu_easygraph { using std::vector; int constraint( - _IN_ const vector& V, - _IN_ const vector& E, - _IN_ const vector& row, - _IN_ const vector& col, _IN_ int num_nodes, - _IN_ const vector& W, + _IN_ const std::vector& rowPtrOut, + _IN_ const std::vector& colIdxOut, + _IN_ const std::vector& valOut, + _IN_ const std::vector& rowPtrIn, + _IN_ const std::vector& colIdxIn, + _IN_ const std::vector& valIn, _IN_ bool is_directed, _IN_ vector& node_mask, - _OUT_ vector& constraint + _OUT_ vector& constraints ) { - int num_edges = row.size(); + int len_rowPtrOut = rowPtrOut.size(); + int len_colIdxOut = colIdxOut.size(); - constraint = vector(num_nodes); - int r = cuda_constraint(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), constraint.data()); + constraints = vector(num_nodes); + int r = cuda_constraint(num_nodes, len_rowPtrOut, len_colIdxOut, rowPtrOut.data(), colIdxOut.data(), valOut.data(), rowPtrIn.data(), colIdxIn.data(), valIn.data(), is_directed, node_mask.data(), constraints.data()); return r; } diff --git a/gpu_easygraph/functions/structural_holes/constraint.cu b/gpu_easygraph/functions/structural_holes/constraint.cu index 5124d6ca..0f22115a 100644 --- a/gpu_easygraph/functions/structural_holes/constraint.cu +++ b/gpu_easygraph/functions/structural_holes/constraint.cu @@ -3,252 +3,257 @@ #include #include "common.h" -#define NODES_PER_BLOCK 1 +// #define NODES_PER_BLOCK 1 namespace gpu_easygraph { enum norm_t { SUM = 0, MAX = 1 }; -static __device__ double mutual_weight( - const int* V, - const int* E, - const double* W, +// static __device__ double mutual_weight( +// const int* V, +// const int* E, +// const double* W, +// int u, +// int v +// ) { +// double a_uv = 0.0; +// for (int i = V[u]; i < V[u+1]; i++) { +// if (E[i] == v) { +// a_uv = W[i]; +// break; +// } +// } +// return a_uv; +// } + +// static __device__ double normalized_mutual_weight( +// const int* V, +// const int* E, +// const double* W, +// int u, +// int v, +// norm_t norm +// ) { +// double weight_uv = mutual_weight(V, E, W, u, v); + +// double scale = 0.0; +// if (norm == SUM) { +// for (int i = V[u]; i < V[u+1]; i++) { +// int neighbor = E[i]; +// double weight_uw = mutual_weight(V, E, W, u, neighbor); +// scale += weight_uw; +// } +// } else if (norm == MAX) { +// for (int i = V[u]; i < V[u+1]; i++) { +// int neighbor = E[i]; +// double weight_uw = mutual_weight(V, E, W, u, neighbor); +// scale = fmax(scale,weight_uw); +// } +// } +// return (scale==0.0) ? 0.0 : (weight_uv / scale); +// } + +// static __device__ double local_constraint( +// const int* V, +// const int* E, +// const double* W, +// int u, +// int v +// ) { +// double direct = normalized_mutual_weight(V,E,W,u,v,SUM); +// double indirect = 0.0; +// for (int i = V[u]; i < V[u+1]; i++) { +// int neighbor = E[i]; +// double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM); +// double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM); +// indirect += norm_uw * norm_wv; +// } +// double local_constraint_of_uv = (direct + indirect) * (direct + indirect); +// return local_constraint_of_uv; +// } + +// __global__ void calculate_constraints( +// const int* __restrict__ V, +// const int* __restrict__ E, +// const double* __restrict__ W, +// const int num_nodes, +// const int* __restrict__ node_mask, +// double* __restrict__ constraint_results +// ) { +// int start_node = blockIdx.x * NODES_PER_BLOCK; +// int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); + +// for (int v = start_node; v < end_node; ++v) { +// if (!node_mask[v]) continue; + +// double constraint_of_v = 0.0; +// bool is_nan = true; + +// __shared__ double shared_constraint[256]; +// double local_sum = 0.0; + +// for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { +// is_nan = false; +// int neighbor = E[i]; +// local_sum += local_constraint(V, E, W, v, neighbor); +// } + +// shared_constraint[threadIdx.x] = local_sum; +// __syncthreads(); + +// for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { +// if (threadIdx.x < offset) { +// shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset]; +// } +// __syncthreads(); +// } + +// if (threadIdx.x == 0) { +// constraint_results[v] = (is_nan) ? NAN : shared_constraint[0]; +// } +// } +// } + +__device__ double mutual_weight( + const int* rowPtrOut, + const int* colIdxOut, + const double* valOut, + const int* rowPtrIn, + const int* colIdxIn, + const double* valIn, int u, int v -) { - double a_uv = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - if (E[i] == v) { - a_uv = W[i]; +) +{ + double w_uv = 0.0, w_vu = 0.0; + for (int i = rowPtrOut[u]; i < rowPtrOut[u+1]; i++) { + if (colIdxOut[i] == v) { + w_uv = valOut[i]; break; } } - return a_uv; -} - -static __device__ double normalized_mutual_weight( - const int* V, - const int* E, - const double* W, - int u, - int v, - norm_t norm -) { - double weight_uv = mutual_weight(V, E, W, u, v); - - double scale = 0.0; - if (norm == SUM) { - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = mutual_weight(V, E, W, u, neighbor); - scale += weight_uw; - } - } else if (norm == MAX) { - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = mutual_weight(V, E, W, u, neighbor); - scale = fmax(scale,weight_uw); + for (int i = rowPtrIn[u]; i < rowPtrIn[u+1]; i++) { + if (colIdxIn[i] == v) { + w_vu = valIn[i]; + break; } } - return (scale==0.0) ? 0.0 : (weight_uv / scale); -} - -static __device__ double local_constraint( - const int* V, - const int* E, - const double* W, - int u, - int v -) { - double direct = normalized_mutual_weight(V,E,W,u,v,SUM); - double indirect = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM); - double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM); - indirect += norm_uw * norm_wv; - } - double local_constraint_of_uv = (direct + indirect) * (direct + indirect); - return local_constraint_of_uv; + // printf("u=%d, v=%d, w_uv=%f, w_vu=%f\n", u, v, w_uv, w_vu); + return w_uv + w_vu; } -__global__ void calculate_constraints( - const int* __restrict__ V, - const int* __restrict__ E, - const double* __restrict__ W, - const int num_nodes, - const int* __restrict__ node_mask, - double* __restrict__ constraint_results -) { - int start_node = blockIdx.x * NODES_PER_BLOCK; - int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); - - for (int v = start_node; v < end_node; ++v) { - if (!node_mask[v]) continue; - - double constraint_of_v = 0.0; - bool is_nan = true; - - __shared__ double shared_constraint[256]; - double local_sum = 0.0; - - for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { - is_nan = false; - int neighbor = E[i]; - local_sum += local_constraint(V, E, W, v, neighbor); - } +__global__ void compute_out_in_sum( + const int* rowPtrOut, + const double* valOut, + const int* rowPtrIn, + const double* valIn, + int num_nodes, + const int* node_mask, + double* d_sum +) +{ + int u = blockIdx.x * blockDim.x + threadIdx.x; + if (u >= num_nodes || !node_mask[u]) return; // 跳过未标记的节点 - shared_constraint[threadIdx.x] = local_sum; - __syncthreads(); + double sum_val = 0.0; - for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { - if (threadIdx.x < offset) { - shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset]; - } - __syncthreads(); - } + for (int i = rowPtrOut[u]; i < rowPtrOut[u + 1]; i++) { + sum_val += valOut[i]; + } - if (threadIdx.x == 0) { - constraint_results[v] = (is_nan) ? NAN : shared_constraint[0]; - } + for (int i = rowPtrIn[u]; i < rowPtrIn[u + 1]; i++) { + sum_val += valIn[i]; } + d_sum[u] = sum_val; + // printf("Node %d: sum_val = %f\n", u, sum_val); } -static __device__ double directed_mutual_weight( - const int* V, - const int* E, - const double* W, +__device__ double local_constraint( + const int* rowPtrOut, + const int* colIdxOut, + const double* valOut, + const int* rowPtrIn, + const int* colIdxIn, + const double* valIn, + const double* d_sum, int u, int v ) { - double a_uv = 0.0, a_vu = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - if (E[i] == v) { - a_uv = W[i]; - break; - } + double weight_uv = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, u, v); + double sum_u = 0.0; + for (int i = rowPtrOut[u]; i < rowPtrOut[u+1]; i++) { + sum_u += valOut[i]; } - for (int i = V[v]; i < V[v+1]; i++) { - if (E[i] == u) { - a_vu = W[i]; - break; - } + for (int i = rowPtrIn[u]; i < rowPtrIn[u+1]; i++) { + sum_u += valIn[i]; } - return a_uv + a_vu; -} + double direct = (sum_u == 0.0) ? 0.0 : (weight_uv / sum_u); -static __device__ double directed_normalized_mutual_weight( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - int num_edges, - int u, - int v, - norm_t norm -) { - double weight_uv = directed_mutual_weight(V, E, W, u, v); - - double scale = 0.0; - if(norm==SUM){ - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); - scale += weight_uw; - } + double indirect = 0.0; + double scale_u = d_sum[u]; + for (int i = rowPtrOut[u]; i < rowPtrOut[u+1]; i++) { + int nbr = colIdxOut[i]; + double w_uw = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, u, nbr); + double w_wv = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, nbr, v); - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int neighbor = row[i]; - double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); - scale += weight_wu; - } - } - }else if(norm==MAX){ - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); - scale = fmax(scale,weight_uw); - } + // double scale_u = d_sum[u]; // u的分母 + double scale_nbr = d_sum[nbr]; // neighbor的分母 - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int neighbor = row[i]; - double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); - scale = fmax(scale,weight_wu); - } - } - } - return (scale==0.0) ? 0.0 : (weight_uv / scale); -} + double norm_uw = (scale_u == 0.0) ? 0.0 : (w_uw / scale_u); + double norm_wv = (scale_nbr == 0.0) ? 0.0 : (w_wv / scale_nbr); -static __device__ double directed_local_constraint( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - int num_edges, - int u, - int v -) { - double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM); - double indirect = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); - double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); indirect += norm_uw * norm_wv; } + for (int i = rowPtrIn[u]; i < rowPtrIn[u+1]; i++) { + int nbr = colIdxIn[i]; + double w_uw = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, u, nbr); + double w_wv = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, nbr, v); - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int neighbor = row[i]; - double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); - double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); - indirect += norm_uw * norm_wv; - } + + double scale_nbr = d_sum[nbr]; + + double norm_uw = (scale_u == 0.0) ? 0.0 : (w_uw / scale_u); + double norm_wv = (scale_nbr == 0.0) ? 0.0 : (w_wv / scale_nbr); + + indirect += norm_uw * norm_wv; } - double local_constraint_of_uv = (direct + indirect) * (direct + indirect); - return local_constraint_of_uv; + double result = (direct + indirect) * (direct + indirect); + printf("u=%d, v=%d, direct=%f, indirect=%f\n", u, v, direct, indirect); + return result; } -__global__ void directed_calculate_constraints( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, +__global__ void calculate_constraints( int num_nodes, - int num_edges, - int* node_mask, + int NODES_PER_BLOCK, + const int* rowPtrOut, + const int* colIdxOut, + const double* valOut, + const int* rowPtrIn, + const int* colIdxIn, + const double* valIn, + const double* d_sum, + const int* node_mask, double* constraint_results -) { +){ int start_node = blockIdx.x * NODES_PER_BLOCK; int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); for (int v = start_node; v < end_node; ++v) { if (!node_mask[v]) continue; - - double constraint_of_v = 0.0; bool is_nan = true; - + // bool is_nan = false; __shared__ double shared_constraint[256]; double local_sum = 0.0; - for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { + for (int i = rowPtrOut[v] + threadIdx.x; i < rowPtrOut[v + 1]; i += blockDim.x) { is_nan = false; - int neighbor = E[i]; - local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor); + int neighbor = colIdxOut[i]; + local_sum += local_constraint(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, d_sum, v, neighbor); } - for (int i = threadIdx.x; i < num_edges; i += blockDim.x) { - if (col[i] == v) { - // is_nan = false; - int neighbor = row[i]; - local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor); - } + for (int i = rowPtrIn[v] + threadIdx.x; i < rowPtrIn[v + 1]; i += blockDim.x) { + int neighbor = colIdxIn[i]; + local_sum += local_constraint(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, d_sum, v, neighbor); } shared_constraint[threadIdx.x] = local_sum; @@ -267,63 +272,103 @@ __global__ void directed_calculate_constraints( } } - int cuda_constraint( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, _IN_ int num_nodes, - _IN_ int num_edges, + _IN_ int len_rowPtrOut, + _IN_ int len_colIdxOut, + _IN_ const int* rowPtrOut, + _IN_ const int* colIdxOut, + _IN_ const double* valOut, + _IN_ const int* rowPtrIn, + _IN_ const int* colIdxIn, + _IN_ const double* valIn, _IN_ bool is_directed, _IN_ int* node_mask, - _OUT_ double* constraint_results + _OUT_ double* constraints ) { int cuda_ret = cudaSuccess; int EG_ret = EG_GPU_SUCC; - int* d_V; - int* d_E; - int* d_row; - int* d_col; - double* d_W; - int* d_node_mask; - double* d_constraint_results; - int block_size = 256; - int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; - - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); + int* d_rowPtrOut = nullptr; + int* d_colIdxOut = nullptr; + double* d_valOut = nullptr; + int* d_rowPtrIn = nullptr; + int* d_colIdxIn = nullptr; + double* d_valIn = nullptr; + int* d_node_mask = nullptr; + double* d_constraints = nullptr; + double* d_sum = nullptr; + + int device; + cudaGetDevice(&device); + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device); + + int maxThreadsPerBlock = prop.maxThreadsPerBlock; + int maxBlocks = prop.maxGridSize[0]; + int NODES_PER_BLOCK =128; + int blockSize = (num_nodes < maxThreadsPerBlock) ? num_nodes : 256; + int numBlocks = (num_nodes + blockSize - 1) / blockSize; + + if (numBlocks > maxBlocks) { + NODES_PER_BLOCK = numBlocks / maxBlocks + 1; + numBlocks = (num_nodes + NODES_PER_BLOCK * blockSize - 1) / (NODES_PER_BLOCK * blockSize); + } + + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_rowPtrOut, len_rowPtrOut * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_colIdxOut, len_colIdxOut * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_valOut, len_colIdxOut * sizeof(double))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_rowPtrIn, len_rowPtrOut * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_colIdxIn, len_colIdxOut * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_valIn, len_colIdxOut * sizeof(double))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraints, num_nodes * sizeof(double))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_sum, num_nodes * sizeof(double))); EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraint_results, num_nodes * sizeof(double))); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_rowPtrOut, rowPtrOut, len_rowPtrOut * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_colIdxOut, colIdxOut, len_colIdxOut * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_valOut, valOut, len_colIdxOut * sizeof(double), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_rowPtrIn, rowPtrIn, len_rowPtrOut * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_colIdxIn, colIdxIn, len_colIdxOut * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_valIn, valIn, len_colIdxOut * sizeof(double), cudaMemcpyHostToDevice)); EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); + + if(is_directed){ - directed_calculate_constraints<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_constraint_results); + compute_out_in_sum<<>>(d_rowPtrOut, d_valOut, d_rowPtrIn, d_valIn, num_nodes, d_node_mask, d_sum); + cudaDeviceSynchronize(); + + calculate_constraints<<>>( + num_nodes, + NODES_PER_BLOCK, + d_rowPtrOut, + d_colIdxOut, + d_valOut, + d_rowPtrIn, + d_colIdxIn, + d_valIn, + d_sum, + d_node_mask, + d_constraints + ); + cudaDeviceSynchronize(); + // constraints.resize(num_nodes); }else{ - calculate_constraints<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results); + // calculate_constraints<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results); } - - EXIT_IF_CUDA_FAILED(cudaMemcpy(constraint_results, d_constraint_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(constraints, d_constraints, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); exit: - cudaFree(d_V); - cudaFree(d_E); - cudaFree(d_row); - cudaFree(d_col); - cudaFree(d_W); + cudaFree(d_rowPtrOut); + cudaFree(d_colIdxOut); + cudaFree(d_valOut); + cudaFree(d_rowPtrIn); + cudaFree(d_colIdxIn); + cudaFree(d_valIn); + cudaFree(d_sum); + cudaFree(d_constraints); cudaFree(d_node_mask); - cudaFree(d_constraint_results); if (cuda_ret != cudaSuccess) { switch (cuda_ret) { case cudaErrorMemoryAllocation: diff --git a/gpu_easygraph/functions/structural_holes/constraint.cuh b/gpu_easygraph/functions/structural_holes/constraint.cuh index 7086e44a..844f090b 100644 --- a/gpu_easygraph/functions/structural_holes/constraint.cuh +++ b/gpu_easygraph/functions/structural_holes/constraint.cuh @@ -5,16 +5,18 @@ namespace gpu_easygraph { int cuda_constraint( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, _IN_ int num_nodes, - _IN_ int num_edges, + _IN_ int len_rowPtrOut, + _IN_ int len_colIdxOut, + _IN_ const int* rowPtrOut, + _IN_ const int* colIdxOut, + _IN_ const double* valOut, + _IN_ const int* rowPtrIn, + _IN_ const int* colIdxIn, + _IN_ const double* valIn, _IN_ bool is_directed, _IN_ int* node_mask, - _OUT_ double* constraint_results + _OUT_ double* constraints ); } // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/gpu_easygraph.h b/gpu_easygraph/gpu_easygraph.h index 91c32973..7b66edec 100644 --- a/gpu_easygraph/gpu_easygraph.h +++ b/gpu_easygraph/gpu_easygraph.h @@ -58,12 +58,13 @@ int pagerank( int constraint( - const std::vector& V, - const std::vector& E, - const std::vector& row, - const std::vector& col, int num_nodes, - const std::vector& W, + const std::vector& rowPtrOut, + const std::vector& colIdxOut, + const std::vector& valOut, + const std::vector& rowPtrIn, + const std::vector& colIdxIn, + const std::vector& valIn, bool is_directed, std::vector& node_mask, std::vector& constraint