diff --git a/cpp_easygraph/functions/structural_holes/evaluation.cpp b/cpp_easygraph/functions/structural_holes/evaluation.cpp index 418a0c01..354b689a 100644 --- a/cpp_easygraph/functions/structural_holes/evaluation.cpp +++ b/cpp_easygraph/functions/structural_holes/evaluation.cpp @@ -38,7 +38,6 @@ weight_t mutual_weight(Graph& G, node_t u, node_t v, std::string weight) { return a_uv + a_vu; } - weight_t directed_mutual_weight(DiGraph& G, node_t u, node_t v, std::string weight) { weight_t a_uv = 0, a_vu = 0; if (G.adj.count(u) && G.adj[u].count(v)) { @@ -173,33 +172,40 @@ std::pair directed_compute_constraint_of_v(DiGraph& G, node_t py::object invoke_cpp_constraint(py::object G, py::object nodes, py::object weight) { std::string weight_key = weight_to_string(weight); rec_type sum_nmw_rec, local_constraint_rec; + if (nodes.is_none()) { nodes = G.attr("nodes"); } + py::list nodes_list = py::list(nodes); - py::list constraint_results = py::list(); int nodes_list_len = py::len(nodes_list); - if(G.attr("is_directed")().cast()){ + std::vector constraint_results(nodes_list_len, 0.0); + + if (G.attr("is_directed")().cast()) { DiGraph& G_ = G.cast(); for (int i = 0; i < nodes_list_len; i++) { py::object v = nodes_list[i]; node_t v_id = G_.node_to_id[v].cast(); - std::pair constraint_pair = directed_compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec); - py::tuple constraint_of_v = py::make_tuple(G_.id_to_node[py::cast(constraint_pair.first)], constraint_pair.second); - constraint_results.append(constraint_of_v); + std::pair constraint_pair = + directed_compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec); + constraint_results[i] = constraint_pair.second; } - }else{ + } else { Graph& G_ = G.cast(); for (int i = 0; i < nodes_list_len; i++) { py::object v = nodes_list[i]; node_t v_id = G_.node_to_id[v].cast(); - std::pair constraint_pair = compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec); - py::tuple constraint_of_v = py::make_tuple(G_.id_to_node[py::cast(constraint_pair.first)], constraint_pair.second); - constraint_results.append(constraint_of_v); + std::pair constraint_pair = + compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec); + constraint_results[i] = constraint_pair.second; } } - py::dict constraint = py::dict(constraint_results); - return constraint; + + std::reverse(constraint_results.begin(), constraint_results.end()); + + py::array::ShapeContainer ret_shape{nodes_list_len}; + py::array_t ret(ret_shape, constraint_results.data()); + return ret; } #ifdef EASYGRAPH_ENABLE_GPU @@ -242,19 +248,13 @@ static py::object invoke_gpu_constraint(py::object G, py::object nodes, py::obje py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r)); } - py::dict constraint_dict; - for (auto node : nodes_list) { - int node_id = G_.node_to_id[node].cast(); - int idx = node2idx[node_id]; + py::array::ShapeContainer ret_shape{(int)constraint_results.size()}; + py::array_t ret(ret_shape, constraint_results.data()); - py::object node_name = G_.id_to_node.attr("get")(py::cast(node_id)); - constraint_dict[node_name] = py::cast(constraint_results[idx]); - } - return constraint_dict; + return ret; } #endif - py::object constraint(py::object G, py::object nodes, py::object weight, py::object n_workers) { #ifdef EASYGRAPH_ENABLE_GPU return invoke_gpu_constraint(G, nodes, weight); @@ -269,9 +269,6 @@ weight_t redundancy(Graph& G, node_t u, node_t v, std::string weight, rec_type& for (const auto& n : G.adj[v]) { neighbors.insert(n.first); } - // for (const auto& n : G.pred[v]) { - // neighbors.insert(n.first); - // } for (const auto& w : neighbors) { r += normalized_mutual_weight(G, u, w, weight, sum, sum_nmw_rec) * normalized_mutual_weight(G, v, w, weight, max, max_nmw_rec); } @@ -294,42 +291,43 @@ weight_t directed_redundancy(DiGraph& G, node_t u, node_t v, std::string weight, } py::object invoke_cpp_effective_size(py::object G, py::object nodes, py::object weight) { + std::string weight_key = weight_to_string(weight); rec_type sum_nmw_rec, max_nmw_rec; - py::dict effective_size = py::dict(); + if (nodes.is_none()) { - nodes = G; + nodes = G.attr("nodes"); } - nodes = py::list(nodes); + + py::list nodes_list = py::list(nodes); + int nodes_list_len = py::len(nodes_list); + std::vector effective_size_results(nodes_list_len, 0.0); + if (!G.attr("is_directed")().cast()){ Graph& G_ = G.cast(); - std::string weight_key = weight_to_string(weight); - int nodes_len = py::len(nodes); - for (int i = 0; i < nodes_len; i++) { - py::object v = nodes[py::cast(i)]; + for (int i = 0; i < nodes_list_len; i++) { + weight_t redundancy_sum = 0; + py::object v = nodes_list[i]; + node_t v_id = G_.node_to_id[v].cast(); if (py::len(G[v]) == 0) { - effective_size[v] = py::cast(Py_NAN); + effective_size_results[i] = Py_NAN; continue; } - weight_t redundancy_sum = 0; - node_t v_id = G_.node_to_id[v].cast(); for (const auto& neighbor_info : G_.adj[v_id]) { node_t u_id = neighbor_info.first; redundancy_sum += redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec); } - effective_size[v] = redundancy_sum; + effective_size_results[i] = redundancy_sum; } } else{ DiGraph& G_ = G.cast(); - std::string weight_key = weight_to_string(weight); - int nodes_len = py::len(nodes); - for (int i = 0; i < nodes_len; i++) { - py::object v = nodes[py::cast(i)]; + for (int i = 0; i < nodes_list_len; i++) { + weight_t redundancy_sum = 0; + py::object v = nodes_list[i]; + node_t v_id = G_.node_to_id[v].cast(); if (py::len(G[v]) == 0) { - effective_size[v] = py::cast(Py_NAN); + effective_size_results[i] = Py_NAN; continue; } - weight_t redundancy_sum = 0; - node_t v_id = G_.node_to_id[v].cast(); for (const auto& neighbor_info : G_.adj[v_id]) { node_t u_id = neighbor_info.first; redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec); @@ -338,16 +336,21 @@ py::object invoke_cpp_effective_size(py::object G, py::object nodes, py::object node_t u_id = neighbor_info.first; redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec); } - effective_size[v] = redundancy_sum; + effective_size_results[i] = redundancy_sum; } } - return effective_size; + + std::reverse(effective_size_results.begin(), effective_size_results.end()); + + py::array::ShapeContainer ret_shape{nodes_list_len}; + py::array_t ret(ret_shape, effective_size_results.data()); + return ret; } #ifdef EASYGRAPH_ENABLE_GPU static py::object invoke_gpu_effective_size(py::object G, py::object nodes, py::object weight) { Graph& G_ = G.cast(); - py::dict effective_size = py::dict(); + if (weight.is_none()) { G_.gen_CSR(); } else { @@ -369,18 +372,17 @@ static py::object invoke_gpu_effective_size(py::object G, py::object nodes, py:: std::vector effective_size_results(num_nodes); bool is_directed = G.attr("is_directed")().cast(); - // 设置节点掩码数组 std::vector node_mask(num_nodes, 0); py::list nodes_list; if (!nodes.is_none()) { nodes_list = py::list(nodes); for (auto node : nodes_list) { int node_id = node2idx[G_.node_to_id[node].cast()]; - node_mask[node_id] = 1; // 标记需要计算的节点 + node_mask[node_id] = 1; } } else { nodes_list = py::list(G.attr("nodes")); - std::fill(node_mask.begin(), node_mask.end(), 1); // 计算所有节点 + std::fill(node_mask.begin(), node_mask.end(), 1); } int gpu_r = gpu_easygraph::effective_size(V, E, row, col, num_nodes, *W_p, is_directed, node_mask, effective_size_results); @@ -389,15 +391,10 @@ static py::object invoke_gpu_effective_size(py::object G, py::object nodes, py:: py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r)); } - py::dict effective_size_dict; - for (auto node : nodes_list) { - int node_id = G_.node_to_id[node].cast(); - int idx = node2idx[node_id]; + py::array::ShapeContainer ret_shape{(int)effective_size_results.size()}; + py::array_t ret(ret_shape, effective_size_results.data()); - py::object node_name = G_.id_to_node.attr("get")(py::cast(node_id)); - effective_size_dict[node_name] = py::cast(effective_size_results[idx]); - } - return effective_size_dict; + return ret; } #endif @@ -411,9 +408,58 @@ py::object effective_size(py::object G, py::object nodes, py::object weight, py: #ifdef EASYGRAPH_ENABLE_GPU static py::object invoke_gpu_efficiency(py::object G, py::object nodes, py::object weight) { - py::dict effective_size_dict = invoke_gpu_effective_size(G, nodes, weight); + Graph& G_ = G.cast(); + py::dict effective_size = py::dict(); + if (weight.is_none()) { + G_.gen_CSR(); + } else { + G_.gen_CSR(weight_to_string(weight)); + } + auto csr_graph = G_.csr_graph; + auto coo_graph = G_.transfer_csr_to_coo(csr_graph); - py::dict degree; + std::vector& V = csr_graph->V; + std::vector& E = csr_graph->E; + std::vector& row = coo_graph->row; + std::vector& col = coo_graph->col; + + std::vector* W_p = weight.is_none() ? &(coo_graph->unweighted_W) + : coo_graph->W_map.find(weight_to_string(weight))->second.get(); + + std::unordered_map& node2idx = coo_graph->node2idx; + int num_nodes = coo_graph->node2idx.size(); + std::vector effective_size_results(num_nodes); + bool is_directed = G.attr("is_directed")().cast(); + + // 设置节点掩码数组 + std::vector node_mask(num_nodes, 0); + py::list nodes_list; + if (!nodes.is_none()) { + nodes_list = py::list(nodes); + for (auto node : nodes_list) { + int node_id = node2idx[G_.node_to_id[node].cast()]; + node_mask[node_id] = 1; // 标记需要计算的节点 + } + } else { + nodes_list = py::list(G.attr("nodes")); + std::fill(node_mask.begin(), node_mask.end(), 1); // 计算所有节点 + } + + int gpu_r = gpu_easygraph::effective_size(V, E, row, col, num_nodes, *W_p, is_directed, node_mask, effective_size_results); + + if (gpu_r != gpu_easygraph::EG_GPU_SUCC) { + py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r)); + } + + py::dict effective_size_dict; + for (auto node : nodes_list) { + int node_id = G_.node_to_id[node].cast(); + int idx = node2idx[node_id]; + + py::object node_name = G_.id_to_node.attr("get")(py::cast(node_id)); + effective_size_dict[node_name] = py::cast(effective_size_results[idx]); + } + py::dict degree; if (weight.is_none()) { degree = G.attr("degree")(py::none()).cast(); } else { @@ -442,8 +488,55 @@ static py::object invoke_gpu_efficiency(py::object G, py::object nodes, py::obje } #endif + py::object invoke_cpp_efficiency(py::object G, py::object nodes, py::object weight) { - py::dict effective_size_dict = invoke_cpp_effective_size(G, nodes, weight); + rec_type sum_nmw_rec, max_nmw_rec; + py::dict effective_size_dict = py::dict(); + if (nodes.is_none()) { + nodes = G; + } + nodes = py::list(nodes); + if (!G.attr("is_directed")().cast()){ + Graph& G_ = G.cast(); + std::string weight_key = weight_to_string(weight); + int nodes_len = py::len(nodes); + for (int i = 0; i < nodes_len; i++) { + py::object v = nodes[py::cast(i)]; + if (py::len(G[v]) == 0) { + effective_size_dict[v] = py::cast(Py_NAN); + continue; + } + weight_t redundancy_sum = 0; + node_t v_id = G_.node_to_id[v].cast(); + for (const auto& neighbor_info : G_.adj[v_id]) { + node_t u_id = neighbor_info.first; + redundancy_sum += redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec); + } + effective_size_dict[v] = redundancy_sum; + } + } else{ + DiGraph& G_ = G.cast(); + std::string weight_key = weight_to_string(weight); + int nodes_len = py::len(nodes); + for (int i = 0; i < nodes_len; i++) { + py::object v = nodes[py::cast(i)]; + if (py::len(G[v]) == 0) { + effective_size_dict[v] = py::cast(Py_NAN); + continue; + } + weight_t redundancy_sum = 0; + node_t v_id = G_.node_to_id[v].cast(); + for (const auto& neighbor_info : G_.adj[v_id]) { + node_t u_id = neighbor_info.first; + redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec); + } + for (const auto& neighbor_info : G_.pred[v_id]) { + node_t u_id = neighbor_info.first; + redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec); + } + effective_size_dict[v] = redundancy_sum; + } + } py::dict degree; if (weight.is_none()) { @@ -536,65 +629,18 @@ py::object invoke_cpp_hierarchy(py::object G, py::object nodes, py::object weigh } py::list nodes_list = py::list(nodes); int nodes_list_len = py::len(nodes_list); - // Graph& G_ = G.cast(); py::dict hierarchy = py::dict(); if(G.attr("is_directed")().cast()){ DiGraph& G_ = G.cast(); - // if (!n_workers.is_none()) { - // std::vector node_ids; - // int n_workers_num = n_workers.cast(); - // for (int i = 0;i < py::len(nodes_list);i++) { - // py::object node = nodes_list[i]; - // node_ids.push_back(G_.node_to_id[node].cast()); - // } - // std::shuffle(node_ids.begin(), node_ids.end(), std::random_device()); - // std::vector > split_nodes; - // if (node_ids.size() > n_workers_num * 30000) { - // split_nodes = split_len(node_ids, 30000); - // } - // else { - // split_nodes = split(node_ids, n_workers_num); - // } - // while (split_nodes.size() < n_workers_num) { - // split_nodes.push_back(std::vector()); - // } - // std::vector > rets(n_workers_num); - // Py_BEGIN_ALLOW_THREADS - - // std::vector threads; - // for (int i = 0;i < n_workers_num; i++) { - // threads.push_back(std::thread(hierarchy_parallel, &G_, &split_nodes[i], weight_key, &rets[i])); - // } - // for (int i = 0;i < n_workers_num;i++) { - // threads[i].join(); - // } - - // Py_END_ALLOW_THREADS - - // for (int i = 1;i < rets.size();i++) { - // rets[0].insert(rets[i].begin(), rets[i].end()); - // } - // for (const auto& hierarchy_pair : rets[0]) { - // py::object node = G_.id_to_node[py::cast(hierarchy_pair.first)]; - // hierarchy[node] = hierarchy_pair.second; - // } - // } - // else { for (int i = 0; i < nodes_list_len; i++) { py::object v = nodes_list[i]; - // py::object E = G.attr("ego_subgraph")(v); - - // int n = py::len(E) - 1; - weight_t C = 0; std::map c; - // 获取 successors 和 predecessors py::list successors_of_v = py::list(G.attr("successors")(v)); py::list predecessors_of_v = py::list(G.attr("predecessors")(v)); - // 使用 set 去重合并 std::set neighbors_of_v; for (const auto& w : successors_of_v) { neighbors_of_v.insert(G_.node_to_id[w].cast()); @@ -603,17 +649,14 @@ py::object invoke_cpp_hierarchy(py::object G, py::object nodes, py::object weigh neighbors_of_v.insert(G_.node_to_id[w].cast()); } - // 遍历 neighbors_of_v for (const auto& w_id : neighbors_of_v) { node_t v_id = G_.node_to_id[v].cast(); - // std::cout << "Node: " << v_id << ", Neighbor: " << w_id << std::endl; - // 计算约束值并存储 C += directed_local_constraint(G_, v_id, w_id, weight_key, local_constraint_rec, sum_nmw_rec); c[w_id] = directed_local_constraint(G_, v_id, w_id, weight_key, local_constraint_rec, sum_nmw_rec); } int n = neighbors_of_v.size(); - // 如果邻居数大于 1,计算层级性 + if (n > 1) { weight_t hierarchy_sum = 0; for (const auto& w_id : neighbors_of_v) { @@ -622,12 +665,11 @@ py::object invoke_cpp_hierarchy(py::object G, py::object nodes, py::object weigh hierarchy[v] = hierarchy_sum; } - // 如果层级性未定义,设置为 0 if (!hierarchy.contains(v)) { hierarchy[v] = 0; } } - // } + }else{ Graph& G_ = G.cast(); if (!n_workers.is_none()) { diff --git a/gpu_easygraph/functions/structural_holes/constraint.cpp b/gpu_easygraph/functions/structural_holes/constraint.cpp new file mode 100644 index 00000000..a8b5731b --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/constraint.cpp @@ -0,0 +1,31 @@ +#include +#include +#include // 需要包含memory头文件 + +#include "structural_holes/constraint.cuh" +#include "common.h" + +namespace gpu_easygraph { + +using std::vector; + +int constraint( + _IN_ const vector& V, + _IN_ const vector& E, + _IN_ const vector& row, + _IN_ const vector& col, + _IN_ int num_nodes, + _IN_ const vector& W, + _IN_ bool is_directed, + _IN_ vector& node_mask, // 添加节点掩码参数 + _OUT_ vector& constraint +) { + int num_edges = row.size(); + + constraint = vector(num_nodes); + int r = cuda_constraint(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), constraint.data()); + + return r; // 成功 +} + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/constraint.cu b/gpu_easygraph/functions/structural_holes/constraint.cu new file mode 100644 index 00000000..31c7de17 --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/constraint.cu @@ -0,0 +1,354 @@ +#include +#include +#include + +#include "common.h" +#define NODES_PER_BLOCK 1 + +namespace gpu_easygraph { + +enum norm_t { SUM = 0, MAX = 1 }; + +static __device__ double mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double a_uv = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + if (E[i] == v) { + a_uv = W[i]; + break; + } + } + return a_uv; +} + +static __device__ double normalized_mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v, + norm_t norm +) { + double weight_uv = mutual_weight(V, E, W, u, v); + + double scale = 0.0; + if (norm == SUM) { + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = mutual_weight(V, E, W, u, neighbor); + scale += weight_uw; + } + } else if (norm == MAX) { + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_uw); + } + } + return (scale==0.0) ? 0.0 : (weight_uv / scale); +} + +static __device__ double local_constraint( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double direct = normalized_mutual_weight(V,E,W,u,v,SUM); + double indirect = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM); + double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM); + indirect += norm_uw * norm_wv; + } + double local_constraint_of_uv = (direct + indirect) * (direct + indirect); + return local_constraint_of_uv; +} + +__global__ void calculate_constraints( + const int* __restrict__ V, + const int* __restrict__ E, + const double* __restrict__ W, + const int num_nodes, + const int* __restrict__ node_mask, + double* __restrict__ constraint_results +) { + int start_node = blockIdx.x * NODES_PER_BLOCK; // 当前块处理的起始节点 + int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); // 结束节点,确保不越界 + + for (int v = start_node; v < end_node; ++v) { + if (!node_mask[v]) continue; + + double constraint_of_v = 0.0; + bool is_nan = true; + + // 使用所有线程并行处理邻居 + __shared__ double shared_constraint[256]; // 假设 blockDim.x <= 256 + double local_sum = 0.0; + + for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { + is_nan = false; + int neighbor = E[i]; + local_sum += local_constraint(V, E, W, v, neighbor); + } + + // 归约邻居结果到一个值 + shared_constraint[threadIdx.x] = local_sum; + __syncthreads(); + + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { + if (threadIdx.x < offset) { + shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset]; + } + __syncthreads(); + } + + // 保存最终结果 + if (threadIdx.x == 0) { + constraint_results[v] = (is_nan) ? NAN : shared_constraint[0]; + } + } +} + +static __device__ double directed_mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double a_uv = 0.0, a_vu = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + if (E[i] == v) { + a_uv = W[i]; + break; + } + } + for (int i = V[v]; i < V[v+1]; i++) { + if (E[i] == u) { + a_vu = W[i]; + break; + } + } + return a_uv + a_vu; +} + +static __device__ double directed_normalized_mutual_weight( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + int num_edges, + int u, + int v, + norm_t norm +) { + double weight_uv = directed_mutual_weight(V, E, W, u, v); + + double scale = 0.0; + if(norm==SUM){ + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); + scale += weight_uw; + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); + scale += weight_wu; + } + } + }else if(norm==MAX){ + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_uw); + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_wu); + } + } + } + return (scale==0.0) ? 0.0 : (weight_uv / scale); +} + +static __device__ double directed_local_constraint( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + int num_edges, + int u, + int v +) { + double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM); + double indirect = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); + double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); + indirect += norm_uw * norm_wv; + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); + double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); + indirect += norm_uw * norm_wv; + } + } + double local_constraint_of_uv = (direct + indirect) * (direct + indirect); + return local_constraint_of_uv; +} + +__global__ void directed_calculate_constraints( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + int num_nodes, + int num_edges, + int* node_mask, + double* constraint_results +) { + int start_node = blockIdx.x * NODES_PER_BLOCK; // 当前块处理的起始节点 + int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); // 结束节点,确保不越界 + + for (int v = start_node; v < end_node; ++v) { + if (!node_mask[v]) continue; + + double constraint_of_v = 0.0; + bool is_nan = true; + + // 使用所有线程并行处理邻居 + __shared__ double shared_constraint[256]; // 假设 blockDim.x <= 256 + double local_sum = 0.0; + + // 计算出邻居约束(out-neighbors) + for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { + is_nan = false; + int neighbor = E[i]; + local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor); + } + + // 计算入邻居约束(in-neighbors) + for (int i = threadIdx.x; i < num_edges; i += blockDim.x) { + if (col[i] == v) { + // is_nan = false; + int neighbor = row[i]; + local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor); + } + } + + // 将所有线程的局部结果写入共享内存 + shared_constraint[threadIdx.x] = local_sum; + __syncthreads(); + + // 归约邻居结果到一个值 + for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { + if (threadIdx.x < offset) { + shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset]; + } + __syncthreads(); + } + + // 保存最终结果 + if (threadIdx.x == 0) { + constraint_results[v] = (is_nan) ? NAN : shared_constraint[0]; + } + } +} + + +int cuda_constraint( + _IN_ const int* V, + _IN_ const int* E, + _IN_ const int* row, + _IN_ const int* col, + _IN_ const double* W, + _IN_ int num_nodes, + _IN_ int num_edges, + _IN_ bool is_directed, + _IN_ int* node_mask, + _OUT_ double* constraint_results +) { + int cuda_ret = cudaSuccess; + int EG_ret = EG_GPU_SUCC; + + int* d_V; + int* d_E; + int* d_row; + int* d_col; + double* d_W; + int* d_node_mask; + double* d_constraint_results; + int block_size = 256; // 每个块的线程数,可以根据设备进行调整 + int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; // 调整为每个块处理NODES_PER_BLOCK个节点 + + // 分配CUDA设备内存 + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraint_results, num_nodes * sizeof(double))); + + // 将数据从主机拷贝到设备 + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); + + + // 调用 CUDA 内核 + if(is_directed){ + directed_calculate_constraints<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_constraint_results); + }else{ + calculate_constraints<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results); + } + + EXIT_IF_CUDA_FAILED(cudaMemcpy(constraint_results, d_constraint_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); +exit: + + cudaFree(d_V); + cudaFree(d_E); + cudaFree(d_row); + cudaFree(d_col); + cudaFree(d_W); + cudaFree(d_node_mask); + cudaFree(d_constraint_results); + if (cuda_ret != cudaSuccess) { + switch (cuda_ret) { + case cudaErrorMemoryAllocation: + EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM; + break; + default: + EG_ret = EG_GPU_DEVICE_ERR; + break; + } + } + + return EG_ret; +} + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/constraint.cuh b/gpu_easygraph/functions/structural_holes/constraint.cuh new file mode 100644 index 00000000..7086e44a --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/constraint.cuh @@ -0,0 +1,20 @@ +#pragma once + +#include "common.h" + +namespace gpu_easygraph { + +int cuda_constraint( + _IN_ const int* V, + _IN_ const int* E, + _IN_ const int* row, + _IN_ const int* col, + _IN_ const double* W, + _IN_ int num_nodes, + _IN_ int num_edges, + _IN_ bool is_directed, + _IN_ int* node_mask, + _OUT_ double* constraint_results +); + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/effective_size.cpp b/gpu_easygraph/functions/structural_holes/effective_size.cpp new file mode 100644 index 00000000..771022a5 --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/effective_size.cpp @@ -0,0 +1,31 @@ +#include +#include +#include // 需要包含memory头文件 + +#include "structural_holes/effective_size.cuh" +#include "common.h" + +namespace gpu_easygraph { + +using std::vector; + +int effective_size( + _IN_ const vector& V, + _IN_ const vector& E, + _IN_ const vector& row, + _IN_ const vector& col, + _IN_ int num_nodes, + _IN_ const vector& W, + _IN_ bool is_directed, + _IN_ vector& node_mask, // 添加节点掩码参数 + _OUT_ vector& effective_size +) { + int num_edges = row.size(); + + effective_size = vector(num_nodes); + int r = cuda_effective_size(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), effective_size.data()); + + return r; // 成功 +} + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/effective_size.cu b/gpu_easygraph/functions/structural_holes/effective_size.cu new file mode 100644 index 00000000..38d34af2 --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/effective_size.cu @@ -0,0 +1,362 @@ +#include +#include +#include + +#include "common.h" +#define NODES_PER_BLOCK 1 + +namespace gpu_easygraph { + +enum norm_t { SUM = 0, MAX = 1 }; + +static __device__ double mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double a_uv = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + if (E[i] == v) { + a_uv = W[i]; + break; + } + } + return a_uv; +} + +static __device__ double normalized_mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v, + norm_t norm +) { + double weight_uv = mutual_weight(V, E, W, u, v); + + double scale = 0.0; + if (norm == SUM) { + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = mutual_weight(V, E, W, u, neighbor); + scale += weight_uw; + } + } else if (norm == MAX) { + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_uw); + } + } + return (scale==0.0) ? 0.0 : (weight_uv / scale); +} + +static __device__ double directed_mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double a_uv = 0.0, a_vu = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + if (E[i] == v) { + a_uv = W[i]; + break; + } + } + for (int i = V[v]; i < V[v+1]; i++) { + if (E[i] == u) { + a_vu = W[i]; + break; + } + } + return a_uv + a_vu; +} + +static __device__ double directed_normalized_mutual_weight( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + int num_edges, + int u, + int v, + norm_t norm +) { + double weight_uv = directed_mutual_weight(V, E, W, u, v); + + double scale = 0.0; + if(norm==SUM){ + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); + scale += weight_uw; + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); + scale += weight_wu; + } + } + }else if(norm==MAX){ + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_uw); + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_wu); + } + } + } + return (scale==0.0) ? 0.0 : (weight_uv / scale); +} + +static __device__ double redundancy( + const int* V, + const int* E, + const double* W, + const int num_nodes, + int u, + int v +) { + double r = 0.0; + for (int i = V[v]; i < V[v + 1]; i++) { + int w = E[i]; + r += normalized_mutual_weight(V, E, W, u, w, SUM) * normalized_mutual_weight(V, E, W, v, w, MAX); + } + return 1-r; +} + + +// Warp 级别归约函数 +__inline__ __device__ double warp_reduce_sum(double val) +{ + for (int offset = warpSize / 2; offset > 0; offset /= 2) + { + val += __shfl_down_sync(0xffffffff, val, offset); + } + return val; +} + +// 线程块内归约函数,利用 warp 归约 +__inline__ __device__ double block_reduce_sum(double val) +{ + // 每个 warp 进行归约 + val = warp_reduce_sum(val); + + // 将每个 warp 的结果存入共享内存 + __shared__ double shared[32]; // 一个线程块最多有 32 个 warp + int warp_id = threadIdx.x / warpSize; + if (threadIdx.x % warpSize == 0) + { + shared[warp_id] = val; + } + __syncthreads(); + + // 由第一个 warp 进行最终的归约 + if (warp_id == 0) + { + val = (threadIdx.x < (blockDim.x / warpSize)) ? shared[threadIdx.x] : 0.0; + val = warp_reduce_sum(val); + } + return val; +} + +__global__ void calculate_effective_size( + const int* __restrict__ V, + const int* __restrict__ E, + const double* __restrict__ W, + const int num_nodes, + const int* __restrict__ node_mask, + double* __restrict__ effective_size_results +) { + // 每个线程块处理一个节点 + int u = blockIdx.x; + if (u >= num_nodes || !node_mask[u]) return; + + // 获取节点 u 的邻居范围 + int neighbor_start = V[u]; + int neighbor_end = V[u + 1]; + int degree = neighbor_end - neighbor_start; + + // 计算线程块的线程数量 + int threads_per_block = blockDim.x; + + // 每个线程处理节点 u 的一个或多个邻居 + double redundancy_sum = 0.0; + for (int idx = threadIdx.x; idx < degree; idx += threads_per_block) { + int i = neighbor_start + idx; + int v = E[i]; + if (v != u) { + // redundancy_sum += redundancy(V, E, W, num_nodes, u, v); + double r = 0.0; + for (int j = V[v]; j < V[v + 1]; j++) { + int w = E[j]; + r += normalized_mutual_weight(V, E, W, u, w, SUM) * + normalized_mutual_weight(V, E, W, v, w, MAX); + } + redundancy_sum += 1 - r; + } + } + + // 线程块内归约 + redundancy_sum = block_reduce_sum(redundancy_sum); + + // 将结果写入全局内存 + if (threadIdx.x == 0) { + effective_size_results[u] = redundancy_sum; + } +} + +static __device__ double directed_redundancy( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + const int num_nodes, + const int num_edges, + int u, + int v +) { + double r = 0.0; + for (int i = V[v]; i < V[v + 1]; i++) { + int w = E[i]; + r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX); + } + for (int i = 0; i < num_edges; i++) { + if (col[i] == v) { + int w = row[i]; + r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX); + } + } + return 1-r; +} + +__global__ void directed_calculate_effective_size( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + const int num_nodes, + const int num_edges, + const int* node_mask, + double* effective_size_results +) { + int u = blockIdx.x * blockDim.x + threadIdx.x; + if (u >= num_nodes || !node_mask[u]) return; + double redundancy_sum = 0.0; + bool is_nan = true; + + // 遍历 u 的所有邻居 + for (int i = V[u]; i < V[u + 1]; i++) { + int v = E[i]; + if (v == u) continue; // 排除自连接的情况 + is_nan = false; + redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v); + } + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int v = row[i]; + redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v); + } + } + effective_size_results[u] = is_nan ? NAN : redundancy_sum; +} + + +int cuda_effective_size( + _IN_ const int* V, + _IN_ const int* E, + _IN_ const int* row, + _IN_ const int* col, + _IN_ const double* W, + _IN_ int num_nodes, + _IN_ int num_edges, + _IN_ bool is_directed, + _IN_ int* node_mask, + _OUT_ double* effective_size_results +) { + int cuda_ret = cudaSuccess; + int EG_ret = EG_GPU_SUCC; + int min_grid_size = 0; + int block_size = 0; + + + cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_effective_size, 0, 0); + + int grid_size = (num_nodes + block_size * NODES_PER_BLOCK - 1) / (block_size * NODES_PER_BLOCK); + + int* d_V; + int* d_E; + int* d_row; + int* d_col; + double* d_W; + int* d_node_mask; + double* d_effective_size_results; + + // 分配 CUDA 设备内存 + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_effective_size_results, num_nodes * sizeof(double))); + + // 将数据从主机拷贝到设备 + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); + + if(is_directed){ + directed_calculate_effective_size<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_effective_size_results); + }else{ + int block_size = 256; // 每个块的线程数,可以根据设备进行调整 + int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; // 调整为每个块处理NODES_PER_BLOCK个节点 + // int shared_memory_size = blockDim.x * sizeof(double); + calculate_effective_size<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_effective_size_results); + + } + + EXIT_IF_CUDA_FAILED(cudaMemcpy(effective_size_results, d_effective_size_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); + +exit: + cudaFree(d_V); + cudaFree(d_E); + cudaFree(d_row); + cudaFree(d_col); + cudaFree(d_W); + cudaFree(d_node_mask); + cudaFree(d_effective_size_results); + + if (cuda_ret != cudaSuccess) { + switch (cuda_ret) { + case cudaErrorMemoryAllocation: + EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM; + break; + default: + EG_ret = EG_GPU_DEVICE_ERR; + break; + } + } + + return EG_ret; +} + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/effective_size.cuh b/gpu_easygraph/functions/structural_holes/effective_size.cuh new file mode 100644 index 00000000..e69751e7 --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/effective_size.cuh @@ -0,0 +1,20 @@ +#pragma once + +#include "common.h" + +namespace gpu_easygraph { + +int cuda_effective_size( + _IN_ const int* V, + _IN_ const int* E, + _IN_ const int* row, + _IN_ const int* col, + _IN_ const double* W, + _IN_ int num_nodes, + _IN_ int num_edges, + _IN_ bool is_directed, + _IN_ int* node_mask, + _OUT_ double* effective_size_results +); + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/evaluation.cpp b/gpu_easygraph/functions/structural_holes/evaluation.cpp deleted file mode 100644 index 957be654..00000000 --- a/gpu_easygraph/functions/structural_holes/evaluation.cpp +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include // 需要包含memory头文件 - -#include "structural_holes/evaluation.cuh" -#include "common.h" - -namespace gpu_easygraph { - -using std::vector; - -// static int decide_warp_size( -// _IN_ int len_V, -// _IN_ int len_E -// ) -// { -// vector warp_size_cand{1, 2, 4, 8, 16, 32}; - -// if (len_E / len_V < warp_size_cand.front()) { -// return warp_size_cand.front(); -// } - -// for (int i = 0; i + 1 < warp_size_cand.size(); ++i) { -// if (warp_size_cand[i] <= len_E / len_V -// && len_E / len_V < warp_size_cand[i + 1]) { -// return warp_size_cand[i + 1]; -// } -// } -// return warp_size_cand.back(); -// } - -int effective_size( - _IN_ const vector& V, - _IN_ const vector& E, - _IN_ const vector& row, - _IN_ const vector& col, - _IN_ int num_nodes, - _IN_ const vector& W, - _IN_ bool is_directed, - _IN_ vector& node_mask, // 添加节点掩码参数 - _OUT_ vector& effective_size -) { - int num_edges = row.size(); - - effective_size = vector(num_nodes); - int r = cuda_effective_size(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), effective_size.data()); - - return r; // 成功 -} - -int constraint( - _IN_ const vector& V, - _IN_ const vector& E, - _IN_ const vector& row, - _IN_ const vector& col, - _IN_ int num_nodes, - _IN_ const vector& W, - _IN_ bool is_directed, - _IN_ vector& node_mask, // 添加节点掩码参数 - _OUT_ vector& constraint -) { - int num_edges = row.size(); - - constraint = vector(num_nodes); - int r = cuda_constraint(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), constraint.data()); - - return r; // 成功 -} - -int hierarchy( - _IN_ const vector& V, - _IN_ const vector& E, - _IN_ const vector& row, - _IN_ const vector& col, - _IN_ int num_nodes, - _IN_ const vector& W, - _IN_ bool is_directed, - _IN_ vector& node_mask, // 添加节点掩码参数 - _OUT_ vector& hierarchy -) { - int num_edges = row.size(); - - hierarchy = vector(num_nodes); - int r = cuda_hierarchy(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), hierarchy.data()); - - return r; // 成功 -} - -} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/evaluation.cu b/gpu_easygraph/functions/structural_holes/evaluation.cu deleted file mode 100644 index 6c6d615d..00000000 --- a/gpu_easygraph/functions/structural_holes/evaluation.cu +++ /dev/null @@ -1,888 +0,0 @@ -#include -#include -#include - -#include "common.h" -#define NODES_PER_BLOCK 1 - -namespace gpu_easygraph { - -enum norm_t { SUM = 0, MAX = 1 }; - -static __device__ double mutual_weight( - const int* V, - const int* E, - const double* W, - int u, - int v -) { - double a_uv = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - if (E[i] == v) { - a_uv = W[i]; - break; - } - } - return a_uv; -} - -static __device__ double normalized_mutual_weight( - const int* V, - const int* E, - const double* W, - int u, - int v, - norm_t norm -) { - double weight_uv = mutual_weight(V, E, W, u, v); - - double scale = 0.0; - if (norm == SUM) { - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = mutual_weight(V, E, W, u, neighbor); - scale += weight_uw; - } - } else if (norm == MAX) { - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = mutual_weight(V, E, W, u, neighbor); - scale = fmax(scale,weight_uw); - } - } - return (scale==0.0) ? 0.0 : (weight_uv / scale); -} - -static __device__ double local_constraint( - const int* V, - const int* E, - const double* W, - int u, - int v -) { - double direct = normalized_mutual_weight(V,E,W,u,v,SUM); - double indirect = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM); - double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM); - indirect += norm_uw * norm_wv; - } - double local_constraint_of_uv = (direct + indirect) * (direct + indirect); - return local_constraint_of_uv; -} - - - -__global__ void calculate_constraints( - const int* __restrict__ V, - const int* __restrict__ E, - const double* __restrict__ W, - const int num_nodes, - const int* __restrict__ node_mask, - double* __restrict__ constraint_results -) { - int start_node = blockIdx.x * NODES_PER_BLOCK; // 当前块处理的起始节点 - int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); // 结束节点,确保不越界 - - for (int v = start_node; v < end_node; ++v) { - if (!node_mask[v]) continue; - - double constraint_of_v = 0.0; - bool is_nan = true; - - // 使用所有线程并行处理邻居 - __shared__ double shared_constraint[256]; // 假设 blockDim.x <= 256 - double local_sum = 0.0; - - for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { - is_nan = false; - int neighbor = E[i]; - local_sum += local_constraint(V, E, W, v, neighbor); - } - - // 归约邻居结果到一个值 - shared_constraint[threadIdx.x] = local_sum; - __syncthreads(); - - for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { - if (threadIdx.x < offset) { - shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset]; - } - __syncthreads(); - } - - // 保存最终结果 - if (threadIdx.x == 0) { - constraint_results[v] = (is_nan) ? NAN : shared_constraint[0]; - } - } -} - -static __device__ double directed_mutual_weight( - const int* V, - const int* E, - const double* W, - int u, - int v -) { - double a_uv = 0.0, a_vu = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - if (E[i] == v) { - a_uv = W[i]; - break; - } - } - for (int i = V[v]; i < V[v+1]; i++) { - if (E[i] == u) { - a_vu = W[i]; - break; - } - } - return a_uv + a_vu; -} - -static __device__ double directed_normalized_mutual_weight( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - int num_edges, - int u, - int v, - norm_t norm -) { - double weight_uv = directed_mutual_weight(V, E, W, u, v); - - double scale = 0.0; - if(norm==SUM){ - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); - scale += weight_uw; - } - - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int neighbor = row[i]; - double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); - scale += weight_wu; - } - } - }else if(norm==MAX){ - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); - scale = fmax(scale,weight_uw); - } - - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int neighbor = row[i]; - double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); - scale = fmax(scale,weight_wu); - } - } - } - return (scale==0.0) ? 0.0 : (weight_uv / scale); -} - -static __device__ double directed_local_constraint( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - int num_edges, - int u, - int v -) { - double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM); - double indirect = 0.0; - for (int i = V[u]; i < V[u+1]; i++) { - int neighbor = E[i]; - double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); - double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); - indirect += norm_uw * norm_wv; - } - - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int neighbor = row[i]; - double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); - double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); - indirect += norm_uw * norm_wv; - } - } - double local_constraint_of_uv = (direct + indirect) * (direct + indirect); - return local_constraint_of_uv; -} - -__global__ void directed_calculate_constraints( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - int num_nodes, - int num_edges, - int* node_mask, - double* constraint_results -) { - int start_node = blockIdx.x * NODES_PER_BLOCK; // 当前块处理的起始节点 - int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); // 结束节点,确保不越界 - - for (int v = start_node; v < end_node; ++v) { - if (!node_mask[v]) continue; - - double constraint_of_v = 0.0; - bool is_nan = true; - - // 使用所有线程并行处理邻居 - __shared__ double shared_constraint[256]; // 假设 blockDim.x <= 256 - double local_sum = 0.0; - - // 计算出邻居约束(out-neighbors) - for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { - is_nan = false; - int neighbor = E[i]; - local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor); - } - - // 计算入邻居约束(in-neighbors) - for (int i = threadIdx.x; i < num_edges; i += blockDim.x) { - if (col[i] == v) { - // is_nan = false; - int neighbor = row[i]; - local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor); - } - } - - // 将所有线程的局部结果写入共享内存 - shared_constraint[threadIdx.x] = local_sum; - __syncthreads(); - - // 归约邻居结果到一个值 - for (int offset = blockDim.x / 2; offset > 0; offset /= 2) { - if (threadIdx.x < offset) { - shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset]; - } - __syncthreads(); - } - - // 保存最终结果 - if (threadIdx.x == 0) { - constraint_results[v] = (is_nan) ? NAN : shared_constraint[0]; - } - } -} - -static __device__ double redundancy( - const int* V, - const int* E, - const double* W, - const int num_nodes, - int u, - int v -) { - double r = 0.0; - for (int i = V[v]; i < V[v + 1]; i++) { - int w = E[i]; - r += normalized_mutual_weight(V, E, W, u, w, SUM) * normalized_mutual_weight(V, E, W, v, w, MAX); - } - return 1-r; -} - -// __global__ void calculate_effective_size( -// const int* V, -// const int* E, -// const double* W, -// const int num_nodes, -// const int* node_mask, -// double* effective_size_results -// ) { -// int u = blockIdx.x * blockDim.x + threadIdx.x; -// if (u >= num_nodes || !node_mask[u]) return; -// double redundancy_sum = 0.0; -// bool is_nan = true; - -// // 遍历 v 的所有邻居 -// for (int i = V[u]; i < V[u + 1]; i++) { -// int v = E[i]; -// if (v == u) continue; // 排除自连接的情况 -// is_nan = false; -// redundancy_sum += redundancy(V,E,W,num_nodes,u,v); -// } -// effective_size_results[u] = is_nan ? NAN : redundancy_sum; -// } - - -static __device__ double atomicAdd ( - _OUT_ double* address, - _IN_ double val -) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); -} - -// 自定义 atomicAdd 函数,用于 double 类型的原子加法(如果您的架构支持,可以使用内置的 atomicAdd) -static __device__ double atomicAdd_double(double* address, double val) -{ - unsigned long long int* address_as_ull = - (unsigned long long int*)address; - unsigned long long int old = *address_as_ull, assumed; - do { - assumed = old; - old = atomicCAS(address_as_ull, assumed, - __double_as_longlong(val + - __longlong_as_double(assumed))); - } while (assumed != old); - return __longlong_as_double(old); -} - -// Warp 级别归约函数 -__inline__ __device__ double warp_reduce_sum(double val) -{ - for (int offset = warpSize / 2; offset > 0; offset /= 2) - { - val += __shfl_down_sync(0xffffffff, val, offset); - } - return val; -} - -// 线程块内归约函数,利用 warp 归约 -__inline__ __device__ double block_reduce_sum(double val) -{ - // 每个 warp 进行归约 - val = warp_reduce_sum(val); - - // 将每个 warp 的结果存入共享内存 - __shared__ double shared[32]; // 一个线程块最多有 32 个 warp - int warp_id = threadIdx.x / warpSize; - if (threadIdx.x % warpSize == 0) - { - shared[warp_id] = val; - } - __syncthreads(); - - // 由第一个 warp 进行最终的归约 - if (warp_id == 0) - { - val = (threadIdx.x < (blockDim.x / warpSize)) ? shared[threadIdx.x] : 0.0; - val = warp_reduce_sum(val); - } - return val; -} - -__global__ void calculate_effective_size( - const int* __restrict__ V, - const int* __restrict__ E, - const double* __restrict__ W, - const int num_nodes, - const int* __restrict__ node_mask, - double* __restrict__ effective_size_results -) { - // 每个线程块处理一个节点 - int u = blockIdx.x; - if (u >= num_nodes || !node_mask[u]) return; - - // 获取节点 u 的邻居范围 - int neighbor_start = V[u]; - int neighbor_end = V[u + 1]; - int degree = neighbor_end - neighbor_start; - - // 计算线程块的线程数量 - int threads_per_block = blockDim.x; - - // 每个线程处理节点 u 的一个或多个邻居 - double redundancy_sum = 0.0; - for (int idx = threadIdx.x; idx < degree; idx += threads_per_block) { - int i = neighbor_start + idx; - int v = E[i]; - if (v != u) { - // redundancy_sum += redundancy(V, E, W, num_nodes, u, v); - double r = 0.0; - for (int j = V[v]; j < V[v + 1]; j++) { - int w = E[j]; - r += normalized_mutual_weight(V, E, W, u, w, SUM) * - normalized_mutual_weight(V, E, W, v, w, MAX); - } - redundancy_sum += 1 - r; - } - } - - // 线程块内归约 - redundancy_sum = block_reduce_sum(redundancy_sum); - - // 将结果写入全局内存 - if (threadIdx.x == 0) { - effective_size_results[u] = redundancy_sum; - } -} - -static __device__ double directed_redundancy( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - const int num_nodes, - const int num_edges, - int u, - int v -) { - double r = 0.0; - for (int i = V[v]; i < V[v + 1]; i++) { - int w = E[i]; - r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX); - } - for (int i = 0; i < num_edges; i++) { - if (col[i] == v) { - int w = row[i]; - r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX); - } - } - return 1-r; -} - -__global__ void directed_calculate_effective_size( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - const int num_nodes, - const int num_edges, - const int* node_mask, - double* effective_size_results -) { - int u = blockIdx.x * blockDim.x + threadIdx.x; - if (u >= num_nodes || !node_mask[u]) return; - double redundancy_sum = 0.0; - bool is_nan = true; - - // 遍历 u 的所有邻居 - for (int i = V[u]; i < V[u + 1]; i++) { - int v = E[i]; - if (v == u) continue; // 排除自连接的情况 - is_nan = false; - redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v); - } - for (int i = 0; i < num_edges; i++) { - if (col[i] == u) { - int v = row[i]; - redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v); - } - } - effective_size_results[u] = is_nan ? NAN : redundancy_sum; -} - - - -// __global__ void calculate_hierarchy( -// const int* V, -// const int* E, -// const double* W, -// int num_nodes, -// const int* node_mask, -// double* hierarchy_results -// ) { -// int v = blockIdx.x * blockDim.x + threadIdx.x; -// if (v >= num_nodes || !node_mask[v]) return; -// int n = V[v + 1] - V[v]; -// double *c = new double[n]; -// double C = 0.0; -// double hierarchy_sum = 0.0; -// int neighbor = 0; -// for (int i = V[v]; i < V[v + 1]; i++) { -// int w = E[i]; -// c[neighbor] = local_constraint(V, E, W, v, w); -// C += c[neighbor++]; -// } -// __syncthreads(); -// if (n > 1) { -// for (int i = 0; i < neighbor; i++) { -// hierarchy_sum += (c[i] / C) * n * logf((c[i] / C) * n) / (n * logf(n)); -// } -// hierarchy_results[v] = hierarchy_sum; -// }else{ -// hierarchy_results[v] = 0; -// } -// delete[] c; -// } - - -__global__ void calculate_hierarchy( - const int* V, - const int* E, - const double* W, - int num_nodes, - const int* node_mask, - double* hierarchy_results -) { - int start_node = blockIdx.x * NODES_PER_BLOCK; // 当前块处理的起始节点 - int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); // 确保不越界 - - extern __shared__ double shared_mem[]; // 动态分配共享内存 - double* shared_c = shared_mem; // 每个邻居的局部约束值 - double* shared_C = &shared_mem[blockDim.x]; // 用于存储约束总和 C - - // 每个块处理多个节点 - for (int v = start_node; v < end_node; ++v) { - if (!node_mask[v]) continue; - - int n = V[v + 1] - V[v]; // 邻居数量 - if (n <= 1) { - hierarchy_results[v] = 0.0; // 如果邻居数 <= 1,层级性为 0 - continue; - } - if (threadIdx.x == 0) shared_C[0] = 0.0; // 清零 - __syncthreads(); - - double local_C = 0.0; - - - // 每个线程负责部分邻居 - for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { - int w = E[i]; - double constraint = local_constraint(V, E, W, v, w); // 计算约束值 - shared_c[threadIdx.x] = constraint; // 写入共享内存 - local_C += constraint; // 累加局部约束值 - } - - // 使用原子操作归约约束总和 C - atomicAdd(&shared_C[0], local_C); - __syncthreads(); // 确保所有线程完成归约 - - // 计算层级性 - if (threadIdx.x == 0) { // 仅由线程 0 计算最终层级性 - double C = shared_C[0]; // 获取约束总和 - double hierarchy_sum = 0.0; - for (int i = 0; i < n; i++) { - double normalized_c = shared_c[i] / C; - hierarchy_sum += normalized_c * n * logf(normalized_c * n) / (n * logf(n)); - } - hierarchy_results[v] = hierarchy_sum; // 保存结果 - } - - __syncthreads(); // 确保当前节点的所有线程完成计算 - } -} - - - -__global__ void directed_calculate_hierarchy( - const int* V, - const int* E, - const int* row, - const int* col, - const double* W, - const int num_nodes, - const int num_edges, - const int* node_mask, - double* hierarchy_results -) { - int v = blockIdx.x * blockDim.x + threadIdx.x; - if (v >= num_nodes || !node_mask[v]) return; - int in_neighbor = V[v + 1] - V[v]; - int out_neighbor = 0; - double C = 0.0; - double hierarchy_sum = 0.0; - int neighbor = 0; - for (int i = 0; i < num_edges; i++) { - if (col[i] == v) { - out_neighbor++; - } - } - double *c = new double[in_neighbor+out_neighbor]; - for (int i = V[v]; i < V[v + 1]; i++) { - int w = E[i]; - c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w); - C += c[neighbor]; - neighbor++; - } - for (int i = 0; i < num_edges; i++) { - if (col[i] == v) { - int w = row[i]; - c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w); - C += c[neighbor]; - neighbor++; - } - } - __syncthreads(); - if (neighbor > 1) { - for (int i = 0; i < neighbor; i++) { - hierarchy_sum += (c[i] / C) * neighbor * logf((c[i] / C) * neighbor) / (neighbor * logf(neighbor)); - } - hierarchy_results[v] = hierarchy_sum; - }else{ - hierarchy_results[v] = 0; - } - delete[] c; -} - - - -int cuda_hierarchy( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, - _IN_ int num_nodes, - _IN_ int num_edges, - _IN_ bool is_directed, - _IN_ int* node_mask, - _OUT_ double* hierarchy_results -){ - int cuda_ret = cudaSuccess; - int EG_ret = EG_GPU_SUCC; - int min_grid_size = 0; - int block_size = 0; - - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_constraints, 0, 0); - int grid_size = (num_nodes + block_size - 1) / block_size; - - int* d_V; - int* d_E; - int* d_row; - int* d_col; - double* d_W; - int* d_node_mask; - double* d_hierarchy_results; - - // 分配CUDA设备内存 - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_hierarchy_results, num_nodes * sizeof(double))); - - // 将数据从主机拷贝到设备 - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); - - - // 调用 CUDA 内核 - if(is_directed){ - directed_calculate_hierarchy<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_hierarchy_results); - }else{ - int block_size = 256; // 每个块的线程数,可以根据设备进行调整 - int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; // 调整为每个块处理NODES_PER_BLOCK个节点 - int shared_memory_size = 2 * sizeof(double) * block_size; - calculate_hierarchy<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_hierarchy_results); - } - - EXIT_IF_CUDA_FAILED(cudaMemcpy(hierarchy_results, d_hierarchy_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); -exit: - - cudaFree(d_V); - cudaFree(d_E); - cudaFree(d_row); - cudaFree(d_col); - cudaFree(d_W); - cudaFree(d_node_mask); - cudaFree(d_hierarchy_results); - if (cuda_ret != cudaSuccess) { - switch (cuda_ret) { - case cudaErrorMemoryAllocation: - EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM; - break; - default: - EG_ret = EG_GPU_DEVICE_ERR; - break; - } - } - - return EG_ret; -} - - - -int cuda_effective_size( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, - _IN_ int num_nodes, - _IN_ int num_edges, - _IN_ bool is_directed, - _IN_ int* node_mask, - _OUT_ double* effective_size_results -) { - int cuda_ret = cudaSuccess; - int EG_ret = EG_GPU_SUCC; - int min_grid_size = 0; - int block_size = 0; - - - // cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, - // is_directed ? directed_calculate_effective_size : undirected_calculate_effective_size, 0, 0); - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_effective_size, 0, 0); - // int grid_size = (num_nodes + block_size - 1) / block_size; - // 我们设定每个节点使用的线程数 - // int threads_per_node = 32; // 通常设置为 32 的倍数 - // dim3 blockDim(threads_per_node, NODES_PER_BLOCK); // 二维线程块 - int grid_size = (num_nodes + block_size * NODES_PER_BLOCK - 1) / (block_size * NODES_PER_BLOCK); - - int* d_V; - int* d_E; - int* d_row; - int* d_col; - double* d_W; - int* d_node_mask; - double* d_effective_size_results; - - // 分配 CUDA 设备内存 - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_effective_size_results, num_nodes * sizeof(double))); - - // 将数据从主机拷贝到设备 - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); - - if(is_directed){ - directed_calculate_effective_size<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_effective_size_results); - }else{ - int block_size = 256; // 每个块的线程数,可以根据设备进行调整 - int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; // 调整为每个块处理NODES_PER_BLOCK个节点 - // int shared_memory_size = blockDim.x * sizeof(double); - calculate_effective_size<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_effective_size_results); - - } - - EXIT_IF_CUDA_FAILED(cudaMemcpy(effective_size_results, d_effective_size_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); - -exit: - cudaFree(d_V); - cudaFree(d_E); - cudaFree(d_row); - cudaFree(d_col); - cudaFree(d_W); - cudaFree(d_node_mask); - cudaFree(d_effective_size_results); - - if (cuda_ret != cudaSuccess) { - switch (cuda_ret) { - case cudaErrorMemoryAllocation: - EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM; - break; - default: - EG_ret = EG_GPU_DEVICE_ERR; - break; - } - } - - return EG_ret; -} - -int cuda_constraint( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, - _IN_ int num_nodes, - _IN_ int num_edges, - _IN_ bool is_directed, - _IN_ int* node_mask, - _OUT_ double* constraint_results -) { - int cuda_ret = cudaSuccess; - int EG_ret = EG_GPU_SUCC; - - - - // calculate_constraints<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results); - - int* d_V; - int* d_E; - int* d_row; - int* d_col; - double* d_W; - int* d_node_mask; - double* d_constraint_results; - int block_size = 256; // 每个块的线程数,可以根据设备进行调整 - int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; // 调整为每个块处理NODES_PER_BLOCK个节点 - - // 分配CUDA设备内存 - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); - EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraint_results, num_nodes * sizeof(double))); - - // 将数据从主机拷贝到设备 - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); - EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); - - - // 调用 CUDA 内核 - if(is_directed){ - // int min_grid_size = 0; - // int block_size = 0; - // cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_constraints, 0, 0); - // int grid_size = (num_nodes + block_size - 1) / block_size; - directed_calculate_constraints<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_constraint_results); - }else{ - - calculate_constraints<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results); - } - - EXIT_IF_CUDA_FAILED(cudaMemcpy(constraint_results, d_constraint_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); -exit: - - cudaFree(d_V); - cudaFree(d_E); - cudaFree(d_row); - cudaFree(d_col); - cudaFree(d_W); - cudaFree(d_node_mask); - cudaFree(d_constraint_results); - if (cuda_ret != cudaSuccess) { - switch (cuda_ret) { - case cudaErrorMemoryAllocation: - EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM; - break; - default: - EG_ret = EG_GPU_DEVICE_ERR; - break; - } - } - - return EG_ret; -} - -} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/evaluation.cuh b/gpu_easygraph/functions/structural_holes/evaluation.cuh deleted file mode 100644 index 6624800d..00000000 --- a/gpu_easygraph/functions/structural_holes/evaluation.cuh +++ /dev/null @@ -1,45 +0,0 @@ -#pragma once - -#include "common.h" - -namespace gpu_easygraph { - -int cuda_effective_size( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, - _IN_ int num_nodes, - _IN_ int num_edges, - _IN_ bool is_directed, - _IN_ int* node_mask, - _OUT_ double* effective_size_results -); - -int cuda_constraint( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, - _IN_ int num_nodes, - _IN_ int num_edges, - _IN_ bool is_directed, - _IN_ int* node_mask, - _OUT_ double* constraint_results -); - -int cuda_hierarchy( - _IN_ const int* V, - _IN_ const int* E, - _IN_ const int* row, - _IN_ const int* col, - _IN_ const double* W, - _IN_ int num_nodes, - _IN_ int num_edges, - _IN_ bool is_directed, - _IN_ int* node_mask, - _OUT_ double* hierarchy_results -); -} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/hierarchy.cpp b/gpu_easygraph/functions/structural_holes/hierarchy.cpp new file mode 100644 index 00000000..1e7c2fa2 --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/hierarchy.cpp @@ -0,0 +1,31 @@ +#include +#include +#include // 需要包含memory头文件 + +#include "structural_holes/hierarchy.cuh" +#include "common.h" + +namespace gpu_easygraph { + +using std::vector; + +int hierarchy( + _IN_ const vector& V, + _IN_ const vector& E, + _IN_ const vector& row, + _IN_ const vector& col, + _IN_ int num_nodes, + _IN_ const vector& W, + _IN_ bool is_directed, + _IN_ vector& node_mask, // 添加节点掩码参数 + _OUT_ vector& hierarchy +) { + int num_edges = row.size(); + + hierarchy = vector(num_nodes); + int r = cuda_hierarchy(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), hierarchy.data()); + + return r; // 成功 +} + +} // namespace gpu_easygraph \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/hierarchy.cu b/gpu_easygraph/functions/structural_holes/hierarchy.cu new file mode 100644 index 00000000..3826eaba --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/hierarchy.cu @@ -0,0 +1,378 @@ +#include +#include +#include + +#include "common.h" +#define NODES_PER_BLOCK 1 + +namespace gpu_easygraph { + +enum norm_t { SUM = 0, MAX = 1 }; + +static __device__ double mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double a_uv = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + if (E[i] == v) { + a_uv = W[i]; + break; + } + } + return a_uv; +} + +static __device__ double normalized_mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v, + norm_t norm +) { + double weight_uv = mutual_weight(V, E, W, u, v); + + double scale = 0.0; + if (norm == SUM) { + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = mutual_weight(V, E, W, u, neighbor); + scale += weight_uw; + } + } else if (norm == MAX) { + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_uw); + } + } + return (scale==0.0) ? 0.0 : (weight_uv / scale); +} + +static __device__ double directed_mutual_weight( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double a_uv = 0.0, a_vu = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + if (E[i] == v) { + a_uv = W[i]; + break; + } + } + for (int i = V[v]; i < V[v+1]; i++) { + if (E[i] == u) { + a_vu = W[i]; + break; + } + } + return a_uv + a_vu; +} + +static __device__ double directed_normalized_mutual_weight( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + int num_edges, + int u, + int v, + norm_t norm +) { + double weight_uv = directed_mutual_weight(V, E, W, u, v); + + double scale = 0.0; + if(norm==SUM){ + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); + scale += weight_uw; + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); + scale += weight_wu; + } + } + }else if(norm==MAX){ + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double weight_uw = directed_mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_uw); + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double weight_wu = directed_mutual_weight(V, E, W, u, neighbor); + scale = fmax(scale,weight_wu); + } + } + } + return (scale==0.0) ? 0.0 : (weight_uv / scale); +} + +static __device__ double atomicAdd ( + _OUT_ double* address, + _IN_ double val +) +{ + unsigned long long int* address_as_ull = + (unsigned long long int*)address; + unsigned long long int old = *address_as_ull, assumed; + do { + assumed = old; + old = atomicCAS(address_as_ull, assumed, + __double_as_longlong(val + + __longlong_as_double(assumed))); + } while (assumed != old); + return __longlong_as_double(old); +} + +static __device__ double local_constraint( + const int* V, + const int* E, + const double* W, + int u, + int v +) { + double direct = normalized_mutual_weight(V,E,W,u,v,SUM); + double indirect = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM); + double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM); + indirect += norm_uw * norm_wv; + } + double local_constraint_of_uv = (direct + indirect) * (direct + indirect); + return local_constraint_of_uv; +} + +__global__ void calculate_hierarchy( + const int* V, + const int* E, + const double* W, + int num_nodes, + const int* node_mask, + double* hierarchy_results +) { + int start_node = blockIdx.x * NODES_PER_BLOCK; // 当前块处理的起始节点 + int end_node = min(start_node + NODES_PER_BLOCK, num_nodes); // 确保不越界 + + extern __shared__ double shared_mem[]; // 动态分配共享内存 + double* shared_c = shared_mem; // 每个邻居的局部约束值 + double* shared_C = &shared_mem[blockDim.x]; // 用于存储约束总和 C + + // 每个块处理多个节点 + for (int v = start_node; v < end_node; ++v) { + if (!node_mask[v]) continue; + + int n = V[v + 1] - V[v]; // 邻居数量 + if (n <= 1) { + hierarchy_results[v] = 0.0; // 如果邻居数 <= 1,层级性为 0 + continue; + } + if (threadIdx.x == 0) shared_C[0] = 0.0; // 清零 + __syncthreads(); + + double local_C = 0.0; + + + // 每个线程负责部分邻居 + for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) { + int w = E[i]; + double constraint = local_constraint(V, E, W, v, w); // 计算约束值 + shared_c[threadIdx.x] = constraint; // 写入共享内存 + local_C += constraint; // 累加局部约束值 + } + + // 使用原子操作归约约束总和 C + atomicAdd(&shared_C[0], local_C); + __syncthreads(); // 确保所有线程完成归约 + + // 计算层级性 + if (threadIdx.x == 0) { // 仅由线程 0 计算最终层级性 + double C = shared_C[0]; // 获取约束总和 + double hierarchy_sum = 0.0; + for (int i = 0; i < n; i++) { + double normalized_c = shared_c[i] / C; + hierarchy_sum += normalized_c * n * logf(normalized_c * n) / (n * logf(n)); + } + hierarchy_results[v] = hierarchy_sum; // 保存结果 + } + + __syncthreads(); // 确保当前节点的所有线程完成计算 + } +} + +static __device__ double directed_local_constraint( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + int num_edges, + int u, + int v +) { + double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM); + double indirect = 0.0; + for (int i = V[u]; i < V[u+1]; i++) { + int neighbor = E[i]; + double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); + double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); + indirect += norm_uw * norm_wv; + } + + for (int i = 0; i < num_edges; i++) { + if (col[i] == u) { + int neighbor = row[i]; + double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM); + double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM); + indirect += norm_uw * norm_wv; + } + } + double local_constraint_of_uv = (direct + indirect) * (direct + indirect); + return local_constraint_of_uv; +} + +__global__ void directed_calculate_hierarchy( + const int* V, + const int* E, + const int* row, + const int* col, + const double* W, + const int num_nodes, + const int num_edges, + const int* node_mask, + double* hierarchy_results +) { + int v = blockIdx.x * blockDim.x + threadIdx.x; + if (v >= num_nodes || !node_mask[v]) return; + int in_neighbor = V[v + 1] - V[v]; + int out_neighbor = 0; + double C = 0.0; + double hierarchy_sum = 0.0; + int neighbor = 0; + for (int i = 0; i < num_edges; i++) { + if (col[i] == v) { + out_neighbor++; + } + } + double *c = new double[in_neighbor+out_neighbor]; + for (int i = V[v]; i < V[v + 1]; i++) { + int w = E[i]; + c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w); + C += c[neighbor]; + neighbor++; + } + for (int i = 0; i < num_edges; i++) { + if (col[i] == v) { + int w = row[i]; + c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w); + C += c[neighbor]; + neighbor++; + } + } + __syncthreads(); + if (neighbor > 1) { + for (int i = 0; i < neighbor; i++) { + hierarchy_sum += (c[i] / C) * neighbor * logf((c[i] / C) * neighbor) / (neighbor * logf(neighbor)); + } + hierarchy_results[v] = hierarchy_sum; + }else{ + hierarchy_results[v] = 0; + } + delete[] c; +} + +int cuda_hierarchy( + _IN_ const int* V, + _IN_ const int* E, + _IN_ const int* row, + _IN_ const int* col, + _IN_ const double* W, + _IN_ int num_nodes, + _IN_ int num_edges, + _IN_ bool is_directed, + _IN_ int* node_mask, + _OUT_ double* hierarchy_results +){ + int cuda_ret = cudaSuccess; + int EG_ret = EG_GPU_SUCC; + int min_grid_size = 0; + int block_size = 0; + + cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_hierarchy, 0, 0); + int grid_size = (num_nodes + block_size - 1) / block_size; + + int* d_V; + int* d_E; + int* d_row; + int* d_col; + double* d_W; + int* d_node_mask; + double* d_hierarchy_results; + + // 分配CUDA设备内存 + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int))); + EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_hierarchy_results, num_nodes * sizeof(double))); + + // 将数据从主机拷贝到设备 + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice)); + EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice)); + + + // 调用 CUDA 内核 + if(is_directed){ + directed_calculate_hierarchy<<>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_hierarchy_results); + }else{ + int block_size = 256; // 每个块的线程数,可以根据设备进行调整 + int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK; // 调整为每个块处理NODES_PER_BLOCK个节点 + int shared_memory_size = 2 * sizeof(double) * block_size; + calculate_hierarchy<<>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_hierarchy_results); + } + + EXIT_IF_CUDA_FAILED(cudaMemcpy(hierarchy_results, d_hierarchy_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost)); +exit: + + cudaFree(d_V); + cudaFree(d_E); + cudaFree(d_row); + cudaFree(d_col); + cudaFree(d_W); + cudaFree(d_node_mask); + cudaFree(d_hierarchy_results); + if (cuda_ret != cudaSuccess) { + switch (cuda_ret) { + case cudaErrorMemoryAllocation: + EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM; + break; + default: + EG_ret = EG_GPU_DEVICE_ERR; + break; + } + } + + return EG_ret; +} +} \ No newline at end of file diff --git a/gpu_easygraph/functions/structural_holes/hierarchy.cuh b/gpu_easygraph/functions/structural_holes/hierarchy.cuh new file mode 100644 index 00000000..ac0f596c --- /dev/null +++ b/gpu_easygraph/functions/structural_holes/hierarchy.cuh @@ -0,0 +1,19 @@ +#pragma once + +#include "common.h" + +namespace gpu_easygraph { + +int cuda_hierarchy( + _IN_ const int* V, + _IN_ const int* E, + _IN_ const int* row, + _IN_ const int* col, + _IN_ const double* W, + _IN_ int num_nodes, + _IN_ int num_edges, + _IN_ bool is_directed, + _IN_ int* node_mask, + _OUT_ double* hierarchy_results +); +} // namespace gpu_easygraph \ No newline at end of file