From d77eb80f0a9f0659dda44c6f851cbb5ac24de5e3 Mon Sep 17 00:00:00 2001
From: jessicawwen <wwen24@m.fudan.edu.cn>
Date: Wed, 1 Jan 2025 08:19:10 +0800
Subject: [PATCH] feat:try to add reverse_csr

---
 cpp_easygraph/classes/graph.cpp               | 266 +++++++++
 cpp_easygraph/classes/graph.h                 |   4 +
 .../functions/structural_holes/evaluation.cpp |  36 +-
 .../functions/structural_holes/constraint.cpp |  20 +-
 .../functions/structural_holes/constraint.cu  | 517 ++++++++++--------
 .../functions/structural_holes/constraint.cuh |  16 +-
 gpu_easygraph/gpu_easygraph.h                 |  11 +-
 7 files changed, 599 insertions(+), 271 deletions(-)
diff --git a/cpp_easygraph/classes/graph.cpp b/cpp_easygraph/classes/graph.cpp
index 122469bc..5d127714 100644
--- a/cpp_easygraph/classes/graph.cpp
+++ b/cpp_easygraph/classes/graph.cpp
@@ -9,6 +9,7 @@ Graph::Graph() {
     this->dirty_adj = true;
     this->linkgraph_dirty = true;
     this->csr_graph = nullptr;
+    this->in_csr_graph = nullptr;
     this->node_to_id = py::dict();
     this->id_to_node = py::dict();
     this->graph = py::dict();
@@ -715,6 +716,81 @@ void Graph::drop_cache() {
     csr_graph = nullptr;
 }
 
+std::shared_ptr<CSRGraph> Graph::gen_CSR_fast(const std::string& weight_key) {
+    // Step 2: 收集所有节点 ID 并排序
+    std::vector<node_t> node_vec;
+    for (const auto& item : this->node_to_id) {
+        node_vec.push_back(item.second.cast<node_t>());
+    }
+    std::sort(node_vec.begin(), node_vec.end());
+
+    // Step 3: 遍历邻接表，构造 edges
+    int num_nodes = node_vec.size();
+    std::vector<int> rowPtrOut(num_nodes + 1, 0);
+    std::vector<int> rowPtrIn(num_nodes + 1, 0);
+    std::vector<std::tuple<int, int, double>> edges;
+
+    for (const auto& adj_item : this->adj) {
+        node_t u_id = adj_item.first;
+        for (const auto& neighbor : adj_item.second) {
+            node_t v_id = neighbor.first;
+            const auto& edge_attrs = neighbor.second;
+
+            double weight = 1.0; // 默认权重
+            if (edge_attrs.find(weight_key) != edge_attrs.end()) {
+                weight = edge_attrs.at(weight_key);
+            }
+
+            edges.emplace_back(u_id, v_id, weight);
+            rowPtrOut[u_id + 1]++;
+            rowPtrIn[v_id + 1]++;
+        }
+    }
+
+    // Step 4: 计算 rowPtr 的前缀和
+    for (int i = 1; i <= num_nodes; i++) {
+        rowPtrOut[i] += rowPtrOut[i - 1];
+        rowPtrIn[i] += rowPtrIn[i - 1];
+    }
+
+    // Step 5: 填充 colIdx 和 val
+    std::vector<int> colIdxOut(edges.size());
+    std::vector<double> valOut(edges.size());
+    std::vector<int> colIdxIn(edges.size());
+    std::vector<double> valIn(edges.size());
+    std::vector<int> offsetOut(num_nodes, 0);
+    std::vector<int> offsetIn(num_nodes, 0);
+
+    for (const auto& [u_id, v_id, weight] : edges) {
+        int posOut = rowPtrOut[u_id] + offsetOut[u_id];
+        colIdxOut[posOut] = v_id;
+        valOut[posOut] = weight;
+        offsetOut[u_id]++;
+
+        int posIn = rowPtrIn[v_id] + offsetIn[v_id];
+        colIdxIn[posIn] = u_id;
+        valIn[posIn] = weight;
+        offsetIn[v_id]++;
+    }
+
+    // Step 6: 保存到 CSRGraph 对象
+    this->csr_graph = std::make_shared<CSRGraph>();
+    auto& csr = *this->csr_graph;
+    csr.nodes = node_vec;
+    csr.V = rowPtrOut;
+    csr.E = colIdxOut;
+    csr.unweighted_W = valOut;
+
+    this->in_csr_graph = std::make_shared<CSRGraph>();
+    auto& in_csr = *this->in_csr_graph;
+    in_csr.nodes = node_vec;
+    in_csr.V = rowPtrIn;
+    in_csr.E = colIdxIn;
+    in_csr.unweighted_W = valIn;
+
+    return this->csr_graph;
+}
+
 std::shared_ptr<CSRGraph> Graph::gen_CSR(const std::string& weight) {
     if (csr_graph != nullptr) {
         if (csr_graph->W_map.find(weight) == csr_graph->W_map.end()) {
@@ -849,6 +925,196 @@ std::shared_ptr<std::vector<int>> Graph::gen_CSR_sources(const py::object& py_so
     return sources;
 }
 
+std::shared_ptr<CSRGraph> Graph::gen_reverse_CSR(const std::string& weight) {
+    // 如果 in_csr_graph 已经存在，检查有没有对应 weight 的权重
+    if (in_csr_graph != nullptr) {
+        // 如果没有该字段的权重，则需要补一份
+        if (in_csr_graph->W_map.find(weight) == in_csr_graph->W_map.end()) {
+            // 构造一个新的权重向量
+            auto W = std::make_shared<std::vector<double>>();
+            W->reserve(in_csr_graph->E.size());
+            
+            // 反向 CSR 的节点顺序、V、E 在 in_csr_graph 中已经确定
+            // 这里只需要根据 E 来补上权重
+            // E[i] 表示从第 u = in_csr_graph->idx2node[...] 个节点
+            // 指向 E[i] 对应的节点，这里是反向边，所以在原图中是 E[i] -> u
+            // 
+            // 我们可以通过 V 来找出每个节点对应的边区间
+            const auto& V = in_csr_graph->V;
+            const auto& E = in_csr_graph->E;
+            const auto& idx2node = in_csr_graph->nodes;    // 排过序的节点列表
+            const auto& node2idx = in_csr_graph->node2idx; // 节点到索引
+
+            // 遍历每个节点在 in_csr_graph 的所有反向边
+            for (size_t u_idx = 0; u_idx < idx2node.size(); ++u_idx) {
+                int start = V[u_idx];
+                int end   = V[u_idx + 1];
+                node_t u_node = idx2node[u_idx]; // 反向图中的“目标”节点
+                
+                for (int edge_pos = start; edge_pos < end; ++edge_pos) {
+                    int v_idx = E[edge_pos];
+                    // v_idx 对应的是原图中的“源”节点
+                    node_t v_node = idx2node[v_idx];
+
+                    // 在原图中 v_node -> u_node
+                    // 查找它的 edge_attr
+                    const auto& v_adjs = adj.find(v_node)->second;
+                    const auto& edge_attr = v_adjs.find(u_node)->second;
+
+                    auto edge_it = edge_attr.find(weight);
+                    double w = (edge_it != edge_attr.end()) ? edge_it->second : 1.0;
+
+                    W->push_back(w);
+                }
+            }
+
+            // 存到 in_csr_graph->W_map
+            in_csr_graph->W_map[weight] = W;
+        }
+    } 
+    else {
+        // 如果从未构造过 in_csr_graph 或者图数据修改过，就需要重新构造
+        in_csr_graph = std::make_shared<CSRGraph>();
+
+        // 1. 收集并排序所有节点
+        auto& nodes = in_csr_graph->nodes;  // 用于存储排好序的节点
+        nodes.reserve(node.size());
+        for (auto it = node.begin(); it != node.end(); ++it) {
+            nodes.push_back(it->first);
+        }
+        std::sort(nodes.begin(), nodes.end());
+
+        // 2. 建立 node2idx
+        auto& node2idx = in_csr_graph->node2idx;
+        for (int i = 0; i < (int)nodes.size(); ++i) {
+            node2idx[nodes[i]] = i;
+        }
+
+        // 3. 构造【反向】邻接表（临时结构），以便后续统一写入 V、E、W
+        //    这里我们需要把 adj 中的 (src -> dst) 翻转成 (dst -> src)
+        std::unordered_map<node_t, std::unordered_map<node_t, edge_attr_dict_factory>> rev_adj;
+        for (auto& kv : adj) {
+            node_t src = kv.first;
+            for (const auto& kv : adj) {
+                node_t src = kv.first;
+                for (const auto& kv2 : kv.second) {
+                    node_t dst = kv2.first;
+                    // kv2.second 是 edge_attr_dict_factory 类型
+                    rev_adj[dst][src] = kv2.second;
+                }
+            }
+        }
+
+        // 4. 分配 V、E、W 并写入
+        auto& V = in_csr_graph->V;
+        auto& E = in_csr_graph->E;
+        auto W  = std::make_shared<std::vector<double>>();
+
+        V.reserve(nodes.size() + 1);
+
+        // 对每个节点(按排序后顺序)填充 V, E, W
+        for (int i = 0; i < (int)nodes.size(); ++i) {
+            node_t cur_node = nodes[i];
+            // 记录当前 E 的起始位置
+            V.push_back((int)E.size());
+
+            // 查看反向邻接 rev_adj[cur_node]，即有哪些节点指向 cur_node
+            auto rev_it = rev_adj.find(cur_node);
+            if (rev_it != rev_adj.end()) {
+                const auto& neighbors = rev_it->second; // map<node_t, edge_attr_dict_factory>
+                for (auto& nb_kv : neighbors) {
+                    node_t src_node = nb_kv.first; 
+                    const auto& edge_attr = nb_kv.second;
+
+                    // src_node -> cur_node 在原图中
+                    // 在反向 CSR 中，我们在“cur_node”这行添加一个从 cur_node -> src_node 的记录
+                    E.push_back(node2idx[src_node]);
+
+                    // 权重处理
+                    auto edge_it = edge_attr.find(weight);
+                    double w_val = (edge_it != edge_attr.end()) ? edge_it->second : 1.0;
+                    W->push_back(w_val);
+                }
+            }
+        }
+
+        // 最后一个节点的“结束位置”，相当于 E.size()
+        V.push_back((int)E.size());
+
+        // 存储权重向量
+        in_csr_graph->W_map[weight] = W;
+    }
+
+    return in_csr_graph;
+}
+
+// 生成无权反向 CSR
+std::shared_ptr<CSRGraph> Graph::gen_reverse_CSR() {
+    // 如果已经有 in_csr_graph，则只需要检查 unweighted_W 是否跟 E.size() 对齐
+    if (in_csr_graph != nullptr) {
+        if (in_csr_graph->unweighted_W.size() != in_csr_graph->E.size()) {
+            in_csr_graph->unweighted_W = std::vector<double>(in_csr_graph->E.size(), 1.0);
+        }
+    } 
+    else {
+        // 和上面同理，重新构造
+        in_csr_graph = std::make_shared<CSRGraph>();
+
+        // 1. 收集并排序节点
+        auto& nodes = in_csr_graph->nodes;
+        nodes.reserve(node.size());
+        for (auto it = node.begin(); it != node.end(); ++it) {
+            nodes.push_back(it->first);
+        }
+        std::sort(nodes.begin(), nodes.end());
+
+        // 2. 建立 node2idx
+        auto& node2idx = in_csr_graph->node2idx;
+        for (int i = 0; i < (int)nodes.size(); ++i) {
+            node2idx[nodes[i]] = i;
+        }
+
+        // 3. 构造反向邻接表
+        std::unordered_map<node_t, std::unordered_map<node_t, edge_attr_dict_factory>> rev_adj;
+        for (auto& kv : adj) {
+            node_t src = kv.first;
+            for (const auto& kv : adj) {
+                node_t src = kv.first;
+                for (const auto& kv2 : kv.second) {
+                    node_t dst = kv2.first;
+                    // kv2.second 是 edge_attr_dict_factory 类型
+                    rev_adj[dst][src] = kv2.second;
+                }
+            }
+        }
+
+        // 4. 写入 V、E
+        auto& V = in_csr_graph->V;
+        auto& E = in_csr_graph->E;
+        V.reserve(nodes.size() + 1);
+
+        for (int i = 0; i < (int)nodes.size(); ++i) {
+            node_t cur_node = nodes[i];
+            V.push_back((int)E.size());
+
+            auto rev_it = rev_adj.find(cur_node);
+            if (rev_it != rev_adj.end()) {
+                const auto& neighbors = rev_it->second;
+                for (auto& nb_kv : neighbors) {
+                    node_t src_node = nb_kv.first; 
+                    E.push_back(node2idx[src_node]);
+                }
+            }
+        }
+        V.push_back((int)E.size());
+
+        // 无权时，统一填 1.0
+        in_csr_graph->unweighted_W = std::vector<double>(E.size(), 1.0);
+    }
+
+    return in_csr_graph;
+}
+
 std::shared_ptr<COOGraph> Graph::gen_COO() {
     if (coo_graph != nullptr) {
         if (coo_graph->unweighted_W.size() != coo_graph->row.size()) {
diff --git a/cpp_easygraph/classes/graph.h b/cpp_easygraph/classes/graph.h
index a10eceeb..8f451850 100644
--- a/cpp_easygraph/classes/graph.h
+++ b/cpp_easygraph/classes/graph.h
@@ -11,6 +11,7 @@ struct Graph {
     adj_dict_factory adj;
     Graph_L linkgraph_structure;
     std::shared_ptr<CSRGraph> csr_graph;
+    std::shared_ptr<CSRGraph> in_csr_graph;
     py::kwargs node_to_id, id_to_node, graph;
     node_t id;
     bool dirty_nodes, dirty_adj, linkgraph_dirty;
@@ -32,10 +33,13 @@ struct Graph {
 
     std::shared_ptr<CSRGraph> gen_CSR(const std::string& weight);
     std::shared_ptr<CSRGraph> gen_CSR();
+    std::shared_ptr<CSRGraph> gen_reverse_CSR(const std::string& weight);
+    std::shared_ptr<CSRGraph> gen_reverse_CSR();
     std::shared_ptr<std::vector<int>> gen_CSR_sources(const py::object& py_sources);
     std::shared_ptr<COOGraph> gen_COO();
     std::shared_ptr<COOGraph> gen_COO(const std::string& weight);
     std::shared_ptr<COOGraph> transfer_csr_to_coo(const std::shared_ptr<CSRGraph>& csr_graph);
+    std::shared_ptr<CSRGraph> gen_CSR_fast(const std::string& weight_key);
 };
 
 py::object Graph__init__(py::args args, py::kwargs kwargs);
diff --git a/cpp_easygraph/functions/structural_holes/evaluation.cpp b/cpp_easygraph/functions/structural_holes/evaluation.cpp
index 9118349b..427e04a7 100644
--- a/cpp_easygraph/functions/structural_holes/evaluation.cpp
+++ b/cpp_easygraph/functions/structural_holes/evaluation.cpp
@@ -212,22 +212,27 @@ static py::object invoke_gpu_constraint(py::object G, py::object nodes, py::obje
     Graph& G_ = G.cast<Graph&>();
     if (weight.is_none()) {
         G_.gen_CSR();
+        G_.gen_reverse_CSR();
     } else {
         G_.gen_CSR(weight_to_string(weight));
-    }
-    auto csr_graph = G_.csr_graph;
-    auto coo_graph = G_.transfer_csr_to_coo(csr_graph);
-    std::vector<int>& V = csr_graph->V;
-    std::vector<int>& E = csr_graph->E;
-    std::vector<int>& row = coo_graph->row;
-    std::vector<int>& col = coo_graph->col;
-    std::vector<double> *W_p = weight.is_none() ? &(coo_graph->unweighted_W) 
-                            : coo_graph->W_map.find(weight_to_string(weight))->second.get();
-    std::unordered_map<node_t, int>& node2idx = coo_graph->node2idx;
-    int num_nodes = coo_graph->node2idx.size();
+        G_.gen_reverse_CSR(weight_to_string(weight));
+    }
+    // G_.gen_CSR_fast(weight_to_string(weight));
+    auto out_csr_graph = G_.csr_graph;
+    auto in_csr_graph = G_.in_csr_graph;
+    std::vector<int>& rowPtrOut = out_csr_graph->V;
+    std::vector<int>& colIdxOut = out_csr_graph->E;
+    std::vector<double> *valOut = weight.is_none() ? &(out_csr_graph->unweighted_W) 
+                            : out_csr_graph->W_map.find(weight_to_string(weight))->second.get();
+    std::vector<int>& rowPtrIn = in_csr_graph->V;
+    
+    std::vector<int>& colIdxIn = in_csr_graph->E;
+    std::vector<double> *valIn = weight.is_none() ? &(in_csr_graph->unweighted_W) 
+                            : in_csr_graph->W_map.find(weight_to_string(weight))->second.get();
+    std::unordered_map<node_t, int>& node2idx = out_csr_graph->node2idx;
+    int num_nodes = out_csr_graph->node2idx.size();
     bool is_directed = G.attr("is_directed")().cast<bool>();
     std::vector<double> constraint_results(num_nodes, 0.0);
-
     std::vector<int> node_mask(num_nodes, 0);
     py::list nodes_list;
     if (!nodes.is_none()) {
@@ -240,8 +245,11 @@ static py::object invoke_gpu_constraint(py::object G, py::object nodes, py::obje
         nodes_list = py::list(G.attr("nodes"));
         std::fill(node_mask.begin(), node_mask.end(), 1);
     }
-
-    int gpu_r = gpu_easygraph::constraint(V, E, row, col, num_nodes, *W_p, is_directed, node_mask, constraint_results);
+    
+    int gpu_r = gpu_easygraph::constraint(num_nodes,
+        rowPtrOut, colIdxOut, *valOut,
+        rowPtrIn, colIdxIn, *valIn,
+        is_directed, node_mask, constraint_results);
     if (gpu_r != gpu_easygraph::EG_GPU_SUCC) {
         py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r));
     }
diff --git a/gpu_easygraph/functions/structural_holes/constraint.cpp b/gpu_easygraph/functions/structural_holes/constraint.cpp
index 834c37c2..678ea9c4 100644
--- a/gpu_easygraph/functions/structural_holes/constraint.cpp
+++ b/gpu_easygraph/functions/structural_holes/constraint.cpp
@@ -10,20 +10,22 @@ namespace gpu_easygraph {
 using std::vector;
 
 int constraint(
-    _IN_ const vector<int>& V,
-    _IN_ const vector<int>& E,
-    _IN_ const vector<int>& row,
-    _IN_ const vector<int>& col,
     _IN_ int num_nodes,
-    _IN_ const vector<double>& W,
+    _IN_ const std::vector<int>& rowPtrOut,
+    _IN_ const std::vector<int>& colIdxOut,
+    _IN_ const std::vector<double>& valOut,
+    _IN_ const std::vector<int>& rowPtrIn,
+    _IN_ const std::vector<int>& colIdxIn,
+    _IN_ const std::vector<double>& valIn,
     _IN_ bool is_directed,
     _IN_ vector<int>& node_mask,
-    _OUT_ vector<double>& constraint
+    _OUT_ vector<double>& constraints
 ) {
-    int num_edges = row.size();
+    int len_rowPtrOut = rowPtrOut.size();
+    int len_colIdxOut = colIdxOut.size();
     
-    constraint = vector<double>(num_nodes);
-    int r = cuda_constraint(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), constraint.data());
+    constraints = vector<double>(num_nodes);
+    int r = cuda_constraint(num_nodes, len_rowPtrOut, len_colIdxOut, rowPtrOut.data(), colIdxOut.data(), valOut.data(), rowPtrIn.data(), colIdxIn.data(), valIn.data(), is_directed, node_mask.data(), constraints.data());
 
     return r;
 }
diff --git a/gpu_easygraph/functions/structural_holes/constraint.cu b/gpu_easygraph/functions/structural_holes/constraint.cu
index 5124d6ca..0f22115a 100644
--- a/gpu_easygraph/functions/structural_holes/constraint.cu
+++ b/gpu_easygraph/functions/structural_holes/constraint.cu
@@ -3,252 +3,257 @@
 #include <stdlib.h>
 
 #include "common.h"
-#define NODES_PER_BLOCK 1
+// #define NODES_PER_BLOCK 1
 
 namespace gpu_easygraph {
 
 enum norm_t { SUM = 0, MAX = 1 };
 
-static __device__ double mutual_weight(
-    const int* V,
-    const int* E,
-    const double* W,
+// static __device__ double mutual_weight(
+//     const int* V,
+//     const int* E,
+//     const double* W,
+//     int u,
+//     int v
+// ) {
+//     double a_uv = 0.0;
+//     for (int i = V[u]; i < V[u+1]; i++) {
+//         if (E[i] == v) {
+//             a_uv = W[i];
+//             break;
+//         }
+//     }
+//     return a_uv;
+// }
+
+// static __device__ double normalized_mutual_weight(
+//     const int* V,
+//     const int* E,
+//     const double* W, 
+//     int u,
+//     int v,
+//     norm_t norm
+// ) {
+//     double weight_uv = mutual_weight(V, E, W, u, v);
+
+//     double scale = 0.0;
+//     if (norm == SUM) {
+//         for (int i = V[u]; i < V[u+1]; i++) {
+//             int neighbor = E[i];
+//             double weight_uw = mutual_weight(V, E, W, u, neighbor);
+//             scale += weight_uw;
+//         }
+//     } else if (norm == MAX) {
+//         for (int i = V[u]; i < V[u+1]; i++) {
+//             int neighbor = E[i];
+//             double weight_uw = mutual_weight(V, E, W, u, neighbor);
+//             scale = fmax(scale,weight_uw);
+//         }
+//     }
+//     return (scale==0.0) ? 0.0 : (weight_uv / scale);
+// }
+
+// static __device__ double local_constraint(
+//     const int* V,
+//     const int* E,
+//     const double* W,
+//     int u,
+//     int v
+// ) {
+//     double direct = normalized_mutual_weight(V,E,W,u,v,SUM);
+//     double indirect = 0.0;
+//     for (int i = V[u]; i < V[u+1]; i++) {
+//         int neighbor = E[i];
+//         double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM);
+//         double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM);
+//         indirect += norm_uw * norm_wv;
+//     }
+//     double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
+//     return local_constraint_of_uv;
+// }
+
+// __global__ void calculate_constraints(
+//     const int* __restrict__ V,
+//     const int* __restrict__ E,
+//     const double* __restrict__ W, 
+//     const int num_nodes, 
+//     const int* __restrict__ node_mask,
+//     double* __restrict__ constraint_results
+// ) {
+//     int start_node = blockIdx.x * NODES_PER_BLOCK;
+//     int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);
+
+//     for (int v = start_node; v < end_node; ++v) {
+//         if (!node_mask[v]) continue;
+
+//         double constraint_of_v = 0.0;
+//         bool is_nan = true;
+
+//         __shared__ double shared_constraint[256];
+//         double local_sum = 0.0;
+
+//         for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
+//             is_nan = false;
+//             int neighbor = E[i];
+//             local_sum += local_constraint(V, E, W, v, neighbor);
+//         }
+
+//         shared_constraint[threadIdx.x] = local_sum;
+//         __syncthreads();
+
+//         for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
+//             if (threadIdx.x < offset) {
+//                 shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset];
+//             }
+//             __syncthreads();
+//         }
+
+//         if (threadIdx.x == 0) {
+//             constraint_results[v] = (is_nan) ? NAN : shared_constraint[0];
+//         }
+//     }
+// }
+
+__device__ double mutual_weight(
+    const int* rowPtrOut,
+    const int* colIdxOut,
+    const double* valOut,
+    const int* rowPtrIn,
+    const int* colIdxIn,
+    const double* valIn,
     int u,
     int v
-) {
-    double a_uv = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        if (E[i] == v) {
-            a_uv = W[i];
+)
+{
+    double w_uv = 0.0, w_vu = 0.0;
+    for (int i = rowPtrOut[u]; i < rowPtrOut[u+1]; i++) {
+        if (colIdxOut[i] == v) {
+            w_uv = valOut[i];
             break;
         }
     }
-    return a_uv;
-}
-
-static __device__ double normalized_mutual_weight(
-    const int* V,
-    const int* E,
-    const double* W, 
-    int u,
-    int v,
-    norm_t norm
-) {
-    double weight_uv = mutual_weight(V, E, W, u, v);
-
-    double scale = 0.0;
-    if (norm == SUM) {
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = mutual_weight(V, E, W, u, neighbor);
-            scale += weight_uw;
-        }
-    } else if (norm == MAX) {
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = mutual_weight(V, E, W, u, neighbor);
-            scale = fmax(scale,weight_uw);
+    for (int i = rowPtrIn[u]; i < rowPtrIn[u+1]; i++) {
+        if (colIdxIn[i] == v) {
+            w_vu = valIn[i];
+            break;
         }
     }
-    return (scale==0.0) ? 0.0 : (weight_uv / scale);
-}
-
-static __device__ double local_constraint(
-    const int* V,
-    const int* E,
-    const double* W,
-    int u,
-    int v
-) {
-    double direct = normalized_mutual_weight(V,E,W,u,v,SUM);
-    double indirect = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        int neighbor = E[i];
-        double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM);
-        double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM);
-        indirect += norm_uw * norm_wv;
-    }
-    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
-    return local_constraint_of_uv;
+    // printf("u=%d, v=%d, w_uv=%f, w_vu=%f\n", u, v, w_uv, w_vu);
+    return w_uv + w_vu;
 }
 
-__global__ void calculate_constraints(
-    const int* __restrict__ V,
-    const int* __restrict__ E,
-    const double* __restrict__ W, 
-    const int num_nodes, 
-    const int* __restrict__ node_mask,
-    double* __restrict__ constraint_results
-) {
-    int start_node = blockIdx.x * NODES_PER_BLOCK;
-    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);
-
-    for (int v = start_node; v < end_node; ++v) {
-        if (!node_mask[v]) continue;
-
-        double constraint_of_v = 0.0;
-        bool is_nan = true;
-
-        __shared__ double shared_constraint[256];
-        double local_sum = 0.0;
-
-        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
-            is_nan = false;
-            int neighbor = E[i];
-            local_sum += local_constraint(V, E, W, v, neighbor);
-        }
+__global__ void compute_out_in_sum(
+    const int* rowPtrOut,
+    const double* valOut,
+    const int* rowPtrIn,
+    const double* valIn,
+    int num_nodes,
+    const int* node_mask,
+    double* d_sum
+)
+{
+    int u = blockIdx.x * blockDim.x + threadIdx.x;
+    if (u >= num_nodes || !node_mask[u]) return; // 跳过未标记的节点
 
-        shared_constraint[threadIdx.x] = local_sum;
-        __syncthreads();
+    double sum_val = 0.0;
 
-        for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
-            if (threadIdx.x < offset) {
-                shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset];
-            }
-            __syncthreads();
-        }
+    for (int i = rowPtrOut[u]; i < rowPtrOut[u + 1]; i++) {
+        sum_val += valOut[i];
+    }
 
-        if (threadIdx.x == 0) {
-            constraint_results[v] = (is_nan) ? NAN : shared_constraint[0];
-        }
+    for (int i = rowPtrIn[u]; i < rowPtrIn[u + 1]; i++) {
+        sum_val += valIn[i];
     }
+    d_sum[u] = sum_val;
+    // printf("Node %d: sum_val = %f\n", u, sum_val);
 }
 
-static __device__ double directed_mutual_weight(
-    const int* V,
-    const int* E,
-    const double* W,
+__device__ double local_constraint(
+    const int* rowPtrOut,
+    const int* colIdxOut,
+    const double* valOut,
+    const int* rowPtrIn,
+    const int* colIdxIn,
+    const double* valIn,
+    const double* d_sum,
     int u,
     int v
 ) {
-    double a_uv = 0.0, a_vu = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        if (E[i] == v) {
-            a_uv = W[i];
-            break;
-        }
+    double weight_uv = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, u, v);
+    double sum_u = 0.0;
+    for (int i = rowPtrOut[u]; i < rowPtrOut[u+1]; i++) {
+        sum_u += valOut[i];
     }
-    for (int i = V[v]; i < V[v+1]; i++) {
-        if (E[i] == u) {
-            a_vu = W[i];
-            break;
-        }
+    for (int i = rowPtrIn[u]; i < rowPtrIn[u+1]; i++) {
+        sum_u += valIn[i];
     }
-    return a_uv + a_vu;
-}
+    double direct = (sum_u == 0.0) ? 0.0 : (weight_uv / sum_u);
 
-static __device__ double directed_normalized_mutual_weight(
-    const int* V,
-    const int* E,
-    const int* row, 
-    const int* col, 
-    const double* W, 
-    int num_edges,
-    int u,
-    int v,
-    norm_t norm
-) {
-    double weight_uv = directed_mutual_weight(V, E, W, u, v);
-
-    double scale = 0.0;
-    if(norm==SUM){
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
-            scale += weight_uw;
-        }
+    double indirect = 0.0;
+    double scale_u = d_sum[u];
+    for (int i = rowPtrOut[u]; i < rowPtrOut[u+1]; i++) {
+        int nbr = colIdxOut[i];
+        double w_uw = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, u, nbr);
+        double w_wv = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, nbr, v);
 
-        for (int i = 0; i < num_edges; i++) {
-            if (col[i] == u) {
-                int neighbor = row[i];
-                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
-                scale += weight_wu;
-            }
-        }
-    }else if(norm==MAX){
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
-            scale = fmax(scale,weight_uw);
-        }
+        // double scale_u = d_sum[u];   // u的分母
+        double scale_nbr = d_sum[nbr];  // neighbor的分母
 
-        for (int i = 0; i < num_edges; i++) {
-            if (col[i] == u) {
-                int neighbor = row[i];
-                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
-                scale = fmax(scale,weight_wu);
-            }
-        }
-    }
-    return (scale==0.0) ? 0.0 : (weight_uv / scale);
-}
+        double norm_uw = (scale_u == 0.0) ? 0.0 : (w_uw / scale_u);
+        double norm_wv = (scale_nbr == 0.0) ? 0.0 : (w_wv / scale_nbr);
 
-static __device__ double directed_local_constraint(
-    const int* V,
-    const int* E,
-    const int* row, 
-    const int* col, 
-    const double* W,
-    int num_edges,
-    int u,
-    int v
-) {
-    double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM);
-    double indirect = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        int neighbor = E[i];
-        double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
-        double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
         indirect += norm_uw * norm_wv;
     }
+    for (int i = rowPtrIn[u]; i < rowPtrIn[u+1]; i++) {
+        int nbr = colIdxIn[i];
+        double w_uw = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, u, nbr);
+        double w_wv = mutual_weight(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, nbr, v);
 
-    for (int i = 0; i < num_edges; i++) {
-        if (col[i] == u) {
-            int neighbor = row[i];
-            double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
-            double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
-            indirect += norm_uw * norm_wv;
-        }
+        
+        double scale_nbr = d_sum[nbr];
+
+        double norm_uw = (scale_u == 0.0) ? 0.0 : (w_uw / scale_u);
+        double norm_wv = (scale_nbr == 0.0) ? 0.0 : (w_wv / scale_nbr);
+
+        indirect += norm_uw * norm_wv;
     }
-    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
-    return local_constraint_of_uv;
+    double result = (direct + indirect) * (direct + indirect);
+    printf("u=%d, v=%d, direct=%f, indirect=%f\n", u, v, direct, indirect);
+    return result;
 }
 
-__global__ void directed_calculate_constraints(
-    const int* V,
-    const int* E,
-    const int* row, 
-    const int* col, 
-    const double* W,  
+__global__ void calculate_constraints(
     int num_nodes,
-    int num_edges,
-    int* node_mask,
+    int NODES_PER_BLOCK,
+    const int* rowPtrOut,
+    const int* colIdxOut,
+    const double* valOut,
+    const int* rowPtrIn,
+    const int* colIdxIn,
+    const double* valIn,
+    const double* d_sum,
+    const int* node_mask,
     double* constraint_results
-) {
+){
     int start_node = blockIdx.x * NODES_PER_BLOCK;
     int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);
 
     for (int v = start_node; v < end_node; ++v) {
         if (!node_mask[v]) continue;
-
-        double constraint_of_v = 0.0;
         bool is_nan = true;
-
+        // bool is_nan = false;
         __shared__ double shared_constraint[256];
         double local_sum = 0.0;
 
-        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
+        for (int i = rowPtrOut[v] + threadIdx.x; i < rowPtrOut[v + 1]; i += blockDim.x) {
             is_nan = false;
-            int neighbor = E[i];
-            local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor);
+            int neighbor = colIdxOut[i];
+            local_sum += local_constraint(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, d_sum, v, neighbor);
         }
 
-        for (int i = threadIdx.x; i < num_edges; i += blockDim.x) {
-            if (col[i] == v) {
-                // is_nan = false;
-                int neighbor = row[i];
-                local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor);
-            }
+        for (int i = rowPtrIn[v] + threadIdx.x; i < rowPtrIn[v + 1]; i += blockDim.x) {
+            int neighbor = colIdxIn[i];
+            local_sum += local_constraint(rowPtrOut, colIdxOut, valOut, rowPtrIn, colIdxIn, valIn, d_sum, v, neighbor);
         }
 
         shared_constraint[threadIdx.x] = local_sum;
@@ -267,63 +272,103 @@ __global__ void directed_calculate_constraints(
     }
 }
 
-
 int cuda_constraint(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
     _IN_ int num_nodes,
-    _IN_ int num_edges,
+    _IN_ int len_rowPtrOut,
+    _IN_ int len_colIdxOut,
+    _IN_ const int* rowPtrOut,
+    _IN_ const int* colIdxOut,
+    _IN_ const double* valOut,
+    _IN_ const int* rowPtrIn,
+    _IN_ const int* colIdxIn,
+    _IN_ const double* valIn,
     _IN_ bool is_directed,
     _IN_ int* node_mask,
-    _OUT_ double* constraint_results
+    _OUT_ double* constraints
 ) {
     int cuda_ret = cudaSuccess;
     int EG_ret = EG_GPU_SUCC;
     
-    int* d_V;
-    int* d_E;
-    int* d_row;
-    int* d_col;
-    double* d_W;
-    int* d_node_mask;
-    double* d_constraint_results;
-    int block_size = 256;
-    int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;
-
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
+    int* d_rowPtrOut = nullptr;
+    int* d_colIdxOut = nullptr;
+    double* d_valOut = nullptr;
+    int* d_rowPtrIn = nullptr;
+    int* d_colIdxIn = nullptr;
+    double* d_valIn = nullptr;
+    int* d_node_mask = nullptr;
+    double* d_constraints = nullptr;
+    double* d_sum = nullptr;
+
+    int device;
+    cudaGetDevice(&device);
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, device);
+
+    int maxThreadsPerBlock = prop.maxThreadsPerBlock;
+    int maxBlocks = prop.maxGridSize[0];
+    int NODES_PER_BLOCK =128;
+    int blockSize = (num_nodes < maxThreadsPerBlock) ? num_nodes : 256;
+    int numBlocks = (num_nodes + blockSize - 1) / blockSize;
+
+    if (numBlocks > maxBlocks) {
+        NODES_PER_BLOCK = numBlocks / maxBlocks + 1;
+        numBlocks = (num_nodes + NODES_PER_BLOCK * blockSize - 1) / (NODES_PER_BLOCK * blockSize);
+    }
+
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_rowPtrOut, len_rowPtrOut * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_colIdxOut, len_colIdxOut * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_valOut, len_colIdxOut * sizeof(double)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_rowPtrIn, len_rowPtrOut * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_colIdxIn, len_colIdxOut * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_valIn, len_colIdxOut * sizeof(double)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraints, num_nodes * sizeof(double)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_sum, num_nodes * sizeof(double)));
     EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraint_results, num_nodes * sizeof(double)));
 
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_rowPtrOut, rowPtrOut, len_rowPtrOut * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_colIdxOut, colIdxOut, len_colIdxOut * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_valOut, valOut, len_colIdxOut * sizeof(double), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_rowPtrIn, rowPtrIn, len_rowPtrOut * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_colIdxIn, colIdxIn, len_colIdxOut * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_valIn, valIn, len_colIdxOut * sizeof(double), cudaMemcpyHostToDevice));
     EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
+
+    
 
     if(is_directed){
-        directed_calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_constraint_results);
+        compute_out_in_sum<<<numBlocks, blockSize>>>(d_rowPtrOut, d_valOut, d_rowPtrIn, d_valIn, num_nodes, d_node_mask, d_sum);
+        cudaDeviceSynchronize();
+
+        calculate_constraints<<<numBlocks, blockSize>>>(
+            num_nodes,
+            NODES_PER_BLOCK,
+            d_rowPtrOut,
+            d_colIdxOut,
+            d_valOut,
+            d_rowPtrIn,
+            d_colIdxIn,
+            d_valIn,
+            d_sum, 
+            d_node_mask,
+            d_constraints
+        );
+        cudaDeviceSynchronize();
+        // constraints.resize(num_nodes);
     }else{
-        calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results);
+        // calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results);
     }
-
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(constraint_results, d_constraint_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(constraints, d_constraints, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
 exit:
 
-    cudaFree(d_V);
-    cudaFree(d_E);
-    cudaFree(d_row);
-    cudaFree(d_col);
-    cudaFree(d_W);
+    cudaFree(d_rowPtrOut);
+    cudaFree(d_colIdxOut);
+    cudaFree(d_valOut);
+    cudaFree(d_rowPtrIn);
+    cudaFree(d_colIdxIn);
+    cudaFree(d_valIn);
+    cudaFree(d_sum);
+    cudaFree(d_constraints);
     cudaFree(d_node_mask);
-    cudaFree(d_constraint_results);
     if (cuda_ret != cudaSuccess) {
         switch (cuda_ret) {
             case cudaErrorMemoryAllocation:
diff --git a/gpu_easygraph/functions/structural_holes/constraint.cuh b/gpu_easygraph/functions/structural_holes/constraint.cuh
index 7086e44a..844f090b 100644
--- a/gpu_easygraph/functions/structural_holes/constraint.cuh
+++ b/gpu_easygraph/functions/structural_holes/constraint.cuh
@@ -5,16 +5,18 @@
 namespace gpu_easygraph {
 
 int cuda_constraint(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
     _IN_ int num_nodes,
-    _IN_ int num_edges,
+    _IN_ int len_rowPtrOut,
+    _IN_ int len_colIdxOut,
+    _IN_ const int* rowPtrOut,
+    _IN_ const int* colIdxOut,
+    _IN_ const double* valOut,
+    _IN_ const int* rowPtrIn,
+    _IN_ const int* colIdxIn,
+    _IN_ const double* valIn,
     _IN_ bool is_directed,
     _IN_ int* node_mask,
-    _OUT_ double* constraint_results
+    _OUT_ double* constraints
 );
 
 } // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/gpu_easygraph.h b/gpu_easygraph/gpu_easygraph.h
index 91c32973..7b66edec 100644
--- a/gpu_easygraph/gpu_easygraph.h
+++ b/gpu_easygraph/gpu_easygraph.h
@@ -58,12 +58,13 @@ int pagerank(
 
 
 int constraint(
-    const std::vector<int>& V,
-    const std::vector<int>& E,
-    const std::vector<int>& row,
-    const std::vector<int>& col,
     int num_nodes,
-    const std::vector<double>& W,
+    const std::vector<int>& rowPtrOut,
+    const std::vector<int>& colIdxOut,
+    const std::vector<double>& valOut,
+    const std::vector<int>& rowPtrIn,
+    const std::vector<int>& colIdxIn,
+    const std::vector<double>& valIn,
     bool is_directed,
     std::vector<int>& node_mask,
     std::vector<double>& constraint