diff --git a/cpp_easygraph/functions/structural_holes/evaluation.cpp b/cpp_easygraph/functions/structural_holes/evaluation.cpp
index 418a0c01..354b689a 100644
--- a/cpp_easygraph/functions/structural_holes/evaluation.cpp
+++ b/cpp_easygraph/functions/structural_holes/evaluation.cpp
@@ -38,7 +38,6 @@ weight_t mutual_weight(Graph& G, node_t u, node_t v, std::string weight) {
     return a_uv + a_vu;
 }
 
-
 weight_t directed_mutual_weight(DiGraph& G, node_t u, node_t v, std::string weight) {
     weight_t a_uv = 0, a_vu = 0;
     if (G.adj.count(u) && G.adj[u].count(v)) {
@@ -173,33 +172,40 @@ std::pair<node_t, weight_t> directed_compute_constraint_of_v(DiGraph& G, node_t
 py::object invoke_cpp_constraint(py::object G, py::object nodes, py::object weight) {
     std::string weight_key = weight_to_string(weight);
     rec_type sum_nmw_rec, local_constraint_rec;
+
     if (nodes.is_none()) {
         nodes = G.attr("nodes");
     }
+
     py::list nodes_list = py::list(nodes);
-    py::list constraint_results = py::list();
     int nodes_list_len = py::len(nodes_list);
-    if(G.attr("is_directed")().cast<bool>()){
+    std::vector<double> constraint_results(nodes_list_len, 0.0);
+
+    if (G.attr("is_directed")().cast<bool>()) {
         DiGraph& G_ = G.cast<DiGraph&>();
         for (int i = 0; i < nodes_list_len; i++) {
             py::object v = nodes_list[i];
             node_t v_id = G_.node_to_id[v].cast<node_t>();
-            std::pair<node_t, weight_t> constraint_pair = directed_compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec);
-            py::tuple constraint_of_v = py::make_tuple(G_.id_to_node[py::cast(constraint_pair.first)], constraint_pair.second);
-            constraint_results.append(constraint_of_v);
+            std::pair<node_t, weight_t> constraint_pair =
+                directed_compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec);
+            constraint_results[i] = constraint_pair.second;
         }
-    }else{
+    } else {
         Graph& G_ = G.cast<Graph&>();
         for (int i = 0; i < nodes_list_len; i++) {
             py::object v = nodes_list[i];
             node_t v_id = G_.node_to_id[v].cast<node_t>();
-            std::pair<node_t, weight_t> constraint_pair = compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec);
-            py::tuple constraint_of_v = py::make_tuple(G_.id_to_node[py::cast(constraint_pair.first)], constraint_pair.second);
-            constraint_results.append(constraint_of_v);
+            std::pair<node_t, weight_t> constraint_pair =
+                compute_constraint_of_v(G_, v_id, weight_key, local_constraint_rec, sum_nmw_rec);
+            constraint_results[i] = constraint_pair.second;
         }
     }
-    py::dict constraint = py::dict(constraint_results);
-    return constraint;
+
+    std::reverse(constraint_results.begin(), constraint_results.end());
+
+    py::array::ShapeContainer ret_shape{nodes_list_len};
+    py::array_t<double> ret(ret_shape, constraint_results.data());
+    return ret;
 }
 
 #ifdef EASYGRAPH_ENABLE_GPU
@@ -242,19 +248,13 @@ static py::object invoke_gpu_constraint(py::object G, py::object nodes, py::obje
         py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r));
     }
 
-    py::dict constraint_dict;
-    for (auto node : nodes_list) {
-        int node_id = G_.node_to_id[node].cast<node_t>();
-        int idx = node2idx[node_id];
+    py::array::ShapeContainer ret_shape{(int)constraint_results.size()};
+    py::array_t<double> ret(ret_shape, constraint_results.data());
 
-        py::object node_name = G_.id_to_node.attr("get")(py::cast(node_id));
-        constraint_dict[node_name] = py::cast(constraint_results[idx]);
-    }
-    return constraint_dict;
+    return ret;
 }
 #endif
 
-
 py::object constraint(py::object G, py::object nodes, py::object weight, py::object n_workers) {
 #ifdef EASYGRAPH_ENABLE_GPU
     return invoke_gpu_constraint(G, nodes, weight);
@@ -269,9 +269,6 @@ weight_t redundancy(Graph& G, node_t u, node_t v, std::string weight, rec_type&
     for (const auto& n : G.adj[v]) {
         neighbors.insert(n.first);
     }
-    // for (const auto& n : G.pred[v]) {
-    //     neighbors.insert(n.first);
-    // }
     for (const auto& w : neighbors) {
         r += normalized_mutual_weight(G, u, w, weight, sum, sum_nmw_rec) * normalized_mutual_weight(G, v, w, weight, max, max_nmw_rec);
     }
@@ -294,42 +291,43 @@ weight_t directed_redundancy(DiGraph& G, node_t u, node_t v, std::string weight,
 }
 
 py::object invoke_cpp_effective_size(py::object G, py::object nodes, py::object weight) {
+    std::string weight_key = weight_to_string(weight);
     rec_type sum_nmw_rec, max_nmw_rec;
-    py::dict effective_size = py::dict();
+
     if (nodes.is_none()) {
-        nodes = G;
+        nodes = G.attr("nodes");
     }
-    nodes = py::list(nodes);
+
+    py::list nodes_list = py::list(nodes);
+    int nodes_list_len = py::len(nodes_list);
+    std::vector<double> effective_size_results(nodes_list_len, 0.0);
+    
     if (!G.attr("is_directed")().cast<bool>()){
         Graph& G_ = G.cast<Graph&>();
-        std::string weight_key = weight_to_string(weight);
-        int nodes_len = py::len(nodes);
-        for (int i = 0; i < nodes_len; i++) {
-            py::object v = nodes[py::cast(i)];
+        for (int i = 0; i < nodes_list_len; i++) {
+            weight_t redundancy_sum = 0;
+            py::object v = nodes_list[i];
+            node_t v_id = G_.node_to_id[v].cast<node_t>();
             if (py::len(G[v]) == 0) {
-                effective_size[v] = py::cast(Py_NAN);
+                effective_size_results[i] = Py_NAN;
                 continue;
             }
-            weight_t redundancy_sum = 0;
-            node_t v_id = G_.node_to_id[v].cast<node_t>();
             for (const auto& neighbor_info : G_.adj[v_id]) {
                 node_t u_id = neighbor_info.first;
                 redundancy_sum += redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec);
             }
-            effective_size[v] = redundancy_sum;
+            effective_size_results[i] = redundancy_sum;
         }
     } else{
         DiGraph& G_ = G.cast<DiGraph&>();
-        std::string weight_key = weight_to_string(weight);
-        int nodes_len = py::len(nodes);
-        for (int i = 0; i < nodes_len; i++) {
-            py::object v = nodes[py::cast(i)];
+        for (int i = 0; i < nodes_list_len; i++) {
+            weight_t redundancy_sum = 0;
+            py::object v = nodes_list[i];
+            node_t v_id = G_.node_to_id[v].cast<node_t>();
             if (py::len(G[v]) == 0) {
-                effective_size[v] = py::cast(Py_NAN);
+                effective_size_results[i] = Py_NAN;
                 continue;
             }
-            weight_t redundancy_sum = 0;
-            node_t v_id = G_.node_to_id[v].cast<node_t>();
             for (const auto& neighbor_info : G_.adj[v_id]) {
                 node_t u_id = neighbor_info.first;
                 redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec);
@@ -338,16 +336,21 @@ py::object invoke_cpp_effective_size(py::object G, py::object nodes, py::object
                 node_t u_id = neighbor_info.first;
                 redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec);
             }
-            effective_size[v] = redundancy_sum;
+            effective_size_results[i] = redundancy_sum;
         }
     }
-    return effective_size;
+
+    std::reverse(effective_size_results.begin(), effective_size_results.end());
+    
+    py::array::ShapeContainer ret_shape{nodes_list_len};
+    py::array_t<double> ret(ret_shape, effective_size_results.data());
+    return ret;
 }
 
 #ifdef EASYGRAPH_ENABLE_GPU
 static py::object invoke_gpu_effective_size(py::object G, py::object nodes, py::object weight) {
     Graph& G_ = G.cast<Graph&>();
-    py::dict effective_size = py::dict();
+
     if (weight.is_none()) {
         G_.gen_CSR();
     } else {
@@ -369,18 +372,17 @@ static py::object invoke_gpu_effective_size(py::object G, py::object nodes, py::
     std::vector<double> effective_size_results(num_nodes);
     bool is_directed = G.attr("is_directed")().cast<bool>();
 
-    // 设置节点掩码数组
     std::vector<int> node_mask(num_nodes, 0);
     py::list nodes_list;
     if (!nodes.is_none()) {
         nodes_list = py::list(nodes);
         for (auto node : nodes_list) {
             int node_id = node2idx[G_.node_to_id[node].cast<node_t>()];
-            node_mask[node_id] = 1;  // 标记需要计算的节点
+            node_mask[node_id] = 1;
         }
     } else {
         nodes_list = py::list(G.attr("nodes"));
-        std::fill(node_mask.begin(), node_mask.end(), 1);  // 计算所有节点
+        std::fill(node_mask.begin(), node_mask.end(), 1);
     }
 
     int gpu_r = gpu_easygraph::effective_size(V, E, row, col, num_nodes, *W_p, is_directed, node_mask, effective_size_results);
@@ -389,15 +391,10 @@ static py::object invoke_gpu_effective_size(py::object G, py::object nodes, py::
         py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r));
     }
 
-    py::dict effective_size_dict;
-    for (auto node : nodes_list) {
-        int node_id = G_.node_to_id[node].cast<node_t>();
-        int idx = node2idx[node_id];
+    py::array::ShapeContainer ret_shape{(int)effective_size_results.size()};
+    py::array_t<double> ret(ret_shape, effective_size_results.data());
 
-        py::object node_name = G_.id_to_node.attr("get")(py::cast(node_id));
-        effective_size_dict[node_name] = py::cast(effective_size_results[idx]);
-    }
-    return effective_size_dict;
+    return ret;
 }
 #endif
 
@@ -411,9 +408,58 @@ py::object effective_size(py::object G, py::object nodes, py::object weight, py:
 
 #ifdef EASYGRAPH_ENABLE_GPU
 static py::object invoke_gpu_efficiency(py::object G, py::object nodes, py::object weight) {
-    py::dict effective_size_dict = invoke_gpu_effective_size(G, nodes, weight);
+    Graph& G_ = G.cast<Graph&>();
+    py::dict effective_size = py::dict();
+    if (weight.is_none()) {
+        G_.gen_CSR();
+    } else {
+        G_.gen_CSR(weight_to_string(weight));
+    }
+    auto csr_graph = G_.csr_graph;
+    auto coo_graph = G_.transfer_csr_to_coo(csr_graph);
 
-    py::dict degree;
+    std::vector<int>& V = csr_graph->V;
+    std::vector<int>& E = csr_graph->E;
+    std::vector<int>& row = coo_graph->row;
+    std::vector<int>& col = coo_graph->col;
+
+    std::vector<double>* W_p = weight.is_none() ? &(coo_graph->unweighted_W)
+                                                : coo_graph->W_map.find(weight_to_string(weight))->second.get();
+
+    std::unordered_map<node_t, int>& node2idx = coo_graph->node2idx;
+    int num_nodes = coo_graph->node2idx.size();
+    std::vector<double> effective_size_results(num_nodes);
+    bool is_directed = G.attr("is_directed")().cast<bool>();
+
+    // 设置节点掩码数组
+    std::vector<int> node_mask(num_nodes, 0);
+    py::list nodes_list;
+    if (!nodes.is_none()) {
+        nodes_list = py::list(nodes);
+        for (auto node : nodes_list) {
+            int node_id = node2idx[G_.node_to_id[node].cast<node_t>()];
+            node_mask[node_id] = 1;  // 标记需要计算的节点
+        }
+    } else {
+        nodes_list = py::list(G.attr("nodes"));
+        std::fill(node_mask.begin(), node_mask.end(), 1);  // 计算所有节点
+    }
+
+    int gpu_r = gpu_easygraph::effective_size(V, E, row, col, num_nodes, *W_p, is_directed, node_mask, effective_size_results);
+
+    if (gpu_r != gpu_easygraph::EG_GPU_SUCC) {
+        py::pybind11_fail(gpu_easygraph::err_code_detail(gpu_r));
+    }
+
+    py::dict effective_size_dict;
+    for (auto node : nodes_list) {
+        int node_id = G_.node_to_id[node].cast<node_t>();
+        int idx = node2idx[node_id];
+
+        py::object node_name = G_.id_to_node.attr("get")(py::cast(node_id));
+        effective_size_dict[node_name] = py::cast(effective_size_results[idx]);
+    }
+        py::dict degree;
     if (weight.is_none()) {
         degree = G.attr("degree")(py::none()).cast<py::dict>();
     } else {
@@ -442,8 +488,55 @@ static py::object invoke_gpu_efficiency(py::object G, py::object nodes, py::obje
 }
 #endif
 
+
 py::object invoke_cpp_efficiency(py::object G, py::object nodes, py::object weight) {
-    py::dict effective_size_dict = invoke_cpp_effective_size(G, nodes, weight);
+    rec_type sum_nmw_rec, max_nmw_rec;
+    py::dict effective_size_dict = py::dict();
+    if (nodes.is_none()) {
+        nodes = G;
+    }
+    nodes = py::list(nodes);
+    if (!G.attr("is_directed")().cast<bool>()){
+        Graph& G_ = G.cast<Graph&>();
+        std::string weight_key = weight_to_string(weight);
+        int nodes_len = py::len(nodes);
+        for (int i = 0; i < nodes_len; i++) {
+            py::object v = nodes[py::cast(i)];
+            if (py::len(G[v]) == 0) {
+                effective_size_dict[v] = py::cast(Py_NAN);
+                continue;
+            }
+            weight_t redundancy_sum = 0;
+            node_t v_id = G_.node_to_id[v].cast<node_t>();
+            for (const auto& neighbor_info : G_.adj[v_id]) {
+                node_t u_id = neighbor_info.first;
+                redundancy_sum += redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec);
+            }
+            effective_size_dict[v] = redundancy_sum;
+        }
+    } else{
+        DiGraph& G_ = G.cast<DiGraph&>();
+        std::string weight_key = weight_to_string(weight);
+        int nodes_len = py::len(nodes);
+        for (int i = 0; i < nodes_len; i++) {
+            py::object v = nodes[py::cast(i)];
+            if (py::len(G[v]) == 0) {
+                effective_size_dict[v] = py::cast(Py_NAN);
+                continue;
+            }
+            weight_t redundancy_sum = 0;
+            node_t v_id = G_.node_to_id[v].cast<node_t>();
+            for (const auto& neighbor_info : G_.adj[v_id]) {
+                node_t u_id = neighbor_info.first;
+                redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec);
+            }
+            for (const auto& neighbor_info : G_.pred[v_id]) {
+                node_t u_id = neighbor_info.first;
+                redundancy_sum += directed_redundancy(G_, v_id, u_id, weight_key, sum_nmw_rec, max_nmw_rec);
+            }
+            effective_size_dict[v] = redundancy_sum;
+        }
+    }
 
     py::dict degree;
     if (weight.is_none()) {
@@ -536,65 +629,18 @@ py::object invoke_cpp_hierarchy(py::object G, py::object nodes, py::object weigh
     }
     py::list nodes_list = py::list(nodes);
     int nodes_list_len = py::len(nodes_list);
-    // Graph& G_ = G.cast<Graph&>();
     py::dict hierarchy = py::dict();
     
     if(G.attr("is_directed")().cast<bool>()){
         DiGraph& G_ = G.cast<DiGraph&>();
-        // if (!n_workers.is_none()) {
-        //     std::vector<node_t> node_ids;
-        //     int n_workers_num = n_workers.cast<unsigned>();
-        //     for (int i = 0;i < py::len(nodes_list);i++) {
-        //         py::object node = nodes_list[i];
-        //         node_ids.push_back(G_.node_to_id[node].cast<node_t>());
-        //     }
-        //     std::shuffle(node_ids.begin(), node_ids.end(), std::random_device());
-        //     std::vector<std::vector<node_t> > split_nodes;
-        //     if (node_ids.size() > n_workers_num * 30000) {
-        //         split_nodes = split_len(node_ids, 30000);
-        //     }
-        //     else {
-        //         split_nodes = split(node_ids, n_workers_num);
-        //     }
-        //     while (split_nodes.size() < n_workers_num) {
-        //         split_nodes.push_back(std::vector<node_t>());
-        //     }
-        //     std::vector<std::unordered_map<node_t, weight_t> > rets(n_workers_num);
-        //     Py_BEGIN_ALLOW_THREADS
-
-        //         std::vector<std::thread> threads;
-        //         for (int i = 0;i < n_workers_num; i++) {
-        //             threads.push_back(std::thread(hierarchy_parallel, &G_, &split_nodes[i], weight_key, &rets[i]));
-        //         }
-        //         for (int i = 0;i < n_workers_num;i++) {
-        //             threads[i].join();
-        //         }
-
-        //     Py_END_ALLOW_THREADS
-
-        //     for (int i = 1;i < rets.size();i++) {
-        //         rets[0].insert(rets[i].begin(), rets[i].end());
-        //     }
-        //     for (const auto& hierarchy_pair : rets[0]) {
-        //         py::object node = G_.id_to_node[py::cast(hierarchy_pair.first)];
-        //         hierarchy[node] = hierarchy_pair.second;
-        //     }
-        // }
-        // else {
         for (int i = 0; i < nodes_list_len; i++) {
             py::object v = nodes_list[i];
-            // py::object E = G.attr("ego_subgraph")(v);
-
-            // int n = py::len(E) - 1;
-
             weight_t C = 0;
             std::map<node_t, weight_t> c;
 
-            // 获取 successors 和 predecessors
             py::list successors_of_v = py::list(G.attr("successors")(v));
             py::list predecessors_of_v = py::list(G.attr("predecessors")(v));
 
-            // 使用 set 去重合并
             std::set<node_t> neighbors_of_v;
             for (const auto& w : successors_of_v) {
                 neighbors_of_v.insert(G_.node_to_id[w].cast<node_t>());
@@ -603,17 +649,14 @@ py::object invoke_cpp_hierarchy(py::object G, py::object nodes, py::object weigh
                 neighbors_of_v.insert(G_.node_to_id[w].cast<node_t>());
             }
 
-            // 遍历 neighbors_of_v
             for (const auto& w_id : neighbors_of_v) {
                 node_t v_id = G_.node_to_id[v].cast<node_t>();
-                // std::cout << "Node: " << v_id << ", Neighbor: " << w_id << std::endl;
 
-                // 计算约束值并存储
                 C += directed_local_constraint(G_, v_id, w_id, weight_key, local_constraint_rec, sum_nmw_rec);
                 c[w_id] = directed_local_constraint(G_, v_id, w_id, weight_key, local_constraint_rec, sum_nmw_rec);
             }
             int n = neighbors_of_v.size();
-            // 如果邻居数大于 1，计算层级性
+
             if (n > 1) {
                 weight_t hierarchy_sum = 0;
                 for (const auto& w_id : neighbors_of_v) {
@@ -622,12 +665,11 @@ py::object invoke_cpp_hierarchy(py::object G, py::object nodes, py::object weigh
                 hierarchy[v] = hierarchy_sum;
             }
 
-            // 如果层级性未定义，设置为 0
             if (!hierarchy.contains(v)) {
                 hierarchy[v] = 0;
             }
         }
-        // }
+
     }else{
         Graph& G_ = G.cast<Graph&>();
         if (!n_workers.is_none()) {
diff --git a/gpu_easygraph/functions/structural_holes/constraint.cpp b/gpu_easygraph/functions/structural_holes/constraint.cpp
new file mode 100644
index 00000000..a8b5731b
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/constraint.cpp
@@ -0,0 +1,31 @@
+#include <string>
+#include <vector>
+#include <memory> // 需要包含memory头文件
+
+#include "structural_holes/constraint.cuh"
+#include "common.h"
+
+namespace gpu_easygraph {
+
+using std::vector;
+
+int constraint(
+    _IN_ const vector<int>& V,
+    _IN_ const vector<int>& E,
+    _IN_ const vector<int>& row,
+    _IN_ const vector<int>& col,
+    _IN_ int num_nodes,
+    _IN_ const vector<double>& W,
+    _IN_ bool is_directed,
+    _IN_ vector<int>& node_mask,  // 添加节点掩码参数
+    _OUT_ vector<double>& constraint
+) {
+    int num_edges = row.size();
+    
+    constraint = vector<double>(num_nodes);
+    int r = cuda_constraint(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), constraint.data());
+
+    return r;  // 成功
+}
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/constraint.cu b/gpu_easygraph/functions/structural_holes/constraint.cu
new file mode 100644
index 00000000..31c7de17
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/constraint.cu
@@ -0,0 +1,354 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+#include "common.h"
+#define NODES_PER_BLOCK 1
+
+namespace gpu_easygraph {
+
+enum norm_t { SUM = 0, MAX = 1 };
+
+static __device__ double mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double a_uv = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        if (E[i] == v) {
+            a_uv = W[i];
+            break;
+        }
+    }
+    return a_uv;
+}
+
+static __device__ double normalized_mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W, 
+    int u,
+    int v,
+    norm_t norm
+) {
+    double weight_uv = mutual_weight(V, E, W, u, v);
+
+    double scale = 0.0;
+    if (norm == SUM) {
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = mutual_weight(V, E, W, u, neighbor);
+            scale += weight_uw;
+        }
+    } else if (norm == MAX) {
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = mutual_weight(V, E, W, u, neighbor);
+            scale = fmax(scale,weight_uw);
+        }
+    }
+    return (scale==0.0) ? 0.0 : (weight_uv / scale);
+}
+
+static __device__ double local_constraint(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double direct = normalized_mutual_weight(V,E,W,u,v,SUM);
+    double indirect = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        int neighbor = E[i];
+        double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM);
+        double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM);
+        indirect += norm_uw * norm_wv;
+    }
+    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
+    return local_constraint_of_uv;
+}
+
+__global__ void calculate_constraints(
+    const int* __restrict__ V,
+    const int* __restrict__ E,
+    const double* __restrict__ W, 
+    const int num_nodes, 
+    const int* __restrict__ node_mask,
+    double* __restrict__ constraint_results
+) {
+    int start_node = blockIdx.x * NODES_PER_BLOCK;  // 当前块处理的起始节点
+    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);  // 结束节点，确保不越界
+
+    for (int v = start_node; v < end_node; ++v) {
+        if (!node_mask[v]) continue;
+
+        double constraint_of_v = 0.0;
+        bool is_nan = true;
+
+        // 使用所有线程并行处理邻居
+        __shared__ double shared_constraint[256];  // 假设 blockDim.x <= 256
+        double local_sum = 0.0;
+
+        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
+            is_nan = false;
+            int neighbor = E[i];
+            local_sum += local_constraint(V, E, W, v, neighbor);
+        }
+
+        // 归约邻居结果到一个值
+        shared_constraint[threadIdx.x] = local_sum;
+        __syncthreads();
+
+        for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
+            if (threadIdx.x < offset) {
+                shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset];
+            }
+            __syncthreads();
+        }
+
+        // 保存最终结果
+        if (threadIdx.x == 0) {
+            constraint_results[v] = (is_nan) ? NAN : shared_constraint[0];
+        }
+    }
+}
+
+static __device__ double directed_mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double a_uv = 0.0, a_vu = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        if (E[i] == v) {
+            a_uv = W[i];
+            break;
+        }
+    }
+    for (int i = V[v]; i < V[v+1]; i++) {
+        if (E[i] == u) {
+            a_vu = W[i];
+            break;
+        }
+    }
+    return a_uv + a_vu;
+}
+
+static __device__ double directed_normalized_mutual_weight(
+    const int* V,
+    const int* E,
+    const int* row, 
+    const int* col, 
+    const double* W, 
+    int num_edges,
+    int u,
+    int v,
+    norm_t norm
+) {
+    double weight_uv = directed_mutual_weight(V, E, W, u, v);
+
+    double scale = 0.0;
+    if(norm==SUM){
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
+            scale += weight_uw;
+        }
+
+        for (int i = 0; i < num_edges; i++) {
+            if (col[i] == u) {
+                int neighbor = row[i];
+                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
+                scale += weight_wu;
+            }
+        }
+    }else if(norm==MAX){
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
+            scale = fmax(scale,weight_uw);
+        }
+
+        for (int i = 0; i < num_edges; i++) {
+            if (col[i] == u) {
+                int neighbor = row[i];
+                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
+                scale = fmax(scale,weight_wu);
+            }
+        }
+    }
+    return (scale==0.0) ? 0.0 : (weight_uv / scale);
+}
+
+static __device__ double directed_local_constraint(
+    const int* V,
+    const int* E,
+    const int* row, 
+    const int* col, 
+    const double* W,
+    int num_edges,
+    int u,
+    int v
+) {
+    double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM);
+    double indirect = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        int neighbor = E[i];
+        double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
+        double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
+        indirect += norm_uw * norm_wv;
+    }
+
+    for (int i = 0; i < num_edges; i++) {
+        if (col[i] == u) {
+            int neighbor = row[i];
+            double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
+            double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
+            indirect += norm_uw * norm_wv;
+        }
+    }
+    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
+    return local_constraint_of_uv;
+}
+
+__global__ void directed_calculate_constraints(
+    const int* V,
+    const int* E,
+    const int* row, 
+    const int* col, 
+    const double* W,  
+    int num_nodes,
+    int num_edges,
+    int* node_mask,
+    double* constraint_results
+) {
+    int start_node = blockIdx.x * NODES_PER_BLOCK;  // 当前块处理的起始节点
+    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);  // 结束节点，确保不越界
+
+    for (int v = start_node; v < end_node; ++v) {
+        if (!node_mask[v]) continue;
+
+        double constraint_of_v = 0.0;
+        bool is_nan = true;
+
+        // 使用所有线程并行处理邻居
+        __shared__ double shared_constraint[256];  // 假设 blockDim.x <= 256
+        double local_sum = 0.0;
+
+        // 计算出邻居约束（out-neighbors）
+        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
+            is_nan = false;
+            int neighbor = E[i];
+            local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor);
+        }
+
+        // 计算入邻居约束（in-neighbors）
+        for (int i = threadIdx.x; i < num_edges; i += blockDim.x) {
+            if (col[i] == v) {
+                // is_nan = false;
+                int neighbor = row[i];
+                local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor);
+            }
+        }
+
+        // 将所有线程的局部结果写入共享内存
+        shared_constraint[threadIdx.x] = local_sum;
+        __syncthreads();
+
+        // 归约邻居结果到一个值
+        for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
+            if (threadIdx.x < offset) {
+                shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset];
+            }
+            __syncthreads();
+        }
+
+        // 保存最终结果
+        if (threadIdx.x == 0) {
+            constraint_results[v] = (is_nan) ? NAN : shared_constraint[0];
+        }
+    }
+}
+
+
+int cuda_constraint(
+    _IN_ const int* V,
+    _IN_ const int* E,
+    _IN_ const int* row,
+    _IN_ const int* col,
+    _IN_ const double* W,
+    _IN_ int num_nodes,
+    _IN_ int num_edges,
+    _IN_ bool is_directed,
+    _IN_ int* node_mask,
+    _OUT_ double* constraint_results
+) {
+    int cuda_ret = cudaSuccess;
+    int EG_ret = EG_GPU_SUCC;
+    
+    int* d_V;
+    int* d_E;
+    int* d_row;
+    int* d_col;
+    double* d_W;
+    int* d_node_mask;
+    double* d_constraint_results;
+    int block_size = 256;  // 每个块的线程数，可以根据设备进行调整
+    int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;  // 调整为每个块处理NODES_PER_BLOCK个节点
+
+    // 分配CUDA设备内存
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraint_results, num_nodes * sizeof(double)));
+
+    // 将数据从主机拷贝到设备
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
+
+    
+    // 调用 CUDA 内核
+    if(is_directed){
+        directed_calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_constraint_results);
+    }else{
+        calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results);
+    }
+
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(constraint_results, d_constraint_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
+exit:
+
+    cudaFree(d_V);
+    cudaFree(d_E);
+    cudaFree(d_row);
+    cudaFree(d_col);
+    cudaFree(d_W);
+    cudaFree(d_node_mask);
+    cudaFree(d_constraint_results);
+    if (cuda_ret != cudaSuccess) {
+        switch (cuda_ret) {
+            case cudaErrorMemoryAllocation:
+                EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM;
+                break;
+            default:
+                EG_ret = EG_GPU_DEVICE_ERR;
+                break;
+        }
+    }
+
+    return EG_ret; 
+}
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/constraint.cuh b/gpu_easygraph/functions/structural_holes/constraint.cuh
new file mode 100644
index 00000000..7086e44a
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/constraint.cuh
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "common.h"
+
+namespace gpu_easygraph {
+
+int cuda_constraint(
+    _IN_ const int* V,
+    _IN_ const int* E,
+    _IN_ const int* row,
+    _IN_ const int* col,
+    _IN_ const double* W,
+    _IN_ int num_nodes,
+    _IN_ int num_edges,
+    _IN_ bool is_directed,
+    _IN_ int* node_mask,
+    _OUT_ double* constraint_results
+);
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/effective_size.cpp b/gpu_easygraph/functions/structural_holes/effective_size.cpp
new file mode 100644
index 00000000..771022a5
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/effective_size.cpp
@@ -0,0 +1,31 @@
+#include <string>
+#include <vector>
+#include <memory> // 需要包含memory头文件
+
+#include "structural_holes/effective_size.cuh"
+#include "common.h"
+
+namespace gpu_easygraph {
+
+using std::vector;
+
+int effective_size(
+    _IN_ const vector<int>& V,
+    _IN_ const vector<int>& E,
+    _IN_ const vector<int>& row,
+    _IN_ const vector<int>& col,
+    _IN_ int num_nodes,
+    _IN_ const vector<double>& W,
+    _IN_ bool is_directed,
+    _IN_ vector<int>& node_mask,  // 添加节点掩码参数
+    _OUT_ vector<double>& effective_size
+) {
+    int num_edges = row.size();
+    
+    effective_size = vector<double>(num_nodes);
+    int r = cuda_effective_size(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), effective_size.data());
+
+    return r;  // 成功
+}
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/effective_size.cu b/gpu_easygraph/functions/structural_holes/effective_size.cu
new file mode 100644
index 00000000..38d34af2
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/effective_size.cu
@@ -0,0 +1,362 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+#include "common.h"
+#define NODES_PER_BLOCK 1
+
+namespace gpu_easygraph {
+
+enum norm_t { SUM = 0, MAX = 1 };
+
+static __device__ double mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double a_uv = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        if (E[i] == v) {
+            a_uv = W[i];
+            break;
+        }
+    }
+    return a_uv;
+}
+
+static __device__ double normalized_mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W, 
+    int u,
+    int v,
+    norm_t norm
+) {
+    double weight_uv = mutual_weight(V, E, W, u, v);
+
+    double scale = 0.0;
+    if (norm == SUM) {
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = mutual_weight(V, E, W, u, neighbor);
+            scale += weight_uw;
+        }
+    } else if (norm == MAX) {
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = mutual_weight(V, E, W, u, neighbor);
+            scale = fmax(scale,weight_uw);
+        }
+    }
+    return (scale==0.0) ? 0.0 : (weight_uv / scale);
+}
+
+static __device__ double directed_mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double a_uv = 0.0, a_vu = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        if (E[i] == v) {
+            a_uv = W[i];
+            break;
+        }
+    }
+    for (int i = V[v]; i < V[v+1]; i++) {
+        if (E[i] == u) {
+            a_vu = W[i];
+            break;
+        }
+    }
+    return a_uv + a_vu;
+}
+
+static __device__ double directed_normalized_mutual_weight(
+    const int* V,
+    const int* E,
+    const int* row, 
+    const int* col, 
+    const double* W, 
+    int num_edges,
+    int u,
+    int v,
+    norm_t norm
+) {
+    double weight_uv = directed_mutual_weight(V, E, W, u, v);
+
+    double scale = 0.0;
+    if(norm==SUM){
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
+            scale += weight_uw;
+        }
+
+        for (int i = 0; i < num_edges; i++) {
+            if (col[i] == u) {
+                int neighbor = row[i];
+                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
+                scale += weight_wu;
+            }
+        }
+    }else if(norm==MAX){
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
+            scale = fmax(scale,weight_uw);
+        }
+
+        for (int i = 0; i < num_edges; i++) {
+            if (col[i] == u) {
+                int neighbor = row[i];
+                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
+                scale = fmax(scale,weight_wu);
+            }
+        }
+    }
+    return (scale==0.0) ? 0.0 : (weight_uv / scale);
+}
+
+static __device__ double redundancy(
+    const int* V,
+    const int* E,
+    const double* W,
+    const int num_nodes,
+    int u,
+    int v
+) {
+    double r = 0.0;
+    for (int i = V[v]; i < V[v + 1]; i++) {
+        int w = E[i];
+        r += normalized_mutual_weight(V, E, W, u, w, SUM) * normalized_mutual_weight(V, E, W, v, w, MAX);
+    }
+    return 1-r;
+}
+
+
+// Warp 级别归约函数
+__inline__ __device__ double warp_reduce_sum(double val)
+{
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+    {
+        val += __shfl_down_sync(0xffffffff, val, offset);
+    }
+    return val;
+}
+
+// 线程块内归约函数，利用 warp 归约
+__inline__ __device__ double block_reduce_sum(double val)
+{
+    // 每个 warp 进行归约
+    val = warp_reduce_sum(val);
+
+    // 将每个 warp 的结果存入共享内存
+    __shared__ double shared[32]; // 一个线程块最多有 32 个 warp
+    int warp_id = threadIdx.x / warpSize;
+    if (threadIdx.x % warpSize == 0)
+    {
+        shared[warp_id] = val;
+    }
+    __syncthreads();
+
+    // 由第一个 warp 进行最终的归约
+    if (warp_id == 0)
+    {
+        val = (threadIdx.x < (blockDim.x / warpSize)) ? shared[threadIdx.x] : 0.0;
+        val = warp_reduce_sum(val);
+    }
+    return val;
+}
+
+__global__ void calculate_effective_size(
+    const int* __restrict__ V,
+    const int* __restrict__ E,
+    const double* __restrict__ W,
+    const int num_nodes,
+    const int* __restrict__ node_mask,
+    double* __restrict__ effective_size_results
+) {
+    // 每个线程块处理一个节点
+    int u = blockIdx.x;
+    if (u >= num_nodes || !node_mask[u]) return;
+
+    // 获取节点 u 的邻居范围
+    int neighbor_start = V[u];
+    int neighbor_end = V[u + 1];
+    int degree = neighbor_end - neighbor_start;
+
+    // 计算线程块的线程数量
+    int threads_per_block = blockDim.x;
+
+    // 每个线程处理节点 u 的一个或多个邻居
+    double redundancy_sum = 0.0;
+    for (int idx = threadIdx.x; idx < degree; idx += threads_per_block) {
+        int i = neighbor_start + idx;
+        int v = E[i];
+        if (v != u) {
+            // redundancy_sum += redundancy(V, E, W, num_nodes, u, v);
+            double r = 0.0;
+            for (int j = V[v]; j < V[v + 1]; j++) {
+                int w = E[j];
+                r += normalized_mutual_weight(V, E, W, u, w, SUM) * 
+                     normalized_mutual_weight(V, E, W, v, w, MAX);
+            }
+            redundancy_sum += 1 - r;
+        }
+    }
+
+    // 线程块内归约
+    redundancy_sum = block_reduce_sum(redundancy_sum);
+
+    // 将结果写入全局内存
+    if (threadIdx.x == 0) {
+        effective_size_results[u] = redundancy_sum;
+    }
+}
+
+static __device__ double directed_redundancy(
+    const int* V,
+    const int* E,
+    const int* row,
+    const int* col,
+    const double* W,
+    const int num_nodes,
+    const int num_edges,
+    int u,
+    int v
+) {
+    double r = 0.0;
+    for (int i = V[v]; i < V[v + 1]; i++) {
+        int w = E[i];
+        r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX);
+    }
+    for (int i = 0; i < num_edges; i++) {
+        if (col[i] == v) {
+            int w = row[i];
+            r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX);
+        }
+    }
+    return 1-r;
+}
+
+__global__ void directed_calculate_effective_size(
+    const int* V,
+    const int* E,
+    const int* row,
+    const int* col,
+    const double* W, 
+    const int num_nodes,
+    const int num_edges,
+    const int* node_mask,
+    double* effective_size_results
+) {
+    int u = blockIdx.x * blockDim.x + threadIdx.x;
+    if (u >= num_nodes || !node_mask[u]) return;
+    double redundancy_sum = 0.0;
+    bool is_nan = true;
+
+    // 遍历 u 的所有邻居
+    for (int i = V[u]; i < V[u + 1]; i++) {
+        int v = E[i];
+        if (v == u) continue; // 排除自连接的情况
+        is_nan = false;
+        redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v);
+    }
+    for (int i = 0; i < num_edges; i++) {
+        if (col[i] == u) {
+            int v = row[i];
+            redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v);
+        }
+    }
+    effective_size_results[u] = is_nan ? NAN : redundancy_sum;
+}
+
+
+int cuda_effective_size(
+    _IN_ const int* V,
+    _IN_ const int* E,
+    _IN_ const int* row,
+    _IN_ const int* col,
+    _IN_ const double* W,
+    _IN_ int num_nodes,
+    _IN_ int num_edges,
+    _IN_ bool is_directed,
+    _IN_ int* node_mask,
+    _OUT_ double* effective_size_results
+) {
+    int cuda_ret = cudaSuccess;
+    int EG_ret = EG_GPU_SUCC;
+    int min_grid_size = 0;
+    int block_size = 0;
+
+    
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_effective_size, 0, 0);
+
+    int grid_size = (num_nodes + block_size * NODES_PER_BLOCK - 1) / (block_size * NODES_PER_BLOCK);
+
+    int* d_V;
+    int* d_E;
+    int* d_row;
+    int* d_col;
+    double* d_W;
+    int* d_node_mask;
+    double* d_effective_size_results;
+
+    // 分配 CUDA 设备内存
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_effective_size_results, num_nodes * sizeof(double)));
+
+    // 将数据从主机拷贝到设备
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
+
+    if(is_directed){
+        directed_calculate_effective_size<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_effective_size_results);
+    }else{
+        int block_size = 256;  // 每个块的线程数，可以根据设备进行调整
+        int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;  // 调整为每个块处理NODES_PER_BLOCK个节点
+        // int shared_memory_size = blockDim.x * sizeof(double); 
+        calculate_effective_size<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_effective_size_results);
+        
+    }
+
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(effective_size_results, d_effective_size_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
+
+exit:
+    cudaFree(d_V);
+    cudaFree(d_E);
+    cudaFree(d_row);
+    cudaFree(d_col);
+    cudaFree(d_W);
+    cudaFree(d_node_mask);
+    cudaFree(d_effective_size_results);
+
+    if (cuda_ret != cudaSuccess) {
+        switch (cuda_ret) {
+            case cudaErrorMemoryAllocation:
+                EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM;
+                break;
+            default:
+                EG_ret = EG_GPU_DEVICE_ERR;
+                break;
+        }
+    }
+
+    return EG_ret;
+}
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/effective_size.cuh b/gpu_easygraph/functions/structural_holes/effective_size.cuh
new file mode 100644
index 00000000..e69751e7
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/effective_size.cuh
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "common.h"
+
+namespace gpu_easygraph {
+
+int cuda_effective_size(
+    _IN_ const int* V,
+    _IN_ const int* E,
+    _IN_ const int* row,
+    _IN_ const int* col,
+    _IN_ const double* W,
+    _IN_ int num_nodes,
+    _IN_ int num_edges,
+    _IN_ bool is_directed,
+    _IN_ int* node_mask,
+    _OUT_ double* effective_size_results
+);
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/evaluation.cpp b/gpu_easygraph/functions/structural_holes/evaluation.cpp
deleted file mode 100644
index 957be654..00000000
--- a/gpu_easygraph/functions/structural_holes/evaluation.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-#include <string>
-#include <vector>
-#include <memory> // 需要包含memory头文件
-
-#include "structural_holes/evaluation.cuh"
-#include "common.h"
-
-namespace gpu_easygraph {
-
-using std::vector;
-
-// static int decide_warp_size(
-//     _IN_ int len_V,
-//     _IN_ int len_E
-// )
-// {
-//     vector<int> warp_size_cand{1, 2, 4, 8, 16, 32};
-
-//     if (len_E / len_V < warp_size_cand.front()) {
-//         return warp_size_cand.front();
-//     }
-
-//     for (int i = 0; i + 1 < warp_size_cand.size(); ++i) {
-//         if (warp_size_cand[i] <= len_E / len_V
-//                 && len_E / len_V < warp_size_cand[i + 1]) {
-//             return warp_size_cand[i + 1];
-//         }
-//     }
-//     return warp_size_cand.back();
-// }
-
-int effective_size(
-    _IN_ const vector<int>& V,
-    _IN_ const vector<int>& E,
-    _IN_ const vector<int>& row,
-    _IN_ const vector<int>& col,
-    _IN_ int num_nodes,
-    _IN_ const vector<double>& W,
-    _IN_ bool is_directed,
-    _IN_ vector<int>& node_mask,  // 添加节点掩码参数
-    _OUT_ vector<double>& effective_size
-) {
-    int num_edges = row.size();
-    
-    effective_size = vector<double>(num_nodes);
-    int r = cuda_effective_size(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), effective_size.data());
-
-    return r;  // 成功
-}
-
-int constraint(
-    _IN_ const vector<int>& V,
-    _IN_ const vector<int>& E,
-    _IN_ const vector<int>& row,
-    _IN_ const vector<int>& col,
-    _IN_ int num_nodes,
-    _IN_ const vector<double>& W,
-    _IN_ bool is_directed,
-    _IN_ vector<int>& node_mask,  // 添加节点掩码参数
-    _OUT_ vector<double>& constraint
-) {
-    int num_edges = row.size();
-    
-    constraint = vector<double>(num_nodes);
-    int r = cuda_constraint(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), constraint.data());
-
-    return r;  // 成功
-}
-
-int hierarchy(
-    _IN_ const vector<int>& V,
-    _IN_ const vector<int>& E,
-    _IN_ const vector<int>& row,
-    _IN_ const vector<int>& col,
-    _IN_ int num_nodes,
-    _IN_ const vector<double>& W,
-    _IN_ bool is_directed,
-    _IN_ vector<int>& node_mask,  // 添加节点掩码参数
-    _OUT_ vector<double>& hierarchy
-) {
-    int num_edges = row.size();
-    
-    hierarchy = vector<double>(num_nodes);
-    int r = cuda_hierarchy(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), hierarchy.data());
-
-    return r;  // 成功
-}
-
-} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/evaluation.cu b/gpu_easygraph/functions/structural_holes/evaluation.cu
deleted file mode 100644
index 6c6d615d..00000000
--- a/gpu_easygraph/functions/structural_holes/evaluation.cu
+++ /dev/null
@@ -1,888 +0,0 @@
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <stdlib.h>
-
-#include "common.h"
-#define NODES_PER_BLOCK 1
-
-namespace gpu_easygraph {
-
-enum norm_t { SUM = 0, MAX = 1 };
-
-static __device__ double mutual_weight(
-    const int* V,
-    const int* E,
-    const double* W,
-    int u,
-    int v
-) {
-    double a_uv = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        if (E[i] == v) {
-            a_uv = W[i];
-            break;
-        }
-    }
-    return a_uv;
-}
-
-static __device__ double normalized_mutual_weight(
-    const int* V,
-    const int* E,
-    const double* W, 
-    int u,
-    int v,
-    norm_t norm
-) {
-    double weight_uv = mutual_weight(V, E, W, u, v);
-
-    double scale = 0.0;
-    if (norm == SUM) {
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = mutual_weight(V, E, W, u, neighbor);
-            scale += weight_uw;
-        }
-    } else if (norm == MAX) {
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = mutual_weight(V, E, W, u, neighbor);
-            scale = fmax(scale,weight_uw);
-        }
-    }
-    return (scale==0.0) ? 0.0 : (weight_uv / scale);
-}
-
-static __device__ double local_constraint(
-    const int* V,
-    const int* E,
-    const double* W,
-    int u,
-    int v
-) {
-    double direct = normalized_mutual_weight(V,E,W,u,v,SUM);
-    double indirect = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        int neighbor = E[i];
-        double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM);
-        double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM);
-        indirect += norm_uw * norm_wv;
-    }
-    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
-    return local_constraint_of_uv;
-}
-
-
-
-__global__ void calculate_constraints(
-    const int* __restrict__ V,
-    const int* __restrict__ E,
-    const double* __restrict__ W, 
-    const int num_nodes, 
-    const int* __restrict__ node_mask,
-    double* __restrict__ constraint_results
-) {
-    int start_node = blockIdx.x * NODES_PER_BLOCK;  // 当前块处理的起始节点
-    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);  // 结束节点，确保不越界
-
-    for (int v = start_node; v < end_node; ++v) {
-        if (!node_mask[v]) continue;
-
-        double constraint_of_v = 0.0;
-        bool is_nan = true;
-
-        // 使用所有线程并行处理邻居
-        __shared__ double shared_constraint[256];  // 假设 blockDim.x <= 256
-        double local_sum = 0.0;
-
-        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
-            is_nan = false;
-            int neighbor = E[i];
-            local_sum += local_constraint(V, E, W, v, neighbor);
-        }
-
-        // 归约邻居结果到一个值
-        shared_constraint[threadIdx.x] = local_sum;
-        __syncthreads();
-
-        for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
-            if (threadIdx.x < offset) {
-                shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset];
-            }
-            __syncthreads();
-        }
-
-        // 保存最终结果
-        if (threadIdx.x == 0) {
-            constraint_results[v] = (is_nan) ? NAN : shared_constraint[0];
-        }
-    }
-}
-
-static __device__ double directed_mutual_weight(
-    const int* V,
-    const int* E,
-    const double* W,
-    int u,
-    int v
-) {
-    double a_uv = 0.0, a_vu = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        if (E[i] == v) {
-            a_uv = W[i];
-            break;
-        }
-    }
-    for (int i = V[v]; i < V[v+1]; i++) {
-        if (E[i] == u) {
-            a_vu = W[i];
-            break;
-        }
-    }
-    return a_uv + a_vu;
-}
-
-static __device__ double directed_normalized_mutual_weight(
-    const int* V,
-    const int* E,
-    const int* row, 
-    const int* col, 
-    const double* W, 
-    int num_edges,
-    int u,
-    int v,
-    norm_t norm
-) {
-    double weight_uv = directed_mutual_weight(V, E, W, u, v);
-
-    double scale = 0.0;
-    if(norm==SUM){
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
-            scale += weight_uw;
-        }
-
-        for (int i = 0; i < num_edges; i++) {
-            if (col[i] == u) {
-                int neighbor = row[i];
-                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
-                scale += weight_wu;
-            }
-        }
-    }else if(norm==MAX){
-        for (int i = V[u]; i < V[u+1]; i++) {
-            int neighbor = E[i];
-            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
-            scale = fmax(scale,weight_uw);
-        }
-
-        for (int i = 0; i < num_edges; i++) {
-            if (col[i] == u) {
-                int neighbor = row[i];
-                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
-                scale = fmax(scale,weight_wu);
-            }
-        }
-    }
-    return (scale==0.0) ? 0.0 : (weight_uv / scale);
-}
-
-static __device__ double directed_local_constraint(
-    const int* V,
-    const int* E,
-    const int* row, 
-    const int* col, 
-    const double* W,
-    int num_edges,
-    int u,
-    int v
-) {
-    double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM);
-    double indirect = 0.0;
-    for (int i = V[u]; i < V[u+1]; i++) {
-        int neighbor = E[i];
-        double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
-        double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
-        indirect += norm_uw * norm_wv;
-    }
-
-    for (int i = 0; i < num_edges; i++) {
-        if (col[i] == u) {
-            int neighbor = row[i];
-            double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
-            double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
-            indirect += norm_uw * norm_wv;
-        }
-    }
-    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
-    return local_constraint_of_uv;
-}
-
-__global__ void directed_calculate_constraints(
-    const int* V,
-    const int* E,
-    const int* row, 
-    const int* col, 
-    const double* W,  
-    int num_nodes,
-    int num_edges,
-    int* node_mask,
-    double* constraint_results
-) {
-    int start_node = blockIdx.x * NODES_PER_BLOCK;  // 当前块处理的起始节点
-    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);  // 结束节点，确保不越界
-
-    for (int v = start_node; v < end_node; ++v) {
-        if (!node_mask[v]) continue;
-
-        double constraint_of_v = 0.0;
-        bool is_nan = true;
-
-        // 使用所有线程并行处理邻居
-        __shared__ double shared_constraint[256];  // 假设 blockDim.x <= 256
-        double local_sum = 0.0;
-
-        // 计算出邻居约束（out-neighbors）
-        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
-            is_nan = false;
-            int neighbor = E[i];
-            local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor);
-        }
-
-        // 计算入邻居约束（in-neighbors）
-        for (int i = threadIdx.x; i < num_edges; i += blockDim.x) {
-            if (col[i] == v) {
-                // is_nan = false;
-                int neighbor = row[i];
-                local_sum += directed_local_constraint(V, E, row, col, W, num_edges, v, neighbor);
-            }
-        }
-
-        // 将所有线程的局部结果写入共享内存
-        shared_constraint[threadIdx.x] = local_sum;
-        __syncthreads();
-
-        // 归约邻居结果到一个值
-        for (int offset = blockDim.x / 2; offset > 0; offset /= 2) {
-            if (threadIdx.x < offset) {
-                shared_constraint[threadIdx.x] += shared_constraint[threadIdx.x + offset];
-            }
-            __syncthreads();
-        }
-
-        // 保存最终结果
-        if (threadIdx.x == 0) {
-            constraint_results[v] = (is_nan) ? NAN : shared_constraint[0];
-        }
-    }
-}
-
-static __device__ double redundancy(
-    const int* V,
-    const int* E,
-    const double* W,
-    const int num_nodes,
-    int u,
-    int v
-) {
-    double r = 0.0;
-    for (int i = V[v]; i < V[v + 1]; i++) {
-        int w = E[i];
-        r += normalized_mutual_weight(V, E, W, u, w, SUM) * normalized_mutual_weight(V, E, W, v, w, MAX);
-    }
-    return 1-r;
-}
-
-// __global__ void calculate_effective_size(
-//     const int* V,
-//     const int* E,
-//     const double* W,
-//     const int num_nodes,
-//     const int* node_mask,
-//     double* effective_size_results
-// ) {
-//     int u = blockIdx.x * blockDim.x + threadIdx.x;
-//     if (u >= num_nodes || !node_mask[u]) return;
-//     double redundancy_sum = 0.0;
-//     bool is_nan = true;
-
-//     // 遍历 v 的所有邻居
-//     for (int i = V[u]; i < V[u + 1]; i++) {
-//         int v = E[i];
-//         if (v == u) continue; // 排除自连接的情况
-//         is_nan = false;
-//         redundancy_sum += redundancy(V,E,W,num_nodes,u,v);
-//     }
-//     effective_size_results[u] = is_nan ? NAN : redundancy_sum;
-// }
-
-
-static __device__ double atomicAdd (
-    _OUT_ double* address, 
-    _IN_ double val
-)
-{
-	unsigned long long int* address_as_ull =
-		(unsigned long long int*)address;
-	unsigned long long int old = *address_as_ull, assumed;
-	do {
-		assumed = old;
-		old = atomicCAS(address_as_ull, assumed,
-			__double_as_longlong(val +
-			__longlong_as_double(assumed)));
-	} while (assumed != old);
-	return __longlong_as_double(old);
-}
-
-// 自定义 atomicAdd 函数，用于 double 类型的原子加法（如果您的架构支持，可以使用内置的 atomicAdd）
-static __device__ double atomicAdd_double(double* address, double val)
-{
-    unsigned long long int* address_as_ull =
-        (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed;
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull, assumed,
-            __double_as_longlong(val +
-            __longlong_as_double(assumed)));
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-
-// Warp 级别归约函数
-__inline__ __device__ double warp_reduce_sum(double val)
-{
-    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-    {
-        val += __shfl_down_sync(0xffffffff, val, offset);
-    }
-    return val;
-}
-
-// 线程块内归约函数，利用 warp 归约
-__inline__ __device__ double block_reduce_sum(double val)
-{
-    // 每个 warp 进行归约
-    val = warp_reduce_sum(val);
-
-    // 将每个 warp 的结果存入共享内存
-    __shared__ double shared[32]; // 一个线程块最多有 32 个 warp
-    int warp_id = threadIdx.x / warpSize;
-    if (threadIdx.x % warpSize == 0)
-    {
-        shared[warp_id] = val;
-    }
-    __syncthreads();
-
-    // 由第一个 warp 进行最终的归约
-    if (warp_id == 0)
-    {
-        val = (threadIdx.x < (blockDim.x / warpSize)) ? shared[threadIdx.x] : 0.0;
-        val = warp_reduce_sum(val);
-    }
-    return val;
-}
-
-__global__ void calculate_effective_size(
-    const int* __restrict__ V,
-    const int* __restrict__ E,
-    const double* __restrict__ W,
-    const int num_nodes,
-    const int* __restrict__ node_mask,
-    double* __restrict__ effective_size_results
-) {
-    // 每个线程块处理一个节点
-    int u = blockIdx.x;
-    if (u >= num_nodes || !node_mask[u]) return;
-
-    // 获取节点 u 的邻居范围
-    int neighbor_start = V[u];
-    int neighbor_end = V[u + 1];
-    int degree = neighbor_end - neighbor_start;
-
-    // 计算线程块的线程数量
-    int threads_per_block = blockDim.x;
-
-    // 每个线程处理节点 u 的一个或多个邻居
-    double redundancy_sum = 0.0;
-    for (int idx = threadIdx.x; idx < degree; idx += threads_per_block) {
-        int i = neighbor_start + idx;
-        int v = E[i];
-        if (v != u) {
-            // redundancy_sum += redundancy(V, E, W, num_nodes, u, v);
-            double r = 0.0;
-            for (int j = V[v]; j < V[v + 1]; j++) {
-                int w = E[j];
-                r += normalized_mutual_weight(V, E, W, u, w, SUM) * 
-                     normalized_mutual_weight(V, E, W, v, w, MAX);
-            }
-            redundancy_sum += 1 - r;
-        }
-    }
-
-    // 线程块内归约
-    redundancy_sum = block_reduce_sum(redundancy_sum);
-
-    // 将结果写入全局内存
-    if (threadIdx.x == 0) {
-        effective_size_results[u] = redundancy_sum;
-    }
-}
-
-static __device__ double directed_redundancy(
-    const int* V,
-    const int* E,
-    const int* row,
-    const int* col,
-    const double* W,
-    const int num_nodes,
-    const int num_edges,
-    int u,
-    int v
-) {
-    double r = 0.0;
-    for (int i = V[v]; i < V[v + 1]; i++) {
-        int w = E[i];
-        r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX);
-    }
-    for (int i = 0; i < num_edges; i++) {
-        if (col[i] == v) {
-            int w = row[i];
-            r += directed_normalized_mutual_weight(V, E, row,col,W,num_edges, u, w,SUM) * directed_normalized_mutual_weight(V, E, row,col,W, num_edges, v,w,MAX);
-        }
-    }
-    return 1-r;
-}
-
-__global__ void directed_calculate_effective_size(
-    const int* V,
-    const int* E,
-    const int* row,
-    const int* col,
-    const double* W, 
-    const int num_nodes,
-    const int num_edges,
-    const int* node_mask,
-    double* effective_size_results
-) {
-    int u = blockIdx.x * blockDim.x + threadIdx.x;
-    if (u >= num_nodes || !node_mask[u]) return;
-    double redundancy_sum = 0.0;
-    bool is_nan = true;
-
-    // 遍历 u 的所有邻居
-    for (int i = V[u]; i < V[u + 1]; i++) {
-        int v = E[i];
-        if (v == u) continue; // 排除自连接的情况
-        is_nan = false;
-        redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v);
-    }
-    for (int i = 0; i < num_edges; i++) {
-        if (col[i] == u) {
-            int v = row[i];
-            redundancy_sum += directed_redundancy(V,E,row,col,W,num_nodes,num_edges,u,v);
-        }
-    }
-    effective_size_results[u] = is_nan ? NAN : redundancy_sum;
-}
-
-
-
-// __global__ void calculate_hierarchy(
-//     const int* V, 
-//     const int* E, 
-//     const double* W,
-//     int num_nodes,
-//     const int* node_mask,
-//     double* hierarchy_results
-// ) {
-//     int v = blockIdx.x * blockDim.x + threadIdx.x;
-//     if (v >= num_nodes || !node_mask[v]) return;
-//     int n = V[v + 1] - V[v];  
-//     double *c = new double[n]; 
-//     double C = 0.0;
-//     double hierarchy_sum = 0.0;
-//     int neighbor = 0;
-//     for (int i = V[v]; i < V[v + 1]; i++) {
-//         int w = E[i];
-//         c[neighbor] = local_constraint(V, E, W, v, w);
-//         C += c[neighbor++];
-//     }
-//     __syncthreads();
-//     if (n > 1) {
-//         for (int i = 0; i < neighbor; i++) {
-//             hierarchy_sum += (c[i] / C) * n * logf((c[i] / C) * n) / (n * logf(n));
-//         }
-//         hierarchy_results[v] = hierarchy_sum;
-//     }else{
-//         hierarchy_results[v] = 0;
-//     }
-//     delete[] c;
-// }
-
-
-__global__ void calculate_hierarchy(
-    const int* V, 
-    const int* E, 
-    const double* W,
-    int num_nodes,
-    const int* node_mask,
-    double* hierarchy_results
-) {
-    int start_node = blockIdx.x * NODES_PER_BLOCK;  // 当前块处理的起始节点
-    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);  // 确保不越界
-
-    extern __shared__ double shared_mem[];  // 动态分配共享内存
-    double* shared_c = shared_mem;          // 每个邻居的局部约束值
-    double* shared_C = &shared_mem[blockDim.x];  // 用于存储约束总和 C
-
-    // 每个块处理多个节点
-    for (int v = start_node; v < end_node; ++v) {
-        if (!node_mask[v]) continue;
-
-        int n = V[v + 1] - V[v];  // 邻居数量
-        if (n <= 1) {
-            hierarchy_results[v] = 0.0;  // 如果邻居数 <= 1，层级性为 0
-            continue;
-        }
-        if (threadIdx.x == 0) shared_C[0] = 0.0; // 清零
-        __syncthreads();
-
-        double local_C = 0.0;
-        
-
-        // 每个线程负责部分邻居
-        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
-            int w = E[i];
-            double constraint = local_constraint(V, E, W, v, w);  // 计算约束值
-            shared_c[threadIdx.x] = constraint;  // 写入共享内存
-            local_C += constraint;  // 累加局部约束值
-        }
-
-        // 使用原子操作归约约束总和 C
-        atomicAdd(&shared_C[0], local_C);
-        __syncthreads();  // 确保所有线程完成归约
-
-        // 计算层级性
-        if (threadIdx.x == 0) {  // 仅由线程 0 计算最终层级性
-            double C = shared_C[0];  // 获取约束总和
-            double hierarchy_sum = 0.0;
-            for (int i = 0; i < n; i++) {
-                double normalized_c = shared_c[i] / C;
-                hierarchy_sum += normalized_c * n * logf(normalized_c * n) / (n * logf(n));
-            }
-            hierarchy_results[v] = hierarchy_sum;  // 保存结果
-        }
-
-        __syncthreads();  // 确保当前节点的所有线程完成计算
-    }
-}
-
-
-
-__global__ void directed_calculate_hierarchy(
-    const int* V,
-    const int* E,
-    const int* row,
-    const int* col,
-    const double* W, 
-    const int num_nodes,
-    const int num_edges,
-    const int* node_mask,
-    double* hierarchy_results
-) {
-    int v = blockIdx.x * blockDim.x + threadIdx.x;
-    if (v >= num_nodes || !node_mask[v]) return;
-    int in_neighbor = V[v + 1] - V[v];  
-    int out_neighbor = 0;
-    double C = 0.0;
-    double hierarchy_sum = 0.0;
-    int neighbor = 0;
-    for (int i = 0; i < num_edges; i++) {
-        if (col[i] == v) {
-            out_neighbor++;
-        }
-    }
-    double *c = new double[in_neighbor+out_neighbor]; 
-    for (int i = V[v]; i < V[v + 1]; i++) {
-        int w = E[i];
-        c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w);
-        C += c[neighbor];
-        neighbor++;
-    }
-    for (int i = 0; i < num_edges; i++) {
-        if (col[i] == v) {
-            int w = row[i];
-            c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w);
-            C += c[neighbor];
-            neighbor++;
-        }
-    }
-    __syncthreads();
-    if (neighbor > 1) {
-        for (int i = 0; i < neighbor; i++) {
-            hierarchy_sum += (c[i] / C) * neighbor * logf((c[i] / C) * neighbor) / (neighbor * logf(neighbor));
-        }
-        hierarchy_results[v] = hierarchy_sum;
-    }else{
-        hierarchy_results[v] = 0;
-    }
-    delete[] c;
-}
-
-
-
-int cuda_hierarchy(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
-    _IN_ int num_nodes,
-    _IN_ int num_edges,
-    _IN_ bool is_directed,
-    _IN_ int* node_mask,
-    _OUT_ double* hierarchy_results
-){
-    int cuda_ret = cudaSuccess;
-    int EG_ret = EG_GPU_SUCC;
-    int min_grid_size = 0;
-    int block_size = 0;
-    
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_constraints, 0, 0);
-    int grid_size = (num_nodes + block_size - 1) / block_size;
-    
-    int* d_V;
-    int* d_E;
-    int* d_row;
-    int* d_col;
-    double* d_W;
-    int* d_node_mask;
-    double* d_hierarchy_results;
-
-    // 分配CUDA设备内存
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_hierarchy_results, num_nodes * sizeof(double)));
-
-    // 将数据从主机拷贝到设备
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
-
-
-    // 调用 CUDA 内核
-    if(is_directed){
-        directed_calculate_hierarchy<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_hierarchy_results);
-    }else{
-        int block_size = 256;  // 每个块的线程数，可以根据设备进行调整
-        int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;  // 调整为每个块处理NODES_PER_BLOCK个节点
-        int shared_memory_size = 2 * sizeof(double) * block_size; 
-        calculate_hierarchy<<<grid_size, block_size, shared_memory_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_hierarchy_results);
-    }
-
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(hierarchy_results, d_hierarchy_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
-exit:
-
-    cudaFree(d_V);
-    cudaFree(d_E);
-    cudaFree(d_row);
-    cudaFree(d_col);
-    cudaFree(d_W);
-    cudaFree(d_node_mask);
-    cudaFree(d_hierarchy_results);
-    if (cuda_ret != cudaSuccess) {
-        switch (cuda_ret) {
-            case cudaErrorMemoryAllocation:
-                EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM;
-                break;
-            default:
-                EG_ret = EG_GPU_DEVICE_ERR;
-                break;
-        }
-    }
-
-    return EG_ret; 
-}
-
-
-
-int cuda_effective_size(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
-    _IN_ int num_nodes,
-    _IN_ int num_edges,
-    _IN_ bool is_directed,
-    _IN_ int* node_mask,
-    _OUT_ double* effective_size_results
-) {
-    int cuda_ret = cudaSuccess;
-    int EG_ret = EG_GPU_SUCC;
-    int min_grid_size = 0;
-    int block_size = 0;
-
-    
-    // cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, 
-    // is_directed ? directed_calculate_effective_size : undirected_calculate_effective_size, 0, 0);
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_effective_size, 0, 0);
-    // int grid_size = (num_nodes + block_size - 1) / block_size;
-    // 我们设定每个节点使用的线程数
-    // int threads_per_node = 32;  // 通常设置为 32 的倍数
-    // dim3 blockDim(threads_per_node, NODES_PER_BLOCK);  // 二维线程块
-    int grid_size = (num_nodes + block_size * NODES_PER_BLOCK - 1) / (block_size * NODES_PER_BLOCK);
-
-    int* d_V;
-    int* d_E;
-    int* d_row;
-    int* d_col;
-    double* d_W;
-    int* d_node_mask;
-    double* d_effective_size_results;
-
-    // 分配 CUDA 设备内存
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_effective_size_results, num_nodes * sizeof(double)));
-
-    // 将数据从主机拷贝到设备
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
-
-    if(is_directed){
-        directed_calculate_effective_size<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_effective_size_results);
-    }else{
-        int block_size = 256;  // 每个块的线程数，可以根据设备进行调整
-        int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;  // 调整为每个块处理NODES_PER_BLOCK个节点
-        // int shared_memory_size = blockDim.x * sizeof(double); 
-        calculate_effective_size<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_effective_size_results);
-        
-    }
-
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(effective_size_results, d_effective_size_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
-
-exit:
-    cudaFree(d_V);
-    cudaFree(d_E);
-    cudaFree(d_row);
-    cudaFree(d_col);
-    cudaFree(d_W);
-    cudaFree(d_node_mask);
-    cudaFree(d_effective_size_results);
-
-    if (cuda_ret != cudaSuccess) {
-        switch (cuda_ret) {
-            case cudaErrorMemoryAllocation:
-                EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM;
-                break;
-            default:
-                EG_ret = EG_GPU_DEVICE_ERR;
-                break;
-        }
-    }
-
-    return EG_ret;
-}
-
-int cuda_constraint(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
-    _IN_ int num_nodes,
-    _IN_ int num_edges,
-    _IN_ bool is_directed,
-    _IN_ int* node_mask,
-    _OUT_ double* constraint_results
-) {
-    int cuda_ret = cudaSuccess;
-    int EG_ret = EG_GPU_SUCC;
-    
-    
-
-    // calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results);
-    
-    int* d_V;
-    int* d_E;
-    int* d_row;
-    int* d_col;
-    double* d_W;
-    int* d_node_mask;
-    double* d_constraint_results;
-    int block_size = 256;  // 每个块的线程数，可以根据设备进行调整
-    int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;  // 调整为每个块处理NODES_PER_BLOCK个节点
-
-    // 分配CUDA设备内存
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
-    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_constraint_results, num_nodes * sizeof(double)));
-
-    // 将数据从主机拷贝到设备
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
-
-    
-    // 调用 CUDA 内核
-    if(is_directed){
-        // int min_grid_size = 0;
-        // int block_size = 0;
-        // cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_constraints, 0, 0);
-        // int grid_size = (num_nodes + block_size - 1) / block_size;
-        directed_calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_constraint_results);
-    }else{
-        
-        calculate_constraints<<<grid_size, block_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_constraint_results);
-    }
-
-    EXIT_IF_CUDA_FAILED(cudaMemcpy(constraint_results, d_constraint_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
-exit:
-
-    cudaFree(d_V);
-    cudaFree(d_E);
-    cudaFree(d_row);
-    cudaFree(d_col);
-    cudaFree(d_W);
-    cudaFree(d_node_mask);
-    cudaFree(d_constraint_results);
-    if (cuda_ret != cudaSuccess) {
-        switch (cuda_ret) {
-            case cudaErrorMemoryAllocation:
-                EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM;
-                break;
-            default:
-                EG_ret = EG_GPU_DEVICE_ERR;
-                break;
-        }
-    }
-
-    return EG_ret; 
-}
-
-} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/evaluation.cuh b/gpu_easygraph/functions/structural_holes/evaluation.cuh
deleted file mode 100644
index 6624800d..00000000
--- a/gpu_easygraph/functions/structural_holes/evaluation.cuh
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include "common.h"
-
-namespace gpu_easygraph {
-
-int cuda_effective_size(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
-    _IN_ int num_nodes,
-    _IN_ int num_edges,
-    _IN_ bool is_directed,
-    _IN_ int* node_mask,
-    _OUT_ double* effective_size_results
-);
-
-int cuda_constraint(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
-    _IN_ int num_nodes,
-    _IN_ int num_edges,
-    _IN_ bool is_directed,
-    _IN_ int* node_mask,
-    _OUT_ double* constraint_results
-);
-
-int cuda_hierarchy(
-    _IN_ const int* V,
-    _IN_ const int* E,
-    _IN_ const int* row,
-    _IN_ const int* col,
-    _IN_ const double* W,
-    _IN_ int num_nodes,
-    _IN_ int num_edges,
-    _IN_ bool is_directed,
-    _IN_ int* node_mask,
-    _OUT_ double* hierarchy_results
-);
-} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/hierarchy.cpp b/gpu_easygraph/functions/structural_holes/hierarchy.cpp
new file mode 100644
index 00000000..1e7c2fa2
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/hierarchy.cpp
@@ -0,0 +1,31 @@
+#include <string>
+#include <vector>
+#include <memory> // 需要包含memory头文件
+
+#include "structural_holes/hierarchy.cuh"
+#include "common.h"
+
+namespace gpu_easygraph {
+
+using std::vector;
+
+int hierarchy(
+    _IN_ const vector<int>& V,
+    _IN_ const vector<int>& E,
+    _IN_ const vector<int>& row,
+    _IN_ const vector<int>& col,
+    _IN_ int num_nodes,
+    _IN_ const vector<double>& W,
+    _IN_ bool is_directed,
+    _IN_ vector<int>& node_mask,  // 添加节点掩码参数
+    _OUT_ vector<double>& hierarchy
+) {
+    int num_edges = row.size();
+    
+    hierarchy = vector<double>(num_nodes);
+    int r = cuda_hierarchy(V.data(), E.data(), row.data(), col.data(), W.data(), num_nodes, num_edges, is_directed, node_mask.data(), hierarchy.data());
+
+    return r;  // 成功
+}
+
+} // namespace gpu_easygraph
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/hierarchy.cu b/gpu_easygraph/functions/structural_holes/hierarchy.cu
new file mode 100644
index 00000000..3826eaba
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/hierarchy.cu
@@ -0,0 +1,378 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <stdlib.h>
+
+#include "common.h"
+#define NODES_PER_BLOCK 1
+
+namespace gpu_easygraph {
+
+enum norm_t { SUM = 0, MAX = 1 };
+
+static __device__ double mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double a_uv = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        if (E[i] == v) {
+            a_uv = W[i];
+            break;
+        }
+    }
+    return a_uv;
+}
+
+static __device__ double normalized_mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W, 
+    int u,
+    int v,
+    norm_t norm
+) {
+    double weight_uv = mutual_weight(V, E, W, u, v);
+
+    double scale = 0.0;
+    if (norm == SUM) {
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = mutual_weight(V, E, W, u, neighbor);
+            scale += weight_uw;
+        }
+    } else if (norm == MAX) {
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = mutual_weight(V, E, W, u, neighbor);
+            scale = fmax(scale,weight_uw);
+        }
+    }
+    return (scale==0.0) ? 0.0 : (weight_uv / scale);
+}
+
+static __device__ double directed_mutual_weight(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double a_uv = 0.0, a_vu = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        if (E[i] == v) {
+            a_uv = W[i];
+            break;
+        }
+    }
+    for (int i = V[v]; i < V[v+1]; i++) {
+        if (E[i] == u) {
+            a_vu = W[i];
+            break;
+        }
+    }
+    return a_uv + a_vu;
+}
+
+static __device__ double directed_normalized_mutual_weight(
+    const int* V,
+    const int* E,
+    const int* row, 
+    const int* col, 
+    const double* W, 
+    int num_edges,
+    int u,
+    int v,
+    norm_t norm
+) {
+    double weight_uv = directed_mutual_weight(V, E, W, u, v);
+
+    double scale = 0.0;
+    if(norm==SUM){
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
+            scale += weight_uw;
+        }
+
+        for (int i = 0; i < num_edges; i++) {
+            if (col[i] == u) {
+                int neighbor = row[i];
+                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
+                scale += weight_wu;
+            }
+        }
+    }else if(norm==MAX){
+        for (int i = V[u]; i < V[u+1]; i++) {
+            int neighbor = E[i];
+            double weight_uw = directed_mutual_weight(V, E, W, u, neighbor);
+            scale = fmax(scale,weight_uw);
+        }
+
+        for (int i = 0; i < num_edges; i++) {
+            if (col[i] == u) {
+                int neighbor = row[i];
+                double weight_wu = directed_mutual_weight(V, E, W, u, neighbor);
+                scale = fmax(scale,weight_wu);
+            }
+        }
+    }
+    return (scale==0.0) ? 0.0 : (weight_uv / scale);
+}
+
+static __device__ double atomicAdd (
+    _OUT_ double* address, 
+    _IN_ double val
+)
+{
+	unsigned long long int* address_as_ull =
+		(unsigned long long int*)address;
+	unsigned long long int old = *address_as_ull, assumed;
+	do {
+		assumed = old;
+		old = atomicCAS(address_as_ull, assumed,
+			__double_as_longlong(val +
+			__longlong_as_double(assumed)));
+	} while (assumed != old);
+	return __longlong_as_double(old);
+}
+
+static __device__ double local_constraint(
+    const int* V,
+    const int* E,
+    const double* W,
+    int u,
+    int v
+) {
+    double direct = normalized_mutual_weight(V,E,W,u,v,SUM);
+    double indirect = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        int neighbor = E[i];
+        double norm_uw = normalized_mutual_weight(V, E, W, u, neighbor,SUM);
+        double norm_wv = normalized_mutual_weight(V, E, W, neighbor, v,SUM);
+        indirect += norm_uw * norm_wv;
+    }
+    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
+    return local_constraint_of_uv;
+}
+
+__global__ void calculate_hierarchy(
+    const int* V, 
+    const int* E, 
+    const double* W,
+    int num_nodes,
+    const int* node_mask,
+    double* hierarchy_results
+) {
+    int start_node = blockIdx.x * NODES_PER_BLOCK;  // 当前块处理的起始节点
+    int end_node = min(start_node + NODES_PER_BLOCK, num_nodes);  // 确保不越界
+
+    extern __shared__ double shared_mem[];  // 动态分配共享内存
+    double* shared_c = shared_mem;          // 每个邻居的局部约束值
+    double* shared_C = &shared_mem[blockDim.x];  // 用于存储约束总和 C
+
+    // 每个块处理多个节点
+    for (int v = start_node; v < end_node; ++v) {
+        if (!node_mask[v]) continue;
+
+        int n = V[v + 1] - V[v];  // 邻居数量
+        if (n <= 1) {
+            hierarchy_results[v] = 0.0;  // 如果邻居数 <= 1，层级性为 0
+            continue;
+        }
+        if (threadIdx.x == 0) shared_C[0] = 0.0; // 清零
+        __syncthreads();
+
+        double local_C = 0.0;
+        
+
+        // 每个线程负责部分邻居
+        for (int i = V[v] + threadIdx.x; i < V[v + 1]; i += blockDim.x) {
+            int w = E[i];
+            double constraint = local_constraint(V, E, W, v, w);  // 计算约束值
+            shared_c[threadIdx.x] = constraint;  // 写入共享内存
+            local_C += constraint;  // 累加局部约束值
+        }
+
+        // 使用原子操作归约约束总和 C
+        atomicAdd(&shared_C[0], local_C);
+        __syncthreads();  // 确保所有线程完成归约
+
+        // 计算层级性
+        if (threadIdx.x == 0) {  // 仅由线程 0 计算最终层级性
+            double C = shared_C[0];  // 获取约束总和
+            double hierarchy_sum = 0.0;
+            for (int i = 0; i < n; i++) {
+                double normalized_c = shared_c[i] / C;
+                hierarchy_sum += normalized_c * n * logf(normalized_c * n) / (n * logf(n));
+            }
+            hierarchy_results[v] = hierarchy_sum;  // 保存结果
+        }
+
+        __syncthreads();  // 确保当前节点的所有线程完成计算
+    }
+}
+
+static __device__ double directed_local_constraint(
+    const int* V,
+    const int* E,
+    const int* row, 
+    const int* col, 
+    const double* W,
+    int num_edges,
+    int u,
+    int v
+) {
+    double direct = directed_normalized_mutual_weight(V,E,row,col,W,num_edges,u,v,SUM);
+    double indirect = 0.0;
+    for (int i = V[u]; i < V[u+1]; i++) {
+        int neighbor = E[i];
+        double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
+        double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
+        indirect += norm_uw * norm_wv;
+    }
+
+    for (int i = 0; i < num_edges; i++) {
+        if (col[i] == u) {
+            int neighbor = row[i];
+            double norm_uw = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, u, neighbor,SUM);
+            double norm_wv = directed_normalized_mutual_weight(V, E, row, col, W, num_edges, neighbor, v,SUM);
+            indirect += norm_uw * norm_wv;
+        }
+    }
+    double local_constraint_of_uv = (direct + indirect) * (direct + indirect);
+    return local_constraint_of_uv;
+}
+
+__global__ void directed_calculate_hierarchy(
+    const int* V,
+    const int* E,
+    const int* row,
+    const int* col,
+    const double* W, 
+    const int num_nodes,
+    const int num_edges,
+    const int* node_mask,
+    double* hierarchy_results
+) {
+    int v = blockIdx.x * blockDim.x + threadIdx.x;
+    if (v >= num_nodes || !node_mask[v]) return;
+    int in_neighbor = V[v + 1] - V[v];  
+    int out_neighbor = 0;
+    double C = 0.0;
+    double hierarchy_sum = 0.0;
+    int neighbor = 0;
+    for (int i = 0; i < num_edges; i++) {
+        if (col[i] == v) {
+            out_neighbor++;
+        }
+    }
+    double *c = new double[in_neighbor+out_neighbor]; 
+    for (int i = V[v]; i < V[v + 1]; i++) {
+        int w = E[i];
+        c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w);
+        C += c[neighbor];
+        neighbor++;
+    }
+    for (int i = 0; i < num_edges; i++) {
+        if (col[i] == v) {
+            int w = row[i];
+            c[neighbor] = directed_local_constraint(V, E, row, col, W, num_edges, v, w);
+            C += c[neighbor];
+            neighbor++;
+        }
+    }
+    __syncthreads();
+    if (neighbor > 1) {
+        for (int i = 0; i < neighbor; i++) {
+            hierarchy_sum += (c[i] / C) * neighbor * logf((c[i] / C) * neighbor) / (neighbor * logf(neighbor));
+        }
+        hierarchy_results[v] = hierarchy_sum;
+    }else{
+        hierarchy_results[v] = 0;
+    }
+    delete[] c;
+}
+
+int cuda_hierarchy(
+    _IN_ const int* V,
+    _IN_ const int* E,
+    _IN_ const int* row,
+    _IN_ const int* col,
+    _IN_ const double* W,
+    _IN_ int num_nodes,
+    _IN_ int num_edges,
+    _IN_ bool is_directed,
+    _IN_ int* node_mask,
+    _OUT_ double* hierarchy_results
+){
+    int cuda_ret = cudaSuccess;
+    int EG_ret = EG_GPU_SUCC;
+    int min_grid_size = 0;
+    int block_size = 0;
+    
+    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, calculate_hierarchy, 0, 0);
+    int grid_size = (num_nodes + block_size - 1) / block_size;
+    
+    int* d_V;
+    int* d_E;
+    int* d_row;
+    int* d_col;
+    double* d_W;
+    int* d_node_mask;
+    double* d_hierarchy_results;
+
+    // 分配CUDA设备内存
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_V, (num_nodes+1) * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_E, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_row, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_col, num_edges * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_W, num_edges * sizeof(double)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_node_mask, num_nodes * sizeof(int)));
+    EXIT_IF_CUDA_FAILED(cudaMalloc((void**)&d_hierarchy_results, num_nodes * sizeof(double)));
+
+    // 将数据从主机拷贝到设备
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_V, V, (num_nodes+1) * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_E, E, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_row, row, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_col, col, num_edges * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_node_mask, node_mask, num_nodes * sizeof(int), cudaMemcpyHostToDevice));
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(d_W, W, num_edges * sizeof(double), cudaMemcpyHostToDevice));
+
+
+    // 调用 CUDA 内核
+    if(is_directed){
+        directed_calculate_hierarchy<<<grid_size, block_size>>>(d_V, d_E, d_row, d_col, d_W, num_nodes, num_edges, d_node_mask, d_hierarchy_results);
+    }else{
+        int block_size = 256;  // 每个块的线程数，可以根据设备进行调整
+        int grid_size = (num_nodes + NODES_PER_BLOCK - 1) / NODES_PER_BLOCK;  // 调整为每个块处理NODES_PER_BLOCK个节点
+        int shared_memory_size = 2 * sizeof(double) * block_size; 
+        calculate_hierarchy<<<grid_size, block_size, shared_memory_size>>>(d_V, d_E, d_W, num_nodes, d_node_mask, d_hierarchy_results);
+    }
+
+    EXIT_IF_CUDA_FAILED(cudaMemcpy(hierarchy_results, d_hierarchy_results, num_nodes * sizeof(double), cudaMemcpyDeviceToHost));
+exit:
+
+    cudaFree(d_V);
+    cudaFree(d_E);
+    cudaFree(d_row);
+    cudaFree(d_col);
+    cudaFree(d_W);
+    cudaFree(d_node_mask);
+    cudaFree(d_hierarchy_results);
+    if (cuda_ret != cudaSuccess) {
+        switch (cuda_ret) {
+            case cudaErrorMemoryAllocation:
+                EG_ret = EG_GPU_FAILED_TO_ALLOCATE_DEVICE_MEM;
+                break;
+            default:
+                EG_ret = EG_GPU_DEVICE_ERR;
+                break;
+        }
+    }
+
+    return EG_ret; 
+}
+}
\ No newline at end of file
diff --git a/gpu_easygraph/functions/structural_holes/hierarchy.cuh b/gpu_easygraph/functions/structural_holes/hierarchy.cuh
new file mode 100644
index 00000000..ac0f596c
--- /dev/null
+++ b/gpu_easygraph/functions/structural_holes/hierarchy.cuh
@@ -0,0 +1,19 @@
+#pragma once
+
+#include "common.h"
+
+namespace gpu_easygraph {
+
+int cuda_hierarchy(
+    _IN_ const int* V,
+    _IN_ const int* E,
+    _IN_ const int* row,
+    _IN_ const int* col,
+    _IN_ const double* W,
+    _IN_ int num_nodes,
+    _IN_ int num_edges,
+    _IN_ bool is_directed,
+    _IN_ int* node_mask,
+    _OUT_ double* hierarchy_results
+);
+} // namespace gpu_easygraph
\ No newline at end of file