Skip to content

Commit

Permalink
Add ccv_cnnp_debug.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Sep 28, 2024
1 parent d2622a3 commit ef7bd53
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 0 deletions.
91 changes: 91 additions & 0 deletions lib/nnc/ccv_cnnp_model_addons.c
Original file line number Diff line number Diff line change
Expand Up @@ -4003,3 +4003,94 @@ static ccv_cnnp_model_t* _ccv_cnnp_scaled_dot_product_attention_copy(const ccv_c
const ccv_cnnp_model_scaled_dot_product_attention_t* const self = (const ccv_cnnp_model_scaled_dot_product_attention_t*)super;
return ccv_cnnp_scaled_dot_product_attention(self->scale, self->is_causal, self->has_attn_mask, self->flags, self->fused_unify_head_weights, self->no_bias, self->super.is_trainable, self->super.name);
}

// MARK - Debug Layer

typedef struct {
ccv_cnnp_model_t super;
ccv_nnc_tensor_symbol_t output;
ccv_cnnp_model_debug_f debugger;
ccv_cnnp_model_debug_context_deinit_f debug_deinit;
ccv_cnnp_model_debug_context_copy_f debug_copy;
void* debug_context;
} ccv_cnnp_model_debug_t;

static int _ccv_cnnp_debug_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
{
if (cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
{
assert(0 && "don't support debug backward pass yet");
}
ccv_cnnp_model_debug_t* const self = (ccv_cnnp_model_debug_t*)cmd.data;
self->debugger(inputs, input_size, stream_context, self->debug_context);
return CCV_NNC_EXEC_SUCCESS;
}

static ccv_nnc_cmd_vtab_t ccv_cnnp_debug_exec_isa = {
.exec = _ccv_cnnp_debug_exec
};

static void _ccv_cnnp_debug_build(ccv_cnnp_model_t* const self, ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, ccv_nnc_tensor_symbol_t* const outputs, const int output_size)
{
PRINT(CCV_CLI_VERBOSE, "[cnnp_debug_build] -\n");
assert(input_size >= 1);
assert(output_size == 1);
ccv_nnc_tensor_symbol_t to = ccv_nnc_tensor_symbol_alias_to(graph, inputs[0]);
ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(graph, inputs[0]);
if (to.d == CCV_NNC_NO_TENSOR_SYMBOL) // If we are not reshape an alias, it is straightforward.
{
int ofs[CCV_NNC_MAX_DIM_ALLOC] = {0};
int stride[CCV_NNC_MAX_DIM_ALLOC];
ccv_nnc_tensor_get_stride(output_params.dim, stride);
outputs[0] = ccv_nnc_tensor_symbol_alias_new(graph, inputs[0], ofs, stride, output_params, 0);
} else {
int old_ofs[CCV_NNC_MAX_DIM_ALLOC];
int old_stride[CCV_NNC_MAX_DIM_ALLOC];
ccv_nnc_tensor_symbol_alias_params(graph, inputs[0], old_ofs, old_stride);
outputs[0] = ccv_nnc_tensor_symbol_alias_new(graph, to, old_ofs, old_stride, output_params, 0);
}
ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_CUSTOM_FORWARD, (ccv_nnc_cmd_vtab_t*)&ccv_cnnp_debug_exec_isa, (ccv_nnc_cmd_param_t){}, 0);
cmd.data = self;
ccv_nnc_graph_exec_symbol_t make_debug = ccv_nnc_graph_exec_symbol_new(graph, cmd, inputs, input_size, outputs, 1, "debug");
// Disable any optimizations.
ccv_nnc_graph_exec_symbol_set_flags(graph, make_debug, CCV_NNC_GRAPH_EXEC_DISABLE_OPT);
}

static void _ccv_cnnp_debug_deinit(ccv_cnnp_model_t* const super)
{
const ccv_cnnp_model_debug_t* const self = (const ccv_cnnp_model_debug_t*)super;
if (self->debug_deinit && self->debug_context)
self->debug_deinit(self->debug_context);
}

static ccv_cnnp_model_t* _ccv_cnnp_debug_copy(const ccv_cnnp_model_t* const super, void* const context);

static const ccv_cnnp_model_vtab_t ccv_cnnp_debug_isa = {
.build = _ccv_cnnp_debug_build,
.deinit = _ccv_cnnp_debug_deinit,
.copy = _ccv_cnnp_debug_copy,
};

ccv_cnnp_model_t* ccv_cnnp_debug(ccv_cnnp_model_debug_f func, void* const context, ccv_cnnp_model_debug_context_deinit_f deinit, ccv_cnnp_model_debug_context_copy_f copy, const char* const name)
{
ccv_cnnp_model_debug_t* const model_debug = (ccv_cnnp_model_debug_t*)cccalloc(1, sizeof(ccv_cnnp_model_debug_t));
model_debug->super.isa = &ccv_cnnp_debug_isa;
model_debug->super.input_size = 0;
model_debug->super.outputs = &model_debug->output;
model_debug->super.output_size = 1;
model_debug->debugger = func;
model_debug->debug_context = context;
model_debug->debug_deinit = deinit;
model_debug->debug_copy = copy;
ccv_cnnp_model_copy_name(&model_debug->super, name);
return (ccv_cnnp_model_t*)model_debug;
}

static ccv_cnnp_model_t* _ccv_cnnp_debug_copy(const ccv_cnnp_model_t* const super, void* const context)
{
const ccv_cnnp_model_debug_t* const self = (const ccv_cnnp_model_debug_t*)super;
void* debug_context = self->debug_context;
if (self->debug_copy && self->debug_context)
debug_context = self->debug_copy(self->debug_context);
return ccv_cnnp_debug(self->debugger, debug_context, self->debug_deinit, self->debug_copy, self->super.name);
}
28 changes: 28 additions & 0 deletions lib/nnc/ccv_nnc.h
Original file line number Diff line number Diff line change
Expand Up @@ -4705,6 +4705,34 @@ CCV_WARN_UNUSED(ccv_cnnp_model_t*) ccv_cnnp_contiguous(const char* const name);
* @return A model that can apply scaled dot product attention compute.
*/
CCV_WARN_UNUSED(ccv_cnnp_model_t*) ccv_cnnp_scaled_dot_product_attention(const float scale, const int is_causal, const int has_attn_mask, const int flags, const int fused_unify_head_weights, const int no_bias, const int is_trainable, const char* const name);
/**
* The function prototype to call during the model execution at this position.
*/
typedef void (*ccv_cnnp_model_debug_f)(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context, void* const context);
/**
* The function prototype to destruct the context.
*/
typedef void (*ccv_cnnp_model_debug_context_deinit_f)(void* const context);
/**
* The function prototype to copy the context.
*/
typedef void* (*ccv_cnnp_model_debug_context_copy_f)(void* const context);
/**
* A special model that takes n inputs and output the first values. This is a special model because it
* generates a graph violates single-static assignment rule by having the outputs the same symbol
* as the input. It also inserts a custom op allows you to intercept the model execution and possibly
* output useful information from it (i.e. debug print tensors, generate stats like max / min / nan
* etc.). This is safe to insert anywhere because it doesn't impact the graph execution process but
* you are also advised to not use this method to modify the tensors during the execution. There will
* be another method for you to insert custom op in the model.
* @param func The func to call during the model execution.
* @param context The context object to be passed along the callback.
* @param deinit The deinit method to be used to free up the context.
* @param copy The copy method to make a duplicate of the context.
* @param name The unique name of the model.
* @return A model that can be applied and copies first input to the second.
*/
CCV_WARN_UNUSED(ccv_cnnp_model_t*) ccv_cnnp_debug(ccv_cnnp_model_debug_f func, void* const context, ccv_cnnp_model_debug_context_deinit_f deinit, ccv_cnnp_model_debug_context_copy_f copy, const char* const name);

/** @} */

Expand Down
4 changes: 4 additions & 0 deletions lib/nnc/ccv_nnc_symbolic_graph_simplify.c
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,8 @@ static void _ccv_nnc_symbolic_graph_ops_fusion(ccv_nnc_symbolic_graph_simplify_t
// If already marked as dead, skip.
if (exec_dead[idx >> 5] & (1u << (idx & 0x1f)))
continue;
if (node->flags & CCV_NNC_GRAPH_EXEC_DISABLE_OPT) // If optimization pass disabled, skip.
continue;
// Run through rudimentary pattern matching for ops_seq. There are better ways to do this (immediately come to mind, Boyer-Moore). However, this is simpler to code.
// If I am running into performance issues with this, I would be very happy.
for (i = 0; i < sizeof(ccv_nnc_ops_fusions) / sizeof(ccv_nnc_ops_fusion_t); i++)
Expand Down Expand Up @@ -891,6 +893,8 @@ static void _ccv_nnc_symbolic_graph_pruning(ccv_nnc_symbolic_graph_simplify_t* c
_ccv_nnc_symbolic_graph_simplify_update_output_execs(simplify);
// Mark everything visited as dead.
ccv_nnc_graph_visit_for(simplify->visit, simplify->exec_symbol_info, node, idx) {
if (node->flags & CCV_NNC_GRAPH_EXEC_DISABLE_OPT) // If optimization pass disabled, skip.
continue;
exec_dead[idx >> 5] |= (1u << (idx & 0x1f));
for (i = 0; i < node->input_size; i++)
{
Expand Down
47 changes: 47 additions & 0 deletions test/unit/nnc/cnnp.core.tests.c
Original file line number Diff line number Diff line change
Expand Up @@ -2023,6 +2023,53 @@ TEST_CASE("chunk a tensor into several smaller ones, variant 2")
ccv_cnnp_model_free(final);
}

static float _debug_value = 0;

static void _debug_test(ccv_nnc_tensor_t* const *const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context, void* const context)
{
_debug_value = inputs[0]->data.f32[0];
}

TEST_CASE("debug can intercept model execution")
{
ccv_cnnp_model_io_t input0 = ccv_cnnp_input();
ccv_cnnp_model_io_t input1 = ccv_cnnp_input();
ccv_cnnp_model_io_t input2 = ccv_cnnp_input();
ccv_cnnp_model_io_t input3 = ccv_cnnp_input();
ccv_cnnp_model_io_t output0 = ccv_cnnp_model_apply(ccv_cnnp_sum(0), MODEL_IO_LIST(input0, input1));
ccv_cnnp_model_io_t output1 = ccv_cnnp_model_apply(ccv_cnnp_debug(_debug_test, 0, 0, 0, 0), MODEL_IO_LIST(output0));
ccv_cnnp_model_io_t output2 = ccv_cnnp_model_apply(ccv_cnnp_mul(1, 0), MODEL_IO_LIST(output1, input2));
output2 = ccv_cnnp_model_apply(ccv_cnnp_sum(0), MODEL_IO_LIST(output2, input3));
ccv_cnnp_model_t* const final = ccv_cnnp_model_new(MODEL_IO_LIST(input0, input1, input2, input3), MODEL_IO_LIST(output2), 1, 0);
const ccv_nnc_tensor_param_t a0 = CPU_TENSOR_NCHW(32F, 1);
const ccv_nnc_tensor_param_t a1 = CPU_TENSOR_NCHW(32F, 1);
const ccv_nnc_tensor_param_t a2 = CPU_TENSOR_NCHW(32F, 1);
const ccv_nnc_tensor_param_t a3 = CPU_TENSOR_NCHW(32F, 1);
ccv_cnnp_model_compile(final, TENSOR_PARAM_LIST(a0, a1, a2, a3), CMD_NOOP(), CMD_NOOP());
ccv_nnc_tensor_t* const a0_tensor = ccv_nnc_tensor_new(0, a0, 0);
ccv_nnc_tensor_t* const a1_tensor = ccv_nnc_tensor_new(0, a1, 0);
ccv_nnc_tensor_t* const a2_tensor = ccv_nnc_tensor_new(0, a2, 0);
ccv_nnc_tensor_t* const a3_tensor = ccv_nnc_tensor_new(0, a3, 0);
ccv_nnc_tensor_t* const b2_tensor = ccv_nnc_tensor_new(0, a0, 0);
a0_tensor->data.f32[0] = 0.5;
a1_tensor->data.f32[0] = 0.75;
a2_tensor->data.f32[0] = 1.75;
a3_tensor->data.f32[0] = 2.5;
_debug_value = 0;
ccv_cnnp_model_evaluate(final, (ccv_cnnp_evaluate_param_t){
.is_test = 1
}, TENSOR_LIST(a0_tensor, a1_tensor, a2_tensor, a3_tensor), TENSOR_LIST(b2_tensor), 0, 0);
CNNP_MODEL_GEN(final, CCV_NNC_LONG_DOT_GRAPH);
REQUIRE_EQ_WITH_TOLERANCE(b2_tensor->data.f32[0], (0.5 + 0.75) * 1.75 + 2.5, 1e-5, "should match the final result");
REQUIRE_EQ_WITH_TOLERANCE(_debug_value, 0.5 + 0.75, 1e-5, "should match the intermediate result");
ccv_cnnp_model_free(final);
ccv_nnc_tensor_free(a0_tensor);
ccv_nnc_tensor_free(a1_tensor);
ccv_nnc_tensor_free(a2_tensor);
ccv_nnc_tensor_free(a3_tensor);
ccv_nnc_tensor_free(b2_tensor);
}

TEST_CASE("LoRA fine-tuning GEMM set is_trainable to false and with gradient checkpointing")
{
const ccv_cnnp_model_io_t input = ccv_cnnp_input();
Expand Down

0 comments on commit ef7bd53

Please sign in to comment.