Add ccv_cnnp_debug.

liuliu · Sep 28, 2024 · ef7bd53 · ef7bd53
1 parent d2622a3
commit ef7bd53
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 0 deletions.
diff --git a/lib/nnc/ccv_cnnp_model_addons.c b/lib/nnc/ccv_cnnp_model_addons.c
@@ -4003,3 +4003,94 @@ static ccv_cnnp_model_t* _ccv_cnnp_scaled_dot_product_attention_copy(const ccv_c
 	const ccv_cnnp_model_scaled_dot_product_attention_t* const self = (const ccv_cnnp_model_scaled_dot_product_attention_t*)super;
 	return ccv_cnnp_scaled_dot_product_attention(self->scale, self->is_causal, self->has_attn_mask, self->flags, self->fused_unify_head_weights, self->no_bias, self->super.is_trainable, self->super.name);
 }
+
+// MARK - Debug Layer
+
+typedef struct {
+	ccv_cnnp_model_t super;
+	ccv_nnc_tensor_symbol_t output;
+	ccv_cnnp_model_debug_f debugger;
+	ccv_cnnp_model_debug_context_deinit_f debug_deinit;
+	ccv_cnnp_model_debug_context_copy_f debug_copy;
+	void* debug_context;
+} ccv_cnnp_model_debug_t;
+
+static int _ccv_cnnp_debug_exec(const ccv_nnc_cmd_t cmd, const ccv_nnc_hint_t hint, const int flags, ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_tensor_t* const* const outputs, const int output_size, ccv_nnc_stream_context_t* const stream_context)
+{
+	if (cmd.cmd == CCV_NNC_CUSTOM_BACKWARD)
+	{
+		assert(0 && "don't support debug backward pass yet");
+	}
+	ccv_cnnp_model_debug_t* const self = (ccv_cnnp_model_debug_t*)cmd.data;
+	self->debugger(inputs, input_size, stream_context, self->debug_context);
+	return CCV_NNC_EXEC_SUCCESS;
+}
+
+static ccv_nnc_cmd_vtab_t ccv_cnnp_debug_exec_isa = {
+	.exec = _ccv_cnnp_debug_exec
+};
+
+static void _ccv_cnnp_debug_build(ccv_cnnp_model_t* const self, ccv_nnc_symbolic_graph_t* const graph, const ccv_nnc_tensor_symbol_t* const inputs, const int input_size, ccv_nnc_tensor_symbol_t* const outputs, const int output_size)
+{
+	PRINT(CCV_CLI_VERBOSE, "[cnnp_debug_build] -\n");
+	assert(input_size >= 1);
+	assert(output_size == 1);
+	ccv_nnc_tensor_symbol_t to = ccv_nnc_tensor_symbol_alias_to(graph, inputs[0]);
+	ccv_nnc_tensor_param_t output_params = ccv_nnc_tensor_symbol_params(graph, inputs[0]);
+	if (to.d == CCV_NNC_NO_TENSOR_SYMBOL) // If we are not reshape an alias, it is straightforward.
+	{
+		int ofs[CCV_NNC_MAX_DIM_ALLOC] = {0};
+		int stride[CCV_NNC_MAX_DIM_ALLOC];
+		ccv_nnc_tensor_get_stride(output_params.dim, stride);
+		outputs[0] = ccv_nnc_tensor_symbol_alias_new(graph, inputs[0], ofs, stride, output_params, 0);
+	} else {
+		int old_ofs[CCV_NNC_MAX_DIM_ALLOC];
+		int old_stride[CCV_NNC_MAX_DIM_ALLOC];
+		ccv_nnc_tensor_symbol_alias_params(graph, inputs[0], old_ofs, old_stride);
+		outputs[0] = ccv_nnc_tensor_symbol_alias_new(graph, to, old_ofs, old_stride, output_params, 0);
+	}
+	ccv_nnc_cmd_t cmd = ccv_nnc_cmd(CCV_NNC_CUSTOM_FORWARD, (ccv_nnc_cmd_vtab_t*)&ccv_cnnp_debug_exec_isa, (ccv_nnc_cmd_param_t){}, 0);
+	cmd.data = self;
+	ccv_nnc_graph_exec_symbol_t make_debug = ccv_nnc_graph_exec_symbol_new(graph, cmd, inputs, input_size, outputs, 1, "debug");
+	// Disable any optimizations.
+	ccv_nnc_graph_exec_symbol_set_flags(graph, make_debug, CCV_NNC_GRAPH_EXEC_DISABLE_OPT);
+}
+
+static void _ccv_cnnp_debug_deinit(ccv_cnnp_model_t* const super)
+{
+	const ccv_cnnp_model_debug_t* const self = (const ccv_cnnp_model_debug_t*)super;
+	if (self->debug_deinit && self->debug_context)
+		self->debug_deinit(self->debug_context);
+}
+
+static ccv_cnnp_model_t* _ccv_cnnp_debug_copy(const ccv_cnnp_model_t* const super, void* const context);
+
+static const ccv_cnnp_model_vtab_t ccv_cnnp_debug_isa = {
+	.build = _ccv_cnnp_debug_build,
+	.deinit = _ccv_cnnp_debug_deinit,
+	.copy = _ccv_cnnp_debug_copy,
+};
+
+ccv_cnnp_model_t* ccv_cnnp_debug(ccv_cnnp_model_debug_f func, void* const context, ccv_cnnp_model_debug_context_deinit_f deinit, ccv_cnnp_model_debug_context_copy_f copy, const char* const name)
+{
+	ccv_cnnp_model_debug_t* const model_debug = (ccv_cnnp_model_debug_t*)cccalloc(1, sizeof(ccv_cnnp_model_debug_t));
+	model_debug->super.isa = &ccv_cnnp_debug_isa;
+	model_debug->super.input_size = 0;
+	model_debug->super.outputs = &model_debug->output;
+	model_debug->super.output_size = 1;
+	model_debug->debugger = func;
+	model_debug->debug_context = context;
+	model_debug->debug_deinit = deinit;
+	model_debug->debug_copy = copy;
+	ccv_cnnp_model_copy_name(&model_debug->super, name);
+	return (ccv_cnnp_model_t*)model_debug;
+}
+
+static ccv_cnnp_model_t* _ccv_cnnp_debug_copy(const ccv_cnnp_model_t* const super, void* const context)
+{
+	const ccv_cnnp_model_debug_t* const self = (const ccv_cnnp_model_debug_t*)super;
+	void* debug_context = self->debug_context;
+	if (self->debug_copy && self->debug_context)
+		debug_context = self->debug_copy(self->debug_context);
+	return ccv_cnnp_debug(self->debugger, debug_context, self->debug_deinit, self->debug_copy, self->super.name);
+}
diff --git a/lib/nnc/ccv_nnc.h b/lib/nnc/ccv_nnc.h
@@ -4705,6 +4705,34 @@ CCV_WARN_UNUSED(ccv_cnnp_model_t*) ccv_cnnp_contiguous(const char* const name);
  * @return A model that can apply scaled dot product attention compute.
  */
 CCV_WARN_UNUSED(ccv_cnnp_model_t*) ccv_cnnp_scaled_dot_product_attention(const float scale, const int is_causal, const int has_attn_mask, const int flags, const int fused_unify_head_weights, const int no_bias, const int is_trainable, const char* const name);
+/**
+ * The function prototype to call during the model execution at this position.
+ */
+typedef void (*ccv_cnnp_model_debug_f)(ccv_nnc_tensor_t* const* const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context, void* const context);
+/**
+ * The function prototype to destruct the context.
+ */
+typedef void (*ccv_cnnp_model_debug_context_deinit_f)(void* const context);
+/**
+ * The function prototype to copy the context.
+ */
+typedef void* (*ccv_cnnp_model_debug_context_copy_f)(void* const context);
+/**
+ * A special model that takes n inputs and output the first values. This is a special model because it
+ * generates a graph violates single-static assignment rule by having the outputs the same symbol
+ * as the input. It also inserts a custom op allows you to intercept the model execution and possibly
+ * output useful information from it (i.e. debug print tensors, generate stats like max / min / nan
+ * etc.). This is safe to insert anywhere because it doesn't impact the graph execution process but
+ * you are also advised to not use this method to modify the tensors during the execution. There will
+ * be another method for you to insert custom op in the model.
+ * @param func The func to call during the model execution.
+ * @param context The context object to be passed along the callback.
+ * @param deinit The deinit method to be used to free up the context.
+ * @param copy The copy method to make a duplicate of the context.
+ * @param name The unique name of the model.
+ * @return A model that can be applied and copies first input to the second.
+ */
+CCV_WARN_UNUSED(ccv_cnnp_model_t*) ccv_cnnp_debug(ccv_cnnp_model_debug_f func, void* const context, ccv_cnnp_model_debug_context_deinit_f deinit, ccv_cnnp_model_debug_context_copy_f copy, const char* const name);
 
 /** @} */
 

diff --git a/lib/nnc/ccv_nnc_symbolic_graph_simplify.c b/lib/nnc/ccv_nnc_symbolic_graph_simplify.c
@@ -699,6 +699,8 @@ static void _ccv_nnc_symbolic_graph_ops_fusion(ccv_nnc_symbolic_graph_simplify_t
 		// If already marked as dead, skip.
 		if (exec_dead[idx >> 5] & (1u << (idx & 0x1f)))
 			continue;
+		if (node->flags & CCV_NNC_GRAPH_EXEC_DISABLE_OPT) // If optimization pass disabled, skip.
+			continue;
 		// Run through rudimentary pattern matching for ops_seq. There are better ways to do this (immediately come to mind, Boyer-Moore). However, this is simpler to code.
 		// If I am running into performance issues with this, I would be very happy.
 		for (i = 0; i < sizeof(ccv_nnc_ops_fusions) / sizeof(ccv_nnc_ops_fusion_t); i++)
@@ -891,6 +893,8 @@ static void _ccv_nnc_symbolic_graph_pruning(ccv_nnc_symbolic_graph_simplify_t* c
 	_ccv_nnc_symbolic_graph_simplify_update_output_execs(simplify);
 	// Mark everything visited as dead.
 	ccv_nnc_graph_visit_for(simplify->visit, simplify->exec_symbol_info, node, idx) {
+		if (node->flags & CCV_NNC_GRAPH_EXEC_DISABLE_OPT) // If optimization pass disabled, skip.
+			continue;
 		exec_dead[idx >> 5] |= (1u << (idx & 0x1f));
 		for (i = 0; i < node->input_size; i++)
 		{

diff --git a/test/unit/nnc/cnnp.core.tests.c b/test/unit/nnc/cnnp.core.tests.c
@@ -2023,6 +2023,53 @@ TEST_CASE("chunk a tensor into several smaller ones, variant 2")
 	ccv_cnnp_model_free(final);
 }
 
+static float _debug_value = 0;
+
+static void _debug_test(ccv_nnc_tensor_t* const *const inputs, const int input_size, ccv_nnc_stream_context_t* const stream_context, void* const context)
+{
+	_debug_value = inputs[0]->data.f32[0];
+}
+
+TEST_CASE("debug can intercept model execution")
+{
+	ccv_cnnp_model_io_t input0 = ccv_cnnp_input();
+	ccv_cnnp_model_io_t input1 = ccv_cnnp_input();
+	ccv_cnnp_model_io_t input2 = ccv_cnnp_input();
+	ccv_cnnp_model_io_t input3 = ccv_cnnp_input();
+	ccv_cnnp_model_io_t output0 = ccv_cnnp_model_apply(ccv_cnnp_sum(0), MODEL_IO_LIST(input0, input1));
+	ccv_cnnp_model_io_t output1 = ccv_cnnp_model_apply(ccv_cnnp_debug(_debug_test, 0, 0, 0, 0), MODEL_IO_LIST(output0));
+	ccv_cnnp_model_io_t output2 = ccv_cnnp_model_apply(ccv_cnnp_mul(1, 0), MODEL_IO_LIST(output1, input2));
+	output2 = ccv_cnnp_model_apply(ccv_cnnp_sum(0), MODEL_IO_LIST(output2, input3));
+	ccv_cnnp_model_t* const final = ccv_cnnp_model_new(MODEL_IO_LIST(input0, input1, input2, input3), MODEL_IO_LIST(output2), 1, 0);
+	const ccv_nnc_tensor_param_t a0 = CPU_TENSOR_NCHW(32F, 1);
+	const ccv_nnc_tensor_param_t a1 = CPU_TENSOR_NCHW(32F, 1);
+	const ccv_nnc_tensor_param_t a2 = CPU_TENSOR_NCHW(32F, 1);
+	const ccv_nnc_tensor_param_t a3 = CPU_TENSOR_NCHW(32F, 1);
+	ccv_cnnp_model_compile(final, TENSOR_PARAM_LIST(a0, a1, a2, a3), CMD_NOOP(), CMD_NOOP());
+	ccv_nnc_tensor_t* const a0_tensor = ccv_nnc_tensor_new(0, a0, 0);
+	ccv_nnc_tensor_t* const a1_tensor = ccv_nnc_tensor_new(0, a1, 0);
+	ccv_nnc_tensor_t* const a2_tensor = ccv_nnc_tensor_new(0, a2, 0);
+	ccv_nnc_tensor_t* const a3_tensor = ccv_nnc_tensor_new(0, a3, 0);
+	ccv_nnc_tensor_t* const b2_tensor = ccv_nnc_tensor_new(0, a0, 0);
+	a0_tensor->data.f32[0] = 0.5;
+	a1_tensor->data.f32[0] = 0.75;
+	a2_tensor->data.f32[0] = 1.75;
+	a3_tensor->data.f32[0] = 2.5;
+	_debug_value = 0;
+	ccv_cnnp_model_evaluate(final, (ccv_cnnp_evaluate_param_t){
+		.is_test = 1
+	}, TENSOR_LIST(a0_tensor, a1_tensor, a2_tensor, a3_tensor), TENSOR_LIST(b2_tensor), 0, 0);
+	CNNP_MODEL_GEN(final, CCV_NNC_LONG_DOT_GRAPH);
+	REQUIRE_EQ_WITH_TOLERANCE(b2_tensor->data.f32[0], (0.5 + 0.75) * 1.75 + 2.5, 1e-5, "should match the final result");
+	REQUIRE_EQ_WITH_TOLERANCE(_debug_value, 0.5 + 0.75, 1e-5, "should match the intermediate result");
+	ccv_cnnp_model_free(final);
+	ccv_nnc_tensor_free(a0_tensor);
+	ccv_nnc_tensor_free(a1_tensor);
+	ccv_nnc_tensor_free(a2_tensor);
+	ccv_nnc_tensor_free(a3_tensor);
+	ccv_nnc_tensor_free(b2_tensor);
+}
+
 TEST_CASE("LoRA fine-tuning GEMM set is_trainable to false and with gradient checkpointing")
 {
 	const ccv_cnnp_model_io_t input = ccv_cnnp_input();