From a76509dcb49f5ed2380dde15012a00a1a5759623 Mon Sep 17 00:00:00 2001 From: Marc Gilleron Date: Fri, 23 Aug 2024 00:18:32 +0100 Subject: [PATCH] Fix OutputWeight nodes when locally constant --- generators/graph/voxel_generator_graph.cpp | 48 +++++++-- generators/graph/voxel_graph_runtime.h | 10 ++ tests/tests.cpp | 1 + tests/voxel/test_voxel_graph.cpp | 107 +++++++++++++++++++++ tests/voxel/test_voxel_graph.h | 1 + 5 files changed, 158 insertions(+), 9 deletions(-) diff --git a/generators/graph/voxel_generator_graph.cpp b/generators/graph/voxel_generator_graph.cpp index 9b7d4ece1..1bfb065a6 100644 --- a/generators/graph/voxel_generator_graph.cpp +++ b/generators/graph/voxel_generator_graph.cpp @@ -142,25 +142,52 @@ void VoxelGeneratorGraph::gather_indices_and_weights( // TODO Optimization: exclude up-front outputs that are known to be zero? // So we choose the cases below based on non-zero outputs instead of total output count + // It is unfortunately not practical to handle all combinations of single-value/array-of-values for every + // layer, so for now we try to workaround that + struct WeightBufferArray { + FixedArray, 16> array; + + inline float get_weight(unsigned int output_index, size_t value_index) const { + Span s = array[output_index]; +#ifdef DEV_ENABLED + // That span can either have size 1 (means all values would be the same) or have regular buffer + // size. We use min() to quickly access that without branching, but that could hide actual OOB bugs, so + // make sure they are detected + ZN_ASSERT(s.size() > 0 && (value_index < s.size() || s.size() == 1)); +#endif + const float weight = s[math::min(value_index, s.size() - 1)]; + return weight; + } + }; + // TODO Could maybe put this part outside? - FixedArray, 16> buffers; + WeightBufferArray buffers; const unsigned int buffers_count = weight_outputs.size(); for (unsigned int oi = 0; oi < buffers_count; ++oi) { const WeightOutput &info = weight_outputs[oi]; - const pg::Runtime::Buffer &buffer = state.get_buffer(info.output_buffer_index); - buffers[oi] = Span(buffer.data, buffer.size); + const math::Interval &range = state.get_range_const_ref(info.output_buffer_index); + + if (range.is_single_value()) { + // The weight is locally constant (or compile-time constant). + // We can't use the usual buffer because if we use optimized execution mapping, they won't be filled by any + // operation + buffers.array[oi] = Span(&range.min, 1); + } else { + const pg::Runtime::Buffer &buffer = state.get_buffer(info.output_buffer_index); + buffers.array[oi] = Span(buffer.data, buffer.size); + } } if (buffers_count <= 4) { // Pick all results and fill with spare indices to keep semantic - unsigned int value_index = 0; + size_t value_index = 0; for (int rz = rmin.z; rz < rmax.z; ++rz) { for (int rx = rmin.x; rx < rmax.x; ++rx) { FixedArray weights; FixedArray indices = spare_indices; fill(weights, uint8_t(0)); for (unsigned int oi = 0; oi < buffers_count; ++oi) { - const float weight = buffers[oi][value_index]; + const float weight = buffers.get_weight(oi, value_index); // TODO Optimization: weight output nodes could already multiply by 255 and clamp afterward // so we would not need to do it here weights[oi] = math::clamp(weight * 255.f, 0.f, 255.f); @@ -180,13 +207,13 @@ void VoxelGeneratorGraph::gather_indices_and_weights( } else if (buffers_count == 4) { // Pick all results - unsigned int value_index = 0; + size_t value_index = 0; for (int rz = rmin.z; rz < rmax.z; ++rz) { for (int rx = rmin.x; rx < rmax.x; ++rx) { FixedArray weights; FixedArray indices; for (unsigned int oi = 0; oi < buffers_count; ++oi) { - const float weight = buffers[oi][value_index]; + const float weight = buffers.get_weight(oi, value_index); weights[oi] = math::clamp(weight * 255.f, 0.f, 255.f); indices[oi] = weight_outputs[oi].layer_index; } @@ -204,7 +231,7 @@ void VoxelGeneratorGraph::gather_indices_and_weights( } else { // More weights than we can have per voxel. Will need to pick most represented weights const float pivot = 1.f / 5.f; - unsigned int value_index = 0; + size_t value_index = 0; FixedArray skipped_outputs; for (int rz = rmin.z; rz < rmax.z; ++rz) { for (int rx = rmin.x; rx < rmax.x; ++rx) { @@ -219,7 +246,8 @@ void VoxelGeneratorGraph::gather_indices_and_weights( unsigned int recorded_weights = 0; // Pick up weights above pivot (this is not as correct as a sort but faster) for (unsigned int oi = 0; oi < buffers_count && recorded_weights < indices.size(); ++oi) { - const float weight = buffers[oi][value_index]; + const float weight = buffers.get_weight(oi, value_index); + if (weight > pivot) { weights[recorded_weights] = math::clamp(weight * 255.f, 0.f, 255.f); indices[recorded_weights] = weight_outputs[oi].layer_index; @@ -648,6 +676,8 @@ VoxelGenerator::Result VoxelGeneratorGraph::generate_block(VoxelGenerator::Voxel if (runtime_ptr->weight_outputs_count > 0 && !sdf_is_air) { // We can skip this when SDF is air because there won't be any matter to give a texture to // TODO Range analysis on that? + // Not easy to do that from here, they would have to ALL be locally constant in order to use a + // short-circuit... for (unsigned int i = 0; i < runtime_ptr->weight_outputs_count; ++i) { required_outputs.push_back(runtime_ptr->weight_output_indices[i]); } diff --git a/generators/graph/voxel_graph_runtime.h b/generators/graph/voxel_graph_runtime.h index 4efe4a5da..0046a2682 100644 --- a/generators/graph/voxel_graph_runtime.h +++ b/generators/graph/voxel_graph_runtime.h @@ -135,6 +135,16 @@ class Runtime { return ranges[address]; } + inline const math::Interval &get_range_const_ref(uint16_t address) const { + // TODO Just for convenience because STL bound checks aren't working in Godot 3 + ZN_ASSERT(address < buffers.size()); + return ranges[address]; + } + + inline uint32_t get_buffer_size() const { + return buffer_size; + } + void clear() { buffer_size = 0; // buffer_capacity = 0; diff --git a/tests/tests.cpp b/tests/tests.cpp index 418d590eb..7dc6bb8c6 100644 --- a/tests/tests.cpp +++ b/tests/tests.cpp @@ -127,6 +127,7 @@ void run_voxel_tests() { VOXEL_TEST(test_voxel_stream_sqlite_key_blob80_encoding); VOXEL_TEST(test_voxel_stream_sqlite_basic); VOXEL_TEST(test_voxel_stream_sqlite_coordinate_format); + VOXEL_TEST(test_voxel_graph_4_default_weights); print_line("------------ Voxel tests end -------------"); } diff --git a/tests/voxel/test_voxel_graph.cpp b/tests/voxel/test_voxel_graph.cpp index 03b538250..eb1809fb3 100644 --- a/tests/voxel/test_voxel_graph.cpp +++ b/tests/voxel/test_voxel_graph.cpp @@ -2222,4 +2222,111 @@ void test_voxel_graph_non_square_image() { ZN_TEST_ASSERT(sd.f > 2.9f && sd.f < 3.1); } +void test_voxel_graph_4_default_weights() { // Related to issue #686 + static constexpr uint32_t block_size = 16; + + struct L { + // The idea is to generate data into the memory cache, which may affect the result if there is a garbage buffer + // bug + static void warmup() { + Ref warmup_generator; + warmup_generator.instantiate(); + + VoxelGraphFunction &g = **warmup_generator->get_main_function(); + + // X --- Sin --- Add --- OutSDF + // / + // Y + + const uint32_t n_x = g.create_node(VoxelGraphFunction::NODE_INPUT_X); + const uint32_t n_y = g.create_node(VoxelGraphFunction::NODE_INPUT_Y); + const uint32_t n_sin = g.create_node(VoxelGraphFunction::NODE_SIN); + const uint32_t n_add = g.create_node(VoxelGraphFunction::NODE_ADD); + const uint32_t n_out_sdf = g.create_node(VoxelGraphFunction::NODE_OUTPUT_SDF); + + g.add_connection(n_x, 0, n_sin, 0); + g.add_connection(n_sin, 0, n_add, 0); + g.add_connection(n_add, 0, n_out_sdf, 0); + g.add_connection(n_y, 0, n_add, 1); + + CompilationResult result = warmup_generator->compile(false); + ZN_TEST_ASSERT(result.success); + + VoxelBuffer buffer(VoxelBuffer::ALLOCATOR_DEFAULT); + buffer.create(Vector3iUtil::create(block_size)); + + VoxelGenerator::VoxelQueryData q{ buffer, Vector3i(0, 0, 0), 0 }; + warmup_generator->generate_block(q); + } + + static void test(const float test_w0, const float test_w1, const float test_w2, const float test_w3) { + const uint8_t test_w0b = math::clamp(static_cast(255.0 * test_w0), 0, 255); + const uint8_t test_w1b = math::clamp(static_cast(255.0 * test_w1), 0, 255); + const uint8_t test_w2b = math::clamp(static_cast(255.0 * test_w2), 0, 255); + const uint8_t test_w3b = math::clamp(static_cast(255.0 * test_w3), 0, 255); + const uint32_t test_ew = encode_weights_to_packed_u16_lossy(test_w0b, test_w1b, test_w2b, test_w3b); + + Ref generator; + generator.instantiate(); + { + VoxelGraphFunction &g = **generator->get_main_function(); + + // Y --- Plane --- OutSDF + // OutWeight1 + // OutWeight2 + // OutWeight3 + // OutWeight4 + + const uint32_t n_y = g.create_node(VoxelGraphFunction::NODE_INPUT_Y); + const uint32_t n_plane = g.create_node(VoxelGraphFunction::NODE_SDF_PLANE); + const uint32_t n_ow0 = g.create_node(VoxelGraphFunction::NODE_OUTPUT_WEIGHT); + const uint32_t n_ow1 = g.create_node(VoxelGraphFunction::NODE_OUTPUT_WEIGHT); + const uint32_t n_ow2 = g.create_node(VoxelGraphFunction::NODE_OUTPUT_WEIGHT); + const uint32_t n_ow3 = g.create_node(VoxelGraphFunction::NODE_OUTPUT_WEIGHT); + // Putting this one last can make bugs show up. Unusual setups increase entropy. + const uint32_t n_out_sdf = g.create_node(VoxelGraphFunction::NODE_OUTPUT_SDF); + + g.set_node_default_input(n_plane, 1, 1.0); + + g.set_node_default_input(n_ow0, 0, test_w0); + g.set_node_default_input(n_ow1, 0, test_w1); + g.set_node_default_input(n_ow2, 0, test_w2); + g.set_node_default_input(n_ow3, 0, test_w3); + + g.set_node_param(n_ow0, 0, 0); + g.set_node_param(n_ow1, 0, 1); + g.set_node_param(n_ow2, 0, 2); + g.set_node_param(n_ow3, 0, 3); + + g.add_connection(n_y, 0, n_plane, 0); + g.add_connection(n_plane, 0, n_out_sdf, 0); + + CompilationResult result = generator->compile(false); + ZN_TEST_ASSERT(result.success); + } + + VoxelBuffer buffer(VoxelBuffer::ALLOCATOR_DEFAULT); + buffer.create(Vector3iUtil::create(block_size)); + + VoxelGenerator::VoxelQueryData q{ buffer, Vector3i(0, 0, 0), 0 }; + // generator->set_use_optimized_execution_map(false); + generator->generate_block(q); + + const uint32_t ei = buffer.get_voxel(10, 0, 0, VoxelBuffer::CHANNEL_INDICES); + const uint32_t ew = buffer.get_voxel(10, 0, 0, VoxelBuffer::CHANNEL_WEIGHTS); + + const FixedArray indices = decode_indices_from_packed_u16(ei); + const FixedArray weights = decode_weights_from_packed_u16(ew); + + ZN_TEST_ASSERT(indices[0] == 0 && indices[1] == 1 && indices[2] == 2 && indices[3] == 3); + ZN_TEST_ASSERT(ew == test_ew); + } + }; + + // L::warmup(); + L::test(0.5, 0.2, 0.4, 0.8); + L::test(0.5, 0.0, 0.25, 1.0); + L::test(0.5, 0.2, 0.4, 0.8); +} + } // namespace zylann::voxel::tests diff --git a/tests/voxel/test_voxel_graph.h b/tests/voxel/test_voxel_graph.h index f005fc179..a34939ebc 100644 --- a/tests/voxel/test_voxel_graph.h +++ b/tests/voxel/test_voxel_graph.h @@ -35,6 +35,7 @@ void test_voxel_graph_many_weight_outputs(); void test_image_range_grid(); void test_voxel_graph_many_subdivisions(); void test_voxel_graph_non_square_image(); +void test_voxel_graph_4_default_weights(); } // namespace zylann::voxel::tests