diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py index e9ed1439cd2..bfd3bababc4 100644 --- a/backends/vulkan/partitioner/supported_ops.py +++ b/backends/vulkan/partitioner/supported_ops.py @@ -70,6 +70,7 @@ def __contains__(self, op): exir_ops.edge.aten.sin.default, exir_ops.edge.aten.sqrt.default, exir_ops.edge.aten.tanh.default, + exir_ops.edge.aten._to_copy.default, # Matrix Multiplication exir_ops.edge.aten.bmm.default, exir_ops.edge.aten.mm.default, diff --git a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp new file mode 100644 index 00000000000..35280eb2e1e --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +namespace vkcompute { + +void resize_to_copy_op_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& extra_args) { + (void)extra_args; + vTensorPtr out = graph->get_tensor(args[0].refs[0]); + vTensorPtr self = graph->get_tensor(args[1].refs[0]); + + out->virtual_resize(self->sizes()); +} + +void add_to_copy_node(ComputeGraph& graph, ValueRef in, ValueRef out) { + ValueRef arg = prepack_if_tensor_ref(graph, in); + vTensorPtr t_in = graph.get_tensor(in); + vTensorPtr t_out = graph.get_tensor(out); + std::set supported_types = { + vkapi::ScalarType::Float, vkapi::ScalarType::Half}; + VK_CHECK_COND( + supported_types.find(t_in->dtype()) != supported_types.end() && + supported_types.find(t_out->dtype()) != supported_types.end(), + "Unsupported dtype for to_copy, only Float and Half are currently supported"); + graph.execute_nodes().emplace_back(new BlitNode(graph, arg, out)); +} + +void to_copy(ComputeGraph& graph, const std::vector& args) { + return add_to_copy_node(graph, args[0], args[7]); +} + +REGISTER_OPERATORS { + VK_REGISTER_OP(aten._to_copy.default, to_copy); +} +} // namespace vkcompute diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp index 2eb3563ca29..b706bc7dd8e 100644 --- a/backends/vulkan/test/vulkan_compute_api_test.cpp +++ b/backends/vulkan/test/vulkan_compute_api_test.cpp @@ -3206,3 +3206,104 @@ TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) { test_transpose_view_mm(2, 7, 17, 5, storage_type); } } + +void test_to_copy() { + GraphConfig config; + config.set_storage_type_override(utils::kTexture3D); + ComputeGraph graph(config); + int M = 8; + int N = 8; + int K = 8; + // Build graph + IOValueRef in = graph.add_input_tensor( + {1, M, N, K}, + vkapi::kFloat, + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); + + std::vector data_in = + create_random_float_buffer(M * N * K, -1024, 1024); + graph.copy_into_staging(in.staging, data_in.data(), data_in.size()); + + IOValueRef out; + out.value = graph.add_tensor( + {1, M, N, K}, + vkapi::kHalf, + utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED); + + auto op = VK_GET_OP_FN("aten._to_copy.default"); + op(graph, + {in.value, + graph.add_none(), + graph.add_none(), + graph.add_none(), + graph.add_none(), + graph.add_none(), + graph.add_none(), + out.value}); + + out.staging = graph.set_output_tensor(out.value); + + graph.prepare(); + graph.encode_prepack(); + graph.prepack(); + graph.encode_execute(); + graph.propagate_resize(); + graph.execute(); + + std::vector output_data(graph.numel_of(out.value)); + graph.copy_from_staging(out.staging, output_data.data(), output_data.size()); + + EXPECT_EQ(data_in.size(), output_data.size()); + + float mse_ex = 0.0f; + float mse_vk = 0.0f; + + // check results + for (size_t i = 0; i < output_data.size(); ++i) { + float input = data_in[i]; + torch::executor::Half expected_output = + static_cast(input); + uint16_t* expected_bits = reinterpret_cast(&expected_output); + torch::executor::Half output = output_data[i]; + uint16_t* output_bits = reinterpret_cast(&output); + + std::cout << "input = " << input << "(0b" + << std::bitset<32>(*reinterpret_cast(&input)) + << "), expected output = " << expected_output << "(0b" + << std::bitset<16>(*expected_bits) + << "), recieved output = " << output << "(0b" + << std::bitset<16>(*output_bits) << ")" << std::endl; + + // Note: Torch executor half "rounds up" when converting to fp16 whereas + // most driver implementations of Vulkan's opFConvert() just truncates the + // extra bits for performance (rounding introduces conditional). + // Example: + // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011}, + // mantissa{0b10010011111101111100111}), + // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011}, + // mantissa{0b1001010000}), + // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011}, + // mantissa{0b1001001111}) + // Note: + // The vulkan mantissa exactly matches the first 10 + // bits of the input 23 bit mantissa. But since the 11th bit is 1, the + // torch half output is rounded up (essentially adding a 1). + // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000} + + EXPECT_TRUE( + (*output_bits == *expected_bits) || + /*rounding error*/ ((*output_bits + 1u) == *expected_bits)); + mse_ex += std::pow(expected_output - input, 2); + mse_vk += std::pow(output - input, 2); + } + + mse_ex /= output_data.size(); + mse_vk /= output_data.size(); + std::cout << "=========================================================" + << std::endl; + std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl; +} + +TEST(VulkanComputeGraphOpsTest, test_to_copy) { + test_to_copy(); +}