diff --git a/sw/dnn/fused_linear_gelu/data/params.json b/sw/dnn/fused_linear_gelu/data/params.json new file mode 100644 index 000000000..60e068341 --- /dev/null +++ b/sw/dnn/fused_linear_gelu/data/params.json @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 2, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 16, + N: 16, + K: 16, + alpha: 1, + beta: 0, + a_gelu: -0.2888, + b_gelu: -1.769, + flg_fp: "fused_linear_gelu_fp64_naive" +} \ No newline at end of file diff --git a/sw/dnn/fused_linear_gelu/scripts/datagen.py b/sw/dnn/fused_linear_gelu/scripts/datagen.py new file mode 100755 index 000000000..c651b7cca --- /dev/null +++ b/sw/dnn/fused_linear_gelu/scripts/datagen.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +# Copyright 2022 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Authors: Viviane Potocnik + +import numpy as np +import os +import re +import pyflexfloat as ff +import sys +import torch + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import DataGen, format_array_declaration, format_struct_definition, \ + format_array_definition, format_ifdef_wrapper # noqa: E402 + + +np.random.seed(42) + +def sigmoid_gelu(x, x_shape, a, b): + # print(dir(ff.np)) + # print(type(x)) + x = ff.np.float32(x) + # reshape and convert back to torch tensor + # x = x.reshape(x_shape) + x = torch.tensor(x) + result = torch.sign(x) * (a * (torch.clamp(torch.abs(x), max=-b) + b)**2 + 1) + # return as same type as input + return ff.array(result.numpy(), x_shape) + + +class FlgDataGen(DataGen): + + # AXI splits bursts crossing 4KB address boundaries. To minimize + # the occurrence of these splits the data should be aligned to 4KB + BURST_ALIGNMENT = 4096 + + def exact_golden_model(self, alpha, a, b, beta, c, a_gelu, b_gelu): + M, N, K = a.shape[0], b.shape[1], b.shape[0] + result = beta * c + result_shape = result.shape + for m in range(M): + for n in range(N): + for k in range(K): + result[m][n] += a[m][k] * b[k][n] + result[m][n] = sigmoid_gelu(result[m][n], result_shape, a_gelu, b_gelu) + return result + + def infer_implementation(self, flg_fp): + # flg_fp: "fused_linear_gelu_fp64_opt" + # create a regex with fp__ + prec, impl = re.search(r'fused_linear_gelu_fp(\d+)_(\w+)', flg_fp).group(1, 2) + return (int(prec) / 8), impl + + def validate_config(self, flg_fp, parallelize_m, + parallelize_k, m_tiles, n_tiles, k_tiles, transa, + transb, M, N, K, beta, **kwargs): + frac_m = M / m_tiles + frac_n = N / n_tiles + frac_k = K / k_tiles + + dtype, impl = self.infer_implementation(flg_fp) + + # Calculate total TCDM occupation + # Note: doesn't account for double buffering + prec = data_utils.size_from_precision_t(dtype) + a_size = frac_m * frac_k * prec + b_size = frac_k * frac_n * prec + c_size = frac_m * frac_n * prec + total_size = a_size + total_size += b_size + total_size += c_size + data_utils.validate_tcdm_footprint(total_size) + + assert (M % m_tiles) == 0, 'M is not an integer multiple of tile size' + assert (N % n_tiles) == 0, 'N is not an integer multiple of tile size' + assert (K % k_tiles) == 0, 'K is not an integer multiple of tile size' + assert not (parallelize_m and parallelize_k), 'Cannot parallelize K and M simultaneously' + assert not transa, 'SIMD kernels don\'t support transposed A matrix' + assert (dtype == 8) or (impl == 'baseline') or (impl == 'naive') \ + or transb, 'Optimized SIMD kernels only support transposed B matrix' + assert not transb or n_tiles == 1, 'Tiling in the N dimension not supported' \ + ' if B is transposed' + assert not transb or k_tiles == 1, 'Tiling in the K dimension not supported' \ + ' if B is transposed' + assert (impl == 'baseline') or (impl == 'naive') or frac_n >= 8, \ + 'N dimension of tile size must be greater or equal to the unrolling factor (8) ' \ + 'when using optimized kernels' + assert beta == 0 or beta == 1, 'Only values of 0 or 1 supported for beta' + assert not (dtype == 8 and impl == "baseline"), 'No baseline implemented' \ + ' for FP64 (switch to NAIVE)' + assert not (((dtype == 8) or (dtype == 4)) and impl == "opt_ex"), \ + 'Expanding GEMM kernels' \ + ' not supported for FP64 and FP32' + assert not (dtype == 1 and impl == "opt"), 'FP8 not supported in' \ + ' optimized implementation' \ + ' (switch to opt_ex)' + + def emit_header(self, **kwargs): + header = [super().emit_header()] + + # Validate parameters + self.validate_config(**kwargs) + + M, N, K = kwargs['M'], kwargs['N'], kwargs['K'] + + prec, _ = self.infer_implementation(kwargs['flg_fp']) + + ff_desc = data_utils.ff_desc_from_precision_t(prec) + ctype = data_utils.ctype_from_precision_t(prec) + + a = ff.array(np.random.rand(M, K), ff_desc) + b = ff.array(np.random.rand(K, N), ff_desc) + c = ff.array(np.random.rand(M, N), ff_desc) + + # a = -0.2888 + # b = -1.769 + a_gelu = kwargs['a_gelu'] + b_gelu = kwargs['b_gelu'] + result = self.exact_golden_model(1, a, b, kwargs['beta'], c, a_gelu, b_gelu) + + # Store matrices in transposed form if requested + a = a.T if kwargs['transa'] else a + b = b.T if kwargs['transb'] else b + + a_uid = 'a' + b_uid = 'b' + c_uid = 'c' + + cfg = { + 'prec': prec, + **kwargs, + 'a_gelu': a_gelu, + 'b_gelu': b_gelu, + 'a': a_uid, + 'b': b_uid, + 'c': c_uid, + } + + a = a.flatten() + b = b.flatten() + c = c.flatten() + + header += [format_array_declaration(ctype, a_uid, a.shape)] + header += [format_array_declaration(ctype, b_uid, b.shape)] + header += [format_array_declaration(ctype, c_uid, c.shape)] + header += [format_struct_definition('flg_args_t', 'flg_args', cfg)] + header += [format_array_definition(ctype, a_uid, a, + section=kwargs['section'])] + header += [format_array_definition(ctype, b_uid, b, + section=kwargs['section'])] + header += [format_array_definition(ctype, c_uid, c, + section=kwargs['section'])] + result_def = format_array_definition(ctype, 'result', result.flatten()) + header += [format_ifdef_wrapper('BIST', result_def)] + header = '\n\n'.join(header) + + return header + + +if __name__ == "__main__": + sys.exit(FlgDataGen().main()) diff --git a/sw/dnn/fused_linear_gelu/src/fused_linear_gelu.h b/sw/dnn/fused_linear_gelu/src/fused_linear_gelu.h new file mode 100644 index 000000000..c6b5547cb --- /dev/null +++ b/sw/dnn/fused_linear_gelu/src/fused_linear_gelu.h @@ -0,0 +1,277 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Viviane Potocnik + +#include + +#include "snrt.h" +#include "blas.h" + +#pragma once + +#include "fused_linear_gelu_fp32.h" +#include "fused_linear_gelu_fp64.h" + +/* + * @struct flg_args_t + * @brief This structure contains all parameters necessary + * for computing the fused linear GELU layer. + * + * @param flg_cfg: Pointer to the Fused Linear GELU configuration structure. + * @param a_gelu: Pointer to the alpha parameter of the i-GELU function. + * @param b_gelu: Pointer to the beta parameter of the i-GELU function. + */ + +typedef void (*flg_fp_t)(uint32_t m, uint32_t n, uint32_t k, void* a, + uint32_t lda, uint32_t transa, void* b, + uint32_t transb, uint32_t ldb, void* c, uint32_t ldc, + uint32_t beta, uint32_t setup_ss, double a_gelu, double b_gelu); + +typedef struct flg_args_struct { + double alpha; + uint32_t prec; + uint32_t setup_ssr; + uint32_t parallelize_m; + uint32_t parallelize_k; + uint32_t m_tiles; + uint32_t n_tiles; + uint32_t k_tiles; + uint32_t load_a; + uint32_t load_b; + uint32_t load_c; + uint32_t transa; + uint32_t transb; + uint32_t M; + uint32_t N; + uint32_t K; + void* a; + void* b; + uint32_t beta; + void* c; + void* flg_fp; + double a_gelu; + double b_gelu; +} flg_args_t; + + +void sc_st_flg(flg_args_t* flg_args, void* a, void* b, uint32_t beta, + void* c) { + flg_fp_t impl = (flg_fp_t)flg_args->flg_fp; + precision_t prec = flg_args->prec; + uint32_t setup_ssr = flg_args->setup_ssr; + uint32_t transa = flg_args->transa; + uint32_t transb = flg_args->transb; + + uint32_t m = flg_args->M / flg_args->m_tiles; + uint32_t n = flg_args->N / flg_args->n_tiles; + uint32_t k = flg_args->K / flg_args->k_tiles; + + uint32_t lda = k; + uint32_t ldb; + if (transb) { + ldb = k; + } else { + ldb = n; + } + uint32_t ldc = n; + + double alpha = flg_args->alpha; + + double a_gelu = flg_args->a_gelu; + double b_gelu = flg_args->b_gelu; + + if (snrt_is_compute_core()) { + const uint32_t compute_num = snrt_cluster_compute_core_num(); + const uint32_t compute_id = snrt_cluster_core_idx(); + + // Compute cores work not on contiguous blocks but on strided rows + uint32_t lda_strided = compute_num * lda; + uint32_t ldc_strided = compute_num * ldc; + + // Compute cores access A and C at offsets of one row from each other + uint32_t offsetA = compute_id * lda * prec; + uint32_t offsetC = compute_id * ldc * prec; + + // Compute fraction of C rows every core computes + uint32_t frac_m = m / compute_num; + uint32_t rem_m = m % compute_num; + if (snrt_cluster_core_idx() < rem_m) frac_m++; + + if (frac_m > 0) + impl(frac_m, n, k, a + offsetA, lda_strided, transa, b, ldb, transb, + c + offsetC, ldc_strided, (float)beta, setup_ssr, a_gelu, b_gelu); + } +} + +// Derivate of the Multiple-cluster multiple-tile GEMM implementation. +// This function computes the fused linear GELU layer. +int fused_linear_gelu(flg_args_t* args) { + flg_args_t* local_args = snrt_l1_next(); + + // Copy the arguments to local memory + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_args, args, sizeof(flg_args_t)); + snrt_dma_wait_all(); + } + snrt_cluster_hw_barrier(); + + uint32_t m = local_args->M; + uint32_t n = local_args->N; + uint32_t k = local_args->K; + precision_t prec = (precision_t)local_args->prec; + uint32_t setup_ssr = local_args->setup_ssr; + uint32_t parallelize_m = local_args->parallelize_m; + uint32_t parallelize_k = local_args->parallelize_k; + uint32_t m_tiles = local_args->m_tiles; + uint32_t n_tiles = local_args->n_tiles; + uint32_t k_tiles = local_args->k_tiles; + uint32_t load_a = local_args->load_a; + uint32_t load_b = local_args->load_b; + uint32_t load_c = local_args->load_c; + uint32_t transa = local_args->transa; + uint32_t transb = local_args->transb; + double alpha = local_args->alpha; + void* a = local_args->a; + void* b = local_args->b; + uint32_t beta = local_args->beta; + void* c = local_args->c; + + // Calculate tile sizes + uint32_t frac_m = m / m_tiles; + uint32_t frac_n = n / n_tiles; + uint32_t frac_k = k / k_tiles; + uint32_t frac_a = frac_m * frac_k; + uint32_t frac_c = frac_m * frac_n; + uint32_t size_frac_a = frac_a * prec; + uint32_t size_frac_b = frac_k * frac_n * prec; + uint32_t size_frac_c = frac_c * prec; + + // Allocate space in TCDM + void *local_a, *local_b, *local_c_partial, *local_c; + void* heap_ptr = (void*)local_args + sizeof(gemm_args_t); + if (load_a) { + local_a = heap_ptr; + heap_ptr += size_frac_a; + } else + local_a = a; + if (load_b) { + local_b = heap_ptr; + heap_ptr += size_frac_b; + } else + local_b = b; + if (load_c) { + local_c_partial = heap_ptr; + heap_ptr += size_frac_c; + } else + local_c_partial = c; + local_c = parallelize_k ? heap_ptr : local_c_partial; + + // Assign m and k tiles to clusters + uint32_t m_tiles_per_cluster = + parallelize_m ? m_tiles / snrt_cluster_num() : m_tiles; + uint32_t k_tiles_per_cluster = + parallelize_k ? k_tiles / snrt_cluster_num() : k_tiles; + + // Every cluster iterates over its subset of m tiles + for (uint32_t m_tile = 0; m_tile < m_tiles_per_cluster; m_tile++) { + for (uint32_t n_tile = 0; n_tile < n_tiles; n_tile++) { + // Calculate absolute m tile index for the current cluster + uint32_t abs_m_tile_idx = + !parallelize_m + ? m_tile + : snrt_cluster_idx() * m_tiles_per_cluster + m_tile; + + // k accumulation loop + for (uint32_t k_tile = 0; k_tile < k_tiles_per_cluster; k_tile++) { + // Calculate absolute k tile index for the current cluster + uint32_t abs_k_tile_idx = + !parallelize_k + ? k_tile + : snrt_cluster_idx() * k_tiles_per_cluster + k_tile; + + // Copy data in TCDM + if (snrt_is_dm_core()) { + if (load_a) { + snrt_dma_load_2d_tile(local_a, a, abs_m_tile_idx, + abs_k_tile_idx, frac_m, frac_k, k, + prec); + } + if (load_b) { + snrt_dma_load_2d_tile(local_b, b, abs_k_tile_idx, + n_tile, frac_k, frac_n, n, prec); + } + // C tile is loaded only upon first iteration, then the C + // array will contain the partial results from the + // previous iteration + if (load_c) { + if (abs_k_tile_idx == 0) { + snrt_dma_load_2d_tile(local_c_partial, c, + abs_m_tile_idx, n_tile, + frac_m, frac_n, n, prec); + } else if (k_tile == 0) { + // Clusters other than the first need to initialize + // the C array to zero in their first iteration + snrt_dma_start_1d(local_c_partial, + (void*)snrt_zero_memory_ptr(), + frac_m * frac_n * prec); + } + } + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + // Compute + if (!snrt_is_dm_core()) { + uint32_t start_cycle = snrt_mcycle(); + + volatile uint32_t ldb = frac_n; + volatile uint32_t ldc = frac_n; + + if (transb) { + ldb = frac_k; + } + + // In the first K iteration we accumulate with the C matrix + // scaled by beta, in successive iterations we accumulate + // the previous partial result for the tile + uint32_t beta_k; + if (abs_k_tile_idx == 0) { + beta_k = beta; + } else { + beta_k = 1; + } + + sc_st_flg(local_args, local_a, local_b, beta_k, + local_c_partial); + + uint32_t end_cycle = snrt_mcycle(); + } + + snrt_cluster_hw_barrier(); + } + + // Add the partial results from the various clusters together in a + // logarithmic reduction fashion + if (parallelize_k) { + snrt_global_reduction_dma((double*)local_c, + (double*)local_c_partial, + frac_m * frac_n); + } + + // Copy data out of TCDM + if (snrt_is_dm_core()) { + // If parallelize_k, then only cluster 0 must writeback + if ((snrt_cluster_idx() == 0) || !parallelize_k) { + snrt_dma_store_2d_tile(c, local_c, abs_m_tile_idx, n_tile, + frac_m, frac_n, n, prec); + snrt_dma_wait_all(); + } + } + } + } + + return 0; +} \ No newline at end of file diff --git a/sw/dnn/fused_linear_gelu/src/fused_linear_gelu_fp32.h b/sw/dnn/fused_linear_gelu/src/fused_linear_gelu_fp32.h new file mode 100644 index 000000000..65fdea5c7 --- /dev/null +++ b/sw/dnn/fused_linear_gelu/src/fused_linear_gelu_fp32.h @@ -0,0 +1,315 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Viviane Potocnik + +void fused_linear_gelu_fp32_naive(uint32_t M, uint32_t N, uint32_t K, void* A_p, + uint32_t ldA, uint32_t ta, void* B_p, uint32_t ldB, + uint32_t tb, void* C_p, uint32_t ldC, uint32_t BETA, + uint32_t setup_SSR, float a_gelu, float b_gelu) { + + float* A = (float*)A_p; + float* B = (float*)B_p; + float* C = (float*)C_p; + + if (!ta && !tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + float c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k + m * ldA] * B[k * ldB + n]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu); + } + } + } else if (ta && !tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + float c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k * M * ldA + m * ldA] * B[k * ldB + n]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu); + } + } + } else if (!ta && tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + float c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = c0; + } + } + } else { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + float c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k * M * ldA + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu); + } + } + } +} + +void fused_linear_gelu_fp32_naive_unrolled(uint32_t M, uint32_t N, uint32_t K, void* A_p, + uint32_t ldA, uint32_t ta, void* B_p, + uint32_t ldB, uint32_t tb, void* C_p, + uint32_t ldC, uint32_t BETA, uint32_t setup_SSR, + float a_gelu, float b_gelu) { + + float* A = (float*)A_p; + float* B = (float*)B_p; + float* C = (float*)C_p; + + float c0 = 0.0f; + float c1 = 0.0f; + float c2 = 0.0f; + float c3 = 0.0f; + if (!ta && !tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + if (BETA == 0) { + c0 = 0.0f; + } else { + c0 = BETA * C[m * ldC + n]; + } + c1 = 0.0f; + c2 = 0.0f; + c3 = 0.0f; + for (uint32_t k = 0; k < K; k += 4) { + c0 += A[(k + 0) + m * ldA] * B[(k + 0) * ldB + n]; + c1 += A[(k + 1) + m * ldA] * B[(k + 1) * ldB + n]; + c2 += A[(k + 2) + m * ldA] * B[(k + 2) * ldB + n]; + c3 += A[(k + 3) + m * ldA] * B[(k + 3) * ldB + n]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c1, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c2, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c3, a_gelu, b_gelu); + } + } + } else if (ta && !tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + if (BETA == 0) { + c0 = 0.0f; + } else { + c0 = BETA * C[m * ldC + n]; + } + c1 = 0.0f; + c2 = 0.0f; + c3 = 0.0f; + for (uint32_t k = 0; k < K; k += 4) { + c0 += A[(k + 0) * M * ldA + m * ldA] * B[(k + 0) * ldB + n]; + c1 += A[(k + 1) * M * ldA + m * ldA] * B[(k + 1) * ldB + n]; + c2 += A[(k + 2) * M * ldA + m * ldA] * B[(k + 2) * ldB + n]; + c3 += A[(k + 3) * M * ldA + m * ldA] * B[(k + 3) * ldB + n]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c1, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c2, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c3, a_gelu, b_gelu); + } + } + } else if (!ta && tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + if (BETA == 0) { + c0 = 0.0f; + } else { + c0 = BETA * C[m * ldC + n]; + } + c1 = 0.0f; + c2 = 0.0f; + c3 = 0.0f; + for (uint32_t k = 0; k < K; k += 4) { + c0 += A[(k + 0) + m * ldA] * B[(k + 0) + n * ldB]; + c1 += A[(k + 1) + m * ldA] * B[(k + 1) + n * ldB]; + c2 += A[(k + 2) + m * ldA] * B[(k + 2) + n * ldB]; + c3 += A[(k + 3) + m * ldA] * B[(k + 3) + n * ldB]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c1, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c2, a_gelu, b_gelu) + + sigmoid_gelu_fp32(c3, a_gelu, b_gelu); + } + } + } else { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + register float c0 = BETA * C[m * ldC + n]; + for (uint32_t k = 0; k < K; k++) { + c0 += A[k * M * ldA + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c0, a_gelu, b_gelu); + } + } + } +} + +void fused_linear_gelu_fp32_opt(uint32_t M, uint32_t N, uint32_t K, void* A_p, uint32_t ldA, + uint32_t ta, void* B_p, uint32_t ldB, uint32_t tb, void* C_p, + uint32_t ldC, uint32_t BETA, uint32_t setup_SSR, float a_gelu, float b_gelu) { + + // cast void pointers to float pointers + float* A = (float*)A_p; + float* B = (float*)B_p; + float* C = (float*)C_p; + // Unrolling factor of most inner loop. + // Should be at least as high as the FMA delay + // for maximum utilization + const uint32_t unroll = 8; + + // SSR strides and bounds only have to be configured + // once in the beginning + if (setup_SSR) { + uint32_t ssr0_b[4] = {unroll, K / 2, N / unroll, M}; + uint32_t ssr0_i[4] = {0, sizeof(float) * 2, 0, sizeof(float) * ldA}; + + uint32_t ssr1_b[4] = {unroll, K / 2, N / unroll, M}; + uint32_t ssr1_i[4] = {sizeof(float) * ldB, sizeof(float) * 2, + sizeof(float) * unroll * ldB, 0}; + + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], ssr1_i[3]); + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B); + snrt_ssr_enable(); + + // Kernel progresses by 2 values each step + const uint32_t n_frep = K / 2 - 1; + + for (uint32_t m = 0; m < M; m++) { + uint32_t n = 0; + for (uint32_t n0 = 0; n0 < N / unroll; n0++) { + float* _C = &C[m * ldC + n / 2]; + const register float zero = 0.0; + v2f32 c[unroll], reduce_reg[unroll]; + + asm volatile( + "beqz %[BETA], 1f \n" + // Load intermediate results + "flw %[reduce_reg0], 0(%[C]) \n" + "flw %[reduce_reg1], 4(%[C]) \n" + "flw %[reduce_reg2], 8(%[C]) \n" + "flw %[reduce_reg3], 12(%[C]) \n" + "flw %[reduce_reg4], 16(%[C]) \n" + "flw %[reduce_reg5], 20(%[C]) \n" + "flw %[reduce_reg6], 24(%[C]) \n" + "flw %[reduce_reg7], 28(%[C]) \n" + // Pack intermediate results into SIMD vector + "vfcpka.s.s %[reduce_reg0], %[reduce_reg0], %[zero]\n" + "vfcpka.s.s %[reduce_reg1], %[reduce_reg1], %[zero]\n" + "vfcpka.s.s %[reduce_reg2], %[reduce_reg2], %[zero]\n" + "vfcpka.s.s %[reduce_reg3], %[reduce_reg3], %[zero]\n" + "vfcpka.s.s %[reduce_reg4], %[reduce_reg4], %[zero]\n" + "vfcpka.s.s %[reduce_reg5], %[reduce_reg5], %[zero]\n" + "vfcpka.s.s %[reduce_reg6], %[reduce_reg6], %[zero]\n" + "vfcpka.s.s %[reduce_reg7], %[reduce_reg7], %[zero]\n" + "j 2f \n" + "1: \n" + // Initialize SIMD vector with zeros + "vfcpka.s.s %[reduce_reg0], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg1], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg2], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg3], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg4], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg5], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg6], %[zero], %[zero]\n" + "vfcpka.s.s %[reduce_reg7], %[zero], %[zero]\n" + + "2: \n" + // Don't accumulate in first iteration + "vfmul.s %[c0], ft1, ft0 \n" + "vfmul.s %[c1], ft1, ft0 \n" + "vfmul.s %[c2], ft1, ft0 \n" + "vfmul.s %[c3], ft1, ft0 \n" + "vfmul.s %[c4], ft1, ft0 \n" + "vfmul.s %[c5], ft1, ft0 \n" + "vfmul.s %[c6], ft1, ft0 \n" + "vfmul.s %[c7], ft1, ft0 \n" + // frep over MACs + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "vfmac.s %[c0], ft1, ft0 \n" + "vfmac.s %[c1], ft1, ft0 \n" + "vfmac.s %[c2], ft1, ft0 \n" + "vfmac.s %[c3], ft1, ft0 \n" + "vfmac.s %[c4], ft1, ft0 \n" + "vfmac.s %[c5], ft1, ft0 \n" + "vfmac.s %[c6], ft1, ft0 \n" + "vfmac.s %[c7], ft1, ft0 \n" + // Sum-reduce vector + "vfsum.s %[reduce_reg0], %[c0] \n" + "vfsum.s %[reduce_reg1], %[c1] \n" + "vfsum.s %[reduce_reg2], %[c2] \n" + "vfsum.s %[reduce_reg3], %[c3] \n" + "vfsum.s %[reduce_reg4], %[c4] \n" + "vfsum.s %[reduce_reg5], %[c5] \n" + "vfsum.s %[reduce_reg6], %[c6] \n" + "vfsum.s %[reduce_reg7], %[c7] \n" + // Pack results together again into vectors + "vfcpka.s.s %[c0], %[reduce_reg0], %[reduce_reg1] \n" + "vfcpka.s.s %[c1], %[reduce_reg2], %[reduce_reg3] \n" + "vfcpka.s.s %[c2], %[reduce_reg4], %[reduce_reg5] \n" + "vfcpka.s.s %[c3], %[reduce_reg6], %[reduce_reg7] \n" + : [ c0 ] "+f"(c[0]), [ c1 ] "+f"(c[1]), [ c2 ] "+f"(c[2]), + [ c3 ] "+f"(c[3]), [ c4 ] "+f"(c[4]), [ c5 ] "+f"(c[5]), + [ c6 ] "+f"(c[6]), [ c7 ] "+f"(c[7]), + [ reduce_reg0 ] "+f"(reduce_reg[0]), + [ reduce_reg1 ] "+f"(reduce_reg[1]), + [ reduce_reg2 ] "+f"(reduce_reg[2]), + [ reduce_reg3 ] "+f"(reduce_reg[3]), + [ reduce_reg4 ] "+f"(reduce_reg[4]), + [ reduce_reg5 ] "+f"(reduce_reg[5]), + [ reduce_reg6 ] "+f"(reduce_reg[6]), + [ reduce_reg7 ] "+f"(reduce_reg[7]) + : [ C ] "r"(_C), [ zero ] "f"(zero), [ n_frep ] "r"(n_frep - 1), + [ unroll ] "i"(unroll), [ BETA ] "r"(BETA) + : "ft0", "ft1", "ft2"); + + // Store results + c[0][0] = sigmoid_gelu_fp32(c[0][0], a_gelu, b_gelu); + c[0][1] = sigmoid_gelu_fp32(c[1][0], a_gelu, b_gelu); + c[1][0] = sigmoid_gelu_fp32(c[2][0], a_gelu, b_gelu); + c[1][1] = sigmoid_gelu_fp32(c[3][0], a_gelu, b_gelu); + c[2][0] = sigmoid_gelu_fp32(c[4][0], a_gelu, b_gelu); + c[2][1] = sigmoid_gelu_fp32(c[5][0], a_gelu, b_gelu); + c[3][0] = sigmoid_gelu_fp32(c[6][0], a_gelu, b_gelu); + c[3][1] = sigmoid_gelu_fp32(c[7][0], a_gelu, b_gelu); + ((v2f32*)_C)[0] = c[0]; + ((v2f32*)_C)[1] = c[1]; + ((v2f32*)_C)[2] = c[2]; + ((v2f32*)_C)[3] = c[3]; + + // progress by 2 columns each iteration of the loop + n += unroll * 2; + } + + // Clean up of leftover columns + snrt_ssr_disable(); + + for (; n < N; n++) { + float c = BETA ? C[m * ldC + n] : 0.0; + for (uint32_t k = 0; k < K; k++) { + c += A[k + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = sigmoid_gelu_fp32(c, a_gelu, b_gelu); + } + + snrt_ssr_enable(); + } + + snrt_ssr_disable(); +} diff --git a/sw/dnn/fused_linear_gelu/src/fused_linear_gelu_fp64.h b/sw/dnn/fused_linear_gelu/src/fused_linear_gelu_fp64.h new file mode 100644 index 000000000..b5969a28c --- /dev/null +++ b/sw/dnn/fused_linear_gelu/src/fused_linear_gelu_fp64.h @@ -0,0 +1,189 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Viviane Potocnik + +void fused_linear_gelu_fp64_naive(uint32_t M, uint32_t N, uint32_t K, void* A_p, + uint32_t ldA, uint32_t ta, void* B_p, uint32_t ldB, + uint32_t tb, void* C_p, uint32_t ldC, uint32_t BETA, + uint32_t setup_SSR, double a_gelu, double b_gelu) { + + double* A = (double*)A_p; + double* B = (double*)B_p; + double* C = (double*)C_p; + + if (!ta && !tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + double c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k + m * ldA] * B[k * ldB + n]; + } + C[m * ldC + n] = sigmoid_gelu_fp64(c0, a_gelu, b_gelu); + } + } + } else if (ta && !tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + double c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k * M * ldA + m * ldA] * B[k * ldB + n]; + } + C[m * ldC + n] = sigmoid_gelu_fp64(c0, a_gelu, b_gelu); + } + } + } else if (!ta && tb) { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + double c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = sigmoid_gelu_fp64(c0, a_gelu, b_gelu); + } + } + } else { + for (uint32_t m = 0; m < M; m++) { + for (uint32_t n = 0; n < N; n++) { + double c0 = multiply_opt(C[m * ldC + n], BETA); + for (uint32_t k = 0; k < K; k++) { + c0 += A[k * M * ldA + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = sigmoid_gelu_fp64(c0, a_gelu, b_gelu); + } + } + } +} + +void fused_linear_gelu_fp64_opt(uint32_t M, uint32_t N, uint32_t K, void* A_p, + uint32_t ldA, uint32_t ta, void* B_p, uint32_t ldB, + uint32_t tb, void* C_p, uint32_t ldC, uint32_t BETA, + uint32_t setup_SSR, double a_gelu, double b_gelu) { + + double* A = (double*)A_p; + double* B = (double*)B_p; + double* C = (double*)C_p; + + // Unrolling factor of most inner loop. + // Should be at least as high as the FMA delay + // for maximum utilization + const uint32_t unroll = 8; + + // SSR strides and bounds only have to be configured + // once in the beginning + if (setup_SSR) { + // First matrix is stored in transposed format + if (ta) { + const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M}; + const uint32_t ssr0_i[4] = {0, 8 * ldA, 0, 8 * 8}; + + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + } else { + const uint32_t ssr0_b[4] = {unroll, K, N / unroll, M}; + const uint32_t ssr0_i[4] = {0, 8, 0, 8 * ldA}; + + snrt_ssr_loop_3d(SNRT_SSR_DM0, ssr0_b[1], ssr0_b[2], ssr0_b[3], + ssr0_i[1], ssr0_i[2], ssr0_i[3]); + snrt_ssr_repeat(SNRT_SSR_DM0, unroll); + } + + // Second matrix is stored in transposed format + if (tb) { + const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M}; + const uint32_t ssr1_i[4] = {8 * ldB, 8, 8 * ldB * unroll, 0}; + + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], + ssr1_i[3]); + } else { + const uint32_t ssr1_b[4] = {unroll, K, N / unroll, M}; + const uint32_t ssr1_i[4] = {8, 8 * ldB, 8 * unroll, 0}; + + snrt_ssr_loop_4d(SNRT_SSR_DM1, ssr1_b[0], ssr1_b[1], ssr1_b[2], + ssr1_b[3], ssr1_i[0], ssr1_i[1], ssr1_i[2], + ssr1_i[3]); + } + } + + // SSR start address need to be configured each time + snrt_ssr_read(SNRT_SSR_DM0, SNRT_SSR_4D, A); + snrt_ssr_read(SNRT_SSR_DM1, SNRT_SSR_4D, B); + snrt_ssr_enable(); + + for (uint32_t m = 0; m < M; m++) { + uint32_t n = 0; + for (uint32_t n0 = 0; n0 < N / unroll; n0++) { + double c[unroll]; + + // Load intermediate result + if (BETA != 0) { + c[0] = C[m * ldC + n + 0]; + c[1] = C[m * ldC + n + 1]; + c[2] = C[m * ldC + n + 2]; + c[3] = C[m * ldC + n + 3]; + c[4] = C[m * ldC + n + 4]; + c[5] = C[m * ldC + n + 5]; + c[6] = C[m * ldC + n + 6]; + c[7] = C[m * ldC + n + 7]; + } else { + c[0] = 0.0; + c[1] = 0.0; + c[2] = 0.0; + c[3] = 0.0; + c[4] = 0.0; + c[5] = 0.0; + c[6] = 0.0; + c[7] = 0.0; + } + asm volatile( + "frep.o %[n_frep], %[unroll], 0, 0 \n" + "fmadd.d %[c0], ft0, ft1, %[c0] \n" + "fmadd.d %[c1], ft0, ft1, %[c1] \n" + "fmadd.d %[c2], ft0, ft1, %[c2] \n" + "fmadd.d %[c3], ft0, ft1, %[c3] \n" + "fmadd.d %[c4], ft0, ft1, %[c4] \n" + "fmadd.d %[c5], ft0, ft1, %[c5] \n" + "fmadd.d %[c6], ft0, ft1, %[c6] \n" + "fmadd.d %[c7], ft0, ft1, %[c7] \n" + : [ c0 ] "+f"(c[0]), [ c1 ] "+f"(c[1]), [ c2 ] "+f"(c[2]), + [ c3 ] "+f"(c[3]), [ c4 ] "+f"(c[4]), [ c5 ] "+f"(c[5]), + [ c6 ] "+f"(c[6]), [ c7 ] "+f"(c[7]) + : [ n_frep ] "r"(K - 1), [ unroll ] "i"(unroll) + : "ft0", "ft1", "ft2"); + + // Store results back + C[m * ldC + n + 0] = sigmoid_gelu_fp64(c[0], a_gelu, b_gelu); + C[m * ldC + n + 1] = sigmoid_gelu_fp64(c[1], a_gelu, b_gelu); + C[m * ldC + n + 2] = sigmoid_gelu_fp64(c[2], a_gelu, b_gelu); + C[m * ldC + n + 3] = sigmoid_gelu_fp64(c[3], a_gelu, b_gelu); + C[m * ldC + n + 4] = sigmoid_gelu_fp64(c[4], a_gelu, b_gelu); + C[m * ldC + n + 5] = sigmoid_gelu_fp64(c[5], a_gelu, b_gelu); + C[m * ldC + n + 6] = sigmoid_gelu_fp64(c[6], a_gelu, b_gelu); + C[m * ldC + n + 7] = sigmoid_gelu_fp64(c[7], a_gelu, b_gelu); + n += unroll; + } + + // Clean up of leftover columns + snrt_ssr_disable(); + + for (; n < N; n++) { + double c; + if (BETA != 0) { + c = C[m * ldC + n]; + } else { + c = 0.0; + } + for (uint32_t k = 0; k < K; k++) { + c += A[k + m * ldA] * B[k + n * ldB]; + } + C[m * ldC + n] = c; + } + + snrt_ssr_enable(); + } + + snrt_ssr_disable(); +} diff --git a/sw/dnn/fused_linear_gelu/src/main.c b/sw/dnn/fused_linear_gelu/src/main.c new file mode 100644 index 000000000..b14d2f031 --- /dev/null +++ b/sw/dnn/fused_linear_gelu/src/main.c @@ -0,0 +1,20 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Viviane Potocnik + +#include +#include + +#include "blas.h" +#include "dnn.h" +#include "data.h" +// #include "fused_linear_gelu.h" + +#include "snrt.h" + +int main() { + fused_linear_gelu(&flg_args); + return 0; +} \ No newline at end of file diff --git a/sw/dnn/gelu/src/gelu.h b/sw/dnn/gelu/src/gelu.h index 5947f5a7e..ef19778e0 100644 --- a/sw/dnn/gelu/src/gelu.h +++ b/sw/dnn/gelu/src/gelu.h @@ -45,6 +45,18 @@ static inline double sigmoid_gelu_fp64(double x, float a, float b) { return x * 0.5 * (1 + l); } +static inline float sigmoid_gelu_fp32(float x, float a, float b) { + // L(x) = sgn(x) [a(clip(|x|, max = −b) + b)^2 + 1] + // a = -0.2888, b = -1.769 + float sign = x > 0.0 ? 1.0 : -1.0; + // float arg = clip(fabs(x), -b); + float sqrt2_inv = 1 / 1.4142135623730951; + float x_scaled = sqrt2_inv * x; + float arg = fabs(x_scaled) > -b ? -b : fabs(x_scaled); + float l = sign * (a * arg * arg + 1.0); + return x * 0.5 * (1 + l); +} + // Single-cluster GeLU static inline void gelu_fp64(double *input, double *output, uint32_t size) { if (snrt_is_compute_core()) { diff --git a/sw/dnn/mlp/data/params.json b/sw/dnn/mlp/data/params.json new file mode 100644 index 000000000..ac5ae1cde --- /dev/null +++ b/sw/dnn/mlp/data/params.json @@ -0,0 +1,57 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Solderpad Hardware License, Version 0.51, see LICENSE for details. +// SPDX-License-Identifier: SHL-0.51 + +{ + layernorm : { + input_dim: { + batch_size: 1, + seq_len: 16, + embeddings: 128 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 2, + implementation: "OPT" + }, + 'fused_linear_gelu' : { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 2, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 1, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 16, + N: 64, + K: 128, + alpha: 1, + beta: 0, + a_gelu: -0.2888, + b_gelu: -1.769, + flg_fp: "fused_linear_gelu_fp64_naive" + }, + gemm : { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 2, + n_tiles: 1, + k_tiles: 1, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 16, + N: 64, + K: 128, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + } +} \ No newline at end of file diff --git a/sw/dnn/mlp/scripts/datagen.py b/sw/dnn/mlp/scripts/datagen.py new file mode 100755 index 000000000..ece07f818 --- /dev/null +++ b/sw/dnn/mlp/scripts/datagen.py @@ -0,0 +1,324 @@ +#!/usr/bin/env python3 +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Viviane Potocnik + +import argparse +import pathlib +import json5 +import sys +import os +import torch +import numpy as np +import re + +import pyflexfloat as ff + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../util/sim/")) +import data_utils # noqa: E402 +from data_utils import emit_license, format_array_declaration, format_struct_definition, \ + format_array_definition, format_ifdef_wrapper # noqa: E402 + +torch.manual_seed(42) + +# AXI splits bursts crossing 4KB address boundaries. To minimize +# the occurrence of these splits the data should be aligned to 4KB +BURST_ALIGNMENT = 4096 + +def infer_flg_implementation(flg_fp): + # flg_fp: "fused_linear_gelu_fp64_opt" + # create a regex with fp__ + prec, impl = re.search(r'fused_linear_gelu_fp(\d+)_(\w+)', flg_fp).group(1, 2) + return (int(prec) / 8), impl + + +def infer_gemm_implementation(gemm_fp): + # gemm_fp: "gemm_fp64_opt" + # create a regex with fp__ + prec, impl = re.search(r'gemm_fp(\d+)_(\w+)', gemm_fp).group(1, 2) + return (int(prec) / 8), impl + + +def sigmoid_gelu(x, x_shape, a, b): + x = ff.np.float32(x) + x = torch.tensor(x) + result = torch.sign(x) * (a * (torch.clamp(torch.abs(x), max=-b) + b)**2 + 1) + return ff.array(result.numpy(), x_shape) + + +def layernorm_golden_model(ifmap, eps): + # Compute the mean and variance considering the last dimension (embeddings) + # The dimensions for mean and variance calculations are (-1,), meaning the last dimension. + # Keep the dimensions for broadcasting in normalization. + mean = np.mean(ifmap, axis=(-1,), keepdims=True) + diff = ifmap - mean + var = np.mean(diff*diff, axis=(-1,), keepdims=True) + + # Normalize the input tensor + ofmap = (ifmap - mean) / np.sqrt(var + eps) + + return ofmap + + +def flg_golden_model(self, alpha, a, b, beta, c, a_gelu, b_gelu): + M, N, K = a.shape[0], b.shape[1], b.shape[0] + result = beta * c + result_shape = result.shape + for m in range(M): + for n in range(N): + for k in range(K): + result[m][n] += a[m][k] * b[k][n] + result[m][n] = sigmoid_gelu(result[m][n], result_shape, a_gelu, b_gelu) + return result + + +def gemm_exact_golden_model(alpha, a, b, beta, c): + M, N, K = a.shape[0], b.shape[1], b.shape[0] + result = beta * c + for m in range(M): + for n in range(N): + for k in range(K): + result[m][n] += a[m][k] * b[k][n] + return result + + +def validate_config(**kwargs): + """ + Validation of LayerNorm configuration parameters + """ + # Aliases + batch_size = kwargs['layernorm']['input_dim']['batch_size'] + seq_len = kwargs['layernorm']['input_dim']['seq_len'] + embeddings = kwargs['layernorm']['input_dim']['embeddings'] + + # Calculate total TCDM occupation + prec = data_utils.size_from_precision_t(kwargs['layernorm']['prec']) + tiled_seq_len = seq_len / kwargs['layernorm']['n_tiles'] + total_size = batch_size * tiled_seq_len * embeddings * prec + data_utils.validate_tcdm_footprint(total_size) + + assert kwargs['layernorm']['input_dim']['seq_len'] % kwargs['layernorm']['n_tiles'] == 0, 'Input dimension is not' \ + ' an integer multiple of' \ + ' tile size' + assert kwargs['layernorm']['prec'] != "FP64", 'FP64 not supported' + assert not (kwargs['layernorm']['implementation'] == "BASELINE"), 'No baseline implementations' \ + ' (switch to NAIVE)' + assert not (kwargs['layernorm']['implementation'] == + "OPT_EX"), 'Expanding layernorm kernels not supported' + assert not (kwargs['layernorm']['prec'] == "FP8" and kwargs['layernorm']['implementation'] == "NAIVE"), 'FP8 not ' \ + 'supported in' \ + 'naive ' \ + 'implementation' + + """ + Validation of GEMM configuration parameters + """ + M = kwargs['gemm']['M'] + N = kwargs['gemm']['N'] + K = kwargs['gemm']['K'] + m_tiles = kwargs['gemm']['m_tiles'] + n_tiles = kwargs['gemm']['n_tiles'] + k_tiles = kwargs['gemm']['k_tiles'] + parallelize_m = kwargs['gemm']['parallelize_m'] + parallelize_k = kwargs['gemm']['parallelize_k'] + transa = kwargs['gemm']['transa'] + transb = kwargs['gemm']['transb'] + beta = kwargs['gemm']['beta'] + + frac_m = M / m_tiles + frac_n = N / n_tiles + frac_k = K / k_tiles + + dtype, impl = infer_gemm_implementation(kwargs['gemm']['gemm_fp']) + + prec = data_utils.size_from_precision_t(dtype) + a_size = frac_m * frac_k * prec + b_size = frac_k * frac_n * prec + c_size = frac_m * frac_n * prec + total_size = a_size + total_size += b_size + total_size += c_size + data_utils.validate_tcdm_footprint(total_size) + + assert (M % m_tiles) == 0, 'M is not an integer multiple of tile size' + assert (N % n_tiles) == 0, 'N is not an integer multiple of tile size' + assert (K % k_tiles) == 0, 'K is not an integer multiple of tile size' + assert not (parallelize_m and parallelize_k), 'Cannot parallelize K and M simultaneously' + assert not transa, 'SIMD kernels don\'t support transposed A matrix' + assert (dtype == 8) or (impl == 'baseline') or (impl == 'naive') \ + or transb, 'Optimized SIMD kernels only support transposed B matrix' + assert not transb or n_tiles == 1, 'Tiling in the N dimension not supported' \ + ' if B is transposed' + assert not transb or k_tiles == 1, 'Tiling in the K dimension not supported' \ + ' if B is transposed' + assert (impl == 'baseline') or (impl == 'naive') or frac_n >= 8, \ + 'N dimension of tile size must be greater or equal to the unrolling factor (8) ' \ + 'when using optimized kernels' + assert beta == 0 or beta == 1, 'Only values of 0 or 1 supported for beta' + assert not (dtype == 8 and impl == "baseline"), 'No baseline implemented' \ + ' for FP64 (switch to NAIVE)' + assert not (((dtype == 8) or (dtype == 4)) and impl == "opt_ex"), \ + 'Expanding GEMM kernels' \ + ' not supported for FP64 and FP32' + assert not (dtype == 1 and impl == "opt"), 'FP8 not supported in' \ + ' optimized implementation' \ + ' (switch to opt_ex)' + +def emit_header(**kwargs): + + # Validate parameters + validate_config(**kwargs) + + """ + LayerNorm data generation + """ + batch_size = kwargs['layernorm']['input_dim']['batch_size'] + seq_len = kwargs['layernorm']['input_dim']['seq_len'] + embeddings = kwargs['layernorm']['input_dim']['embeddings'] + eps = kwargs['layernorm']['eps'] + prec = kwargs['layernorm']['prec'] + n_tiles = kwargs['layernorm']['n_tiles'] + implementation = kwargs['layernorm']['implementation'] + + ff_desc = data_utils.ff_desc_from_precision_t(prec) + ctype = data_utils.ctype_from_precision_t(prec) + + # Generate random input + mlp_input = ff.array(np.random.rand(batch_size, seq_len, embeddings), ff_desc) + x1_output = layernorm_golden_model(mlp_input, eps) + + mlp_input_uid = 'mlp_input' + + layernorm_cfg = { + **kwargs['layernorm']['input_dim'], + 'n_tiles': n_tiles, + 'implementation': implementation, + 'eps': eps, + 'dtype': prec, + } + + + """ + FLG data generation + """ + + M, N, K = kwargs['fused_linear_gelu']['M'], kwargs['fused_linear_gelu']['N'], kwargs['fused_linear_gelu']['K'] + + prec, _ = infer_flg_implementation(kwargs['fused_linear_gelu']['flg_fp']) + + ff_desc = data_utils.ff_desc_from_precision_t(prec) + ctype = data_utils.ctype_from_precision_t(prec) + + W_ff = ff.array(np.random.rand(K, N), ff_desc) + + W_ff = W_ff.T if kwargs['fused_linear_gelu']['transb'] else W_ff + + W_ff_uid = 'W_ff' + + flg_cfg = { + 'prec': int(prec), + **kwargs['fused_linear_gelu'], + } + + W_ff = W_ff.flatten() + + + """ + GEMM data generation + """ + + M, N, K = kwargs['gemm']['M'], kwargs['gemm']['N'], kwargs['gemm']['K'] + + prec, _ = infer_gemm_implementation(kwargs['gemm']['gemm_fp']) + + ff_desc = data_utils.ff_desc_from_precision_t(prec) + ctype = data_utils.ctype_from_precision_t(prec) + + W_mlp = ff.array(np.random.rand(K, N), ff_desc) + + W_mlp = W_mlp.T if kwargs['gemm']['transb'] else W_mlp + + W_mlp_uid = 'W_mlp' + + gemm_cfg = { + 'prec': int(prec), + **kwargs['gemm'], + } + + W_mlp = W_mlp.flatten() + + """ + MLP Configuration + """ + mlp_args_cfg = { + 'layernorm_cfg': '&layernorm_cfg', + 'flg_cfg': '&flg_cfg', + 'gemm_cfg': '&gemm_cfg', + 'mlp_input': mlp_input_uid, + 'W_ff': W_ff_uid, + 'W_mlp': W_mlp_uid, + } + + + """ + Emit header file + """ + data_str = [emit_license()] + data_str += [format_array_declaration(ctype, mlp_input_uid, mlp_input.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, W_ff_uid, W_ff.shape, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_declaration(ctype, W_mlp_uid, W_mlp.shape, + alignment=BURST_ALIGNMENT)] + + """ + Emit struct definitions + """ + data_str += [format_struct_definition('layernorm_layer_t', 'layernorm_cfg', layernorm_cfg)] + data_str += [format_struct_definition('flg_args_t', 'flg_cfg', flg_cfg)] + data_str += [format_struct_definition('gemm_args_t', 'gemm_cfg', gemm_cfg)] + data_str += [format_struct_definition('mlp_args_t', 'mlp_args', mlp_args_cfg)] + + data_str += [format_array_definition(ctype, mlp_input_uid, mlp_input, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, W_ff_uid, W_ff, + alignment=BURST_ALIGNMENT)] + data_str += [format_array_definition(ctype, W_mlp_uid, W_mlp, + alignment=BURST_ALIGNMENT)] + + data_str = '\n\n'.join(data_str) + return data_str + +def main(): + + parser = argparse.ArgumentParser(description='Generate data for layernorm kernel') + parser.add_argument( + "-c", "--cfg", + type=pathlib.Path, + required=True, + help='Select param config file kernel' + ) + parser.add_argument( + '--section', + type=str, + help='Section to store matrices in') + parser.add_argument( + 'output', + type=pathlib.Path, + help='Path of the output header file') + args = parser.parse_args() + + # Load param config file + with args.cfg.open() as f: + param = json5.loads(f.read()) + param['section'] = args.section + + # Emit header file + with open(args.output, 'w') as f: + f.write(emit_header(**param)) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/sw/dnn/mlp/src/main.c b/sw/dnn/mlp/src/main.c new file mode 100644 index 000000000..dabd41c65 --- /dev/null +++ b/sw/dnn/mlp/src/main.c @@ -0,0 +1,19 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Viviane Potocnik + +#include +#include + +#include "blas.h" +#include "dnn.h" +#include "data.h" + +#include "snrt.h" + +int main() { + mlp_layer(&mlp_args); + return 0; +} \ No newline at end of file diff --git a/sw/dnn/mlp/src/mlp.h b/sw/dnn/mlp/src/mlp.h new file mode 100644 index 000000000..9f6ad587f --- /dev/null +++ b/sw/dnn/mlp/src/mlp.h @@ -0,0 +1,71 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Author: Viviane Potocnik + +#include "blas.h" +#include "snrt.h" + +/* + * @struct mlp_args_t + * @brief This structure contains all parameters necessary + * for computing the Multi-layer Perceptron (MLP) layer. + * @var mlp_args_t::layernorm_layer_cfg + * Parameters for the LayerNorm layer + * @var mlp_args_t::flg_cfg + * Parameters for the Fused Linear GELU layer + * @var mlp_args_t::gemm_cfg + * Parameters for the GEMM layer + */ + +typedef struct { + struct layernorm_layer_struct *layernorm_cfg; + struct flg_args_struct *flg_cfg; + struct gemm_args_struct *gemm_cfg; + void* mlp_input; + void* flg_output; + void* mlp_output; + void* W_ff; + void* W_mlp; +} mlp_args_t; + +static inline int mlp_layer(mlp_args_t *mlp_args) { + + // LayerNorm of MLP + layernorm_layer_t layernorm_layer_cfg = { + .batch_size = mlp_args->layernorm_cfg->batch_size, + .seq_len = mlp_args->layernorm_cfg->seq_len, + .embeddings = mlp_args->layernorm_cfg->embeddings, + .n_tiles = mlp_args->layernorm_cfg->n_tiles, + .implementation = mlp_args->layernorm_cfg->implementation, + .eps = mlp_args->layernorm_cfg->eps, + .ifmap = mlp_args->mlp_input, + .ofmap = mlp_args->mlp_input, + .dtype = mlp_args->layernorm_cfg->dtype, + }; + layernorm_layer_cfg.ifmap = mlp_args->mlp_input; + layernorm_layer_cfg.ofmap = mlp_args->mlp_input; + layernorm_layer(layernorm_layer_cfg); + snrt_cluster_hw_barrier(); + + // Fused Linear GELU + mlp_args->flg_cfg->a = mlp_args->mlp_input; + mlp_args->flg_cfg->b = mlp_args->W_ff; + fused_linear_gelu(mlp_args->flg_cfg); + snrt_cluster_hw_barrier(); + + mlp_args->flg_output = mlp_args->flg_cfg->c; + snrt_cluster_hw_barrier(); + + // GEMM + mlp_args->gemm_cfg->a = mlp_args->flg_output; + mlp_args->gemm_cfg->b = mlp_args->W_mlp; + gemm(mlp_args->gemm_cfg); + snrt_cluster_hw_barrier(); + + mlp_args->mlp_output = mlp_args->gemm_cfg->c; + snrt_cluster_hw_barrier(); + + return 0; +} \ No newline at end of file diff --git a/sw/dnn/src/dnn.h b/sw/dnn/src/dnn.h index 8667804ac..a87dd0090 100644 --- a/sw/dnn/src/dnn.h +++ b/sw/dnn/src/dnn.h @@ -208,7 +208,9 @@ typedef struct network_single_cluster_t_ { #include "../flashattention_2/src/flashattention_2.h" #include "../fused_concat_linear/src/fused_concat_linear.h" #include "../gelu/src/gelu.h" +#include "../fused_linear_gelu/src/fused_linear_gelu.h" #include "../layernorm/src/layernorm.h" #include "../maxpool/src/maxpool.h" #include "../mha/src/mha.h" +#include "../mlp/src/mlp.h" #include "../softmax/src/softmax.h" diff --git a/target/snitch_cluster/sw.mk b/target/snitch_cluster/sw.mk index 57ba7e558..ae56307e3 100644 --- a/target/snitch_cluster/sw.mk +++ b/target/snitch_cluster/sw.mk @@ -53,8 +53,10 @@ APPS += sw/apps/dnn/softmax APPS += sw/apps/dnn/flashattention_2 APPS += sw/apps/dnn/concat APPS += sw/apps/dnn/fused_concat_linear +APPS += sw/apps/dnn/fused_linear_gelu APPS += sw/apps/dnn/transpose APPS += sw/apps/dnn/mha +APPS += sw/apps/dnn/mlp APPS += sw/apps/montecarlo/pi_estimation APPS += sw/apps/atax APPS += sw/apps/correlation diff --git a/target/snitch_cluster/sw/apps/dnn/fused_linear_gelu/Makefile b/target/snitch_cluster/sw/apps/dnn/fused_linear_gelu/Makefile new file mode 100644 index 000000000..27365440c --- /dev/null +++ b/target/snitch_cluster/sw/apps/dnn/fused_linear_gelu/Makefile @@ -0,0 +1,12 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Luca Colagrande + +APP ?= fused_linear_gelu + +include ../../../../../../sw/dnn/common.mk +include ../../common.mk + +$(DEP): $(DATA_H) diff --git a/target/snitch_cluster/sw/apps/dnn/mlp/Makefile b/target/snitch_cluster/sw/apps/dnn/mlp/Makefile new file mode 100644 index 000000000..12099bd3d --- /dev/null +++ b/target/snitch_cluster/sw/apps/dnn/mlp/Makefile @@ -0,0 +1,12 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Viviane Potocnik + +APP ?= mlp + +include ../../../../../../sw/dnn/common.mk +include ../../common.mk + +$(DEP): $(DATA_H) \ No newline at end of file