diff --git a/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp16.json b/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp16.json new file mode 100644 index 000000000..dcc871106 --- /dev/null +++ b/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp16.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 40, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 2048 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 66, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 2, + k_tiles: 8, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 128, + K: 2048, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + flashattention_2: { + L: 2048, + S: 2048, + d: 128, + B_r: 16, + B_c: 16, + dtype: "FP16", + use_mask: true, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 128], + output_shape: [16, 128], + dtype: "FP16", + gemm_implementation: "gemm_fp16_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 2048 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 66, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, // number of tiles in M dimension + n_tiles: 2, // number of tiles in N dimension + k_tiles: 8, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 2048, + N: 128, + K: 2048, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 32, + k_tiles: 32, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 2048, + K: 8192, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp32.json b/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp32.json new file mode 100644 index 000000000..8ff3e596f --- /dev/null +++ b/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp32.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 40, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 2048 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 132, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 2, + k_tiles: 16, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 128, + K: 2048, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + flashattention_2: { + L: 2048, + S: 2048, + d: 128, + B_r: 8, + B_c: 8, + dtype: "FP32", + use_mask: true, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 128], + output_shape: [16, 128], + dtype: "FP32", + gemm_implementation: "gemm_fp32_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 2048 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 132, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, // number of tiles in M dimension + n_tiles: 2, // number of tiles in N dimension + k_tiles: 16, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 2048, + N: 128, + K: 2048, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 32, + k_tiles: 64, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 2048, + K: 8192, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp8.json b/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp8.json new file mode 100644 index 000000000..1627f6512 --- /dev/null +++ b/sw/dnn/decoder/data/gpt-3xl/gpt-3xl-fp8.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 40, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 2048 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 33, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 2, + k_tiles: 4, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 128, + K: 2048, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + flashattention_2: { + L: 2048, + S: 2048, + d: 128, + B_r: 32, + B_c: 32, + dtype: "FP8", + use_mask: true, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 128], + output_shape: [16, 128], + dtype: "FP8", + gemm_implementation: "gemm_fp8_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 2048 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 33, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, // number of tiles in M dimension + n_tiles: 2, // number of tiles in N dimension + k_tiles: 4, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 2048, + N: 128, + K: 2048, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 32, + k_tiles: 16, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 2048, + K: 8192, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/decoder/data/gpt-j/gpt-j-fp16.json b/sw/dnn/decoder/data/gpt-j/gpt-j-fp16.json new file mode 100644 index 000000000..8d63923ef --- /dev/null +++ b/sw/dnn/decoder/data/gpt-j/gpt-j-fp16.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 28, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 4096 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 132, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 4, + k_tiles: 16, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 256, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + flashattention_2: { + L: 2048, + S: 2048, + d: 256, + B_r: 8, + B_c: 8, + dtype: "FP16", + use_mask: true, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 256], + output_shape: [16, 128], + dtype: "FP16", + gemm_implementation: "gemm_fp16_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 4096 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 132, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, // number of tiles in M dimension + n_tiles: 4, // number of tiles in N dimension + k_tiles: 16, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 2048, + N: 256, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 64, + k_tiles: 64, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 4096, + K: 16384, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/decoder/data/gpt-j/gpt-j-fp32.json b/sw/dnn/decoder/data/gpt-j/gpt-j-fp32.json new file mode 100644 index 000000000..e43fd2f33 --- /dev/null +++ b/sw/dnn/decoder/data/gpt-j/gpt-j-fp32.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 28, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 4096 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 264, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 4, + k_tiles: 32, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 256, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + flashattention_2: { + L: 2048, + S: 2048, + d: 256, + B_r: 4, + B_c: 4, + dtype: "FP32", + use_mask: true, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 256], + output_shape: [16, 128], + dtype: "FP32", + gemm_implementation: "gemm_fp32_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 4096 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 264, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, // number of tiles in M dimension + n_tiles: 4, // number of tiles in N dimension + k_tiles: 32, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 2048, + N: 256, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 64, + k_tiles: 128, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 4096, + K: 16384, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/decoder/data/gpt-j/gpt-j-fp8.json b/sw/dnn/decoder/data/gpt-j/gpt-j-fp8.json new file mode 100644 index 000000000..fa8948ddf --- /dev/null +++ b/sw/dnn/decoder/data/gpt-j/gpt-j-fp8.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 28, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 4096 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 66, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 4, + k_tiles: 8, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 256, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + flashattention_2: { + L: 2048, + S: 2048, + d: 256, + B_r: 16, + B_c: 16, + dtype: "FP8", + use_mask: true, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 256], + output_shape: [16, 128], + dtype: "FP8", + gemm_implementation: "gemm_fp8_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 2048, + embeddings: 4096 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 66, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, // number of tiles in M dimension + n_tiles: 4, // number of tiles in N dimension + k_tiles: 8, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 2048, + N: 256, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 256, + n_tiles: 64, + k_tiles: 32, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 2048, + N: 4096, + K: 16384, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-b/vit-b-fp16.json b/sw/dnn/encoder/data/vit-b/vit-b-fp16.json new file mode 100644 index 000000000..803c736d4 --- /dev/null +++ b/sw/dnn/encoder/data/vit-b/vit-b-fp16.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 12, + mha: { + num_heads: 12, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 768 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 3, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 3, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 768, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 64, + B_r: 32, + B_c: 32, + dtype: "FP16", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 12, + input_shape: [16, 64], + output_shape: [16, 128], + dtype: "FP16", + gemm_implementation: "gemm_fp16_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 768 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 3, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 3, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 768, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 12, + k_tiles: 12, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 768, + K: 3072, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-b/vit-b-fp32.json b/sw/dnn/encoder/data/vit-b/vit-b-fp32.json new file mode 100644 index 000000000..61a363da1 --- /dev/null +++ b/sw/dnn/encoder/data/vit-b/vit-b-fp32.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 12, + mha: { + num_heads: 12, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 768 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 5, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 6, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 768, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 64, + B_r: 16, + B_c: 16, + dtype: "FP32", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 12, + input_shape: [16, 64], + output_shape: [16, 128], + dtype: "FP32", + gemm_implementation: "gemm_fp32_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 768 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 5, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 6, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 768, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 12, + k_tiles: 24, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 768, + K: 3072, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-b/vit-b-fp8.json b/sw/dnn/encoder/data/vit-b/vit-b-fp8.json new file mode 100644 index 000000000..c4953d4e2 --- /dev/null +++ b/sw/dnn/encoder/data/vit-b/vit-b-fp8.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 12, + mha: { + num_heads: 12, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 768 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 2, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 2, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 768, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 64, + B_r: 64, + B_c: 64, + dtype: "FP8", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 12, + input_shape: [16, 64], + output_shape: [16, 128], + dtype: "FP8", + gemm_implementation: "gemm_fp8_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 768 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 2, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 2, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 768, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 12, + k_tiles: 6, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 768, + K: 3072, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-h/vit-h-fp16.json b/sw/dnn/encoder/data/vit-h/vit-h-fp16.json new file mode 100644 index 000000000..60e84585b --- /dev/null +++ b/sw/dnn/encoder/data/vit-h/vit-h-fp16.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 32, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1280 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 8, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 10, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 1280, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 80, + B_r: 32, + B_c: 32, + dtype: "FP16", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 80], + output_shape: [16, 128], + dtype: "FP16", + gemm_implementation: "gemm_fp16_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1280 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 8, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 8, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 1280, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 30, + k_tiles: 20, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 1280, + K: 5120, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-h/vit-h-fp32.json b/sw/dnn/encoder/data/vit-h/vit-h-fp32.json new file mode 100644 index 000000000..ada7da7fc --- /dev/null +++ b/sw/dnn/encoder/data/vit-h/vit-h-fp32.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 32, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1280 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 15, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 20, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 1280, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 80, + B_r: 16, + B_c: 16, + dtype: "FP32", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 80], + output_shape: [16, 128], + dtype: "FP32", + gemm_implementation: "gemm_fp32_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1280 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 15, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 15, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 1280, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 30, + k_tiles: 40, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 1280, + K: 5120, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-h/vit-h-fp8.json b/sw/dnn/encoder/data/vit-h/vit-h-fp8.json new file mode 100644 index 000000000..ed86d3cae --- /dev/null +++ b/sw/dnn/encoder/data/vit-h/vit-h-fp8.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 32, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1280 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 4, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 5, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 1280, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 80, + B_r: 64, + B_c: 64, + dtype: "FP8", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 80], + output_shape: [16, 128], + dtype: "FP8", + gemm_implementation: "gemm_fp8_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1280 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 4, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 4, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 1280, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 30, + k_tiles: 10, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 1280, + K: 5120, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-l/vit-l-fp16.json b/sw/dnn/encoder/data/vit-l/vit-l-fp16.json new file mode 100644 index 000000000..d53a57b28 --- /dev/null +++ b/sw/dnn/encoder/data/vit-l/vit-l-fp16.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 24, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1024 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 6, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 8, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 1024, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 64, + B_r: 32, + B_c: 32, + dtype: "FP16", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 64], + output_shape: [16, 128], + dtype: "FP16", + gemm_implementation: "gemm_fp16_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1024 + }, + eps: 1e-5, + prec: "FP16", + n_tiles: 6, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 6, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 1024, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 24, + k_tiles: 16, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 1024, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp16_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-l/vit-l-fp32.json b/sw/dnn/encoder/data/vit-l/vit-l-fp32.json new file mode 100644 index 000000000..5c9d4dcb6 --- /dev/null +++ b/sw/dnn/encoder/data/vit-l/vit-l-fp32.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 24, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1024 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 12, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 16, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 1024, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 64, + B_r: 16, + B_c: 16, + dtype: "FP32", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 64], + output_shape: [16, 128], + dtype: "FP32", + gemm_implementation: "gemm_fp32_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1024 + }, + eps: 1e-5, + prec: "FP32", + n_tiles: 12, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 12, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 1024, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 24, + k_tiles: 32, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 1024, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp32_opt" + } + } +} \ No newline at end of file diff --git a/sw/dnn/encoder/data/vit-l/vit-l-fp8.json b/sw/dnn/encoder/data/vit-l/vit-l-fp8.json new file mode 100644 index 000000000..58a1bece1 --- /dev/null +++ b/sw/dnn/encoder/data/vit-l/vit-l-fp8.json @@ -0,0 +1,108 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +{ + num_layers : 24, + mha: { + num_heads: 16, + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1024 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 3, + implementation: "OPT" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 1, + k_tiles: 4, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 64, + K: 1024, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + flashattention_2: { + L: 197, + S: 197, + d: 64, + B_r: 64, + B_c: 64, + dtype: "FP8", + use_mask: false, + baseline: false + }, + 'fused_concat_linear': { + num_inputs: 16, + input_shape: [16, 64], + output_shape: [16, 128], + dtype: "FP8", + gemm_implementation: "gemm_fp8_opt" + } + }, + mlp: { + layernorm: { + input_dim: { + batch_size: 1, + seq_len: 197, + embeddings: 1024 + }, + eps: 1e-5, + prec: "FP8", + n_tiles: 3, + implementation: "OPT" + }, + 'fused_linear_gelu': { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, // number of tiles in M dimension + n_tiles: 1, // number of tiles in N dimension + k_tiles: 3, // number of tiles in K dimension + load_a: 1, + load_b: 1, + load_c: 1, + transa: false, + transb: true, // must be true for SIMD + M: 197, + N: 64, + K: 1024, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + }, + gemm: { + setup_ssr: 1, + parallelize_m: 0, + parallelize_k: 0, + m_tiles: 25, + n_tiles: 24, + k_tiles: 8, + load_a: 1, + load_b: 1, + load_c: 0, + transa: false, + transb: true, + M: 197, + N: 1024, + K: 4096, + alpha: 1, + beta: 0, + gemm_fp: "gemm_fp8_opt" + } + } +} \ No newline at end of file