diff --git a/CITATION.cff b/CITATION.cff
index 82bcac5a7b750..10b7290022aef 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -3,8 +3,7 @@ title: ONNX Runtime
 message: "Please use this information to cite ONNX Runtime in
   research or other publications."
 authors:
-  - affiliation: Microsoft Corporation
-    given-names: ONNX Runtime developers
+  - name: ONNX Runtime developers
 date-released: 2018-11-29
 url: "https://onnxruntime.ai"
 repository-code: "https://github.com/microsoft/onnxruntime"
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index a56864ebf4644..8161ea574b8cc 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -92,13 +92,8 @@ if (onnxruntime_MINIMAL_BUILD)
   endif()
 endif()
 
-# Enable stream for all the non-minimal build, except for DML. There's currently a bug
-# in the allocation planner when reusing buffers and more than one streams are used that
-# make it possible (although rarely) to reach a reference count of 0 for a buffer that is
-# still being used. Since DML doesn't benefit from multiple streams, disabling it is the
-# safest option for now.
-# https://github.com/microsoft/onnxruntime/issues/19480
-if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_DML)
+# Enable stream for all the non-minimal build
+if (NOT onnxruntime_MINIMAL_BUILD)
   add_compile_definitions(ORT_ENABLE_STREAM)
 endif()
 
diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
index 85a9bf50460d3..1bb70e9c2ed27 100644
--- a/cmake/onnxruntime_rocm_hipify.cmake
+++ b/cmake/onnxruntime_rocm_hipify.cmake
@@ -20,10 +20,6 @@ set(contrib_ops_excluded_files
   "bert/fastertransformer_decoder_attention/*"
   "bert/multihead_attention.cc"
   "bert/multihead_attention.h"
-  "bert/fast_gelu_impl.cu"
-  "bert/fast_gelu_impl.h"
-  "bert/fast_gelu.cc"
-  "bert/fast_gelu.h"
   "bert/relative_attn_bias.cc"
   "bert/relative_attn_bias.h"
   "bert/relative_attn_bias_impl.cu"
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
index 715aed7e1d64f..7f3d5d6624b07 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.NetCoreApp/InferenceTest.netcore.cs
@@ -145,7 +145,7 @@ private void TestCUDAProviderOptions()
         private void CanRunInferenceOnAModelWithTensorRT()
         {
             string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "squeezenet.onnx");
-            
+
             int deviceId = 0;
             string deviceIdStr = System.Environment.GetEnvironmentVariable("ONNXRUNTIME_TEST_GPU_DEVICE_ID");
             if (!string.IsNullOrEmpty(deviceIdStr) && int.TryParse(deviceIdStr, out int parsedValue) && parsedValue >= 0)
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 8ff2135c6b1f6..b0ed68d595c42 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -127,6 +127,7 @@ Do not modify directly.*
 |GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
 |||12|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
 |||11|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)<br/> **indices** = tensor(int64)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|20+|**T** = tensor(float)|
 |Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float)|
 |||[11, 12]|**T** = tensor(double), tensor(float)|
 |||[9, 10]|**T** = tensor(double), tensor(float)|
@@ -606,6 +607,7 @@ Do not modify directly.*
 |GatherND|*in* data:**T**<br> *in* indices:**tensor(int64)**<br> *out* output:**T**|13+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
 |||12|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
 |||11|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int64)<br/> **indices** = tensor(int64)|
+|Gelu|*in* X:**T**<br> *out* Y:**T**|20+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Gemm|*in* A:**T**<br> *in* B:**T**<br> *in* C:**T**<br> *out* Y:**T**|13+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[11, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[9, 10]|**T** = tensor(double), tensor(float), tensor(float16)|
@@ -617,6 +619,7 @@ Do not modify directly.*
 |||[7, 8]|**T** = tensor(double), tensor(float), tensor(float16)|
 |GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
 |||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
+|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
 |HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
 |Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
 |||[14, 18]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 31c988f500779..c1cc69edc17d8 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -33,6 +33,8 @@ class Node;
 #include "core/framework/stream_handles.h"
 #include "core/framework/tuning_context.h"
 
+struct OrtRunOptions;
+
 namespace onnxruntime {
 
 /**
@@ -51,6 +53,8 @@ struct NodeComputeInfo {
   DestroyFunctionStateFunc release_state_func;
 };
 
+using RunOptions = OrtRunOptions;
+
 enum class DataLayout {
   NCHW,
   NHWC,
@@ -184,7 +188,7 @@ class IExecutionProvider {
      Run may not be finished on device This function should be regarded as the
      point after which a new Run would start to submit commands from CPU
   */
-  virtual common::Status OnRunStart() { return Status::OK(); }
+  virtual common::Status OnRunStart(const onnxruntime::RunOptions& /*run_options*/) { return Status::OK(); }
 
   /**
      Called when InferenceSession::Run ended
@@ -192,7 +196,9 @@ class IExecutionProvider {
      may not be finished on device This function should be regarded as the point
      that all commands of current Run has been submmited by CPU
   */
-  virtual common::Status OnRunEnd(bool /*sync_stream*/) { return Status::OK(); }
+  virtual common::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
+    return Status::OK();
+  }
 
   /**
      Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
diff --git a/include/onnxruntime/core/providers/cuda/cuda_resource.h b/include/onnxruntime/core/providers/cuda/cuda_resource.h
index 1fef077860be3..00e7dec5727d1 100644
--- a/include/onnxruntime/core/providers/cuda/cuda_resource.h
+++ b/include/onnxruntime/core/providers/cuda/cuda_resource.h
@@ -19,4 +19,4 @@ enum CudaResource : int {
   enable_skip_layer_norm_strict_mode_t,
   prefer_nhwc_t,
   use_tf32_t,
-};
\ No newline at end of file
+};
diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
index 1f5fcd50e185c..b0a17e175fef3 100644
--- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h
@@ -30,3 +30,15 @@ static const char* const kOrtRunOptionsConfigEnableMemoryArenaShrinkage = "memor
 // Per default it will be set to '0'
 // Taking CUDA EP as an example, it omit triggering cudaStreamSynchronize on the compute stream.
 static const char* const kOrtRunOptionsConfigDisableSynchronizeExecutionProviders = "disable_synchronize_execution_providers";
+
+// Set HTP performance mode for QNN HTP backend before session run.
+// options for HTP performance mode: "burst", "balanced", "default", "high_performance",
+// "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver",
+// "sustained_high_performance". Default to "default".
+static const char* const kOrtRunOptionsConfigQnnPerfMode = "qnn.htp_perf_mode";
+
+// Set HTP performance mode for QNN HTP backend post session run.
+static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_mode_post_run";
+
+// Set RPC control latency for QNN HTP backend
+static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency";
diff --git a/js/node/lib/backend.ts b/js/node/lib/backend.ts
index e8eb0e9babf5a..927953b4f1dd6 100644
--- a/js/node/lib/backend.ts
+++ b/js/node/lib/backend.ts
@@ -36,7 +36,7 @@ class OnnxruntimeSessionHandler implements InferenceSessionHandler {
   async run(feeds: SessionHandler.FeedsType, fetches: SessionHandler.FetchesType, options: InferenceSession.RunOptions):
       Promise<SessionHandler.ReturnType> {
     return new Promise((resolve, reject) => {
-      process.nextTick(() => {
+      setImmediate(() => {
         try {
           resolve(this.#inferenceSession.run(feeds, fetches, options));
         } catch (e) {
@@ -56,7 +56,7 @@ class OnnxruntimeBackend implements Backend {
   async createInferenceSessionHandler(pathOrBuffer: string|Uint8Array, options?: InferenceSession.SessionOptions):
       Promise<InferenceSessionHandler> {
     return new Promise((resolve, reject) => {
-      process.nextTick(() => {
+      setImmediate(() => {
         try {
           resolve(new OnnxruntimeSessionHandler(pathOrBuffer, options || {}));
         } catch (e) {
diff --git a/js/react_native/e2e/yarn.lock b/js/react_native/e2e/yarn.lock
index 9e20a286c4e27..6f05faf046098 100644
--- a/js/react_native/e2e/yarn.lock
+++ b/js/react_native/e2e/yarn.lock
@@ -3351,9 +3351,9 @@ invariant@^2.2.4:
     loose-envify "^1.0.0"
 
 ip@^1.1.5:
-  version "1.1.8"
-  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
-  integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg==
+  version "1.1.9"
+  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396"
+  integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ==
 
 is-accessor-descriptor@^0.1.6:
   version "0.1.6"
diff --git a/js/react_native/yarn.lock b/js/react_native/yarn.lock
index 4dca90d7415cf..bbb0c4f3d1e22 100644
--- a/js/react_native/yarn.lock
+++ b/js/react_native/yarn.lock
@@ -3701,9 +3701,9 @@ invariant@^2.2.4:
     loose-envify "^1.0.0"
 
 ip@^1.1.5:
-  version "1.1.8"
-  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.8.tgz#ae05948f6b075435ed3307acce04629da8cdbf48"
-  integrity sha512-PuExPYUiu6qMBQb4l06ecm6T6ujzhmh+MeJcW9wa89PoAz5pvd4zPgN5WJV104mb6S2T1AwNIAaB70JNrLQWhg==
+  version "1.1.9"
+  resolved "https://registry.yarnpkg.com/ip/-/ip-1.1.9.tgz#8dfbcc99a754d07f425310b86a99546b1151e396"
+  integrity sha512-cyRxvOEpNHNtchU3Ln9KC/auJgup87llfQpQ+t5ghoC/UhL16SWzbueiCsdTnWmqAWl7LadfuwhlqmtOaqMHdQ==
 
 is-absolute@^1.0.0:
   version "1.0.0"
diff --git a/js/web/README.md b/js/web/README.md
index c75a40ad6da28..906c78a1b7ec4 100644
--- a/js/web/README.md
+++ b/js/web/README.md
@@ -12,7 +12,7 @@ The [Open Neural Network Exchange](http://onnx.ai/) (ONNX) is an open standard f
 
 With ONNX Runtime Web, web developers can score models directly on browsers with various benefits including reducing server-client communication and protecting user privacy, as well as offering install-free and cross-platform in-browser ML experience.
 
-ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web complies the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
+ONNX Runtime Web can run on both CPU and GPU. On CPU side, [WebAssembly](https://developer.mozilla.org/en-US/docs/WebAssembly) is adopted to execute the model at near-native speed. ONNX Runtime Web compiles the native ONNX Runtime CPU engine into WebAssembly backend by using Emscripten, so it supports most functionalities native ONNX Runtime offers, including full ONNX operator coverage, multi-threading, [ONNX Runtime Quantization](https://www.onnxruntime.ai/docs/how-to/quantization.html) as well as [ONNX Runtime Mobile](https://onnxruntime.ai/docs/tutorials/mobile/). For performance acceleration with GPUs, ONNX Runtime Web leverages WebGL, a popular standard for accessing GPU capabilities. We are keeping improving op coverage and optimizing performance in WebGL backend.
 
 See [Compatibility](#Compatibility) and [Operators Supported](#Operators) for a list of platforms and operators ONNX Runtime Web currently supports.
 
@@ -22,7 +22,7 @@ Refer to [ONNX Runtime JavaScript examples](https://github.com/microsoft/onnxrun
 
 ## Documents
 
-### Developement
+### Development
 
 Refer to the following links for development information:
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
index b5b6a2a15cd8c..11c8778b72335 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
@@ -23,17 +23,17 @@ import {DataType} from '../../../../wasm-common';
 import {LOG_DEBUG} from '../../../log';
 import {TensorView} from '../../../tensor-view';
 import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types';
-import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from '../common';
+import {createTensorShapeVariables, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common';
 import {ConvTransposeAttributes} from '../conv-transpose';
 import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils';
 
-import {biasSnippet, typeSnippet} from './activation_util';
+import {biasSnippet} from './activation_util';
 import {utilFunctions} from './conv_util';
 import {makeMatMulPackedSource, makeMatMulPackedVec4Source} from './matmul_packed_webgpu';
 
 const conv2dTransposeCommonSnippet =
-    (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, innerElementSize = 4): string => {
-      const type = typeSnippet(innerElementSize, 'f32');
+    (isChannelsLast: boolean, addBias = false, attributes: ConvTransposeAttributes, type: string,
+     innerElementSize = 4): string => {
       const getWSnippet = (innerElementSize: number) => {
         switch (innerElementSize) {
           case 1:
@@ -47,7 +47,7 @@ const conv2dTransposeCommonSnippet =
             let v1 = w[getIndexFromCoords4D(coord1, vec4<i32>(uniforms.w_shape))];
             let v2 = w[getIndexFromCoords4D(coord2, vec4<i32>(uniforms.w_shape))];
             let v3 = w[getIndexFromCoords4D(coord3, vec4<i32>(uniforms.w_shape))];
-            return vec4<f32>(v0, v1, v2, v3);
+            return ${type}(v0, v1, v2, v3);
             `;
           default:
             throw new Error(`innerElementSize ${innerElementSize} is not supported.`);
@@ -224,7 +224,7 @@ export const createConv2DTransposeMatMulProgramInfo =
           const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims.length, components);
           inputVariables.push(bias);
           declareFunctions += `
-          fn getBiasByOutputCoords(coords : vec4<i32>) -> ${isVec4 ? 'vec4<f32>' : 'f32'} {
+          fn getBiasByOutputCoords(coords : vec4<i32>) -> ${bias.type.value} {
             return bias[coords.${isChannelsLast ? 'w' : 'y'}${isVec4 ? '/ 4' : ''}];
           }`;
         }
@@ -236,16 +236,20 @@ export const createConv2DTransposeMatMulProgramInfo =
           {name: 'pads', type: 'i32', length: pads.length}
         ];
         appendActivationUniforms(attributes, uniforms);
+        const elemType = tensorTypeToWsglStorageType(inputs[0].dataType, 1);
+        if (elemType !== 'f16' && elemType !== 'f32') {
+          throw new Error(`elemType ${elemType} is not supported.`);
+        }
         return `
         ${utilFunctions('uniforms.result_strides')}
         ${shaderHelper.registerUniforms(uniforms).declareVariables(...inputVariables, output)};
         ${declareFunctions}
-        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, innerElementSize)}
+        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, attributes, x.type.value, innerElementSize)}
         ${
             isVec4 ? makeMatMulPackedVec4Source(
-                         elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner) :
+                         elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner) :
                      makeMatMulPackedSource(
-                         elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner, false,
+                         elementsPerThread, workGroupSize, elemType, undefined, !isChannelsLast, tileInner, false,
                          undefined, sequentialAccessByThreads)}`;
       };
 
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/where.ts b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
index cfee07a9239d7..a6375847fc42f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/where.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/where.ts
@@ -27,7 +27,7 @@ const createWhereOpProgramShader =
           const expressionA = `a_data[index_a${x}][component_a${x}]`;
           const expressionB = `b_data[index_b${x}][component_b${x}]`;
           // eslint-disable-next-line no-bitwise
-          const expressionC = `bool(c_data[index_c${x}] & ${0xff000000 >>> ((3 - x) * 8)}u)`;
+          const expressionC = `bool(c_data[index_c${x}] & (0xffu << (component_c${x} * 8)))`;
           return `
             let output_indices${x} = ${output.offsetToIndices(`global_idx * 4u + ${x}u`)};
             let offset_a${x} = ${a.broadcastedIndicesToOffset(`output_indices${x}`, output)};
@@ -38,6 +38,7 @@ const createWhereOpProgramShader =
             let index_c${x} = offset_c${x} / 4u;
             let component_a${x} = offset_a${x} % 4u;
             let component_b${x} = offset_b${x} % 4u;
+            let component_c${x} = offset_c${x} % 4u;
             ${resStr}[${x}] = ${typeCast}(${expression(expressionA, expressionB, expressionC)});
           `;
         };
diff --git a/js/web/test/data/ops/where.jsonc b/js/web/test/data/ops/where.jsonc
index 047fd6fd7511b..990120dd3708e 100644
--- a/js/web/test/data/ops/where.jsonc
+++ b/js/web/test/data/ops/where.jsonc
@@ -168,5 +168,39 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Where with no attributes",
+    "operator": "Where",
+    "attributes": [],
+    "cases": [
+      {
+        "name": "T[1 1 2 1] T[1 4] T[1 1 2 4] float32 broadcast 1",
+        "inputs": [
+          {
+            "data": [true, false],
+            "dims": [1, 1, 2, 1],
+            "type": "bool"
+          },
+          {
+            "data": [1, 2, 3, 4],
+            "dims": [1, 4],
+            "type": "float32"
+          },
+          {
+            "data": [5, 6, 7, 8, 9, 10, 11, 12],
+            "dims": [1, 1, 2, 4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [1, 2, 3, 4, 9, 10, 11, 12],
+            "dims": [1, 1, 2, 4],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index ecc7d4b4a09a5..a4adf5c4ce144 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -627,8 +627,8 @@ export async function runModelTestSet(
   try {
     const feeds: Record<string, ort.Tensor> = {};
     const outputsMetaInfo: Record<string, ort.Tensor> = {};
-    testCase.inputs!.forEach((tensor, i) => feeds[context.session.inputNames[i]] = tensor);
-    testCase.outputs!.forEach((tensor, i) => outputsMetaInfo[context.session.outputNames[i]] = tensor);
+    testCase.inputs!.forEach((tensor) => feeds[tensor.name] = tensor);
+    testCase.outputs!.forEach((tensor) => outputsMetaInfo[tensor.name] = tensor);
     const [start, end, outputs] =
         await sessionRun({session: context.session, feeds, outputsMetaInfo, ioBinding: context.ioBinding});
     if (context.perfData.count === 0) {
diff --git a/onnxruntime/contrib_ops/cpu/activations.cc b/onnxruntime/contrib_ops/cpu/activations.cc
index 556699192d2eb..3e0533dd8b9e5 100644
--- a/onnxruntime/contrib_ops/cpu/activations.cc
+++ b/onnxruntime/contrib_ops/cpu/activations.cc
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 #include "core/providers/cpu/activation/activations.h"
-#include "activations.h"
+#include "contrib_ops/cpu/activations.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -26,14 +26,6 @@ ONNX_CPU_OPERATOR_VERSIONED_KERNEL(
     KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     ThresholdedRelu<float>);
 
-ONNX_OPERATOR_KERNEL_EX(
-    Gelu,
-    kMSDomain,
-    1,
-    kCpuExecutionProvider,
-    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
-    Gelu<float>);
-
 ONNX_OPERATOR_KERNEL_EX(
     QuickGelu,
     kMSDomain,
diff --git a/onnxruntime/contrib_ops/cpu/activations.h b/onnxruntime/contrib_ops/cpu/activations.h
index aed4c2229215d..7e64235d3fc3d 100644
--- a/onnxruntime/contrib_ops/cpu/activations.h
+++ b/onnxruntime/contrib_ops/cpu/activations.h
@@ -54,47 +54,6 @@ namespace contrib {
 DEFINE_ELE_KERNEL(ScaledTanh);
 DEFINE_ELE_KERNEL(ParametricSoftplus);
 
-template <typename T>
-class Gelu : public OpKernel {
- public:
-  Gelu(const OpKernelInfo& info) : OpKernel(info) {
-  }
-
-  Status Compute(OpKernelContext* context) const override {
-    const Tensor* input = context->Input<Tensor>(0);
-    const T* input_data = input->Data<T>();
-
-    Tensor* output = context->Output(0, input->Shape());
-    T* output_data = output->MutableData<T>();
-
-    concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-    int64_t elem_count = input->Shape().Size();
-    constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
-    int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
-    concurrency::ThreadPool::TryBatchParallelFor(
-        tp, static_cast<int32_t>(task_count),
-        [&](ptrdiff_t task_idx) {
-          const auto start = task_idx * length_per_task;
-          const T* p_input = input_data + start;
-          T* p_output = output_data + start;
-          int64_t count = std::min(length_per_task, elem_count - start);
-
-          for (int64_t i = 0; i < count; i++) {
-            T value = p_input[i];
-            p_output[i] = value * static_cast<T>(M_SQRT1_2);
-          }
-
-          MlasComputeErf(p_output, p_output, narrow<size_t>(count));
-
-          for (int64_t i = 0; i < count; i++) {
-            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
-          }
-        },
-        0);
-    return Status::OK();
-  }
-};
-
 // Implement a new one instead of inheriting from ElementWiseRangedTransform so that we can call
 // MlasComputeLogistic instead of using Eigen for better perf.
 template <typename T>
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.cc b/onnxruntime/contrib_ops/cuda/activation/activations.cc
index 1a86c5dbece5a..6303858b9bd48 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations.cc
+++ b/onnxruntime/contrib_ops/cuda/activation/activations.cc
@@ -49,7 +49,6 @@ namespace cuda {
 UNARY_ACTIVATION_OP_HFD(Affine, 1, kOnnxDomain);
 UNARY_ACTIVATION_OP_HFD(ParametricSoftplus, 1, kOnnxDomain);
 UNARY_ACTIVATION_OP_HFD(ScaledTanh, 1, kOnnxDomain);
-UNARY_ACTIVATION_OP_HFD(Gelu, 1, kMSDomain);
 UNARY_ACTIVATION_OP_HFD(QuickGelu, 1, kMSDomain);
 
 REGISTER_ACTIVATION_KERNEL(ThresholdedRelu, 1, kOnnxDomain, MLFloat16)
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations.h b/onnxruntime/contrib_ops/cuda/activation/activations.h
index ab339f276c2bd..fc9a71b0b7fa1 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations.h
+++ b/onnxruntime/contrib_ops/cuda/activation/activations.h
@@ -66,17 +66,6 @@ class ScaledTanh final : public UnaryElementwise {
   float beta_;
 };
 
-template <typename T>
-class Gelu final : public UnaryElementwise {
- public:
-  Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {}
-
-  Status ComputeInternal(OpKernelContext* context) const override;
-
- private:
-  MAKE_FUNC_CTX_NULL()
-};
-
 template <typename T>
 class QuickGelu final : public UnaryElementwise {
  public:
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
index 0c856815fd437..36f33fbb24c18 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.cu
@@ -36,20 +36,6 @@ struct OP_ScaledTanh : public CtxScaledTanh {
   }
 };
 
-template <typename T>
-struct OP_Gelu : public CtxGelu {
-  __device__ __inline__ T operator()(const T& a) const {
-    return _Gelu(a);
-  }
-};
-
-template <>
-struct OP_Gelu<half> : public CtxGelu {
-  __device__ __inline__ half operator()(const half& a) const {
-    return static_cast<half>(_Gelu(static_cast<float>(a)));
-  }
-};
-
 template <typename T>
 struct OP_QuickGelu : public CtxQuickGelu {
   __device__ __inline__ T operator()(const T& a) const {
diff --git a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
index 5d18283a395e3..782d4bf59a5ad 100644
--- a/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
+++ b/onnxruntime/contrib_ops/cuda/activation/activations_impl.h
@@ -11,14 +11,12 @@ namespace cuda {
 typedef onnxruntime::cuda::CtxAlphaBeta CtxAffine;
 typedef onnxruntime::cuda::CtxAlphaBeta CtxParametricSoftplus;
 typedef onnxruntime::cuda::CtxAlphaBeta CtxScaledTanh;
-typedef onnxruntime::cuda::CtxNull CtxGelu;
 typedef onnxruntime::cuda::CtxAlpha CtxQuickGelu;
 
 #define UNARY_CONTRIB_ACTIVATION_OPS()         \
   UNARY_ACTIVATION_OP_NAME(ScaledTanh)         \
   UNARY_ACTIVATION_OP_NAME(Affine)             \
   UNARY_ACTIVATION_OP_NAME(ParametricSoftplus) \
-  UNARY_ACTIVATION_OP_NAME(Gelu)               \
   UNARY_ACTIVATION_OP_NAME(QuickGelu)
 
 #define UNARY_ACTIVATION_OP_NAME(name) UNARY_ACTIVATION_IMPL_DECLARATION(name);
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
index 892f5c181a607..e8974a29476b6 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.cc
@@ -4,9 +4,14 @@
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cudnn_common.h"
 #include "fast_gelu.h"
-#include "fast_gelu_impl.h"
+#include "core/providers/cuda/tensor/gelu_impl.h"
 #include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "transformer_common.h"
+#ifdef USE_ROCM
+#include "contrib_ops/rocm/bert/elementwise.h"
+#endif
+#ifdef USE_CUDA
+#include "contrib_ops/cuda/bert/transformer_common.h"
+#endif
 
 namespace onnxruntime {
 namespace contrib {
@@ -31,8 +36,10 @@ using namespace ONNX_NAMESPACE;
 
 template <typename T>
 FastGelu<T>::FastGelu(const OpKernelInfo& op_kernel_info) : CudaKernel(op_kernel_info) {
+#ifdef USE_CUDA
   const TransformerOptions* options = TransformerOptions::GetInstance();
   use_half2_ = !options->DisableHalf2();
+#endif
 }
 
 template <typename T>
@@ -50,6 +57,14 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
   int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
   typedef typename ToCudaType<T>::MappedType CudaT;
 
+#ifdef USE_ROCM
+  return LaunchElementwiseKernel<functor::FastGeLU, CudaT>(
+      GetTuningContext(), context->GetComputeStream(),
+      reinterpret_cast<const CudaT*>(input->Data<T>()), static_cast<int>(input_length),
+      (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr, static_cast<int>(bias_length),
+      reinterpret_cast<CudaT*>(output->MutableData<T>()));
+#endif
+#ifdef USE_CUDA
   return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
                                      Stream(context),
                                      static_cast<int>(input_length),
@@ -58,6 +73,7 @@ Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
                                      (nullptr != bias) ? reinterpret_cast<const CudaT*>(bias->Data<T>()) : nullptr,
                                      reinterpret_cast<CudaT*>(output->MutableData<T>()),
                                      use_half2_);
+#endif
 }
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
index 3e642a70afef5..d563556593e6e 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
+++ b/onnxruntime/contrib_ops/cuda/bert/fast_gelu.h
@@ -18,7 +18,7 @@ class FastGelu final : public CudaKernel {
   Status ComputeInternal(OpKernelContext* ctx) const override;
 
  private:
-  bool use_half2_;
+  bool use_half2_;  // Only applicable to CUDA kernel (not ROCM).
 };
 
 }  // namespace cuda
diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
index be8c0dc86c135..57e951d3a68ff 100644
--- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc
@@ -203,6 +203,10 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze);
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample);
+#endif
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   KernelCreateInfo info;
@@ -408,6 +412,9 @@ Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, MLFloat16, DistributedSqueeze)>,
 #endif
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 16, float, GridSample)>,
+#endif
   };
 
   for (auto& function_table_entry : function_table) {
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/contrib_ops/cuda/grid_sample.cc
index 4c2999c279e0a..2500de39d3536 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample.cc
+++ b/onnxruntime/contrib_ops/cuda/grid_sample.cc
@@ -9,22 +9,23 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-#define REGISTER_KERNEL_TYPED(T)                                   \
+#define REGISTER_KERNEL_TYPED(T, VERSION, LAYOUT, DOMAIN)          \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                   \
       GridSample,                                                  \
-      kMSDomain,                                                   \
-      1,                                                           \
+      DOMAIN,                                                      \
+      VERSION,                                                     \
       T,                                                           \
       kCudaExecutionProvider,                                      \
       (*KernelDefBuilder::Create())                                \
           .TypeConstraint("T1", DataTypeImpl::GetTensorType<T>())  \
           .TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()), \
-      GridSample<T>);
+      onnxruntime::contrib::cuda::GridSample<T, LAYOUT>);
 
-REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(float, 1, LAYOUT_NCHW, kMSDomain)
+REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NHWC, kMSInternalNHWCDomain)
 
-template <typename T>
-GridSample<T>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
+template <typename T, bool IsNHWC>
+GridSample<T, IsNHWC>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   std::string mode_str = info.GetAttrOrDefault<std::string>("mode", "bilinear");
   std::string padding_mode_str = info.GetAttrOrDefault<std::string>("padding_mode", "zeros");
   align_corners_ = static_cast<bool>(info.GetAttrOrDefault<int64_t>("align_corners", 0));
@@ -48,8 +49,8 @@ GridSample<T>::GridSample(const OpKernelInfo& info) : CudaKernel(info) {
   }
 }
 
-template <typename T>
-Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
+template <typename T, bool IsNHWC>
+Status GridSample<T, IsNHWC>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* X = context->Input<Tensor>(0);
   const auto& dims_input = X->Shape().GetDims();
   const Tensor* Grid = context->Input<Tensor>(1);
@@ -61,11 +62,13 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
   ORT_ENFORCE(dims_grid[0] == dims_input[0], "Grid batch size ", dims_grid[0], " does not match input batch size ", dims_input[0]);
   ORT_ENFORCE(dims_grid[3] == 2, "Last dimension of grid: ", dims_grid[3], ", expect 2");
 
+  using Ch = Channels<IsNHWC>;
+
   TensorShapeVector dims_output(4);
-  dims_output[0] = dims_input[0];
-  dims_output[1] = dims_input[1];
-  dims_output[2] = dims_grid[1];
-  dims_output[3] = dims_grid[2];
+  dims_output[Ch::N] = dims_input[Ch::N];
+  dims_output[Ch::C] = dims_input[Ch::C];
+  dims_output[Ch::H] = dims_grid[1 /* Grid::H */];
+  dims_output[Ch::W] = dims_grid[2 /* Grid::W */];
   Tensor* Y = context->Output(0, dims_output);
   // Return early if the output tensor is going to be of size 0
   if (Y->Shape().Size() == 0) {
@@ -74,7 +77,7 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
 
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaT* Y_data = reinterpret_cast<CudaT*>(Y->MutableData<T>());
-  GridSampleImpl<CudaT>(
+  GridSampleImpl<CudaT, IsNHWC>(
       Stream(context),
       reinterpret_cast<const CudaT*>(X->Data<T>()),
       reinterpret_cast<const CudaT*>(Grid->Data<T>()),
@@ -89,4 +92,8 @@ Status GridSample<T>::ComputeInternal(OpKernelContext* context) const {
 }
 }  // namespace cuda
 }  // namespace contrib
+
+namespace cuda {
+REGISTER_KERNEL_TYPED(float, 16, LAYOUT_NCHW, kOnnxDomain)
+}  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/contrib_ops/cuda/grid_sample.h
index 08ca58c7cc458..16581bfe77482 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample.h
+++ b/onnxruntime/contrib_ops/cuda/grid_sample.h
@@ -12,7 +12,7 @@ namespace cuda {
 
 using namespace onnxruntime::cuda;
 
-template <typename T>
+template <typename T, bool IsNHWC>
 class GridSample final : public CudaKernel {
  public:
   explicit GridSample(const OpKernelInfo& info);
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
index 8a391eca7e86a..b23da635bc83d 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
+++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu
@@ -50,28 +50,34 @@ __device__ T GsReflect(T x, float x_min, float x_max) {
   return static_cast<T>(fx);
 }
 
-template <typename T>
+template <typename T, bool Layout>
 __device__ T PixelAtGrid(const T* input_data, int64_t bIdx, int64_t cIdx, int64_t y, int64_t x,
-    int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) {
+                         int64_t padding_mode, int64_t N, int64_t C, int64_t H, int64_t W, float border[4]) {
   T pixel = 0.0f;
+
+  auto PixelOffset = [bIdx, cIdx, C, H, W](int64_t x, int64_t y) -> int64_t {
+    return Layout == LAYOUT_NCHW
+       ? (bIdx * C * H * W + cIdx * H * W + y * W + x)
+       : (bIdx * H * W * C + y * W * C + x * C + cIdx);
+  };
+
   if (padding_mode == 0) {  // zeros
     if (x >= 0 && x < W && y >= 0 && y < H) {
-      pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+      pixel = input_data[PixelOffset(x, y)];
     }
-  } else if (padding_mode == 1) {  //border
+  } else if (padding_mode == 1) {  // border
     x = max((int64_t)0, min((int64_t)W - 1, (int64_t)x));
     y = max((int64_t)0, min((int64_t)H - 1, (int64_t)y));
-    pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+    pixel = input_data[PixelOffset(x, y)];
   } else {  // Reflection
-    x = (int64_t) GsReflect<T>(x, border[0], border[2]);
-    y = (int64_t) GsReflect<T>(y, border[1], border[3]);
-    pixel = input_data[bIdx * C * H * W + cIdx * H * W + y * W + x];
+    x = (int64_t)GsReflect<T>(x, border[0], border[2]);
+    y = (int64_t)GsReflect<T>(y, border[1], border[3]);
+    pixel = input_data[PixelOffset(x, y)];
   }
   return pixel;
 }
 
-__device__ void GsGetCubicCoeffs(float x, float coeffs[4])
-{
+__device__ void GsGetCubicCoeffs(float x, float coeffs[4]) {
   float cubic_alpha = -0.75f;
   x = abs(x);
   coeffs[0] = (((cubic_alpha * (x + 1) - 5 * cubic_alpha) * (x + 1) + 8 * cubic_alpha) * (x + 1) - 4 * cubic_alpha);
@@ -93,7 +99,7 @@ __device__ T GsBicubicInterpolate(T p[4][4], float x, float y) {
   return pixel;
 }
 
-template <typename T>
+template <typename T, bool Layout>
 __global__ void _GridSampleKernel(
     const T* input_data,
     const T* grid_data,
@@ -110,16 +116,32 @@ __global__ void _GridSampleKernel(
 {
     CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(idx, N * C * H_out * W_out);
     // extract batch index, channel index, y index, x index for current thread
-    int BIdx = idx / (C * H_out * W_out );
-    int tmpBCnt = BIdx * (C * H_out * W_out);
+    int BIdx, yIdx, xIdx, cIdx;
+    if constexpr (Layout == LAYOUT_NCHW) {
+      BIdx = idx / (C * H_out * W_out);
+      int tmpBCnt = BIdx * (C * H_out * W_out);
+
+      cIdx = (idx - tmpBCnt) / (H_out * W_out);
+      int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out);
 
-    int cIdx = (idx - tmpBCnt) / (H_out * W_out);
-    int tmpCCnt = tmpBCnt + cIdx * (H_out * W_out);
+      yIdx = (idx - tmpCCnt) / W_out;
+      int tmpHCnt = tmpCCnt + yIdx * W_out;
 
-    int yIdx = (idx - tmpCCnt) / W_out;
-    int tmpHCnt = tmpCCnt + yIdx * W_out;
+      xIdx = (idx - tmpHCnt);
+    } else {
+      static_assert(Layout == LAYOUT_NHWC, "Unsupported layout");
 
-    int xIdx = (idx - tmpHCnt);
+      BIdx = idx / (H_out * W_out * C);
+      int tmpBCnt = BIdx * (H_out * W_out * C);
+
+      yIdx = (idx - tmpBCnt) / (W_out * C);
+      int tmpHCnt = tmpBCnt + yIdx * (W_out * C);
+
+      xIdx = (idx - tmpHCnt) / C;
+      int tmpWCnt = tmpHCnt + xIdx * C;
+
+      cIdx = (idx - tmpWCnt);
+    }
 
     int grid_idx = BIdx * H_out * W_out + yIdx * W_out + xIdx;
     T grid_X = grid_data[grid_idx * 2 + 0];
@@ -147,8 +169,9 @@ __global__ void _GridSampleKernel(
     if (grid_x_imgSpace < x_min || grid_x_imgSpace > x_max ||
         grid_y_imgSpace < y_min || grid_y_imgSpace > y_max) { // out of bound
       if (padding_mode == 1) {  // border
-        grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f));
-        grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f));
+        // Clamping must not be done here, see #10607
+        // grid_x_imgSpace = max(0.0f, min(grid_x_imgSpace, W_in - 1.0f));
+        // grid_y_imgSpace = max(0.0f, min(grid_y_imgSpace, H_in - 1.0f));
       } else if (padding_mode == 2) {  // reflection
         grid_x_imgSpace = GsReflect(grid_x_imgSpace, x_min, x_max);
         grid_y_imgSpace = GsReflect(grid_y_imgSpace, y_min, y_max);
@@ -175,10 +198,10 @@ __global__ void _GridSampleKernel(
       w_lb = w_b * w_l;
       w_rb = w_b * w_r;
 
-      T lt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border);
-      T rt_v = PixelAtGrid(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border);
-      T lb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border);
-      T rb_v = PixelAtGrid(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border);
+      T lt_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y1, x1, padding_mode, N, C, H_in, W_in, border);
+      T rt_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y1, x2, padding_mode, N, C, H_in, W_in, border);
+      T lb_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y2, x1, padding_mode, N, C, H_in, W_in, border);
+      T rb_v = PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y2, x2, padding_mode, N, C, H_in, W_in, border);
       T interpoV = w_lt * lt_v + w_rt * rt_v + w_lb * lb_v + w_rb * rb_v;
       output_data[outIdx] = interpoV;
       return;
@@ -186,7 +209,8 @@ __global__ void _GridSampleKernel(
     if (mode == 1) {  // nearest
       int x_n = grid_x_imgSpace;
       int y_n = grid_y_imgSpace;
-      output_data[outIdx] = PixelAtGrid(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border);
+      output_data[outIdx] =
+        PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, y_n, x_n, padding_mode, N, C, H_in, W_in, border);
       return;
     }
     if (mode == 2) {  // bicubic
@@ -195,7 +219,8 @@ __global__ void _GridSampleKernel(
       T p[4][4] = {};  // [H][W]
       for (int64_t h = 0; h < 4; h++) {
         for (int64_t w = 0; w < 4; w++) {
-          p[h][w] = PixelAtGrid(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border);
+          p[h][w] = 
+            PixelAtGrid<T, Layout>(input_data, BIdx, cIdx, h + y0, w + x0, padding_mode, N, C, H_in, W_in, border);
         }
       }
       T dx = grid_x_imgSpace - x0 - 1;
@@ -204,7 +229,7 @@ __global__ void _GridSampleKernel(
     }
 }
 
-template <typename T>
+template <typename T, bool IsNHWC>
 void GridSampleImpl(
     cudaStream_t stream,
     const T* input_data,
@@ -216,17 +241,23 @@ void GridSampleImpl(
     const int64_t H_out,
     const int64_t W_out,
     T* output_data) {
-  int blocksPerGrid = (int)(ceil(static_cast<T>(dims[0] * dims[1] * H_out * W_out) / GridDim::maxThreadsPerBlock));
-  _GridSampleKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      input_data, grid_data, mode, padding_mode, align_corners, dims[0], dims[1], dims[2], dims[3], H_out, W_out, output_data);
+  using Ch = Channels<IsNHWC>;
+
+  int blocksPerGrid = static_cast<int>(
+    ceil(static_cast<T>(dims[Ch::N] * dims[Ch::C] * H_out * W_out) / GridDim::maxThreadsPerBlock));
+  _GridSampleKernel<T, IsNHWC><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
+      input_data, grid_data, mode, padding_mode, align_corners, 
+      dims[Ch::N], dims[Ch::C], dims[Ch::H], dims[Ch::W],
+      H_out, W_out, output_data);
 }
 
-#define SPECIALIZED_IMPL(T) \
-  template void GridSampleImpl<T>(cudaStream_t stream, const T* input_data, const T* grid_data, \
-                                  const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
-                                  const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
+#define SPECIALIZED_IMPL(T, IsNHWC)                                                                                    \
+  template void GridSampleImpl<T, IsNHWC>(cudaStream_t stream, const T* input_data, const T* grid_data,                \
+                                          const int64_t mode, const int64_t padding_mode, const int64_t align_corners, \
+                                          const int64_t[4], const int64_t H_out, const int64_t W_out, T* output_data);
 
-SPECIALIZED_IMPL(float)
+SPECIALIZED_IMPL(float, false)  // NCHW
+SPECIALIZED_IMPL(float, true)   // NHWC
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
index 6df86ce161908..62cd66a48fa84 100644
--- a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
+++ b/onnxruntime/contrib_ops/cuda/grid_sample_impl.h
@@ -8,7 +8,7 @@ namespace onnxruntime {
 namespace contrib {
 namespace cuda {
 
-template <typename T>
+template <typename T, bool IsNHWC>
 void GridSampleImpl(
     cudaStream_t stream,
     const T* input_data,
diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
deleted file mode 100644
index 9cb414e4e8980..0000000000000
--- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#include "contrib_ops/rocm/bert/fast_gelu.h"
-
-#include "core/providers/rocm/rocm_common.h"
-#include "core/providers/rocm/miopen_common.h"
-#include "contrib_ops/cpu/bert/bias_gelu_helper.h"
-#include "contrib_ops/rocm/bert/elementwise.h"
-#include "contrib_ops/rocm/bert/transformer_common.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-#define REGISTER_KERNEL_TYPED(T)                                  \
-  ONNX_OPERATOR_TYPED_KERNEL_EX(                                  \
-      FastGelu,                                                   \
-      kMSDomain,                                                  \
-      1,                                                          \
-      T,                                                          \
-      kRocmExecutionProvider,                                     \
-      (*KernelDefBuilder::Create())                               \
-          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()), \
-      FastGelu<T>);
-
-REGISTER_KERNEL_TYPED(float)
-REGISTER_KERNEL_TYPED(MLFloat16)
-REGISTER_KERNEL_TYPED(BFloat16)
-
-using namespace ONNX_NAMESPACE;
-
-template <typename T>
-Status FastGelu<T>::ComputeInternal(OpKernelContext* context) const {
-  ORT_RETURN_IF_ERROR(bias_gelu_helper::CheckInputs(context));
-
-  const Tensor* input = context->Input<Tensor>(0);
-  const Tensor* bias = context->Input<Tensor>(1);
-  Tensor* output = context->Output(0, input->Shape());
-
-  int64_t input_length = input->Shape().Size();
-  if (input_length == 0) {
-    return Status::OK();
-  }
-  int64_t bias_length = (nullptr == bias) ? 0 : bias->Shape().Size();
-  typedef typename ToHipType<T>::MappedType HipT;
-
-  const HipT* input_buffer = reinterpret_cast<const HipT*>(input->Data<T>());
-  const HipT* bias_buffer = (nullptr != bias) ? reinterpret_cast<const HipT*>(bias->Data<T>()) : nullptr;
-  return LaunchElementwiseKernel<functor::FastGeLU, HipT>(
-      GetTuningContext(), context->GetComputeStream(),
-      input_buffer, static_cast<int>(input_length),
-      bias_buffer, static_cast<int>(bias_length),
-      reinterpret_cast<HipT*>(output->MutableData<T>()));
-}
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h b/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
deleted file mode 100644
index 42bfe5a0b0246..0000000000000
--- a/onnxruntime/contrib_ops/rocm/bert/fast_gelu.h
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) Microsoft Corporation. All rights reserved.
-// Licensed under the MIT License.
-
-#pragma once
-
-#include "core/common/common.h"
-#include "core/providers/rocm/rocm_kernel.h"
-
-namespace onnxruntime {
-namespace contrib {
-namespace rocm {
-
-using namespace onnxruntime::rocm;
-
-template <typename T>
-class FastGelu final : public RocmKernel {
- public:
-  FastGelu(const OpKernelInfo& op_kernel_info) : RocmKernel(op_kernel_info) {}
-  Status ComputeInternal(OpKernelContext* ctx) const override;
-};
-
-}  // namespace rocm
-}  // namespace contrib
-}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
index b3d3e92209b39..c6ca16bfdfc80 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.cuh
@@ -46,8 +46,6 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
     auto block_size = metadata->constants.at("BLOCK_SIZE");
     auto hw_size = metadata->constants.at("HW_SIZE");
     auto impl = [i, block_size, hw_size](const GroupNormNHWCTunableParams<T>* params) -> Status {
-      TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF((params->skip != nullptr || params->bias != nullptr),
-                                                "Input skip or bias is not supported by triton kernel.");
       TUNABLE_OP_RETURN_UNSUPPORTED_ARGUMENT_IF(
           params->channels_per_group > block_size || params->channels_per_group * 2 <= block_size,
           "Arg block_size (", block_size, ") is not the next power of 2 of channels_per_group (",
@@ -61,23 +59,36 @@ auto GetTritonGroupNormNHWCTypeStringAndOps() {
       }
       // Construct args for launch kernel
       struct {
-        void* X;
-        void* Y;
+        const void* src;
+        const void* skip;
+        const void* bias;
+        void* out;
+        void* add_out;
         const void* gamma;
         const void* beta;
         int hw;
         int c;
         int c_per_group;
         float eps;
+        bool has_skip;
+        bool has_bias;
+        bool broadcast_skip;
       } args = {
-          (void*)params->src,
+          (const void*)params->src,
+          (const void*)params->skip,
+          (const void*)params->bias,
           (void*)params->dst,
+          (void*)params->skip_workspace,
           (const void*)params->gamma,
           (const void*)params->beta,
           params->hw,
           params->c,
           params->channels_per_group,
-          params->epsilon};
+          params->epsilon,
+          params->skip != nullptr,
+          params->bias != nullptr,
+          params->broadcast_skip,
+      };
 
       // Grid dim is (batch_count, groups, 1)
       return LaunchTritonKernel(params->StreamHandle(), i, params->n, params->groups, 1, &args, sizeof(args));
diff --git a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
index 5368cb1cf635b..5ba96ebc117f0 100644
--- a/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
+++ b/onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py
@@ -12,13 +12,19 @@
 @triton.jit
 def group_norm_kernel(
     input_ptr,
+    skip_ptr,
+    bias_ptr,
     output_ptr,
+    add_out_ptr,
     gamma_ptr,
     beta_ptr,
     img_size,
     c,
     c_per_group,
     eps,
+    has_skip,
+    has_bias,
+    broadcast_skip,
     BLOCK_SIZE: tl.constexpr,
     HW_SIZE: tl.constexpr,
     ACTIVATION_SILU: tl.constexpr,
@@ -36,14 +42,35 @@ def group_norm_kernel(
     offsets = hw[:, None] * c + cols[None, :]
     mask = (cols < c_per_group)[None, :]
 
+    bias = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    if has_skip:
+        add_out_ptr += row_x * stride + row_y * c_per_group
+        if broadcast_skip:
+            broadcast_skip_ptr = skip_ptr + row_x * c + row_y * c_per_group
+            bias += tl.load(broadcast_skip_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32)
+        else:
+            skip_ptr += row_x * stride + row_y * c_per_group
+    if has_bias:
+        bias_ptr += row_y * c_per_group
+        bias += tl.load(bias_ptr + cols, mask=cols < c_per_group, other=0.0).to(tl.float32)
+
     # Calculate mean and variance
     _sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32)
     _square_sum = tl.zeros([HW_SIZE, BLOCK_SIZE], dtype=tl.float32)
     for i in range(tl.cdiv(img_size, HW_SIZE)):
         x_ptr = input_ptr + i * HW_SIZE * c
         a = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        if has_skip and not broadcast_skip:
+            s_ptr = skip_ptr + i * HW_SIZE * c
+            s = tl.load(s_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+            a += s
+        if has_bias or broadcast_skip:
+            a += bias
         _sum += a
         _square_sum += a * a
+        if has_skip:
+            add_y_ptr = add_out_ptr + i * HW_SIZE * c
+            tl.store(add_y_ptr + offsets, a, mask=mask)
 
     # Set axis=None (or leave it unspecified) to reduce all axes.
     # TODO: In older Triton we have to reduce an axis at a time, but in our case
@@ -57,9 +84,13 @@ def group_norm_kernel(
     gamma = tl.load(gamma_ptr + cols, mask=cols < c_per_group).to(tl.float32)
     beta = tl.load(beta_ptr + cols, mask=cols < c_per_group).to(tl.float32)
     for i in range(tl.cdiv(img_size, HW_SIZE)):
-        x_ptr = input_ptr + i * HW_SIZE * c
         y_ptr = output_ptr + i * HW_SIZE * c
-        x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        if has_skip:
+            add_y_ptr = add_out_ptr + i * HW_SIZE * c
+            x = tl.load(add_y_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
+        else:
+            x_ptr = input_ptr + i * HW_SIZE * c
+            x = tl.load(x_ptr + offsets, mask=mask, other=0.0).to(tl.float32)
         x_hat = (x - group_mean) * rstd
         y = x_hat * gamma + beta
         if ACTIVATION_SILU:
@@ -77,7 +108,7 @@ def group_norm_kernel(
 hw_sizes = [8, 16, 32, 64, 128, 256]
 warps = [1, 2, 4, 8, 16]
 name_pattern = "GroupNormTriton_{}_{}_b{}_hw{}_w{}"
-sig_pattern = "*{},*{},*fp32,*fp32,i32,i32,i32,fp32"
+sig_pattern = "*{},*{},*{},*{},*{},*fp32,*fp32,i32,i32,i32,fp32,i1,i1,i1"
 group_pattern = "GroupNormTriton_{}_{}"
 
 
@@ -88,7 +119,7 @@ def get_function_table():
         silu_suffix = "Silu" if silu else "Pass"
         name = name_pattern.format(silu_suffix, dtype, b, hw_size, warp)
         group = group_pattern.format(silu_suffix, dtype)
-        sig = sig_pattern.format(dtype, dtype)
+        sig = sig_pattern.format(dtype, dtype, dtype, dtype, dtype)
         kwargs = {
             "num_warps": warp,
             "constants": {"BLOCK_SIZE": b, "HW_SIZE": hw_size, "ACTIVATION_SILU": int(silu)},
diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index ea7a6432a7507..158ab8ed610f4 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -182,7 +182,6 @@ class PlannerImpl {
   // upstream_node_0 and upstream_node_1 are the immmediate upstream nodes of downstream_node
   // upstream_node_2 is the immediate nodes ahead of downstream_node in the same logic stream
   InlinedHashMap<onnxruntime::NodeIndex, InlinedHashSet<onnxruntime::NodeIndex>> dependence_graph_;
-  InlinedHashMap<onnxruntime::OrtValueIndex, InlinedHashSet<onnxruntime::NodeIndex>> value_consumer_map_;
   InlinedHashMap<onnxruntime::OrtValueIndex, onnxruntime::NodeIndex> value_node_map_;
 
   // OrtValueInfo: Auxiliary information about an OrtValue used only during plan-generation:
@@ -295,7 +294,7 @@ class PlannerImpl {
   }
 #endif
 
-  // Find if there exists some input tensor that we can use in-place for output_arg_num-th input in the node.
+  // Find if there exists some input tensor that we can use in-place for output_arg_num-th output in the node.
   bool FindReusableInput(const onnxruntime::Node& node, int output_arg_num, OrtValueIndex* reusable_input,
                          bool* is_strided_tensor) {
     *is_strided_tensor = false;
@@ -530,6 +529,7 @@ class PlannerImpl {
 
     // Initialize allocation plan:
     plan_.allocation_plan.resize(num_ml_values);
+    for (int i = 0; static_cast<size_t>(i) < num_ml_values; i++) AllocPlan(i).reused_buffer = i;
   }
 
   bool HasExternalOutputs(const Node& node) const {
@@ -1065,7 +1065,8 @@ class PlannerImpl {
 
     // build the consumer list for each value
     int num_ml_values = ort_value_name_idx_map_.MaxIdx() + 1;
-    value_consumer_map_.reserve(num_ml_values);
+    InlinedHashMap<onnxruntime::OrtValueIndex, InlinedHashSet<onnxruntime::NodeIndex>> value_consumer_map;
+    value_consumer_map.reserve(num_ml_values);
 
     // iterate each stream from back, so the first element is the last consumer in single stream case
     for (auto& stream : stream_nodes_) {
@@ -1078,10 +1079,10 @@ class PlannerImpl {
             const auto& name = input.Name();
             int value_idx;
             ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx));
-            auto origin = Buffer(value_idx);
-            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
+            auto origin = AllocPlan(value_idx).reused_buffer;
+            if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
-              value_consumer_map_[origin].insert(node_index);
+              value_consumer_map[origin].insert(node_index);
             }
           }
           return Status::OK();
@@ -1138,8 +1139,8 @@ class PlannerImpl {
                   std::cout << p_input_arg->Name() << " reused by " << p_output_arg->Name() << " as input" << std::endl;
                   allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                   allocation_plan[output_idx_global].reused_buffer = reusable_input;
-                  value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
-                                                             value_consumer_map_[output_idx_global].end());
+                  value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(),
+                                                            value_consumer_map[output_idx_global].end());
                   reused.insert(reusable_input);
                   found_reusable = true;
                   break;
@@ -1168,8 +1169,8 @@ class PlannerImpl {
                   allocation_plan[reusable_input].alloc_kind == AllocKind::kAllocate) {
                 allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                 allocation_plan[output_idx_global].reused_buffer = reusable_input;
-                value_consumer_map_[reusable_input].insert(value_consumer_map_[output_idx_global].begin(),
-                                                           value_consumer_map_[output_idx_global].end());
+                value_consumer_map[reusable_input].insert(value_consumer_map[output_idx_global].begin(),
+                                                          value_consumer_map[output_idx_global].end());
                 reused.insert(reusable_input);
                 continue;
               }  // if
@@ -1187,11 +1188,11 @@ class PlannerImpl {
                 OrtValueIndex input_arg_index{};
                 if (value_map.GetIdx(p_input_arg->Name(), input_arg_index).IsOK() &&
                     allocation_plan[input_arg_index].alloc_kind == AllocKind::kAllocate) {
-                  if (value_consumer_map_[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
+                  if (value_consumer_map[input_arg_index].size() == 1 && SameSize(*p_input_arg, *p_output_arg)) {
                     allocation_plan[output_idx_global].alloc_kind = AllocKind::kReuse;
                     allocation_plan[output_idx_global].reused_buffer = input_arg_index;
-                    value_consumer_map_[input_arg_index].insert(value_consumer_map_[output_idx_global].begin(),
-                                                                value_consumer_map_[output_idx_global].end());
+                    value_consumer_map[input_arg_index].insert(value_consumer_map[output_idx_global].begin(),
+                                                               value_consumer_map[output_idx_global].end());
                     reused.insert(input_arg_index);
                   }
                 }
@@ -1266,7 +1267,7 @@ class PlannerImpl {
             }
 
             bool all_covered = true;
-            for (auto consumer : value_consumer_map_[output_idx_global]) {
+            for (auto consumer : value_consumer_map[output_idx_global]) {
               if (deps->find(consumer) == deps->end()) {
                 all_covered = false;
                 break;
@@ -1277,9 +1278,9 @@ class PlannerImpl {
               allocation_plan[downstream_value].reused_buffer = output_idx_global;
               get_reused = true;
               // add new consumer for the value to be reused
-              value_consumer_map_[output_idx_global].insert(value_node_map_[downstream_value]);
-              value_consumer_map_[output_idx_global].insert(value_consumer_map_[downstream_value].begin(),
-                                                            value_consumer_map_[downstream_value].end());
+              value_consumer_map[output_idx_global].insert(value_node_map_[downstream_value]);
+              value_consumer_map[output_idx_global].insert(value_consumer_map[downstream_value].begin(),
+                                                           value_consumer_map[downstream_value].end());
               node_iter = size_iter->second.erase(node_iter);
               if (size_iter->second.empty()) {
                 local_iter->second.erase(size_iter);
@@ -1342,8 +1343,9 @@ class PlannerImpl {
     ort_value_usecount.reserve(ort_value_info_.size());
 #endif
     for (size_t i = 0; i < stream_nodes_.size(); ++i) {
-      // compute use count first
+      // compute use count first. TODO(leca): call ComputeReuseCount() only once is enough!
       ORT_RETURN_IF_ERROR(ComputeReuseCount());
+      for (int j = 0; static_cast<size_t>(j) < ort_value_info_.size(); j++) Buffer(j) = j;
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
       if (i == 0) {
         for (auto ort_value_info : ort_value_info_) {
@@ -1693,8 +1695,8 @@ class PlannerImpl {
             const auto& name = input.Name();
             int value_idx;
             ORT_RETURN_IF_ERROR(ort_value_name_idx_map_.GetIdx(name, value_idx));
-            auto origin = Buffer(value_idx);
-            if (origin != -1 && plan_.allocation_plan[origin].alloc_kind == AllocKind::kAllocate) {
+            auto origin = AllocPlan(value_idx).reused_buffer;
+            if (AllocPlan(origin).alloc_kind == AllocKind::kAllocate) {
               // add current node as consumer for origin buffer
               value_consumers[origin].push_back(node_index);
             }
@@ -1889,7 +1891,7 @@ class PlannerImpl {
                   // 2. the consumer is in the same stream(non-cpu device), but it consumes a CPU tensor from an non-shape op.
                   //    for example, a resize cuda kernel consumer a tensor from MemCpyToHost cuda kernel on the same stream.
                   //    in this case, the FIFO can't guarantee the cpu tensor is ready when resize kernel is launching
-                  OrtDevice::DeviceType output_arg_device = plan_.allocation_plan[output_arg_idx].location.Type();
+                  OrtDevice::DeviceType output_arg_device = AllocPlan(output_arg_idx).location.Type();
                   WaitNotificationFn wait_handle = stream_handle_registry.GetWaitHandle(stream_device, output_arg_device);
                   if ((node_stream_map_[it->Index()] != i || output_arg_device == OrtDevice::CPU) && wait_handle != nullptr) {
                     if (node_to_notification.find(node_index) == node_to_notification.end()) {
diff --git a/onnxruntime/core/framework/stream_execution_context.cc b/onnxruntime/core/framework/stream_execution_context.cc
index 875e7f395bfa8..dd7f4d35b34bd 100644
--- a/onnxruntime/core/framework/stream_execution_context.cc
+++ b/onnxruntime/core/framework/stream_execution_context.cc
@@ -181,11 +181,13 @@ void RunSince(size_t stream_idx, StreamExecutionContext& ctx, SessionScope& sess
   }
 
 #ifdef USE_CANN
+  // Leave it to CANN EP to fill the gap if they want to use run_options
+  static onnxruntime::RunOptions run_options;
   // For CANN EP, it is necessary to explicitly create a corresponding Context for each thread in the thread pool,
   // which is different from CUDA Runtime API, but similar to CUDA Driver API.
   auto& execution_providers = ctx.GetSessionState().GetExecutionProviders();
   for (auto& xp : execution_providers) {
-    auto status = xp->OnRunStart();
+    auto status = xp->OnRunStart(run_options);
     if (!status.IsOK()) {
       ctx.SetStatus(status);
       return;
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
index 4505d4afdf1e0..a8717b99a8750 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc
@@ -31,6 +31,7 @@ CostCheckResult PostLayoutTransformCostCheck(const api::GraphRef& graph, const a
 }
 
 #if defined(USE_CUDA) && ENABLE_CUDA_NHWC_OPS
+// TODO(mtavenrath) generate list from registered kernels using nhwc domain
 const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
   static std::unordered_set<std::string_view> cuda_nhwc_ops = []() {
     return std::unordered_set<std::string_view>{
@@ -41,6 +42,7 @@ const std::unordered_set<std::string_view>& GetCUDALayoutSensitiveOps() {
         "MaxPool",
         "GlobalAveragePool",
         "AveragePool",
+        "GridSample",
     };
   }();
   return cuda_nhwc_ops;
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc
index 752b742805a7c..9a242919665bb 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.cc
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc
@@ -1045,7 +1045,7 @@ CANNExecutionProvider::~CANNExecutionProvider() {
 }
 
 // All threads share the same context and stream
-Status CANNExecutionProvider::OnRunStart() {
+Status CANNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   CANN_RETURN_IF_ERROR(aclrtSetDevice(info_.device_id));
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h
index 63ae980869c65..d83bd88d6958f 100644
--- a/onnxruntime/core/providers/cann/cann_execution_provider.h
+++ b/onnxruntime/core/providers/cann/cann_execution_provider.h
@@ -33,7 +33,7 @@ class CANNExecutionProvider : public IExecutionProvider {
   explicit CANNExecutionProvider(const CANNExecutionProviderInfo& info);
   virtual ~CANNExecutionProvider();
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
   template <typename T>
   Status Fill(Tensor* y, void* addr, aclrtStream stream) const {
diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
index 813fdc54ecd0d..48e4617b33b4d 100644
--- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
+++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc
@@ -1035,6 +1035,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, float, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16, IsNaN);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu);
 #if !defined(DISABLE_FLOAT8_TYPES)
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN, IsNaN);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FNUZ, IsNaN);
@@ -2562,6 +2563,7 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, double, IsNaN)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, MLFloat16,
                                                                 IsNaN)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Gelu)>,
 #if !defined(DISABLE_FLOAT8_TYPES)
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 20, Float8E4M3FN,
                                                                 IsNaN)>,
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.cc b/onnxruntime/core/providers/cpu/tensor/gelu.cc
new file mode 100644
index 0000000000000..d55973eda180f
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/gelu.cc
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/common/common.h"
+#include "core/common/narrow.h"
+#include "core/framework/op_kernel.h"
+#include "core/util/math_cpuonly.h"
+#include "core/mlas/inc/mlas.h"
+
+#include "core/platform/threadpool.h"
+#include <unsupported/Eigen/SpecialFunctions>
+#include "core/providers/cpu/element_wise_ranged_transform.h"
+#include "core/providers/cpu/tensor/gelu.h"
+
+using onnxruntime::narrow;
+using namespace onnxruntime::common;
+
+namespace onnxruntime {
+
+// May revisit the implementations to support inplace computation, if needed.
+
+ONNX_CPU_OPERATOR_KERNEL(
+    Gelu,
+    20,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Gelu<float>);
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib {
+ONNX_OPERATOR_KERNEL_EX(
+    Gelu,
+    kMSDomain,
+    1,
+    kCpuExecutionProvider,
+    KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    Gelu<float>);
+}
+#endif
+
+template <typename T>
+Status Gelu<T>::Compute(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const T* input_data = input->Data<T>();
+
+  Tensor* output = context->Output(0, input->Shape());
+  T* output_data = output->MutableData<T>();
+
+  concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
+  int64_t elem_count = input->Shape().Size();
+  constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
+  int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
+
+  if (approximation_algorithm_ == "tanh") {
+    // FastGelu allows optional bias. Here we split input data into chunks. Each chunk
+    // has N elements (except the last chunk), and use thread pool to parallel chunks.
+    // N = 4096 is selected based on performance test results on input shape 1x128x768.
+    // FastGelu uses approximation for Gelu. The formula is 0.5 * (1 + Tanh(x * (C * x * x + B))) * x.
+    static constexpr float B = 0.7978845608028654f;    // sqrt(2.0 / M_PI)
+    static constexpr float C = 0.035677408136300125f;  // 0.044715 * sqrt(2.0 / M_PI)
+
+    concurrency::ThreadPool::TryBatchParallelFor(
+        tp, static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          const auto start = task_idx * length_per_task;
+          const T* p_input = input_data + start;
+          T* p_output = output_data + start;
+          int64_t count = std::min(length_per_task, elem_count - start);
+
+          for (int64_t i = 0; i < count; i++) {
+            T value = p_input[i];
+            p_output[i] = value * (static_cast<T>(C) * value * value + static_cast<T>(B));
+          }
+
+          MlasComputeTanh(p_output, p_output, narrow<size_t>(count));
+
+          for (int64_t i = 0; i < count; i++) {
+            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
+          }
+        },
+        0);
+    return Status::OK();
+  } else if (approximation_algorithm_ == "none") {
+    concurrency::ThreadPool::TryBatchParallelFor(
+        tp, static_cast<int32_t>(task_count),
+        [&](ptrdiff_t task_idx) {
+          const auto start = task_idx * length_per_task;
+          const T* p_input = input_data + start;
+          T* p_output = output_data + start;
+          int64_t count = std::min(length_per_task, elem_count - start);
+
+          for (int64_t i = 0; i < count; i++) {
+            T value = p_input[i];
+            p_output[i] = value * static_cast<T>(M_SQRT1_2);
+          }
+
+          MlasComputeErf(p_output, p_output, narrow<size_t>(count));
+
+          for (int64_t i = 0; i < count; i++) {
+            p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
+          }
+        },
+        0);
+    return Status::OK();
+  }
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.h b/onnxruntime/core/providers/cpu/tensor/gelu.h
new file mode 100644
index 0000000000000..13238028d878a
--- /dev/null
+++ b/onnxruntime/core/providers/cpu/tensor/gelu.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace onnxruntime {
+
+template <typename T>
+class Gelu final : public OpKernel {
+ public:
+  explicit Gelu(const OpKernelInfo& info) : OpKernel(info) {
+    approximation_algorithm_ = info.GetAttrOrDefault<std::string>("approximate", "none");
+  }
+  Status Compute(OpKernelContext* ctx) const override;
+
+ private:
+  std::string approximation_algorithm_;
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 48a952e6dd98f..00783bcbc2665 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -386,7 +386,7 @@ Status CUDAExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunStart() {
+Status CUDAExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set CUDA device when session::Run() in case it runs in a worker thread
   CUDA_RETURN_IF_ERROR(cudaSetDevice(GetDeviceId()));
   if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
@@ -396,7 +396,7 @@ Status CUDAExecutionProvider::OnRunStart() {
   return Status::OK();
 }
 
-Status CUDAExecutionProvider::OnRunEnd(bool sync_stream) {
+Status CUDAExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
     if (GetPerThreadContext().IsGraphCaptureAllowed()) {
       GetPerThreadContext().CaptureEnd();
@@ -1256,6 +1256,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample);
 
 // Opset 17
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization);
@@ -1329,6 +1330,11 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, S
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape);
 #endif
 
+// Opset 20
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu);
+
 template <>
 KernelCreateInfo BuildKernelCreateInfo<void>() {
   return {};
@@ -2143,6 +2149,7 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, double, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, MLFloat16, LessOrEqual)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, 17, ScatterElements)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 16, float, GridSample)>,
 
     // Opset 17
     BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 17, float, LayerNormalization)>,
@@ -2222,6 +2229,11 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Reshape)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Scan)>,
     BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 19, Shape)>,
+
+    // Opset 20
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, float, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, double, Gelu)>,
+    BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 20, MLFloat16, Gelu)>,
 #endif
   };
 
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index 55f0b5570e0ee..5f62f313b86a2 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -29,9 +29,9 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   DataLayout GetPreferredLayout() const override;
 
@@ -115,6 +115,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
     PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy,
                      CUDAExecutionProviderExternalAllocatorInfo external_alloc_info, OrtArenaCfg* arena_cfg);
     ~PerThreadContext();
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
 
     cublasHandle_t CublasHandle() const {
       return cublas_handle_;
diff --git a/onnxruntime/core/providers/cuda/cudnn_common.h b/onnxruntime/core/providers/cuda/cudnn_common.h
index fdd14dedad47e..2cbeb13696270 100644
--- a/onnxruntime/core/providers/cuda/cudnn_common.h
+++ b/onnxruntime/core/providers/cuda/cudnn_common.h
@@ -24,12 +24,12 @@ class CudnnTensor final {
 
   operator cudnnTensorDescriptor_t() const { return tensor_; }
 
+  Status CreateTensorIfNeeded();
+
   template <typename T>
   static cudnnDataType_t GetDataType();
 
  private:
-  Status CreateTensorIfNeeded();
-
   cudnnTensorDescriptor_t tensor_;
 };
 
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
index 99c1f48e21c74..b61b104790fe5 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
@@ -9,40 +9,49 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-void CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnn_desc,
-                                    const int pseudo_layer,
-                                    const cudnnTensorDescriptor_t x_desc,
-                                    const cudnnFilterDescriptor_t w_desc,
-                                    const cudnnFilterDescriptor_t filter_desc,
-                                    const void* reorganized_w_data,
-                                    const int lin_layer_id,
-                                    const T* pos,
-                                    int& offset,
-                                    bool is_matrix,
-                                    cudaStream_t cuda_stream) const {
+Status CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
+                                      const cudnnRNNDescriptor_t rnn_desc,
+                                      const int pseudo_layer,
+                                      size_t reorganized_w_data_size,
+                                      const void* reorganized_w_data,
+                                      const int lin_layer_id,
+                                      const T* pos,
+                                      int& offset,
+                                      bool is_matrix,
+                                      cudaStream_t cuda_stream) const {
   int numDims;
-  std::vector<int> matDims(3);
+  std::array<int, 3> matDims;
+  std::array<int, 3> strideA;
   cudnnDataType_t dt;
-  cudnnTensorFormat_t tf;
   T* mem_offset;
 
-  if (is_matrix) {
-    cudnnGetRNNLinLayerMatrixParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset);
-  } else {
-    cudnnGetRNNLinLayerBiasParams(handle, rnn_desc, pseudo_layer, x_desc, w_desc, reorganized_w_data, lin_layer_id, filter_desc, (void**)&mem_offset);
-  }
+  CudnnTensor tensor_desc_matrix, tensor_desc_bias;
+  ORT_RETURN_IF_ERROR(tensor_desc_bias.CreateTensorIfNeeded());
+  ORT_RETURN_IF_ERROR(tensor_desc_matrix.CreateTensorIfNeeded());
 
-  cudnnGetFilterNdDescriptor(filter_desc, 3, &dt, &tf, &numDims, matDims.data());
+  T *mem_offset_matrix, *mem_offset_bias;
+  CUDNN_RETURN_IF_ERROR(cudnnGetRNNWeightParams(
+      handle, rnn_desc, pseudo_layer, reorganized_w_data_size, reorganized_w_data,
+      lin_layer_id, tensor_desc_matrix, (void**)&mem_offset_matrix, tensor_desc_bias, (void**)&mem_offset_bias));
+  CUDNN_RETURN_IF_ERROR(cudnnGetTensorNdDescriptor(
+      is_matrix ? tensor_desc_matrix : tensor_desc_bias, 3, &dt, &numDims, matDims.data(), strideA.data()));
+
+  mem_offset = is_matrix ? mem_offset_matrix : mem_offset_bias;
   int count = matDims[0] * matDims[1] * matDims[2];
+
+  if (strideA[0] != count) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::INVALID_ARGUMENT, "Stride is not packed");
+  }
   CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, cuda_stream));
+
   offset += count;
+
+  return Status::OK();
 }
 template <typename T>
 Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                               const cudnnRNNDescriptor_t rnn_desc,
-                                              const cudnnTensorDescriptor_t x_desc,
-                                              const cudnnFilterDescriptor_t w_desc,
+                                              size_t reorganized_w_data_size,
                                               void* reorganized_w_data,
                                               const T* W_data,
                                               const T* R_data,
@@ -51,18 +60,22 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
   int w_offset = 0;
   int r_offset = 0;
   int bias_offset = 0;
-  CudnnFilterDescriptor filter_desc;
   for (int layer = 0; layer < RNN_NUM_LAYERS * num_directions_; ++layer) {
     for (size_t idx = 0; idx < W_lin_layer_id_.size(); ++idx) {
-      SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream);
+      ORT_RETURN_IF_ERROR(SetWeightBias(
+          cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+          W_lin_layer_id_[idx], W_data, w_offset, true, cuda_stream));
       if (B_data != nullptr) {
-        SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream);
+        ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                          W_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream));
       }
     }
     for (size_t idx = 0; idx < R_lin_layer_id_.size(); ++idx) {
-      SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream);
+      ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                        R_lin_layer_id_[idx], R_data, r_offset, true, cuda_stream));
       if (B_data != nullptr) {
-        SetWeightBias(cudnn_handle, rnn_desc, layer, x_desc, w_desc, filter_desc, reorganized_w_data, R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream);
+        ORT_RETURN_IF_ERROR(SetWeightBias(cudnn_handle, rnn_desc, layer, reorganized_w_data_size, reorganized_w_data,
+                                          R_lin_layer_id_[idx], B_data, bias_offset, false, cuda_stream));
       }
     }
   }
@@ -72,6 +85,7 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
 
 template <typename T>
 Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B,
+                                          size_t& reorganized_w_data_size_in_bytes,
                                           IAllocatorUniquePtr<void>& reorganized_w_data,
                                           CudnnFilterDescriptor& target_w_desc,
                                           CudnnRNN& rnn_desc, onnxruntime::Stream* ort_stream) const {
@@ -91,19 +105,16 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
   TensorShapeVector dims_w({w_size, 1, 1});
   ORT_RETURN_IF_ERROR(target_w_desc.Set(dims_w, CudnnTensor::GetDataType<CudaT>()));
 
-  TensorShapeVector fake_dims_x({1, input_size, 1});
-  CudnnTensor fake_x_desc;
-  ORT_RETURN_IF_ERROR(fake_x_desc.Set(fake_dims_x, CudnnTensor::GetDataType<CudaT>()));
-
   // Prepare the weight data
-  reorganized_w_data = GetScratchBuffer<void>(w_size * sizeof(T), ort_stream);
+  reorganized_w_data_size_in_bytes = w_size * sizeof(T);
+  reorganized_w_data = GetScratchBuffer<void>(reorganized_w_data_size_in_bytes, ort_stream);
 
   // In many cases, this allocation is bigger than needed, leaving part of
-  // the buffer unintialized. non-zero garbage data leads to wrong result
+  // the buffer uninitialized. non-zero garbage data leads to wrong result
   // in call to cudnnRNNForwardInference()
   // TODO! refine allocation size for each case.
   cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
-  cudaMemsetAsync(reorganized_w_data.get(), 0, w_size * sizeof(T), cuda_stream);
+  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(reorganized_w_data.get(), 0, reorganized_w_data_size_in_bytes, cuda_stream));
 
   const T* W_data = W->Data<T>();
   const T* R_data = R->Data<T>();
@@ -111,8 +122,9 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
 
   auto* ort_cuda_stream = dynamic_cast<CudaStream*>(ort_stream);
   cudnnHandle_t cudnn_handle = ort_cuda_stream ? ort_cuda_stream->cudnn_handle_ : DefaultCudnnHandle();
-  ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc, fake_x_desc, target_w_desc,
-                                            reorganized_w_data.get(), W_data, R_data, B_data, cuda_stream));
+  ORT_RETURN_IF_ERROR(SetCudnnRnnWeightBias(cudnn_handle, rnn_desc,
+                                            reorganized_w_data_size_in_bytes, reorganized_w_data.get(),
+                                            W_data, R_data, B_data, cuda_stream));
 
   return Status::OK();
 }
@@ -128,22 +140,31 @@ Status CudnnRnnBase<T>::CacheCudnnRnnWeights(const OpKernelInfo& info) {
   bool get_R = info.TryGetConstantInput(RNN_Input_Index::R, &R);
   bool get_B = info.TryGetConstantInput(RNN_Input_Index::B, &B);
 
+  bool has_bias = B != nullptr;
+
   if (get_W && get_R) {
     CudnnRNN tmp_rnn_desc;
-    ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(DefaultCudnnHandle(),
+    auto proj_size = hidden_size_;
+    ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(W->Shape()[2],  // input_size
                                          hidden_size_,
+                                         proj_size,
                                          RNN_NUM_LAYERS,
                                          cudnn_dropout_desc_,
                                          cudnn_direction_mode_,
                                          rnn_mode_,
-                                         CudnnTensor::GetDataType<CudaT>(),
-                                         GetDeviceProp()));
+                                         has_bias,
+                                         CudnnTensor::GetDataType<CudaT>()));
     if (get_B) {
-      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr));
+      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, B,
+                                            w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
+                                            tmp_rnn_desc, nullptr));
     } else {
-      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr, w_data_cache_, w_desc_cache_, tmp_rnn_desc, nullptr));
+      ORT_RETURN_IF_ERROR(ReorganizeWeights(W, R, nullptr,
+                                            w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
+                                            tmp_rnn_desc, nullptr));
     }
     cudaStreamSynchronize(nullptr);
+
     weight_cached_ = true;
   }
 
@@ -158,17 +179,72 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   ORT_ENFORCE(nullptr != X);
 
   // optional inputs
-  const Tensor* sequence_lens = ctx->Input<Tensor>(RNN_Input_Index::sequence_lens);  // [batch_size]
-  const Tensor* initial_h = ctx->Input<Tensor>(RNN_Input_Index::initial_h);          // initial hidden. [num_directions_, batch_size, hidden_size_]
+  // [batch_size]
+  const Tensor* sequence_lens = ctx->Input<Tensor>(RNN_Input_Index::sequence_lens);
+  // initial hidden. [num_directions_, batch_size, hidden_size_]
+  const Tensor* initial_h = ctx->Input<Tensor>(RNN_Input_Index::initial_h);
   const Tensor* initial_c(nullptr);
   if (rnn_mode_ == CUDNN_LSTM) {
-    initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);  // initial cell. [num_directions_, batch_size, hidden_size_]
+    // initial cell. [num_directions_, batch_size, hidden_size_]
+    initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);
   }
 
+  size_t proj_size = hidden_size_;
   int64_t seq_length = X->Shape()[0];
   int64_t batch_size = X->Shape()[1];
   int64_t input_size = X->Shape()[2];
 
+  // we thread a single input as sequence_lens of length 1, require to expand to [batch_size]?
+  std::vector<int32_t> sequence_lengths_temp;
+  if (!sequence_lens) {
+    sequence_lengths_temp.resize(batch_size, gsl::narrow_cast<int32_t>(seq_length));
+  }
+
+  const int32_t* sequence_lens_data = (sequence_lens == nullptr)
+                                          ? sequence_lengths_temp.data()
+                                          : sequence_lens->Data<int32_t>();
+
+  // cuDNN doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1
+  // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence
+  int64_t zero_seq_count = 0;
+  std::vector<int32_t> zero_seq_index_cache(batch_size, 0);
+
+  CudaAsyncBuffer<int32_t> sequence_lens_buffer(this, batch_size);
+  int32_t* seq_len_array = sequence_lens_buffer.CpuPtr();
+
+  // 0-len sequences are not supported by cuDNN.
+  // Replace them by sequences of len 1 and mask them out with SetZeroSequences
+  for (int i = 0; i < batch_size; ++i) {
+    if (0 == sequence_lens_data[i]) {
+      seq_len_array[i] = 1;
+      zero_seq_index_cache[zero_seq_count] = i;
+      ++zero_seq_count;
+    } else {
+      seq_len_array[i] = sequence_lens_data[i];
+    }
+  }
+
+  // Calculate the zero position cache for reverse direction if it's bidirectional
+  // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since
+  // we hacked the 0 sequence to 1
+  if (zero_seq_count && num_directions_ > 1) {
+    zero_seq_index_cache.resize(zero_seq_count * num_directions_);
+    for (int64_t i = 0; i < zero_seq_count; ++i) {
+      zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] =
+          static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
+    }
+    zero_seq_count *= num_directions_;
+  }
+
+  // Prior to cuDNN 8.9.1 the sequence lens buffer must be passed to cudnnRNNForward and thus is must
+  // be copied to the GPU always.
+  ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
+  // Starting with cuDNN 8.9.1 the sequence lens buffer is ignored by cudnnRNNForward and thus it must
+  // be copied to the GPU only for the ReverseBySequence kernels.
+  // if (reverse_) {
+  //  ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
+  // }
+
   // optional outputs
   TensorShapeVector dims_Y({seq_length, num_directions_, batch_size, hidden_size_});
   TensorShapeVector dims_hxy({RNN_NUM_LAYERS * num_directions_, batch_size, hidden_size_});
@@ -177,25 +253,6 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   Tensor* Y_h = ctx->Output(Output_Index::Y_h, dims_hxy);
   Tensor* Y_c = ctx->Output(Output_Index::Y_c, dims_yc);
 
-  std::vector<int64_t> dims_x({batch_size, input_size, 1});
-  std::vector<int64_t> dims_y({batch_size, hidden_size_ * num_directions_, 1});
-
-  CudnnTensor x_desc_temp;
-  ORT_RETURN_IF_ERROR(x_desc_temp.Set(dims_x, CudnnTensor::GetDataType<CudaT>()));
-  CudnnTensor y_desc_temp;
-  ORT_RETURN_IF_ERROR(y_desc_temp.Set(dims_y, CudnnTensor::GetDataType<CudaT>()));
-  std::vector<cudnnTensorDescriptor_t> x_desc(seq_length, x_desc_temp);
-  std::vector<cudnnTensorDescriptor_t> y_desc(seq_length, y_desc_temp);
-
-  CudnnTensor hx_desc;
-  CudnnTensor cx_desc;
-  CudnnTensor y_h_desc;
-  CudnnTensor y_c_desc;
-  ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(y_h_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-  ORT_RETURN_IF_ERROR(y_c_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
-
   IAllocatorUniquePtr<T> x_reversed_data;
   const T* x_data = X->Data<T>();
   if (reverse_) {
@@ -203,6 +260,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     x_reversed_data = GetScratchBuffer<T>(seq_length * batch_size * input_size, ctx->GetComputeStream());
     ReverseBySequence(Stream(ctx),
                       gsl::narrow_cast<int32_t>(seq_length),
+                      sequence_lens_buffer.GpuPtr(),
                       gsl::narrow_cast<int32_t>(batch_size),
                       gsl::narrow_cast<int32_t>(input_size),
                       reinterpret_cast<const CudaT*>(x_data),
@@ -226,115 +284,82 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     y_data = y_alloc_data.get();
   }
 
-  const int32_t* sequence_lens_data = (sequence_lens == nullptr) ? nullptr : sequence_lens->Data<int32_t>();
+  const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
+  bool has_bias = B != nullptr;
 
   CudnnRNN rnn_desc;
-  ORT_RETURN_IF_ERROR(rnn_desc.Set(GetCudnnHandle(ctx),
+  ORT_RETURN_IF_ERROR(rnn_desc.Set(input_size,
                                    hidden_size_,
+                                   proj_size,
                                    RNN_NUM_LAYERS,
                                    cudnn_dropout_desc_,
                                    cudnn_direction_mode_,
                                    rnn_mode_,
-                                   CudnnTensor::GetDataType<CudaT>(),
-                                   GetDeviceProp()));
+                                   has_bias,
+                                   CudnnTensor::GetDataType<CudaT>()));
 
   // Prepare the weight data
+  size_t w_data_size_in_bytes = 0;
   IAllocatorUniquePtr<void> w_data;
   CudnnFilterDescriptor w_desc;
   if (!weight_cached_) {
     const Tensor& W = *ctx->Input<Tensor>(RNN_Input_Index::W);
     const Tensor& R = *ctx->Input<Tensor>(RNN_Input_Index::R);
     const Tensor* B = ctx->Input<Tensor>(RNN_Input_Index::B);
-    ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data, w_desc, rnn_desc, ctx->GetComputeStream()));
+    ORT_RETURN_IF_ERROR(ReorganizeWeights(&W, &R, B, w_data_size_in_bytes, w_data, w_desc,
+                                          rnn_desc, ctx->GetComputeStream()));
   }
 
-  // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
-  CUDNN_RETURN_IF_ERROR(cudnnSetRNNPaddingMode(rnn_desc, CUDNN_RNN_PADDED_IO_ENABLED));
+  CudnnDataTensor x_desc1;
+  ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size,
+                                  input_size, seq_len_array));
+  CudnnDataTensor y_desc1;
+  ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size,
+                                  ((rnn_mode_ == CUDNN_LSTM) ? proj_size : hidden_size_) * num_directions_,
+                                  seq_len_array));
 
-  size_t workspace_bytes;
-  CUDNN_RETURN_IF_ERROR(cudnnGetRNNWorkspaceSize(GetCudnnHandle(ctx), rnn_desc, gsl::narrow_cast<int>(seq_length), x_desc.data(), &workspace_bytes));
-  auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes, ctx->GetComputeStream());
-  int64_t zero_seq_count = 0;
-  std::vector<int32_t> zero_seq_index_cache(batch_size, 0);
-  int64_t zero_seq_index_cache_size = 0;
-
-  if (CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_ || nullptr == sequence_lens_data) {
-    CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInference(GetCudnnHandle(ctx),
-                                                   rnn_desc,
-                                                   gsl::narrow_cast<int>(seq_length),
-                                                   x_desc.data(),
-                                                   x_data_input,
-                                                   hx_desc,
-                                                   hx_data,
-                                                   cx_desc,
-                                                   cx_data,
-                                                   weight_cached_ ? w_desc_cache_ : w_desc,
-                                                   weight_cached_ ? w_data_cache_.get() : w_data.get(),
-                                                   y_desc.data(),
-                                                   y_data,
-                                                   y_h_desc,
-                                                   y_h_data,
-                                                   y_c_desc,
-                                                   y_c_data,
-                                                   workspace_cuda.get(),
-                                                   workspace_bytes));
-  } else {
-    // cudnn doesn't support 0 sequence inside the batch, find the 0 sequence and set it to 1
-    // there's a ZeroMask kernel to reset the result to 0 for the 0 sequence
-    std::vector<int32_t> seq_len_array(sequence_lens_data, sequence_lens_data + batch_size);
-    for (int i = 0; i < batch_size; ++i) {
-      if (0 == seq_len_array[i]) {
-        seq_len_array[i] = 1;
-        zero_seq_index_cache[zero_seq_count] = i;
-        ++zero_seq_count;
-      }
-    }
+  CudnnTensor cx_desc;
+  ORT_RETURN_IF_ERROR(cx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
 
-    // Calculate the zero position cache for reverse direction if it's bidirectional
-    // The cache is for Y_h or Y_c, and the 1st sequence for Y, no need to do it for other sequence in Y since
-    // we hacked the 0 sequence to 1
-    if (zero_seq_count && num_directions_ > 1) {
-      zero_seq_index_cache_size = zero_seq_count * num_directions_;
-      zero_seq_index_cache.resize(zero_seq_index_cache_size);
-      for (int64_t i = 0; i < zero_seq_count; ++i) {
-        zero_seq_index_cache[static_cast<size_t>(zero_seq_count) + i] = static_cast<int32_t>(batch_size + zero_seq_index_cache[i]);
-      }
-    }
+  CudnnTensor hx_desc;
+  ORT_RETURN_IF_ERROR(hx_desc.Set(dims_hxy, CudnnTensor::GetDataType<CudaT>()));
+
+  // reserveSpaceSize is not required cudnnRNNForward, but returned by cudnnGetRNNTempSpaceSizes
+  size_t workspace_bytes, reservespace_bytes;
 
-    CudnnDataTensor x_desc1;
-    ORT_RETURN_IF_ERROR(x_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size, input_size, seq_len_array.data()));
-    CudnnDataTensor y_desc1;
-    ORT_RETURN_IF_ERROR(y_desc1.Set(CudnnTensor::GetDataType<CudaT>(), seq_length, batch_size, hidden_size_ * num_directions_, seq_len_array.data()));
-
-    CUDNN_RETURN_IF_ERROR(cudnnRNNForwardInferenceEx(GetCudnnHandle(ctx),
-                                                     rnn_desc,
-                                                     x_desc1,
-                                                     x_data_input,
-                                                     hx_desc,
-                                                     hx_data,
-                                                     cx_desc,
-                                                     cx_data,
-                                                     weight_cached_ ? w_desc_cache_ : w_desc,
-                                                     weight_cached_ ? w_data_cache_.get() : w_data.get(),
-                                                     y_desc1,
-                                                     y_data,
-                                                     y_h_desc,
-                                                     y_h_data,
-                                                     y_c_desc,
-                                                     y_c_data,
-                                                     nullptr, nullptr, nullptr, nullptr,
-                                                     nullptr, nullptr, nullptr, nullptr,
-                                                     workspace_cuda.get(),
-                                                     workspace_bytes));
-
-    // Early terminate for this case since Y data is not required, and Y_h is obtained correctly, no need the following code to retrive Y_h from Y data.
-    if (nullptr == Y) {
+  CUDNN_RETURN_IF_ERROR(cudnnGetRNNTempSpaceSizes(GetCudnnHandle(ctx), rnn_desc, CUDNN_FWD_MODE_INFERENCE,
+                                                  x_desc1, &workspace_bytes, &reservespace_bytes));
+  auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes, ctx->GetComputeStream());
+  auto reservespace_cuda = GetScratchBuffer<void>(reservespace_bytes, ctx->GetComputeStream());
+
+  CUDNN_RETURN_IF_ERROR(cudnnRNNForward(GetCudnnHandle(ctx),
+                                        rnn_desc,
+                                        CUDNN_FWD_MODE_INFERENCE,
+                                        sequence_lens_buffer.GpuPtr(),  // should be zero starting with cudnn 8.9.1
+                                        x_desc1,
+                                        x_data_input,
+                                        y_desc1,
+                                        y_data,  // output
+                                        hx_desc,
+                                        hx_data,   // input
+                                        y_h_data,  // output
+                                        cx_desc, cx_data, y_c_data,
+                                        weight_cached_ ? w_data_cache_size_in_bytes_ : w_data_size_in_bytes,
+                                        weight_cached_ ? w_data_cache_.get() : w_data.get(),
+                                        workspace_bytes,
+                                        workspace_cuda.get(),
+                                        reservespace_bytes,
+                                        reservespace_cuda.get()));
+
+  // Early terminate for this case since Y data is not required, and Y_h is obtained correctly,
+  // no need the following code to retrieve Y_h from Y data.
+  if (nullptr == Y) {
+    // Mask on output for 0 sequence batches
+    if (zero_seq_count > 0) {
       // Mask on output for 0 sequence batches
-      if (zero_seq_count > 0) {
-        SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
-      }
-      return Status::OK();
+      SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
     }
+    return Status::OK();
   }
 
   IAllocatorUniquePtr<T> y_reorganized_data;
@@ -345,6 +370,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
       // reverse output data
       ReverseBySequence(Stream(ctx),
                         gsl::narrow_cast<int32_t>(seq_length),
+                        sequence_lens_buffer.GpuPtr(),
                         gsl::narrow_cast<int32_t>(batch_size),
                         gsl::narrow_cast<int32_t>(hidden_size_),
                         reinterpret_cast<CudaT*>(y_data),
@@ -361,8 +387,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     }
 
     if (Y != nullptr) {
-      // User specified this optional output, so need to copy the reversed data to orignial place
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T), cudaMemcpyDeviceToDevice, Stream(ctx)));
+      // User specified this optional output, so need to copy the reversed data to original place
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T),
+                                           cudaMemcpyDeviceToDevice, Stream(ctx)));
     } else {
       y_data = y_reorganized_data.get();
     }
@@ -370,23 +397,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   // Mask on output for 0 sequence batches
   if (zero_seq_count > 0) {
-    SetZeroSequences(zero_seq_index_cache_size, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
+    SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, ctx->GetComputeStream());
   }
 
-  if ((CUDNN_RNN_RELU == rnn_mode_ || CUDNN_RNN_TANH == rnn_mode_) && sequence_lens_data != nullptr && y_h_data != nullptr && y_data != nullptr) {
-    CudaAsyncBuffer<int32_t> sequence_lens_buffer(this, batch_size);
-    memcpy(sequence_lens_buffer.CpuPtr(), sequence_lens_data, batch_size * sizeof(int32_t));
-    ORT_RETURN_IF_ERROR(sequence_lens_buffer.CopyToGpu(ctx->GetComputeStream()));
-    RnnMaskImpl(Stream(ctx),
-                gsl::narrow_cast<int32_t>(num_directions_),
-                gsl::narrow_cast<int32_t>(seq_length),
-                gsl::narrow_cast<int32_t>(batch_size),
-                gsl::narrow_cast<int32_t>(hidden_size_),
-                sequence_lens_buffer.GpuPtr(),
-                reinterpret_cast<CudaT*>(y_data),
-                reinterpret_cast<CudaT*>(y_h_data),
-                output_size);
-  }
   return Status::OK();
 }
 
@@ -399,7 +412,8 @@ void CudnnRnnBase<T>::SetZeroSequences(const int64_t zero_seq_index_cache_size,
                                        onnxruntime::Stream* ort_stream) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
   CudaAsyncBuffer<int32_t> zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size);
-  memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(), zero_seq_index_cache_size * sizeof(int32_t));
+  memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(),
+         zero_seq_index_cache_size * sizeof(int32_t));
   ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu(ort_stream));
   cudaStream_t cuda_stream = ort_stream ? static_cast<cudaStream_t>(ort_stream->GetHandle()) : nullptr;
   MaskZeroSequences(cuda_stream,
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
index 1c9483b2afd38..0fa01d3486e99 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
@@ -38,26 +38,28 @@ class CudnnRNN {
     }
   }
 
-  Status Set(const cudnnHandle_t& cudnnHandle, int64_t hidden_size, int num_layers,
+  Status Set(int64_t input_size, int64_t hidden_size, int64_t proj_size, int num_layers,
              cudnnDropoutDescriptor_t cudnn_dropout_desc, cudnnDirectionMode_t cudnn_direction_model,
-             cudnnRNNMode_t rnn_mode, cudnnDataType_t dataType, const cudaDeviceProp& prop) {
+             cudnnRNNMode_t rnn_mode, bool has_bias, cudnnDataType_t dataType) {
     if (!cudnn_rnn_desc_)
       CUDNN_RETURN_IF_ERROR(cudnnCreateRNNDescriptor(&cudnn_rnn_desc_));
 
-    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v6(cudnnHandle,
-                                                   cudnn_rnn_desc_,
+    CUDNN_RETURN_IF_ERROR(cudnnSetRNNDescriptor_v8(cudnn_rnn_desc_,
+                                                   CUDNN_RNN_ALGO_STANDARD,  // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC
+                                                   rnn_mode,
+                                                   has_bias ? CUDNN_RNN_DOUBLE_BIAS : CUDNN_RNN_NO_BIAS,
+                                                   cudnn_direction_model,
+                                                   CUDNN_LINEAR_INPUT,
+                                                   dataType,
+                                                   dataType,
+                                                   dataType == CUDNN_DATA_HALF ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH,
+                                                   gsl::narrow_cast<int>(input_size),
                                                    gsl::narrow_cast<int>(hidden_size),
+                                                   gsl::narrow_cast<int>(proj_size),  // projected size
                                                    num_layers,
                                                    cudnn_dropout_desc,
-                                                   CUDNN_LINEAR_INPUT,  // We can also skip the input matrix transformation
-                                                   cudnn_direction_model,
-                                                   rnn_mode,
-                                                   CUDNN_RNN_ALGO_STANDARD,  // CUDNN_RNN_ALGO_PERSIST_STATIC, CUDNN_RNN_ALGO_PERSIST_DYNAMIC
-                                                   dataType));
-
-    if (prop.major >= 7 && dataType == CUDNN_DATA_HALF) {
-      cudnnSetRNNMatrixMathType(cudnn_rnn_desc_, CUDNN_TENSOR_OP_MATH);
-    }
+                                                   // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
+                                                   CUDNN_RNN_PADDED_IO_ENABLED));
 
     return Status::OK();
   }
@@ -119,8 +121,7 @@ class CudnnRnnBase : public CudaKernel {
  private:
   Status SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                const cudnnRNNDescriptor_t rnn_desc,
-                               const cudnnTensorDescriptor_t x_desc,
-                               const cudnnFilterDescriptor_t w_desc,
+                               size_t w_data_size,
                                void* w_data,
                                const T* W_data,
                                const T* R_data,
@@ -128,23 +129,22 @@ class CudnnRnnBase : public CudaKernel {
                                cudaStream_t cuda_stream) const;
 
   Status ReorganizeWeights(const Tensor* W, const Tensor* R, const Tensor* B,
+                           size_t& target_w_data_size_in_bytes,
                            IAllocatorUniquePtr<void>& target_w_data,
                            CudnnFilterDescriptor& target_w_desc,
                            CudnnRNN& rnn_desc,
                            onnxruntime::Stream* ort_stream) const;
 
-  void SetWeightBias(const cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnn_desc,
-                     const int pseudo_layer,
-                     const cudnnTensorDescriptor_t x_desc,
-                     const cudnnFilterDescriptor_t w_desc,
-                     const cudnnFilterDescriptor_t filter_desc,
-                     const void* w_data,
-                     const int lin_layer_id,
-                     const T* pos,
-                     int& offset,
-                     bool is_matrix,
-                     cudaStream_t cuda_stream) const;
+  Status SetWeightBias(const cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnn_desc,
+                       const int pseudo_layer,
+                       size_t w_data_size,
+                       const void* w_data,
+                       const int lin_layer_id,
+                       const T* pos,
+                       int& offset,
+                       bool is_matrix,
+                       cudaStream_t cuda_stream) const;
 
   void SetZeroSequences(const int64_t zero_seq_index_cache_size,
                         const std::vector<int32_t> zero_seq_index_cache,
@@ -167,6 +167,7 @@ class CudnnRnnBase : public CudaKernel {
   cudnnRNNMode_t rnn_mode_;
   // w_desc_cache_ & w_data_cache_ are changed in Constructor if we can get the weights as constant input
   CudnnFilterDescriptor w_desc_cache_;
+  size_t w_data_cache_size_in_bytes_;
   IAllocatorUniquePtr<void> w_data_cache_;
   bool weight_cached_;
   int64_t layout_;
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.cc b/onnxruntime/core/providers/cuda/rnn/rnn.cc
index 4bd22340ef2bb..ed8be63679707 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.cc
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.cc
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/providers/shared_library/provider_api.h"
 #include "rnn.h"
+
+#include "core/providers/shared_library/provider_api.h"
 #include "rnn_impl.h"
 #include "core/providers/cuda/cudnn_common.h"
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn.h b/onnxruntime/core/providers/cuda/rnn/rnn.h
index e4e50046b3725..6221afb003b22 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn.h
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "cudnn_rnn_base.h"
+
 #include "core/providers/cuda/cuda_common.h"
 #include <cudnn.h>
 
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
index d485855ddb417..94c8036be6cdf 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
+++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.cu
@@ -8,22 +8,32 @@ namespace onnxruntime {
 namespace cuda {
 
 template <typename T>
-__global__ void _ReverseBySequenceKernel(const int32_t seq_length,
+__global__ void _ReverseBySequenceKernel(const int32_t max_seq_length,
+                                         const int32_t* seq_lengths,
                                          const int32_t block_size,
                                          const fast_divmod div_batch_block,
+                                         const fast_divmod div_input_or_hidden_size,
                                          const T* data,
                                          T* reversed_data,
                                          const CUDA_LONG N) {
   CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
   int seq_id, offset;
   div_batch_block.divmod(id, seq_id, offset);
-  int org_id = (seq_length - seq_id - 1) * block_size + offset;
-  reversed_data[id] = data[org_id];
+  int batch, batch_offset;
+  div_input_or_hidden_size.divmod(offset, batch, batch_offset);
+  int seq_id_org = seq_lengths[batch] - seq_id - 1;
+  if (seq_id_org >= 0) {
+    int org_id = seq_id_org * block_size + offset;
+    reversed_data[id] = data[org_id];
+  } else {
+    reversed_data[id] = T{};
+  }
 }
 
 template <typename T>
 void ReverseBySequence(cudaStream_t stream,
-                       const int32_t seq_length,
+                       const int32_t max_seq_length,
+                       const int32_t *seq_lengths,
                        const int32_t batch_size,
                        const int32_t input_or_hidden_size,
                        const T* data,
@@ -32,9 +42,10 @@ void ReverseBySequence(cudaStream_t stream,
   // kerneral
   int32_t block_size = batch_size * input_or_hidden_size;
   fast_divmod div_batch_block(block_size);
+  fast_divmod div_input_or_hidden_size(input_or_hidden_size);
   int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
   _ReverseBySequenceKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      seq_length, block_size, div_batch_block, data, reversed_data, (CUDA_LONG)N);
+      max_seq_length, seq_lengths, block_size, div_batch_block, div_input_or_hidden_size, data, reversed_data, (CUDA_LONG)N);
 }
 
 template <typename T>
@@ -82,60 +93,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream,
       data, reordered_data, (CUDA_LONG)N);
 }
 
-template <typename T>
-__global__ void _RnnMaskKernel(const int32_t seq_length,
-                               const int32_t batch_size,
-                               const int32_t hidden_size,
-                               const int32_t* sequence_lens,
-                               const fast_divmod div_seq_block,
-                               const fast_divmod div_dir_block,
-                               const fast_divmod div_batch_block,
-                               T* y_output_data,
-                               T* y_h_output_data,
-                               const CUDA_LONG N) {
-  CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N);
-
-  int seq_id, direction_id, batch_id, offset;
-  div_seq_block.divmod(id, seq_id, offset);
-  div_dir_block.divmod(offset, direction_id, offset);
-  div_batch_block.divmod(offset, batch_id, offset);
-  int32_t batch_seq_length = sequence_lens[batch_id];
-
-  if (batch_id >= batch_size || batch_seq_length == seq_length) {
-    return;
-  }
-
-  if (seq_id >= batch_seq_length) {
-    y_output_data[id] = 0;
-    return;
-  }
-
-  if ((y_h_output_data != nullptr) && 
-      ((direction_id == 0 && (seq_id + 1) == batch_seq_length) || (direction_id == 1 && seq_id == 0))) {
-    int hy_idx = direction_id * batch_size * hidden_size + batch_id * hidden_size + offset;
-    y_h_output_data[hy_idx] = y_output_data[id];
-  }
-}
-
-template <typename T>
-void RnnMaskImpl(cudaStream_t stream,
-                 const int32_t num_directions,
-                 const int32_t seq_length,
-                 const int32_t batch_size,
-                 const int32_t hidden_size,
-                 const int32_t* sequence_lens,
-                 T* y_output_data,
-                 T* y_h_output_data,
-                 const size_t N) {
-  fast_divmod div_seq_block(batch_size * hidden_size * num_directions);
-  fast_divmod div_dir_block(batch_size * hidden_size);
-  fast_divmod div_batch_block(hidden_size);
-  int blocksPerGrid = (int)(ceil(static_cast<float>(N) / GridDim::maxThreadsPerBlock));
-  _RnnMaskKernel<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
-      seq_length, batch_size, hidden_size, sequence_lens, div_seq_block,
-      div_dir_block, div_batch_block, y_output_data, y_h_output_data, (CUDA_LONG)N);
-}
-
 template <typename T>
 __global__ void _MaskZeroSequences(const int32_t hidden_size,
                                    T* y_output_data,
@@ -180,17 +137,9 @@ void MaskZeroSequences(cudaStream_t stream,
 }
 
 #define SPECIALIZED_RNN_IMPL(T)                                                 \
-  template void RnnMaskImpl<T>(cudaStream_t stream,                       \
-                               const int32_t num_directions,                    \
-                               const int32_t seq_length,                        \
-                               const int32_t batch_size,                        \
-                               const int32_t hidden_size,                       \
-                               const int32_t* sequence_lens,                    \
-                               T* y_output_data,                                \
-                               T* y_h_output_data,                              \
-                               const size_t N);                                 \
-  template void ReverseBySequence<T>(cudaStream_t stream,                 \
-                                     const int32_t seq_length,                  \
+  template void ReverseBySequence<T>(cudaStream_t stream,                       \
+                                     const int32_t max_seq_length,              \
+                                     const int32_t* seq_lengths,                \
                                      const int32_t batch_size,                  \
                                      const int32_t hidden_size,                 \
                                      const T* data,                             \
@@ -203,7 +152,7 @@ void MaskZeroSequences(cudaStream_t stream,
                                                       const T* data,            \
                                                       T* reordered_data,        \
                                                      const size_t N);           \
-template void MaskZeroSequences<T>(cudaStream_t stream,                   \
+template void MaskZeroSequences<T>(cudaStream_t stream,                         \
                                    const int32_t hidden_size,                   \
                                    T* y_output_data,                            \
                                    T* y_h_output_data,                          \
diff --git a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
index 9844e04ff6ec5..ba876011f6b67 100644
--- a/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
+++ b/onnxruntime/core/providers/cuda/rnn/rnn_impl.h
@@ -10,7 +10,8 @@ namespace cuda {
 
 template <typename T>
 void ReverseBySequence(cudaStream_t stream,
-                       const int32_t seq_length,
+                       const int32_t max_seq_length,
+                       const int32_t* seq_lengths,
                        const int32_t batch_size,
                        const int32_t input_or_hidden_size,
                        const T* data,
@@ -26,17 +27,6 @@ void ReorderBidirectionalDataInSequence(cudaStream_t stream,
                                         T* reordered_data,
                                         const size_t N);
 
-template <typename T>
-void RnnMaskImpl(cudaStream_t stream,
-                 const int32_t num_directions,
-                 const int32_t seq_length,
-                 const int32_t batch_size,
-                 const int32_t hidden_size,
-                 const int32_t* sequence_lens,
-                 T* y_output_data,
-                 T* y_h_output_data,
-                 const size_t N);
-
 template <typename T>
 void MaskZeroSequences(cudaStream_t stream,
                        const int32_t hidden_size,
diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
index fa987866c002f..54c024793ff0b 100644
--- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
+++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h
@@ -168,5 +168,31 @@ struct NumericLimits<double> {
   }
 };
 
+// TODO Where to put this? good places might be
+// core/framework/tensor_shape.h
+// core/util/matrix_layout.h
+
+constexpr bool LAYOUT_NCHW = false;
+constexpr bool LAYOUT_NHWC = true;
+
+template <bool IsNHWC>
+struct Channels;
+
+template <>
+struct Channels<LAYOUT_NHWC> {
+  static constexpr size_t N = 0;
+  static constexpr size_t H = 1;
+  static constexpr size_t W = 2;
+  static constexpr size_t C = 3;
+};
+
+template <>
+struct Channels<LAYOUT_NCHW> {
+  static constexpr size_t N = 0;
+  static constexpr size_t C = 1;
+  static constexpr size_t H = 2;
+  static constexpr size_t W = 3;
+};
+
 }  // namespace cuda
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.cc b/onnxruntime/core/providers/cuda/tensor/gelu.cc
new file mode 100644
index 0000000000000..67b2fad373a7f
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu.cc
@@ -0,0 +1,89 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/cuda/cuda_common.h"
+#include "core/providers/cuda/cudnn_common.h"
+#include "core/providers/cuda/tensor/gelu.h"
+#include "core/providers/cuda/tensor/gelu_impl.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+#define REGISTER_KERNEL_TYPED(T)                                 \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                 \
+      Gelu,                                                      \
+      kOnnxDomain,                                               \
+      20,                                                        \
+      T,                                                         \
+      kCudaExecutionProvider,                                    \
+      (*KernelDefBuilder::Create())                              \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+          .MayInplace(0, 0),                                     \
+      Gelu<T>);
+
+REGISTER_KERNEL_TYPED(float)
+REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(double)
+
+template <typename T>
+Status Gelu<T>::ComputeInternal(OpKernelContext* context) const {
+  const Tensor* input = context->Input<Tensor>(0);
+  const auto& input_dims = input->Shape().GetDims();
+  if (input_dims.size() < 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 0 is expected to have 1 or more dimensions, got ", input_dims.size());
+  }
+
+  Tensor* output = context->Output(0, input->Shape());
+
+  int64_t input_length = input->Shape().Size();
+  if (input_length == 0) {
+    return Status::OK();
+  }
+
+  typedef typename ToCudaType<T>::MappedType CudaT;
+
+  if (approximation_algorithm_ == "tanh") {
+    return LaunchFastGeluKernel<CudaT>(GetDeviceProp(),
+                                       Stream(context),
+                                       static_cast<int>(input_length),
+                                       0 /* no bias */,
+                                       reinterpret_cast<const CudaT*>(input->Data<T>()),
+                                       nullptr /* no bias */,
+                                       reinterpret_cast<CudaT*>(output->MutableData<T>()),
+                                       use_half2_);
+  } else if (approximation_algorithm_ == "none") {
+    return LaunchGeluKernel<CudaT>(Stream(context),
+                                   reinterpret_cast<const CudaT*>(input->Data<T>()),
+                                   reinterpret_cast<CudaT*>(output->MutableData<T>()),
+                                   static_cast<size_t>(input_length));
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
+}
+
+}  // namespace cuda
+
+#ifndef DISABLE_CONTRIB_OPS
+namespace contrib::cuda {
+#define REGISTER_CONTRIB_KERNEL_TYPED(T)                         \
+  ONNX_OPERATOR_TYPED_KERNEL_EX(                                 \
+      Gelu,                                                      \
+      kMSDomain,                                                 \
+      1,                                                         \
+      T,                                                         \
+      kCudaExecutionProvider,                                    \
+      (*KernelDefBuilder::Create())                              \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>()) \
+          .MayInplace(0, 0),                                     \
+      onnxruntime::cuda::Gelu<T>);
+
+REGISTER_CONTRIB_KERNEL_TYPED(float)
+REGISTER_CONTRIB_KERNEL_TYPED(MLFloat16)
+REGISTER_CONTRIB_KERNEL_TYPED(double)
+
+#undef REGISTER_CONTRIB_KERNEL_TYPED
+}  // namespace contrib::cuda
+#endif
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu.h b/onnxruntime/core/providers/cuda/tensor/gelu.h
new file mode 100644
index 0000000000000..1c8189ab24121
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu.h
@@ -0,0 +1,28 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+#include "core/common/common.h"
+#include "core/providers/cuda/cuda_kernel.h"
+#include "core/providers/cuda/math/unary_elementwise_ops.h"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+class Gelu final : public UnaryElementwise {
+ public:
+  Gelu(const OpKernelInfo& info) : UnaryElementwise(info) {
+    approximation_algorithm_ = info.GetAttrOrDefault<std::string>("approximate", "none");
+  }
+
+  Status ComputeInternal(OpKernelContext* ctx) const override;
+
+ private:
+  const bool use_half2_{true};
+
+  std::string approximation_algorithm_;
+};
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
similarity index 88%
rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu
rename to onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
index c9498eb1bcd7b..3292650584de8 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.cu
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_approximate_impl.cu
@@ -24,12 +24,9 @@ limitations under the License.
 #include "core/providers/cuda/cuda_common.h"
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#include "contrib_ops/cuda/bert/fast_gelu_impl.h"
-
-using namespace onnxruntime::cuda;
+#include "core/providers/cuda/tensor/gelu_impl.h"
 
 namespace onnxruntime {
-namespace contrib {
 namespace cuda {
 
 // constants for approximating the normal cdf
@@ -75,6 +72,17 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
   return CUDA_CALL(cudaGetLastError());
 }
 
+template <>
+Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
+                            const double* input, const double* bias, double* output, bool /*use_half2*/) {
+  constexpr int blockSize = 256;
+  const int gridSize = (input_length + blockSize - 1) / blockSize;
+  FastGeluKernel<double, blockSize><<<gridSize, blockSize, 0, stream>>>(A, B, C, input_length, bias_length,
+                                                                        input, bias, output);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
 template <>
 Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
                             const half* input, const half* bias, half* output, bool use_half2) {
@@ -114,5 +122,4 @@ Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int
 }
 
 }  // namespace cuda
-}  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
new file mode 100644
index 0000000000000..3f96da38b37bb
--- /dev/null
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.cu
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <cuda_runtime.h>
+#include "core/providers/cuda/tensor/gelu_impl.h"
+#include "core/providers/cuda/cu_inc/common.cuh"
+#include "core/providers/cuda/cu_inc/unary_elementwise_impl.cuh"
+
+namespace onnxruntime {
+namespace cuda {
+
+template <typename T>
+struct OP_Gelu {
+  __device__ __inline__ T operator()(const T& a) const {
+    return _Gelu(a);
+  }
+};
+
+template <>
+struct OP_Gelu<half> {
+  __device__ __inline__ half operator()(const half& a) const {
+    return static_cast<half>(_Gelu(static_cast<float>(a)));
+  }
+};
+
+template <typename T>
+Status LaunchGeluKernel(
+    cudaStream_t stream,
+    const T* input_data,
+    T* output_data,
+    size_t count) {
+  UnaryElementWiseImpl(stream, input_data, output_data, OP_Gelu<T>(), count);
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
+#define SPECIALIZED_GELU_IMPL(T)                                                                \
+  template Status LaunchGeluKernel<T>(cudaStream_t stream, const T* input_data, T* output_data, \
+                                      size_t count);
+
+SPECIALIZED_GELU_IMPL(float);
+SPECIALIZED_GELU_IMPL(half);
+SPECIALIZED_GELU_IMPL(double);
+
+#undef SPECIALIZED_GELU_IMPL
+
+}  // namespace cuda
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h
similarity index 80%
rename from onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h
rename to onnxruntime/core/providers/cuda/tensor/gelu_impl.h
index ba78310f5dfc2..2ea0d3441fda3 100644
--- a/onnxruntime/contrib_ops/cuda/bert/fast_gelu_impl.h
+++ b/onnxruntime/core/providers/cuda/tensor/gelu_impl.h
@@ -1,17 +1,18 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
 #pragma once
+
 #include "core/common/common.h"
 
 namespace onnxruntime {
-namespace contrib {
 namespace cuda {
 
+template <typename T>
+Status LaunchGeluKernel(cudaStream_t stream, const T* input, T* output, size_t count);
+
 template <typename T>
 Status LaunchFastGeluKernel(const cudaDeviceProp& prop, cudaStream_t stream, int input_length, int bias_length,
                             const T* input, const T* bias, T* output, bool use_half2);
 
 }  // namespace cuda
-}  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
index 5617bc7bdcac6..841d6244a983e 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -270,7 +270,7 @@ namespace Dml
             return m_impl->OnSessionInitializationEnd();
         }
 
-        virtual onnxruntime::Status Sync() const final override
+        onnxruntime::Status Sync() const final override
         {
             // Completely wait until the device has completed all preceding tasks.
             // The application could have called SynchronizeBoundOutputs().
@@ -278,7 +278,7 @@ namespace Dml
             return Status::OK();
         }
 
-        virtual onnxruntime::Status OnRunEnd(bool /*sync_stream*/) final override
+        onnxruntime::Status OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) final override
         {
             // Flush any pending work to the GPU, but don't block for completion, permitting it
             // to overlap other work.
diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc
index 799d4172f2b64..62c3981682cfc 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.cc
+++ b/onnxruntime/core/providers/js/js_execution_provider.cc
@@ -756,7 +756,7 @@ std::unique_ptr<onnxruntime::IDataTransfer> JsExecutionProvider::GetDataTransfer
 JsExecutionProvider::~JsExecutionProvider() {
 }
 
-Status JsExecutionProvider::OnRunStart() {
+Status JsExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && IsGraphCaptureAllowed() && !IsGraphCaptured()) {
     LOGS(*GetLogger(), INFO) << "Capturing the webgpu graph for this model";
     EM_ASM({ Module.jsepCaptureBegin(); });
@@ -764,7 +764,7 @@ Status JsExecutionProvider::OnRunStart() {
   return Status::OK();
 }
 
-Status JsExecutionProvider::OnRunEnd(bool sync_stream) {
+Status JsExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && !IsGraphCaptured()) {
     if (IsGraphCaptureAllowed()) {
       EM_ASM({ Module.jsepCaptureEnd(); });
diff --git a/onnxruntime/core/providers/js/js_execution_provider.h b/onnxruntime/core/providers/js/js_execution_provider.h
index 91a3256ec2bd5..b4518c67d1e60 100644
--- a/onnxruntime/core/providers/js/js_execution_provider.h
+++ b/onnxruntime/core/providers/js/js_execution_provider.h
@@ -59,8 +59,8 @@ class JsExecutionProvider : public IExecutionProvider {
 
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
-  Status OnRunStart() override;
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   bool IsGraphCaptureEnabled() const override;
   bool IsGraphCaptured() const override;
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
index 40e76a0a67782..50782569ee80a 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc
@@ -1383,11 +1383,11 @@ Status MIGraphXExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status MIGraphXExecutionProvider::OnRunStart() {
+Status MIGraphXExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   return Status::OK();
 }
 
-Status MIGraphXExecutionProvider::OnRunEnd(bool) {
+Status MIGraphXExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& /*run_options*/) {
   auto status = hipStreamQuery(stream_);
 
   if (status != hipSuccess) {
diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
index d582338c7e067..c3617f409e72c 100644
--- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
+++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h
@@ -56,9 +56,9 @@ class MIGraphXExecutionProvider : public IExecutionProvider {
 #ifdef MIGRAPHX_STREAM_SYNC
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 #endif
 
   std::vector<std::unique_ptr<ComputeCapability>>
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
index c2e71081b898e..2d8ec295d613b 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.cc
@@ -151,12 +151,14 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
 Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
                                QnnBackendManager* qnn_backend_manager,
-                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models) {
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
+                               const logging::Logger& logger) {
   Status status = GetEpContextFromMainNode(*graph_viewer.Nodes().begin(), ctx_onnx_model_path, qnn_backend_manager, qnn_models);
 
   // This is the protocol with customer that status with INVALID_GRAPH will be generated if failed to load context model
   if (!status.IsOK()) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContextModel. ", status.ErrorMessage());
+    LOGS(logger, ERROR) << "Failed to load from EpContext model. " << status.ErrorMessage();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_GRAPH, "Failed to load from EpContext model. ", status.ErrorMessage());
   }
 
   return Status::OK();
diff --git a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
index b1360b4e576fa..7d56b45a1dbcd 100644
--- a/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/qnn/builder/onnx_ctx_model_helper.h
@@ -56,7 +56,8 @@ Status GetEpContextFromMainNode(const onnxruntime::Node& main_context_node,
 Status LoadQnnCtxFromOnnxGraph(const onnxruntime::GraphViewer& graph_viewer,
                                const onnxruntime::PathString& ctx_onnx_model_path,
                                QnnBackendManager* qnn_backend_manager,
-                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models);
+                               std::unordered_map<std::string, std::unique_ptr<qnn::QnnModel>>& qnn_models,
+                               const logging::Logger& logger);
 
 Status CreateEPContextNodes(Model* model,
                             unsigned char* buffer,
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 5f0b87c7cb9d7..e354bf6562722 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -573,11 +573,16 @@ Status QnnBackendManager::LoadCachedQnnContextFromBuffer(char* buffer, uint64_t
 
   // More work to support multiple partition, how to map the graph name in compile to qnn graph name
   // Need the lower level framework to understand EPContext op and pass in the partition_name in fused_node during Compile
-  for (uint32_t i = 0; i < graph_count; ++i) {
-    std::string graph_name(graphs_info[i].graphInfoV1.graphName);
-    auto qnn_model_pos = qnn_models.find(graph_name);
-    ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names.");
-    ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i]));
+  if (1 == graph_count) {
+    auto qnn_model_pose = qnn_models.begin();
+    ORT_RETURN_IF_ERROR(qnn_model_pose->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[0]));
+  } else {
+    for (uint32_t i = 0; i < graph_count; ++i) {
+      std::string graph_name(graphs_info[i].graphInfoV1.graphName);
+      auto qnn_model_pos = qnn_models.find(graph_name);
+      ORT_RETURN_IF(qnn_model_pos == qnn_models.end(), graph_name + " does not match any EPContext node names.");
+      ORT_RETURN_IF_ERROR(qnn_model_pos->second->DeserializeGraphInfoFromBinaryInfo(graphs_info[i]));
+    }
   }
 
   qnn_sys_interface_.systemContextFree(sys_ctx_handle);
@@ -629,11 +634,6 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
     LOGS(logger, VERBOSE) << "CreateContext succeed.";
   }
 
-  if (htp_performance_mode_ != HtpPerformanceMode::kHtpDefault) {
-    ORT_RETURN_IF_ERROR(SetHtpPowerConfig());
-    LOGS(logger, VERBOSE) << "SetHtpPowerConfig succeed.";
-  }
-
   LOGS(logger, VERBOSE) << "QNN SetupBackend succeed";
 
   backend_setup_completed_ = true;
@@ -641,7 +641,7 @@ Status QnnBackendManager::SetupBackend(const logging::Logger& logger, bool load_
   return Status::OK();
 }
 
-Status QnnBackendManager::SetHtpPowerConfig() {
+Status QnnBackendManager::CreateHtpPowerCfgId(uint32_t device_id, uint32_t core_id, uint32_t& htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
   ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -651,23 +651,37 @@ Status QnnBackendManager::SetHtpPowerConfig() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
   // Get power client id
-  status = htp_perf_infra.createPowerConfigId(/*device_id=*/0, /*core_id=*/0, &htp_power_config_client_id_);
+  status = htp_perf_infra.createPowerConfigId(device_id, core_id, &htp_power_config_id);
   ORT_RETURN_IF(QNN_SUCCESS != status, "createPowerConfigId failed.");
 
+  return Status::OK();
+}
+
+Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id,
+                                            HtpPerformanceMode htp_performance_mode) {
+  QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+  auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+  ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+  auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+  ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+  QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
   constexpr const int kNumConfigs = 1;
   std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> power_configs(
       kNumConfigs);
   QnnHtpPerfInfrastructure_PowerConfig_t& dcvs_config = power_configs[0];
   dcvs_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
   QnnHtpPerfInfrastructure_DcvsV3_t& dcvs_v3 = dcvs_config.dcvsV3Config;
-  dcvs_v3.contextId = htp_power_config_client_id_;
+  dcvs_v3.contextId = htp_power_config_client_id;
   dcvs_v3.setSleepDisable = 0;
   dcvs_v3.sleepDisable = 0;
   dcvs_v3.setDcvsEnable = 1;
   dcvs_v3.dcvsEnable = kDcvsDisable;
   dcvs_v3.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
   // choose performance mode
-  switch (htp_performance_mode_) {
+  switch (htp_performance_mode) {
     case HtpPerformanceMode::kHtpBurst:
       dcvs_v3.setSleepLatency = 1;  // true
       dcvs_v3.sleepLatency = kSleepMinLatency;
@@ -766,25 +780,40 @@ Status QnnBackendManager::SetHtpPowerConfig() {
       dcvs_v3.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_NOM_PLUS;
       break;
     default:
-      ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode_));
+      ORT_THROW("Invalid performance profile %d", static_cast<int>(htp_performance_mode));
       break;
   }
   std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr = ObtainNullTermPtrVector(power_configs);
-  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data());
+  status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
   ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for HTP performance mode.");
 
-  // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
-  if (rpc_control_latency_ != 0) {
+  return Status::OK();
+}
+
+Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_id,
+                                               uint32_t rpc_control_latency) {
+  if (rpc_control_latency != 0) {
+    QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
+    auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
+    ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
+
+    auto* htp_infra = static_cast<QnnHtpDevice_Infrastructure_t*>(qnn_device_infra);
+    ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType,
+                  "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
+    QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
+
+    // Set rpc control latency here, but note that v68 doesn't support rpc polling mode.
     constexpr int kNumRpcPollingPowerConfigs = 2;
     std::vector<QnnHtpPerfInfrastructure_PowerConfig_t> rpc_power_configs(kNumRpcPollingPowerConfigs);
-    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency = rpc_power_configs[0];
+    QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0];
     // v68 doesn't support this.
     QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time = rpc_power_configs[1];
-    rpc_control_latency.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
+    rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY;
     rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-    rpc_control_latency.rpcControlLatencyConfig = rpc_control_latency_;
-    perf_power_configs_ptr = ObtainNullTermPtrVector(rpc_power_configs);
-    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id_, perf_power_configs_ptr.data());
+    rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency;
+    std::vector<const QnnHtpPerfInfrastructure_PowerConfig_t*> perf_power_configs_ptr =
+        ObtainNullTermPtrVector(rpc_power_configs);
+    status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data());
     ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency.");
   }
 
@@ -805,11 +834,7 @@ void QnnBackendManager::Split(std::vector<std::string>& split_string,
   }
 }
 
-Status QnnBackendManager::DestroyHTPPowerConfigID() {
-  if (htp_performance_mode_ == HtpPerformanceMode::kHtpDefault) {
-    return Status::OK();
-  }
-
+Status QnnBackendManager::DestroyHTPPowerConfigID(uint32_t htp_power_config_id) {
   QnnDevice_Infrastructure_t qnn_device_infra = nullptr;
   auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra);
   ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed.");
@@ -819,7 +844,7 @@ Status QnnBackendManager::DestroyHTPPowerConfigID() {
                 "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type.");
   QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra;
 
-  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_client_id_);
+  Qnn_ErrorHandle_t destroy_ret = htp_perf_infra.destroyPowerConfigId(htp_power_config_id);
   ORT_RETURN_IF(QNN_SUCCESS != destroy_ret, "destroyPowerConfigId failed.");
   return Status::OK();
 }
@@ -829,12 +854,7 @@ void QnnBackendManager::ReleaseResources() {
     return;
   }
 
-  auto result = DestroyHTPPowerConfigID();
-  if (Status::OK() != result) {
-    ORT_THROW("Failed to DestroyHTPPowerConfigID.");
-  }
-
-  result = ReleaseContext();
+  auto result = ReleaseContext();
   if (Status::OK() != result) {
     ORT_THROW("Failed to ReleaseContext.");
   }
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
index 36375522b5a0a..ff97c4c3a991c 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h
@@ -33,8 +33,6 @@ class QnnBackendManager {
  public:
   QnnBackendManager(std::string&& backend_path,
                     ProfilingLevel profiling_level,
-                    uint32_t rpc_control_latency,
-                    HtpPerformanceMode htp_performance_mode,
                     ContextPriority context_priority,
                     std::string&& qnn_saver_path,
                     uint32_t device_id,
@@ -42,8 +40,6 @@ class QnnBackendManager {
                     uint32_t soc_model)
       : backend_path_(backend_path),
         profiling_level_(profiling_level),
-        rpc_control_latency_(rpc_control_latency),
-        htp_performance_mode_(htp_performance_mode),
         context_priority_(context_priority),
         qnn_saver_path_(qnn_saver_path),
         device_id_(device_id),
@@ -92,7 +88,13 @@ class QnnBackendManager {
 
   Status SetupBackend(const logging::Logger& logger, bool load_from_cached_context);
 
-  Status SetHtpPowerConfig();
+  Status CreateHtpPowerCfgId(uint32_t deviceId, uint32_t coreId, uint32_t& htp_power_config_id);
+
+  Status SetHtpPowerConfig(uint32_t htp_power_config_client_id,
+                           HtpPerformanceMode htp_performance_mode);
+
+  Status SetRpcControlLatency(uint32_t htp_power_config_client_id,
+                              uint32_t rpc_control_latency);
 
   const QNN_INTERFACE_VER_TYPE& GetQnnInterface() { return qnn_interface_; }
 
@@ -141,6 +143,8 @@ class QnnBackendManager {
 
   const std::string& GetSdkVersion() { return sdk_build_version_; }
 
+  Status DestroyHTPPowerConfigID(uint32_t htp_power_config_id);
+
  private:
   void* LoadLib(const char* file_name, int flags, std::string& error_msg);
 
@@ -150,8 +154,6 @@ class QnnBackendManager {
 
   Status UnloadLib(void* handle);
 
-  Status DestroyHTPPowerConfigID();
-
   void* LibFunction(void* handle, const char* symbol, std::string& error_msg);
 
   template <class T>
@@ -232,15 +234,12 @@ class QnnBackendManager {
   QnnBackendType qnn_backend_type_ = QnnBackendType::CPU;
   Qnn_ProfileHandle_t profile_backend_handle_ = nullptr;
   std::vector<std::string> op_package_paths_;
-  uint32_t rpc_control_latency_ = 0;
-  HtpPerformanceMode htp_performance_mode_;
   ContextPriority context_priority_;
   std::string sdk_build_version_ = "";
 #ifdef _WIN32
   std::set<HMODULE> mod_handles_;
 #endif
   const std::string qnn_saver_path_;
-  uint32_t htp_power_config_client_id_ = 0;
   uint32_t device_id_ = 0;
   QnnHtpDevice_Arch_t htp_arch_ = QNN_HTP_DEVICE_ARCH_NONE;
   uint32_t soc_model_ = QNN_SOC_MODEL_UNKNOWN;
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
index f5a166d36b15a..3d9cfd92b7922 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -7,6 +7,7 @@
 #include "core/framework/compute_capability.h"
 #include "core/graph/graph_viewer.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/kernel_registry.h"
 #include "core/platform/env.h"
@@ -18,11 +19,36 @@
 #include "core/providers/qnn/builder/op_builder_factory.h"
 #include "core/providers/qnn/builder/qnn_def.h"
 #include "core/providers/qnn/builder/onnx_ctx_model_helper.h"
+#include "core/framework/run_options.h"
 
 namespace onnxruntime {
 
 constexpr const char* QNN = "QNN";
 
+static std::unique_ptr<std::vector<std::function<void()>>> s_run_on_unload_;
+
+void RunOnUnload(std::function<void()> function) {
+  OrtMutex mutex;
+  std::lock_guard<OrtMutex> guard(mutex);
+  if (!s_run_on_unload_) {
+    s_run_on_unload_ = std::make_unique<std::vector<std::function<void()>>>();
+  }
+  s_run_on_unload_->push_back(std::move(function));
+}
+
+struct OnUnload {
+  ~OnUnload() {
+    if (!s_run_on_unload_)
+      return;
+
+    for (auto& function : *s_run_on_unload_)
+      function();
+
+    s_run_on_unload_.reset();
+  }
+
+} g_on_unload;
+
 static void ParseProfilingLevel(std::string profiling_level_string,
                                 qnn::ProfilingLevel& profiling_level) {
   std::transform(profiling_level_string.begin(),
@@ -193,18 +219,18 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   }
 
   static const std::string RPC_CONTROL_LANTENCY = "rpc_control_latency";
-  uint32_t rpc_control_latency = 0;
   auto latency_pos = provider_options_map.find(RPC_CONTROL_LANTENCY);
   if (latency_pos != provider_options_map.end()) {
-    rpc_control_latency = static_cast<uint32_t>(std::stoul(latency_pos->second));
-    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
+    default_rpc_control_latency_ = static_cast<uint32_t>(std::stoul(latency_pos->second));
+    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << default_rpc_control_latency_;
   }
 
-  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  // default_htp_performance_mode from QNN EP option.
+  // set it once only for each thread as default so user don't need to set it for every session run
   static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode";
   auto htp_performance_mode_pos = provider_options_map.find(HTP_PERFORMANCE_MODE);
   if (htp_performance_mode_pos != provider_options_map.end()) {
-    ParseHtpPerformanceMode(htp_performance_mode_pos->second, htp_performance_mode);
+    ParseHtpPerformanceMode(htp_performance_mode_pos->second, default_htp_performance_mode_);
   }
 
   htp_graph_finalization_opt_mode_ = qnn::HtpGraphFinalizationOptimizationMode::kDefault;
@@ -241,15 +267,14 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   }
 
   static const std::string QNN_DEVICE_ID = "device_id";
-  uint32_t device_id = 0;
   auto dev_id_pos = provider_options_map.find(QNN_DEVICE_ID);
   if (dev_id_pos != provider_options_map.end()) {
     int value = std::stoi(dev_id_pos->second);
     if (value < 0) {
       LOGS_DEFAULT(WARNING) << "Invalid device ID '" << value
-                            << "', only >= 0 allowed. Set to " << device_id << ".";
+                            << "', only >= 0 allowed. Set to " << device_id_ << ".";
     } else {
-      device_id = static_cast<uint32_t>(value);
+      device_id_ = static_cast<uint32_t>(value);
     }
   }
 
@@ -276,15 +301,23 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
   qnn_backend_manager_ = std::make_unique<qnn::QnnBackendManager>(
       std::move(backend_path),
       profiling_level,
-      rpc_control_latency,
-      htp_performance_mode,
       context_priority,
       std::move(qnn_saver_path),
-      device_id,
+      device_id_,
       htp_arch,
       soc_model);
 }
 
+QNNExecutionProvider::~QNNExecutionProvider() {
+  // clean up thread local context caches
+  std::lock_guard<OrtMutex> lock(context_state_.mutex);
+  for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) {
+    const auto cache = cache_weak.lock();
+    if (!cache) continue;
+    ORT_IGNORE_RETURN_VALUE(cache->erase(this));
+  }
+}
+
 bool QNNExecutionProvider::IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                                            const logging::Logger& logger) const {
   const std::string& op_type = node_unit.OpType();
@@ -670,7 +703,8 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
     ORT_RETURN_IF_ERROR(qnn::LoadQnnCtxFromOnnxGraph(main_ctx_graph_viewer,
                                                      context_cache_path,
                                                      qnn_backend_manager_.get(),
-                                                     qnn_models));
+                                                     qnn_models,
+                                                     logger));
 
     for (auto fused_node_and_graph : fused_nodes_and_graphs) {
       const onnxruntime::GraphViewer& graph_viewer(fused_node_and_graph.filtered_graph);
@@ -724,4 +758,147 @@ const InlinedVector<const Node*> QNNExecutionProvider::GetEpContextNodes() const
 
   return ep_context_nodes;
 }
+
+QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
+                                                         uint32_t device_id,
+                                                         uint32_t core_id,
+                                                         qnn::HtpPerformanceMode default_htp_performance_mode,
+                                                         uint32_t default_rpc_control_latency)
+    : qnn_backend_manager_(qnn_backend_manager) {
+  Status rt = qnn_backend_manager_->CreateHtpPowerCfgId(device_id, core_id, htp_power_config_id_);
+  is_htp_power_config_id_valid_ = rt.IsOK();
+  // default_htp_performance_mode and default_rpc_control_latency are from QNN EP option.
+  // set it once only for each thread as default so user don't need to set it for every session run
+  if (is_htp_power_config_id_valid_) {
+    if (qnn::HtpPerformanceMode::kHtpDefault != default_htp_performance_mode) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetHtpPowerConfig(htp_power_config_id_,
+                                                                      default_htp_performance_mode));
+    }
+    if (default_rpc_control_latency > 0) {
+      ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcControlLatency(htp_power_config_id_,
+                                                                         default_rpc_control_latency));
+    }
+  }
+}
+
+QNNExecutionProvider::PerThreadContext::~PerThreadContext() {
+  if (is_htp_power_config_id_valid_) {
+    ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->DestroyHTPPowerConfigID(htp_power_config_id_));
+  }
+}
+
+QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  // try to use cached context
+  auto cached_context_it = per_thread_context_cache->find(this);
+  if (cached_context_it != per_thread_context_cache->end()) {
+    auto cached_context = cached_context_it->second.lock();
+    ORT_ENFORCE(cached_context);
+    return *cached_context;
+  }
+
+  // get context and update cache
+  std::shared_ptr<PerThreadContext> context;
+  {
+    std::lock_guard<OrtMutex> lock(context_state_.mutex);
+
+    // get or create a context
+    if (context_state_.retired_context_pool.empty()) {
+      uint32_t core_id = 0;
+      context = std::make_shared<PerThreadContext>(qnn_backend_manager_.get(), device_id_, core_id,
+                                                   default_htp_performance_mode_, default_rpc_control_latency_);
+    } else {
+      context = context_state_.retired_context_pool.back();
+      context_state_.retired_context_pool.pop_back();
+    }
+
+    // insert into active_contexts, should not already be present
+    const auto active_contexts_insert_result = context_state_.active_contexts.insert(context);
+    ORT_ENFORCE(active_contexts_insert_result.second);
+
+    // insert into caches_to_update_on_destruction, may already be present
+    ORT_IGNORE_RETURN_VALUE(context_state_.caches_to_update_on_destruction.insert(per_thread_context_cache));
+  }
+
+  per_thread_context_cache->insert(std::make_pair(this, context));
+
+  return *context;
+}
+
+void QNNExecutionProvider::ReleasePerThreadContext() const {
+  const auto& per_thread_context_cache = PerThreadContextCache();
+
+  auto cached_context_it = per_thread_context_cache->find(this);
+  ORT_ENFORCE(cached_context_it != per_thread_context_cache->end());
+  auto cached_context = cached_context_it->second.lock();
+  ORT_ENFORCE(cached_context);
+
+  {
+    std::lock_guard<OrtMutex> lock(context_state_.mutex);
+    context_state_.active_contexts.erase(cached_context);
+    context_state_.retired_context_pool.push_back(cached_context);
+  }
+
+  per_thread_context_cache->erase(cached_context_it);
+}
+
+Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
+    return Status::OK();
+  }
+
+  std::string htp_perf_mode = "";
+  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfMode, htp_perf_mode)) {
+    // set power mode
+    ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
+  }
+
+  std::string rpc_latency = "";
+  uint32_t rpc_control_latency = 0;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) {
+    rpc_control_latency = static_cast<uint32_t>(std::stoul(rpc_latency));
+    LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency;
+  }
+
+  if (GetPerThreadContext().IsHtpPowerConfigIdValid()) {
+    if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                  htp_performance_mode));
+    }
+
+    if (rpc_control_latency > 0) {
+      ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcControlLatency(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                     rpc_control_latency));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status QNNExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxruntime::RunOptions& run_options) {
+  auto backend_type = qnn_backend_manager_->GetQnnBackendType();
+  if (qnn::QnnBackendType::HTP != backend_type && qnn::QnnBackendType::DSP != backend_type) {
+    return Status::OK();
+  }
+
+  std::string htp_perf_mode = "";
+  qnn::HtpPerformanceMode htp_performance_mode = qnn::HtpPerformanceMode::kHtpDefault;
+  if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, htp_perf_mode)) {
+    // set power mode
+    ParseHtpPerformanceMode(htp_perf_mode, htp_performance_mode);
+  }
+
+  if (qnn::HtpPerformanceMode::kHtpDefault != htp_performance_mode) {
+    if (!GetPerThreadContext().IsHtpPowerConfigIdValid()) {
+      return Status::OK();
+    }
+    ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetHtpPowerConfig(GetPerThreadContext().GetHtpPowerConfigId(),
+                                                                htp_performance_mode));
+  }
+
+  return Status::OK();
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
index 0bcaa39b22f6d..43b5e7bff827e 100644
--- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h
+++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h
@@ -12,14 +12,19 @@
 #include "core/providers/qnn/builder/qnn_model.h"
 #include "core/providers/qnn/builder/qnn_configs_helper.h"
 #include "HTP/QnnHtpGraph.h"
+#include <vector>
+#include <set>
+#include <unordered_map>
 
 namespace onnxruntime {
 
+void RunOnUnload(std::function<void()> function);
+
 // Logical device representation.
 class QNNExecutionProvider : public IExecutionProvider {
  public:
   explicit QNNExecutionProvider(const ProviderOptions& provider_options_map, const SessionOptions* session_options);
-  virtual ~QNNExecutionProvider() = default;
+  virtual ~QNNExecutionProvider();
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(QNNExecutionProvider);
 
   // we implement the Compile that takes FusedNodeAndGraph instances
@@ -40,6 +45,10 @@ class QNNExecutionProvider : public IExecutionProvider {
 
   const InlinedVector<const Node*> GetEpContextNodes() const override;
 
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
+
  private:
   bool IsNodeSupported(qnn::QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit,
                        const logging::Logger& logger) const;
@@ -72,6 +81,68 @@ class QNNExecutionProvider : public IExecutionProvider {
   int32_t vtcm_size_in_mb_ = 0;
   std::unique_ptr<onnxruntime::Model> qnn_ep_context_model_;
   ModelMetadefIdGenerator metadef_id_generator_;
+  uint32_t device_id_ = 0;
+  qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault;
+  uint32_t default_rpc_control_latency_ = 0;
+
+  class PerThreadContext final {
+   public:
+    PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager,
+                     uint32_t device_id, uint32_t core_id,
+                     qnn::HtpPerformanceMode default_htp_performance_mode,
+                     uint32_t default_rpc_control_latency);
+    ~PerThreadContext();
+    ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext);
+
+    bool IsHtpPowerConfigIdValid() { return is_htp_power_config_id_valid_; }
+
+    uint32_t GetHtpPowerConfigId() { return htp_power_config_id_; }
+
+   private:
+    bool is_htp_power_config_id_valid_ = false;
+    uint32_t htp_power_config_id_ = 0;
+    qnn::QnnBackendManager* qnn_backend_manager_;
+  };
+
+  using PerThreadContextMap = std::unordered_map<const QNNExecutionProvider*, std::weak_ptr<PerThreadContext>>;
+
+  struct ContextCacheHolder {
+    ContextCacheHolder() {
+      RunOnUnload([&, weak_p_ = std::weak_ptr<PerThreadContextMap>(p)] {
+        if (auto lock = weak_p_.lock())
+          p.reset();
+      });
+    }
+
+    std::shared_ptr<PerThreadContextMap> p = std::make_shared<PerThreadContextMap>();
+  };
+
+  static const std::shared_ptr<PerThreadContextMap>& PerThreadContextCache() {
+    thread_local const ContextCacheHolder per_thread_context_cache;
+    return per_thread_context_cache.p;
+  }
+
+  struct PerThreadContextState {
+    // contexts that are currently active
+    std::set<std::shared_ptr<PerThreadContext>, std::owner_less<std::shared_ptr<PerThreadContext>>> active_contexts;
+    // contexts available for reuse
+    std::vector<std::shared_ptr<PerThreadContext>> retired_context_pool;
+    // weak references to thread local caches from which this QNNExecutionProvider instance's entry should be removed
+    // upon destruction
+    std::set<std::weak_ptr<PerThreadContextMap>, std::owner_less<std::weak_ptr<PerThreadContextMap>>>
+        caches_to_update_on_destruction;
+    // synchronizes access to PerThreadContextState members
+    OrtMutex mutex;
+  };
+
+  // The execution provider maintains the PerThreadContexts in this structure.
+  // Synchronization is required to update the contained structures.
+  // On the other hand, access to an individual PerThreadContext is assumed to be from a single thread at a time,
+  // so synchronization is not required for that.
+  mutable PerThreadContextState context_state_;
+
+  PerThreadContext& GetPerThreadContext() const;
+  void ReleasePerThreadContext() const;
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
index ee3578326ac6d..3fd5423681b81 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc
@@ -353,7 +353,7 @@ Status ROCMExecutionProvider::Sync() const {
   return Status::OK();
 }
 
-Status ROCMExecutionProvider::OnRunStart() {
+Status ROCMExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   // always set ROCM device when session::Run() in case it runs in a worker thread
   HIP_RETURN_IF_ERROR(hipSetDevice(GetDeviceId()));
   if (IsGraphCaptureEnabled() && GetPerThreadContext().IsGraphCaptureAllowed() && !GetPerThreadContext().IsGraphCaptured()) {
@@ -363,7 +363,7 @@ Status ROCMExecutionProvider::OnRunStart() {
   return Status::OK();
 }
 
-Status ROCMExecutionProvider::OnRunEnd(bool sync_stream) {
+Status ROCMExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (IsGraphCaptureEnabled() && !GetPerThreadContext().IsGraphCaptured()) {
     if (GetPerThreadContext().IsGraphCaptureAllowed()) {
       GetPerThreadContext().CaptureEnd();
diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
index 37d5f7b42210f..da671d9e863bb 100644
--- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h
+++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h
@@ -28,9 +28,9 @@ class ROCMExecutionProvider : public IExecutionProvider {
 
   Status Sync() const override;
 
-  Status OnRunStart() override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
 
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   const void* GetExecutionHandle() const noexcept override {
     // The ROCM interface does not return anything interesting.
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
index c0bf29e486c88..81346671f2aad 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -1818,11 +1818,11 @@ std::unique_ptr<IDataTransfer> TensorrtExecutionProvider::GetDataTransfer() cons
   return onnxruntime::CreateGPUDataTransfer();
 }
 
-Status TensorrtExecutionProvider::OnRunStart() {
+Status TensorrtExecutionProvider::OnRunStart(const onnxruntime::RunOptions& /*run_options*/) {
   return Status::OK();
 }
 
-Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream) {
+Status TensorrtExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
   if (sync_stream && external_stream_) {
     CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
   }
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
index e86f997b6597a..26f6b2dcc3020 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h
@@ -233,8 +233,8 @@ class TensorrtExecutionProvider : public IExecutionProvider {
   common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
                          std::vector<NodeComputeInfo>& node_compute_funcs) override;
 
-  Status OnRunStart() override;
-  Status OnRunEnd(bool sync_stream) override;
+  Status OnRunStart(const onnxruntime::RunOptions& run_options) override;
+  Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override;
 
   ProviderOptions GetProviderOptions() const override {
     return TensorrtExecutionProviderInfo::ToProviderOptions(info_);
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index b045f30a59797..efd7db4ea7629 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -2289,8 +2289,8 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
     // TODO: only call OnRunStart for all providers in-use
     for (auto& xp : execution_providers_) {
       // call OnRunStart and add to exec_providers_to_stop if successful
-      auto start_func = [&xp, &exec_providers_to_stop]() {
-        auto status = xp->OnRunStart();
+      auto start_func = [&xp, &exec_providers_to_stop, run_options]() {
+        auto status = xp->OnRunStart(run_options);
         if (status.IsOK())
           exec_providers_to_stop.push_back(xp.get());
 
@@ -2326,7 +2326,7 @@ Status InferenceSession::PartialRun(onnxruntime::RunOptions& run_options,
 
   // info all execution providers InferenceSession:Run ended
   for (auto* xp : exec_providers_to_stop) {
-    auto status = xp->OnRunEnd(/*sync_stream*/ false);
+    auto status = xp->OnRunEnd(/*sync_stream*/ false, run_options);
     ORT_CHECK_AND_SET_RETVAL(status);
   }
 
@@ -2448,8 +2448,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
       // TODO: only call OnRunStart for all providers in-use
       for (auto& xp : execution_providers_) {
         // call OnRunStart and add to exec_providers_to_stop if successful
-        auto start_func = [&xp, &exec_providers_to_stop]() {
-          auto status = xp->OnRunStart();
+        auto start_func = [&xp, &exec_providers_to_stop, &run_options]() {
+          auto status = xp->OnRunStart(run_options);
           if (status.IsOK())
             exec_providers_to_stop.push_back(xp.get());
 
@@ -2490,7 +2490,7 @@ Status InferenceSession::Run(const RunOptions& run_options,
       // info all execution providers InferenceSession:Run ended
       for (auto* xp : exec_providers_to_stop) {
         bool synchronize_execution_providers = run_options.config_options.GetConfigOrDefault(kOrtRunOptionsConfigDisableSynchronizeExecutionProviders, "0") == "0";
-        auto status = xp->OnRunEnd(synchronize_execution_providers);
+        auto status = xp->OnRunEnd(synchronize_execution_providers, run_options);
         ORT_CHECK_AND_SET_RETVAL(status);
       }
 
diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
index 8334d20e47c86..400a9d8a7a187 100644
--- a/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
+++ b/onnxruntime/python/tools/kernel_explorer/kernels/groupnorm_test.py
@@ -80,6 +80,18 @@ def run_group_norm(
     )
     use_silu = silu
     broadcast_skip = False
+    if has_skip:
+        skip_x_shape = skip_x.shape
+        b2 = len(skip_x_shape) == 2 and skip_x_shape[0] == batch_size and skip_x_shape[1] == num_channels
+        b4 = (
+            len(skip_x_shape) == 4
+            and skip_x_shape[0] == batch_size
+            and skip_x_shape[1] == 1
+            and skip_x_shape[2] == 1
+            and skip_x_shape[3] == num_channels
+        )
+        if b2 or b4:
+            broadcast_skip = True
     channels_per_block = 0  # Compute in params initialization
 
     input_d = ke.DeviceArray(input_x.astype(dtype))
diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc
index b1e54ec605a39..2a56991ec5af4 100644
--- a/onnxruntime/test/contrib_ops/activation_op_test.cc
+++ b/onnxruntime/test/contrib_ops/activation_op_test.cc
@@ -22,7 +22,8 @@ namespace test {
 TEST_F(ActivationOpTest, ThresholdedRelu_version_1_to_9) {
   float alpha = 0.1f;
   TestActivationOp<float>(
-      "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, true, 1);
+      "ThresholdedRelu", input_values, [alpha](float x) { return (x >= alpha) ? x : 0; }, {{"alpha", alpha}}, {},
+      true, 1);
 }
 
 TEST_F(ActivationOpTest, ScaledTanh) {
@@ -46,13 +47,13 @@ TEST_F(ActivationOpTest, ParametricSoftplus) {
         else
           return alpha * logf(expf(bx) + 1);
       },
-      {{"alpha", alpha}, {"beta", beta}}, false);  // Disable TensorRT due to result mismatch
+      {{"alpha", alpha}, {"beta", beta}}, {}, false);  // Disable TensorRT due to result mismatch
 }
 
 TEST_F(ActivationOpTest, Gelu) {
   TestActivationOp<float>(
       "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast<float>(M_SQRT1_2))); }, {},
-      false, 1, kMSDomain);
+      {}, false, 1, kMSDomain);
 }
 
 #if defined(USE_DNNL)
@@ -115,7 +116,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 
   // Silu = x*sigmoid(x), i.e., alpha = 1.0f.
@@ -129,7 +130,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 
   // Negative alpha.
@@ -143,7 +144,7 @@ TEST_F(ActivationOpTest, QuickGelu) {
           y = tmp >= 0 ? y : 1 - y;
           return x * y;
         },
-        {{"alpha", alpha}}, false, 1, kMSDomain);
+        {{"alpha", alpha}}, {}, false, 1, kMSDomain);
   }
 }
 
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index d7b1de5c930c5..3e0d94e94e48c 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -1974,6 +1974,74 @@ TEST_F(PlannerTest, TestCpuIf) {
     ASSERT_TRUE(exe_plan[1]->steps_[6]->ToString().substr(0, WaitOnEPStep.size()) == WaitOnEPStep);
   }
 }
+
+// model looks like:
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+//                                                 |-----------> Gather
+// Shape ----------------> Reshape --> Shape ------------------> Reshape
+//                           ^                                     ^
+// InstanceNormalization ----|         InstanceNormalization ------|
+//
+// Python script to create this model:
+// def CreateModelFor19480():
+//    #shape->reshape->shape->reshape, 4 gather
+//    graphNodes = []
+//    graphNodes.append(h.make_node('Shape', inputs=['shape_input'], outputs=['9']))
+//    graphNodes.append(h.make_node('InstanceNormalization', inputs=['in0_input', 'scale0', 'B0'], outputs=['8']))
+//    graphNodes.append(h.make_node('Reshape', inputs=['8', '9'], outputs=['Reshape15_output']))
+//    graphNodes.append(h.make_node('Shape', inputs=['Reshape15_output'], outputs=['281']))
+//    graphNodes.append(h.make_node('InstanceNormalization', inputs=['in1_input', 'scale1', 'B1'], outputs=['293']))
+//    graphNodes.append(h.make_node('Reshape', inputs=['293', '281'], outputs=['output0']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices1'], outputs=['output1']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices2'], outputs=['output2']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices3'], outputs=['output3']))
+//    graphNodes.append(h.make_node('Gather', inputs=['281', 'indices4'], outputs=['output4']))
+//    g = h.make_graph(graphNodes, 'issue_19480',
+//                     [h.make_tensor_value_info('shape_input', tp.FLOAT, ['batch', 128, None, None]),
+//                      h.make_tensor_value_info('in0_input', tp.FLOAT, ['batch', 32, None]),
+//                      h.make_tensor_value_info('scale0', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('B0', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('in1_input', tp.FLOAT, ['batch', 32, None]),
+//                      h.make_tensor_value_info('scale1', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('B1', tp.FLOAT, [32]),
+//                      h.make_tensor_value_info('indices1', tp.INT32, []),
+//                      h.make_tensor_value_info('indices2', tp.INT32, []),
+//                      h.make_tensor_value_info('indices3', tp.INT32, []),
+//                      h.make_tensor_value_info('indices4', tp.INT32, [])],
+//                     [h.make_tensor_value_info('output0', tp.FLOAT, None),
+//                      h.make_tensor_value_info('output1', tp.INT64, None),
+//                      h.make_tensor_value_info('output2', tp.INT64, None),
+//                      h.make_tensor_value_info('output3', tp.INT64, None),
+//                      h.make_tensor_value_info('output4', tp.INT64, None)])
+//    model = h.make_model(g, opset_imports=[h.make_operatorsetid("", 17)], producer_name='producer_name')
+//    onnx.save(model, 'issue_19480.onnx')
+//
+TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
+  SessionOptions sess_opt;
+  sess_opt.graph_optimization_level = TransformerLevel::Default;
+
+  InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
+  auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
+  status = sess.Load();
+  status = sess.Initialize();
+  ASSERT_TRUE(status.IsOK()) << "No crash";
+  const SequentialExecutionPlan* plan = sess.GetSessionState().GetExecutionPlan();
+  ASSERT_EQ(plan->allocation_plan[14].alloc_kind, AllocKind::kReuse) << "The input of reshape and gather will reuse the output of shape";
+
+  int gather_count = 0;
+  for (size_t i = 0; i < plan->execution_plan[1]->steps_.size(); i++) {
+    if (strstr(typeid(*(plan->execution_plan[1]->steps_[i])).name(), "LaunchKernelStep")) {
+      const Node* node = sess.GetSessionState().GetGraphViewer().GetNode(plan->execution_plan[1]->steps_[i]->GetNodeIndex());
+      if (node->OpType() == "Gather")
+        gather_count++;
+      else
+        FAIL() << "CPU stream should contain only gather ops";
+    }
+  }
+  ASSERT_EQ(gather_count, 4) << "4 gather ops are all placed in CPU stream";
+}
 #endif
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc
index cf859facf4765..69ee72996365e 100644
--- a/onnxruntime/test/onnx/microbenchmark/activation.cc
+++ b/onnxruntime/test/onnx/microbenchmark/activation.cc
@@ -11,6 +11,7 @@
 #include "core/framework/node_index_info.h"
 #include "core/framework/execution_frame.h"
 #include "contrib_ops/cpu/activations.h"
+#include "core/providers/cpu/tensor/gelu.h"
 #include "core/providers/cpu/activation/activations.h"
 #include <onnx/defs/attr_proto_util.h>
 #include <benchmark/benchmark.h>
@@ -182,7 +183,7 @@ static void RunSingleNode(const std::string& op_name, const std::string& domain,
 }
 
 static void BM_GeluCompute(benchmark::State& state) {
-  RunSingleNode<contrib::Gelu<float>>("Gelu", kMSDomain, {}, state);
+  RunSingleNode<Gelu<float>>("Gelu", kMSDomain, {}, state);
 }
 
 BENCHMARK(BM_GeluCompute)
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
index ddb0a6620619c..acd513172f95d 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc
@@ -116,13 +116,13 @@ TEST_F(ActivationOpTest, Relu) {
       "Relu",
       input_values_double,
       [](double x) { return std::max(x, 0.0); },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false);
   TestActivationOp<int8_t>(
       "Relu",
       input_values_int8,
       [](int8_t x) { return std::max(x, static_cast<int8_t>(0)); },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 14);
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -133,7 +133,7 @@ TEST_F(ActivationOpTest, Relu) {
         if (x.ToFloat() > 0.0f) return x;
         return MLFloat16();
       },
-      {},
+      {}, {},
       /*is_tensorrt_supported=*/false,
       /*opset_version= */ 11);
 #endif  // MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -402,7 +402,7 @@ TEST_F(ActivationOpTest, Celu) {
       // TODO: Investigate why gcc 4 fails to compile without the explicit cast
       [alpha](float x) { return std::max(0.0f, x) + std::min(0.0f, alpha * (static_cast<float>(exp(x / alpha)) - 1)); },
       // Disable on TensorRT as it seems like it doesn't yet support Celu
-      {{"alpha", alpha}}, false, 12);
+      {{"alpha", alpha}}, {}, false, 12);
 }
 
 TEST_F(ActivationOpTest, LeakyRelu) {
@@ -410,7 +410,7 @@ TEST_F(ActivationOpTest, LeakyRelu) {
   TestActivationOp<float>("LeakyRelu",
                           input_values,
                           [alpha](float x) { return (x >= 0) ? x : alpha * x; },
-                          {{"alpha", alpha}});
+                          {{"alpha", alpha}}, {});
 }
 
 #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
@@ -442,7 +442,7 @@ TEST_F(ActivationOpTest, ThresholdedRelu) {
       "ThresholdedRelu",
       input_values,
       [alpha](float x) { return (x >= alpha) ? x : 0; },
-      {{"alpha", alpha}}, true, 10);
+      {{"alpha", alpha}}, {}, true, 10);
 }
 
 TEST_F(ActivationOpTest, Selu) {
@@ -452,7 +452,7 @@ TEST_F(ActivationOpTest, Selu) {
   TestActivationOp<float>("Selu",
                           input_values,
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, Selu_Attributes) {
@@ -462,7 +462,7 @@ TEST_F(ActivationOpTest, Selu_Attributes) {
   TestActivationOp<float>("Selu",
                           input_values,
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, Selu_GH10726) {
@@ -472,7 +472,7 @@ TEST_F(ActivationOpTest, Selu_GH10726) {
   TestActivationOp<float>("Selu",
                           {{1.f, -1.f}},
                           [](float x) { return x <= 0 ? gamma * (alpha * exp(x) - alpha) : gamma * x; },
-                          {{"alpha", alpha}, {"gamma", gamma}});
+                          {{"alpha", alpha}, {"gamma", gamma}}, {});
 }
 
 TEST_F(ActivationOpTest, PRelu) {
@@ -625,7 +625,7 @@ TEST_F(ActivationOpNoInfTest, Softsign) {
 
         return result;
       },
-      {}, false);  // Disable TensorRT because result mismatches
+      {}, {}, false);  // Disable TensorRT because result mismatches
 }
 
 #if defined(ENABLE_TRAINING_OPS)
@@ -695,5 +695,33 @@ TEST(LeakyReluGradInferenceTest, Basic) {
 }
 #endif
 
+// Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation.
+// It will run the DNNL Gelu op which only be part of standard of Gelu-20 op.
+#if !defined(USE_DNNL) && !defined(USE_QNN)
+TEST_F(ActivationOpTest, ONNX_Gelu) {
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); }, {},
+      {{"approximate", "none"}}, true, 20);
+
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) { return 0.5 * x * (1 + erf(x * M_SQRT1_2)); },
+      {},
+      {/*default value of approximate attribute is none */}, true, 20);
+
+  TestActivationOp<float>(
+      "Gelu",
+      input_values,
+      [](float x) {
+        return 0.5 * x * (1 + tanh(sqrt(2 / M_PI) * (x + 0.044715 * x * x * x)));
+      },
+      {},
+      {{"approximate", "tanh"}}, true, 20);
+}
+#endif
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
index b5ec1402584fb..984b8f4437a3b 100644
--- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h
+++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h
@@ -17,13 +17,16 @@ namespace test {
 template <typename T>
 inline void TestActivationOp(const char* szOp, const std::vector<std::vector<T>>& input_vals_vec,
                              std::function<T(T)> expected_func,
-                             const std::unordered_map<std::string, float> attribs = {},
+                             const std::unordered_map<std::string, float> float_attribs = {},
+                             const std::unordered_map<std::string, std::string> string_attribs = {},
                              bool is_tensorrt_supported = true, int opset_version = 7,
                              const char* domain = kOnnxDomain) {
   for (const std::vector<T>& input_vals : input_vals_vec) {
     OpTester test(szOp, opset_version, domain);
 
-    for (auto attr : attribs) test.AddAttribute<float>(attr.first, attr.second);
+    for (auto attr : float_attribs) test.AddAttribute<float>(attr.first, attr.second);
+    for (auto attr : string_attribs) test.AddAttribute(attr.first, attr.second);
+
     std::vector<int64_t> dims{(int64_t)input_vals.size()};
 
     std::vector<T> expected_vals;
diff --git a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
index b9875b9553a55..1a31743e2f7e7 100644
--- a/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/rnn_op_test.cc
@@ -120,15 +120,11 @@ TEST(RNNTest, RNN_bidirectional_bias_initial_zigged_batch) {
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
   // TensorRT failed on RNN tests
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_zigged_batch) {
-#else
 TEST(RNNTest, RNN_bidirectional_zigged_batch) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 3, seq_length = 5;
 
@@ -275,15 +271,11 @@ TEST(RNNTest, RNN_reverse_direction_zigged_batch) {
   std::vector<float> Y_h_data({0.87014002F, 0.09402763F, -0.54269236F, 0.64809889F, -0.19472955F, -0.24271242F});
   test.AddOutput<float>("Y_h", Y_h_dims, Y_h_data);
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
 // Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_forward_direction_zigged_batch) {
-#else
 TEST(RNNTest, RNN_forward_direction_zigged_batch) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 1, input_size = 2, hidden_size = 3, seq_length = 5;
 
@@ -357,12 +349,7 @@ TEST(RNNTest, RNN_forward_direction_zigged_batch) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_0) {
-#else
 TEST(RNNTest, RNN_bidirectional_0) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5;
 
@@ -424,12 +411,7 @@ TEST(RNNTest, RNN_bidirectional_0) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
 }
 
-// Doesn't work with CUDA 11.4 on Windows. Need investigation.
-#if defined(USE_CUDA) && defined(_WIN32)
-TEST(RNNTest, DISABLED_RNN_bidirectional_1) {
-#else
 TEST(RNNTest, RNN_bidirectional_1) {
-#endif
   OpTester test("RNN");
   int64_t num_directions = 2, input_size = 2, hidden_size = 2, batch_size = 1, seq_length = 1;
 
@@ -597,7 +579,7 @@ TEST(RNNTest, DISABLED_RNN_default_attributes_and_forward_direction) {
   }
 }
 
-TEST(RNNTest, DISABLED_RNN_reverse_direction) {
+TEST(RNNTest, RNN_reverse_direction) {
   int64_t num_directions = 1, input_size = 2, hidden_size = 3, batch_size = 1, seq_length = 5;
 
   // In case of useDefault, attributes, inputs or outputs are not set.
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 0f097622abff0..5c89d6ea7bd75 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -6,6 +6,33 @@
 
 namespace onnxruntime {
 namespace test {
+
+std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset_version) {
+  ORT_UNUSED_PARAMETER(opset_version);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+
+  execution_providers.emplace_back(DefaultCpuExecutionProvider());
+#ifdef USE_CUDA
+  if (opset_version < 20) {
+    execution_providers.emplace_back(DefaultCudaExecutionProvider());
+#ifdef ENABLE_CUDA_NHWC_OPS
+    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+#endif
+  }
+
+#endif
+  return execution_providers;
+}
+
+template <typename T>
+void RunTests(T& test, std::vector<std::unique_ptr<IExecutionProvider>>&& execution_providers) {
+  for (size_t idx = 0; idx < execution_providers.size(); ++idx) {
+    test.ConfigEp(std::move(execution_providers[idx])).RunWithConfig();
+  }
+  execution_providers.clear();
+}
+
 // DO NOT edit following tests. They are generated by:
 // onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
@@ -25,8 +52,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
@@ -46,8 +72,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
@@ -67,8 +92,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
@@ -88,8 +112,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
@@ -109,8 +132,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) {
@@ -130,8 +152,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
@@ -151,8 +172,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
@@ -172,8 +192,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
@@ -193,8 +212,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
@@ -214,8 +232,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
@@ -235,8 +252,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) {
@@ -256,8 +272,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
@@ -277,8 +292,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
@@ -298,8 +312,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
@@ -319,8 +332,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
@@ -340,8 +352,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
@@ -361,8 +372,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) {
@@ -382,8 +392,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(16));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
@@ -403,8 +412,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
@@ -424,8 +432,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
@@ -445,8 +452,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
@@ -466,8 +472,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
@@ -487,8 +492,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
@@ -508,8 +512,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
@@ -529,8 +532,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
@@ -550,8 +552,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
@@ -571,8 +572,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
@@ -592,8 +592,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) {
@@ -613,8 +612,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) {
@@ -634,8 +632,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
@@ -655,8 +652,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
@@ -676,8 +672,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
@@ -697,8 +692,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
@@ -718,8 +712,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
@@ -739,8 +732,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
@@ -760,8 +752,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
@@ -781,8 +772,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
@@ -802,8 +792,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
@@ -823,8 +812,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
@@ -844,8 +832,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) {
@@ -865,8 +852,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) {
@@ -886,8 +872,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
@@ -907,8 +892,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
@@ -928,8 +912,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
@@ -949,8 +932,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
@@ -970,8 +952,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
@@ -991,8 +972,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) {
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
 
 TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) {
@@ -1012,8 +992,8 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners)
   test.AddAttribute("padding_mode", padding_mode);
   test.AddAttribute("align_corners", align_corners);
   test.AddOutput<float>("Y", Y_shape, Y_data);
-  test.ConfigEp(DefaultCpuExecutionProvider())
-      .RunWithConfig();
+  RunTests(test, GetExecutionProviders(20));
 }
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
index e4d58e79243ef..c60e55617774f 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py
@@ -76,6 +76,6 @@
                     print('test.AddAttribute("padding_mode", padding_mode);')
                     print('test.AddAttribute("align_corners", align_corners);')
                     print('test.AddOutput<float>("Y", Y_shape, Y_data);')
-                    print("test.Run();")
+                    print(f"RunTests(test, GetExecutionProviders({opset_version}));")
                     print("}")
                     print("\n")
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index a70e439cdf755..5505d689381c9 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -22,6 +22,8 @@ TEST(TestDeferredRelease, WithArena) {
   CUDAExecutionProvider ep(info);
   AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];
 
+  RunOptions run_opts;
+  run_opts.run_tag = "log1";
   // Allocator for call cudaMallocHost and cudaFreeHost
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
   AllocatorPtr cpu_pinned_alloc = ep.CreatePreferredAllocators()[1];
@@ -31,7 +33,7 @@ TEST(TestDeferredRelease, WithArena) {
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
-  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts));
   for (size_t i = 0; i < n_allocs; ++i) {
     // Allocate 10MB CUDA pinned memory.
     auto pinned_buffer = IAllocator::MakeUniquePtr<void>(cpu_pinned_alloc, n_bytes);
@@ -44,7 +46,7 @@ TEST(TestDeferredRelease, WithArena) {
   cpu_pinned_alloc->GetStats(&stats);
   ASSERT_EQ(stats.num_allocs, n_allocs);
   ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd());
-  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
 TEST(TestDeferredRelease, WithoutArena) {
@@ -52,6 +54,9 @@ TEST(TestDeferredRelease, WithoutArena) {
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
 
+  RunOptions run_opts;
+  run_opts.run_tag = "log1";
+
   OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
   // Create allocator without BFCArena
   AllocatorCreationInfo pinned_memory_info(
@@ -70,7 +75,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   // 10 MB
   const size_t n_bytes = 10 * 1000000;
   const int64_t n_allocs = 64;
-  ORT_THROW_IF_ERROR(ep.OnRunStart());
+  ORT_THROW_IF_ERROR(ep.OnRunStart(run_opts));
   for (size_t i = 0; i < n_allocs; ++i) {
     // Allocate 10MB CUDA pinned memory.
     auto pinned_buffer = IAllocator::MakeUniquePtr<void>(cuda_pinned_alloc, n_bytes);
@@ -79,7 +84,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   }
 
   ORT_THROW_IF_ERROR(stream.CleanUpOnRunEnd());
-  ORT_THROW_IF_ERROR(ep.OnRunEnd(true));
+  ORT_THROW_IF_ERROR(ep.OnRunEnd(true, run_opts));
 }
 
 }  // namespace test
diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
index 4e1aef2c40b2b..8f07c2ce77e77 100644
--- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc
@@ -7,6 +7,7 @@
 
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include "core/providers/cpu/cpu_provider_factory.h"  // For OrtSessionOptionsAppendExecutionProvider_CPU
 #include "core/session/inference_session.h"
 
@@ -332,19 +333,23 @@ static void CreateModelInMemory(std::unique_ptr<ModelAndBuilder>& result,
 static void RunSessionAndVerify(InferenceSession& session, const RunOptions& run_options, const NameMLValMap& feeds,
                                 const std::vector<std::string>& output_names,
                                 const std::vector<std::vector<int64_t>>& output_shapes,
-                                const std::vector<std::vector<float>>& expected_values) {
-  std::vector<OrtValue> fetches;
-  auto status = session.Run(run_options, feeds, output_names, &fetches);
-  ASSERT_TRUE(status.IsOK());
-
-  for (size_t i = 0; i < fetches.size(); i++) {
-    auto& tensor = fetches[i].Get<Tensor>();
-    TensorShape expected_shape(output_shapes[i]);
-    ASSERT_EQ(expected_shape, tensor.Shape());
-
-    gsl::span<const float> actual = tensor.DataAsSpan<float>();
-    gsl::span<const float> expected(expected_values[i].data(), expected_values[i].size());
-    ASSERT_EQ(expected, actual);
+                                const std::vector<std::vector<float>>& expected_values,
+                                int loop_count = 10) {
+  // Let it run for a while
+  for (int it = 0; it < loop_count; ++it) {
+    std::vector<OrtValue> fetches;
+    auto status = session.Run(run_options, feeds, output_names, &fetches);
+    ASSERT_TRUE(status.IsOK());
+
+    for (size_t i = 0; i < fetches.size(); i++) {
+      auto& tensor = fetches[i].Get<Tensor>();
+      TensorShape expected_shape(output_shapes[i]);
+      ASSERT_EQ(expected_shape, tensor.Shape());
+
+      gsl::span<const float> actual = tensor.DataAsSpan<float>();
+      gsl::span<const float> expected(expected_values[i].data(), expected_values[i].size());
+      ASSERT_EQ(expected, actual);
+    }
   }
 }
 
@@ -404,11 +409,11 @@ TEST_F(QnnCPUBackendTests, MultithreadSessionRun) {
 
   std::vector<std::thread> threads;
   constexpr int num_threads = 5;
-
+  constexpr int loop_count = 10;
   for (int i = 0; i < num_threads; i++) {
     threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
                                   model->builder.feeds_, model->builder.output_names_,
-                                  output_shapes, output_values));
+                                  output_shapes, output_values, loop_count));
   }
 
   for (auto& th : threads) {
@@ -484,11 +489,191 @@ TEST_F(QnnHTPBackendTests, MultithreadSessionRun) {
 
   std::vector<std::thread> threads;
   constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
 
   for (int i = 0; i < num_threads; i++) {
     threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
                                   model->builder.feeds_, model->builder.output_names_,
-                                  output_shapes, output_values));
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with run option to set power config
+TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  std::vector<std::string> perf_modes{
+      "burst", "balanced", "default", "high_performance", "high_power_saver",
+      "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"};
+
+  size_t post_i = perf_modes.size() - 1;
+  ASSERT_TRUE(post_i > num_threads);
+  for (int i = 0; i < num_threads; ++i, --post_i) {
+    RunOptions run_opts;
+    run_opts.run_tag = session_opts.session_logid;
+    auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+    rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with EP option to set default power config
+TEST_F(QnnHTPBackendTests, MultithreadDefaultHtpPowerCfgFromEpOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  RunOptions run_opts;
+  run_opts.run_tag = session_opts.session_logid;
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+  options["htp_performance_mode"] = "burst";
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  for (int i = 0; i < num_threads; i++) {
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
+  }
+
+  for (auto& th : threads) {
+    th.join();
+  }
+}
+
+// Tests running a single session in multiple threads on the HTP backend with
+// EP option to set default power config + run option to set power config for each run
+TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgDefaultAndRunOption) {
+  std::unique_ptr<ModelAndBuilder> model;
+  std::vector<float> input_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  std::vector<int64_t> shape = {1, 3, 2};
+  std::vector<std::vector<int64_t>> output_shapes = {shape};
+  std::vector<std::vector<float>> output_values = {{3.0f, 6.0f, 9.0f, 12.0f, 15.0f, 18.0f}};
+
+  CreateModelInMemory(model,
+                      QDQBuildAdd3Tensors<uint8_t>(TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data),
+                                                   TestInputDef<float>(shape, false, input_data)),
+                      "add3.qdq");
+
+  SessionOptions session_opts;
+  session_opts.session_logid = "logger0";
+
+  InferenceSession session_obj{session_opts, GetEnvironment()};
+  onnxruntime::ProviderOptions options;
+
+#if defined(_WIN32)
+  options["backend_path"] = "QnnHtp.dll";
+#else
+  options["backend_path"] = "libQnnHtp.so";
+#endif
+  options["htp_performance_mode"] = "burst";
+
+  auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts);
+  EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK());
+
+  auto status = session_obj.Load(model->model_data.data(), static_cast<int>(model->model_data.size()));
+  ASSERT_TRUE(status.IsOK());
+  status = session_obj.Initialize();
+  ASSERT_TRUE(status.IsOK());
+
+  std::vector<std::thread> threads;
+  constexpr int num_threads = 5;
+  constexpr int loop_count = 10;
+
+  std::vector<std::string> perf_modes{
+      "burst", "balanced", "default", "high_performance", "high_power_saver",
+      "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver"};
+
+  size_t post_i = perf_modes.size() - 1;
+  ASSERT_TRUE(post_i > num_threads);
+  for (int i = 0; i < num_threads; ++i, --post_i) {
+    RunOptions run_opts;
+    run_opts.run_tag = session_opts.session_logid;
+    auto rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfMode, perf_modes[i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+    rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str());
+    ASSERT_TRUE(rt.IsOK());
+
+    threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts,
+                                  model->builder.feeds_, model->builder.output_names_,
+                                  output_shapes, output_values, loop_count));
   }
 
   for (auto& th : threads) {
diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
index b1f3b52e77553..eaef6f6315157 100644
--- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
+++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc
@@ -463,7 +463,6 @@ TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_InvalidGraph) {
 
   InferenceSessionWrapper session_object{so, GetEnvironment()};
 
-  std::string provider_type = kCpuExecutionProvider;
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
   ASSERT_STATUS_OK(session_object.Load(qnn_ctx_model_data.data(), static_cast<int>(qnn_ctx_model_data.size())));
   // Verify the return status with code INVALID_GRAPH
@@ -486,7 +485,6 @@ std::string CreateQnnCtxModelWithNonEmbedMode(std::string external_bin_path) {
   auto* graph_output = helper.MakeOutput<float>(shape);
   Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
   ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
-  // The .. in the path will cause INVALID_GRAPH
   ep_context_node.AddAttribute("ep_cache_context", external_bin_path);
   ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
   ep_context_node.AddAttribute("source", "QNN");
@@ -651,6 +649,87 @@ TEST_F(QnnHTPBackendTests, QnnContextBinary2InputsTest) {
   ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
 }
 
+// Context binary only contains a single QNN graph, generated context cache model (detached mode) only has 1 EPContext node
+// Create another Onnx model which also reference to the bin file,
+// but the node name is not same with the QNN graph name inside the bin file.
+// This is to support backward compitable for the models generated before the PR that
+// make context generation support multi-partition
+TEST_F(QnnHTPBackendTests, QnnContextBinaryCache_SingleNodeNameNotMatchGraphNameInCtx) {
+  ProviderOptions provider_options;
+#if defined(_WIN32)
+  provider_options["backend_path"] = "QnnHtp.dll";
+#else
+  provider_options["backend_path"] = "libQnnHtp.so";
+#endif
+  const std::string context_binary_file = "./qnn_context_cache_non_embed.onnx";
+  std::filesystem::path context_bin = "qnn_context_cache_non_embed.onnx_QNNExecutionProvider_QNN_8283143575221199085_1_0.bin";
+  std::remove(context_binary_file.c_str());
+  std::remove(context_bin.string().c_str());
+
+  std::unordered_map<std::string, std::string> session_option_pairs;
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEnable, "1");
+  session_option_pairs.emplace(kOrtSessionOptionEpContextFilePath, context_binary_file);
+  session_option_pairs.emplace(kOrtSessionOptionEpContextEmbedMode, "0");
+
+  const TestInputDef<float> input_def({1, 2, 3}, false, -10.0f, 10.0f);
+  const std::string op_type = "Atan";
+
+  // Runs model with DQ-> Atan-> Q and compares the outputs of the CPU and QNN EPs.
+  // 1st run will generate the Onnx skeleton file + Qnn context cache binary file
+  TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, {}),
+                       BuildQDQOpTestCase<uint8_t>(op_type, {input_def}, {}, {}),
+                       provider_options,
+                       14,
+                       ExpectedEPNodeAssignment::All,
+                       QDQTolerance(),
+                       logging::Severity::kERROR,
+                       "",  // context model file path, not required for this inference
+                       session_option_pairs);
+
+  // Check the Onnx skeleton file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str()));
+  // Check the Qnn context cache binary file is generated
+  EXPECT_TRUE(std::filesystem::exists(context_bin));
+
+  const std::unordered_map<std::string, int> domain_to_version = {{"", 11}, {kMSDomain, 1}};
+  auto& logging_manager = DefaultLoggingManager();
+  onnxruntime::Model model("QNN_ctx_model", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+                           logging_manager.DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder helper(graph);
+  std::vector<int64_t> shape = {1, 2, 3};
+  NodeArg* graph_input = MakeTestInput(helper, TestInputDef<float>(shape, false, {0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f}));
+  auto* graph_output = helper.MakeOutput<float>(shape);
+  Node& ep_context_node = helper.AddNode("EPContext", {graph_input}, {graph_output}, kMSDomain);
+  ep_context_node.AddAttribute("embed_mode", static_cast<int64_t>(0));
+  ep_context_node.AddAttribute("ep_cache_context", context_bin.string());
+  ep_context_node.AddAttribute("partition_name", "QNNExecutionProvider_QNN_1110111000111000111_1_0");
+  ep_context_node.AddAttribute("source", "QNNExecutionProvider");
+  helper.SetGraphOutputs();
+  ASSERT_STATUS_OK(graph.Resolve());
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  // loads and run from Onnx skeleton file + Qnn context cache binary file
+
+  SessionOptions so;
+  so.session_logid = "qnn_ctx_model_logger";
+  RunOptions run_options;
+  run_options.run_tag = so.session_logid;
+
+  InferenceSessionWrapper session_object{so, GetEnvironment()};
+
+  ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(QnnExecutionProviderWithOptions(provider_options)));
+  ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(model_data.size())));
+  // Verify the return status with code INVALID_GRAPH
+  ASSERT_TRUE(session_object.Initialize().Code() == common::StatusCode::OK);
+
+  // Clean up
+  ASSERT_EQ(std::remove(context_binary_file.c_str()), 0);
+  ASSERT_EQ(std::remove(context_bin.string().c_str()), 0);
+}
+
 #endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
 
 }  // namespace test
diff --git a/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx b/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx
new file mode 100644
index 0000000000000..dc7d39206dd49
Binary files /dev/null and b/onnxruntime/test/testdata/multi_stream_models/issue_19480.onnx differ
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 40b40136af1af..b404c12db3582 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -8,6 +8,9 @@
 #ifdef USE_COREML
 #include "core/providers/coreml/coreml_provider_factory.h"
 #endif
+#if defined(ENABLE_CUDA_NHWC_OPS)
+#include <core/providers/cuda/cuda_provider_options.h>
+#endif
 #include "core/session/onnxruntime_cxx_api.h"
 #include "core/framework/session_options.h"
 
@@ -118,6 +121,19 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
   return nullptr;
 }
 
+#ifdef ENABLE_CUDA_NHWC_OPS
+std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
+#if defined(USE_CUDA)
+  OrtCUDAProviderOptionsV2 provider_options{};
+  provider_options.do_copy_in_default_stream = true;
+  provider_options.prefer_nhwc = true;
+  if (auto factory = CudaProviderFactoryCreator::Create(&provider_options))
+    return factory->CreateProvider();
+#endif
+  return nullptr;
+}
+#endif
+
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options) {
 #ifdef USE_CUDA
   if (auto factory = CudaProviderFactoryCreator::Create(provider_options))
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index 9f78e0a0d4eb2..738fc66d775c6 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -35,6 +35,9 @@ namespace test {
 // unique_ptr providers with default values for session registration
 std::unique_ptr<IExecutionProvider> DefaultCpuExecutionProvider(bool enable_arena = true);
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider();
+#ifdef ENABLE_CUDA_NHWC_OPS
+std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider();
+#endif
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options);
 std::unique_ptr<IExecutionProvider> DefaultDnnlExecutionProvider();
 std::unique_ptr<IExecutionProvider> DnnlExecutionProviderWithOptions(const OrtDnnlProviderOptions* provider_options);
diff --git a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
index 092ab89d5d760..f30d6ddee253a 100644
--- a/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
+++ b/orttraining/orttraining/core/framework/triton/triton_op_executor.cc
@@ -106,6 +106,8 @@ void TritonOpExecutor::ExecuteByFuncName(const std::string& func_name, const Inl
       PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyLong_FromLongLong(std::stoll(kv.second.first)));
     } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
       PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyFloat_FromDouble(std::stod(kv.second.first)));
+    } else if (kv.second.second == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+      PyDict_SetItemString(python_kwargs.get(), kv.first.c_str(), PyUnicode_FromString(kv.second.first.c_str()));
     } else {
       ORT_THROW("Unsupported kwargs data type: ", kv.second.second);
     }
diff --git a/orttraining/orttraining/python/training/ort_triton/_utils.py b/orttraining/orttraining/python/training/ort_triton/_utils.py
index 95e6703be8783..877eacc0b775f 100644
--- a/orttraining/orttraining/python/training/ort_triton/_utils.py
+++ b/orttraining/orttraining/python/training/ort_triton/_utils.py
@@ -141,13 +141,14 @@ def get_reduce_info(node: NodeProto, graph: GraphProto, input_rank: int) -> Tupl
 
 
 def next_power_of_2(n: int) -> int:
-    assert n <= 2**32, "32-bit only"
+    """Return the smallest power of 2 greater than or equal to n"""
     n -= 1
     n |= n >> 1
     n |= n >> 2
     n |= n >> 4
     n |= n >> 8
     n |= n >> 16
+    n |= n >> 32
     n += 1
     return n
 
diff --git a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
index 779b6bfe50422..fda6e345da235 100755
--- a/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
+++ b/orttraining/orttraining/python/training/ortmodule/_graph_execution_manager.py
@@ -20,7 +20,6 @@
 from onnxruntime.capi import _pybind_state as C
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
 from onnxruntime.training.utils import ORTModelInputOutputSchemaType, PTable, onnx_dtype_to_pytorch_dtype
-from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3
 
 from . import _are_deterministic_algorithms_enabled, _io, _logger, _onnx_models, _utils
 from ._fallback import (
@@ -143,6 +142,9 @@ def __init__(
 
         self._zero_stage3_param_map = {}
         if self._runtime_options.enable_zero_stage3_support:
+            # Move import to here to avoid circular dependency error
+            from onnxruntime.training.utils.hooks import configure_ort_compatible_zero_stage3  # type: ignore[import]
+
             # Cannot toggle feature enabling/disabling after the first time enabled.
 
             configure_ort_compatible_zero_stage3(debug=False, stats_output_dir="ort_output", stats_overwrite=True)
diff --git a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
index f226db76f7ed7..db8e8558ab884 100644
--- a/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
+++ b/orttraining/orttraining/training_ops/cpu/triton/triton_op.h
@@ -25,12 +25,15 @@ class TritonOp final : public OpKernel {
           attr.first == "onnx_string") {
         continue;
       }
-      // Support int64 and float only for now, skip other types.
+      // Support int64, float and string only for now, skip other types.
       if (attr.second.type() == ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_INT) {
         kwargs_.insert({attr.first, {std::to_string(attr.second.i()), ONNX_NAMESPACE::TensorProto_DataType_INT64}});
       } else if (attr.second.type() ==
                  ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_FLOAT) {
         kwargs_.insert({attr.first, {std::to_string(attr.second.f()), ONNX_NAMESPACE::TensorProto_DataType_FLOAT}});
+      } else if (attr.second.type() ==
+                 ONNX_NAMESPACE::AttributeProto::AttributeType::AttributeProto_AttributeType_STRING) {
+        kwargs_.insert({attr.first, {attr.second.s(), ONNX_NAMESPACE::TensorProto_DataType_STRING}});
       }
     }
   }
diff --git a/setup.py b/setup.py
index 03e1cb75ba581..9a5fc29dd5e02 100644
--- a/setup.py
+++ b/setup.py
@@ -205,18 +205,23 @@ def run(self):
                 rocm_dependencies = [
                     "libamd_comgr.so.2",
                     "libamdhip64.so.5",
+                    "libamdhip64.so.6",
                     "libdrm.so.2",
                     "libdrm_amdgpu.so.1",
                     "libelf.so.1",
                     "libhipfft.so.0",
                     "libhiprtc.so.5",
+                    "libhiprtc.so.6",
                     "libhsa-runtime64.so.1",
                     "libMIOpen.so.1",
                     "libnuma.so.1",
                     "librccl.so.1",
                     "librocblas.so.3",
+                    "librocblas.so.4",
                     "librocfft.so.0",
+                    "libroctx64.so.4",
                     "librocm_smi64.so.5",
+                    "librocm_smi64.so.6",
                     "libroctracer64.so.4",
                     "libtinfo.so.6",
                     "libmigraphx_c.so.3",
diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
index b6b44690f4f6c..d04e276347170 100644
--- a/winml/lib/Api/HardwareCoreEnumerator.cpp
+++ b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -84,7 +84,7 @@ uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
   // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
   auto cores = GetNumberOPhysicalAndEngineeringCores();
 
-#if !defined(_M_ARM64) && !defined(__aarch64__)
+#if !defined(_M_ARM64EC) && !defined(_M_ARM64) && !defined(__aarch64__)
   const int kVendorID_Intel[3] = {0x756e6547, 0x6c65746e, 0x49656e69};  // "GenuntelineI"
   int regs_leaf0[4];
   int regs_leaf7[4];