From 7b1337140c44f3fbc0c48edaa677f4c8ecca9dad Mon Sep 17 00:00:00 2001
From: junpeiz <jpzhou1996@gmail.com>
Date: Mon, 16 Sep 2024 13:36:32 -0700
Subject: [PATCH] 8.0 Release (#2342)

---
 coremlpython/CoreMLPython.h                   |    3 +-
 coremlpython/CoreMLPython.mm                  |   44 +-
 coremltools/__init__.py                       |   34 +-
 coremltools/_deps/__init__.py                 |   26 +-
 coremltools/converters/mil/frontend/_utils.py |   27 +-
 .../mil/frontend/tensorflow/test/test_ops.py  |   46 +-
 .../tensorflow2/test/test_v2_ops_tf_keras.py  |    5 -
 .../mil/frontend/torch/converter.py           |  113 +-
 .../mil/frontend/torch/exir_utils.py          |   15 +-
 .../mil/frontend/torch/internal_graph.py      |   46 +-
 .../converters/mil/frontend/torch/load.py     |   43 +-
 .../converters/mil/frontend/torch/ops.py      | 1302 ++++++---
 .../mil/frontend/torch/quantization_ops.py    |  273 +-
 .../torch/test/test_internal_graph.py         |    3 +-
 .../torch/test/test_torch_conversion_api.py   |  114 +
 .../test/test_torch_export_conversion_api.py  |  270 +-
 .../test/test_torch_export_quantization.py    |   31 +-
 .../mil/frontend/torch/test/test_torch_ops.py | 2602 ++++++++++-------
 .../torch/test/test_torch_quantization_ops.py |  465 ++-
 .../torch/test/test_torch_stateful_model.py   |   37 +-
 .../mil/frontend/torch/test/testing_utils.py  |   65 +-
 .../converters/mil/frontend/torch/utils.py    |   29 +-
 coremltools/converters/mil/mil/operation.py   |    4 +-
 .../mil/mil/ops/defs/iOS18/compression.py     |    4 +-
 .../mil/ops/tests/iOS16/test_constexpr_ops.py |   31 +
 .../defs/cleanup/const_deduplication.py       |   45 +-
 .../defs/optimize_elementwise_binary.py       |    5 +-
 .../mil/passes/defs/optimize_quantization.py  |  112 +
 .../mil/mil/passes/defs/quantization.py       |    6 +
 .../mil/mil/passes/pass_pipeline.py           |    2 +
 .../mil/passes/tests/test_cleanup_passes.py   |  132 +
 .../passes/tests/test_quantization_passes.py  |  112 +
 coremltools/converters/mil/testing_utils.py   |   53 +-
 coremltools/models/_compiled_model.py         |   39 +-
 coremltools/models/model.py                   |   62 +-
 coremltools/optimize/__init__.py              |    4 +-
 .../optimize/coreml/_quantization_passes.py   |    2 +
 .../_post_training_quantization.py            |   44 +-
 .../torch/quantization/_backend_config.py     |   39 +-
 .../optimize/torch/quantization/_utils.py     |   17 +-
 .../optimize/torch/quantization/quantizer.py  |    3 +-
 coremltools/test/api/test_api_visibilities.py |    2 +
 .../test/ml_program/test_compression.py       |   28 +-
 .../test/modelpackage/test_modelpackage.py    |   44 +-
 .../neural_network/test_compiled_model.py     |   31 +-
 .../test/neural_network/test_tf_numeric.py    |    5 -
 .../test/optimize/api/test_optimize_api.py    |   40 +
 .../coreml/test_post_training_quantization.py |  122 +-
 .../torch/quantization/test_configure.py      |   10 +-
 .../torch/quantization/test_quantizer.py      |   15 +-
 .../optimize/torch/quantization/test_utils.py |   10 +-
 coremltools/version.py                        |    2 +-
 docs-guides/source/flexible-inputs.md         |   12 +
 docs-guides/source/model-prediction.md        |   11 +
 reqs/test.pip                                 |   13 +-
 55 files changed, 4797 insertions(+), 1857 deletions(-)

diff --git a/coremlpython/CoreMLPython.h b/coremlpython/CoreMLPython.h
index 6bd6554f5..a1735842d 100644
--- a/coremlpython/CoreMLPython.h
+++ b/coremlpython/CoreMLPython.h
@@ -57,7 +57,7 @@ namespace CoreML {
             Model(const Model&) = delete;
             Model& operator=(const Model&) = delete;
             ~Model();
-            explicit Model(const std::string& urlStr, const std::string& computeUnits, const std::string& functionName);
+            explicit Model(const std::string& urlStr, const std::string& computeUnits, const std::string& functionName, const py::dict& optimizationHints);
             explicit Model(MLModel* m_model, NSURL* compiledUrl, bool deleteCompiledModelOnExit);
 
             py::list batchPredict(const py::list& batch) const;
@@ -67,6 +67,7 @@ namespace CoreML {
             py::dict predict(const py::dict& input, State* state=NULL) const;
 
 #if BUILT_WITH_MACOS15_SDK
+            static void setOptimizationHints(MLModelConfiguration *configuration, const py::dict& optimizationHints);
             State newState() const;
 #endif
 
diff --git a/coremlpython/CoreMLPython.mm b/coremlpython/CoreMLPython.mm
index f818f4985..0bd060d4a 100644
--- a/coremlpython/CoreMLPython.mm
+++ b/coremlpython/CoreMLPython.mm
@@ -42,7 +42,12 @@ bool usingMacOS13OrHigher() {
     }
 }
 
-Model::Model(const std::string& urlStr, const std::string& computeUnits, const std::string& functionName) {
+Model::Model(
+             const std::string& urlStr,
+             const std::string& computeUnits,
+             const std::string& functionName,
+             const py::dict& optimizationHints
+             ) {
     @autoreleasepool {
         NSError *error = nil;
 
@@ -80,6 +85,10 @@ bool usingMacOS13OrHigher() {
         MLModelConfiguration *configuration = [MLModelConfiguration new];
         setComputeUnit(configuration, computeUnits);
 
+#if BUILT_WITH_MACOS15_SDK
+        setOptimizationHints(configuration, optimizationHints);
+#endif
+
         if (!functionName.empty()) {
 #if BUILT_WITH_MACOS15_SDK
             configuration.functionName = [NSString stringWithUTF8String:functionName.c_str()];
@@ -148,6 +157,37 @@ bool usingMacOS13OrHigher() {
 }
 
 
+#if BUILT_WITH_MACOS15_SDK
+void Model::setOptimizationHints(MLModelConfiguration *configuration, const py::dict& optimizationHints) {
+    // This function does minimal validation. It assumes Python layer has already validated.
+
+    // Reshape frequency optimization hint
+    if (optimizationHints.contains("reshapeFrequency")) {
+        const std::string val = optimizationHints["reshapeFrequency"].cast<std::string>();
+        if (val == "Frequent") {
+            configuration.optimizationHints.reshapeFrequency = MLReshapeFrequencyHintFrequent;
+        } else {
+            assert(val == "Infrequent");
+            configuration.optimizationHints.reshapeFrequency = MLReshapeFrequencyHintInfrequent;
+        }
+    }
+
+    // Specialization strategy optimization hint
+    if (optimizationHints.contains("specializationStrategy")) {
+        const std::string val = optimizationHints["specializationStrategy"].cast<std::string>();
+        if (val == "Default") {
+            configuration.optimizationHints.specializationStrategy = MLSpecializationStrategyDefault;
+        } else {
+            assert(val == "FastPrediction");
+            configuration.optimizationHints.specializationStrategy = MLSpecializationStrategyFastPrediction;
+        }
+    }
+
+
+}
+#endif
+
+
 py::list Model::batchPredict(const py::list& batch) const {
   @autoreleasepool {
       NSError* error = nil;
@@ -237,7 +277,7 @@ bool usingMacOS13OrHigher() {
     py::module m("libcoremlpython", "CoreML.Framework Python bindings");
 
     py::class_<Model>(m, "_MLModelProxy")
-        .def(py::init<const std::string&, const std::string&, const std::string&>())
+        .def(py::init<const std::string&, const std::string&, const std::string&, const py::dict&>())
         .def("predict", &Model::predict)
         .def("batchPredict", &Model::batchPredict)
         .def("get_compiled_model_path", &Model::getCompiledModelPath)
diff --git a/coremltools/__init__.py b/coremltools/__init__.py
index db16e8bf6..4af9f558d 100644
--- a/coremltools/__init__.py
+++ b/coremltools/__init__.py
@@ -72,11 +72,35 @@ class ComputeUnit(_Enum):
     '''
     The set of processing-unit configurations the model can use to make predictions.
     '''
-    ALL = 1  # Allows the model to use all compute units available, including the neural engine
-    CPU_AND_GPU = 2 # Allows the model to use both the CPU and GPU, but not the neural engine
-    CPU_ONLY = 3 # Limit the model to only use the CPU
-    CPU_AND_NE = 4 # Allows the model to use both the CPU and neural engine, but not the GPU.
-                   # Only available on macOS >= 13.0
+    ALL = 1           # Allows model to use all compute units available, including the neural engine.
+    CPU_AND_GPU = 2   # Allows model to use both the CPU and GPU, but not the neural engine.
+    CPU_ONLY = 3      # Limits model to only use the CPU.
+    CPU_AND_NE = 4    # Allows model to use both the CPU and neural engine, but not the GPU.
+                          # Only available on macOS >= 13.0
+
+
+class ReshapeFrequency(_Enum):
+    '''
+    https://developer.apple.com/documentation/coreml/mlreshapefrequencyhint?language=objc
+    '''
+    Frequent = 1
+    Infrequent = 2
+
+
+class SpecializationStrategy(_Enum):
+    '''
+    The optimization strategy for the model specialization.
+
+    https://developer.apple.com/documentation/coreml/mlspecializationstrategy?language=objc
+    '''
+
+    # The strategy that works well for most applications.
+    Default = 1
+
+    # Prefer the prediction latency at the potential cost of specialization time, memory footprint,
+    # and the disk space usage of specialized artifacts.
+    FastPrediction = 2
+
 
 # A dictionary that maps the CoreML model specification version to the MLProgram/MIL opset string
 _OPSET = {
diff --git a/coremltools/_deps/__init__.py b/coremltools/_deps/__init__.py
index f8f0f28db..8e9d0b8ec 100644
--- a/coremltools/_deps/__init__.py
+++ b/coremltools/_deps/__init__.py
@@ -153,18 +153,33 @@ def __get_sklearn_version(version):
 
 # ---------------------------------------------------------------------------------------
 _HAS_TORCH = True
-_TORCH_MAX_VERSION = "2.3.0"
+_TORCH_MAX_VERSION = "2.4.0"
 _HAS_TORCH_EXPORT_API = False
+_CT_OPTIMIZE_TORCH_MIN_VERSION = "2.1.0"
+_IMPORT_CT_OPTIMIZE_TORCH = False
 try:
     import torch
     _warn_if_above_max_supported_version("Torch", torch.__version__, _TORCH_MAX_VERSION)
 
-    if _get_version(torch.__version__) >= Version("2.1.0"):
+    torch_version = _get_version(torch.__version__)
+
+    if torch_version >= Version("2.1.0"):
         _HAS_TORCH_EXPORT_API = True
 
+    if torch_version >= Version(_CT_OPTIMIZE_TORCH_MIN_VERSION):
+        _IMPORT_CT_OPTIMIZE_TORCH = True
+    else:
+        logger.warning(
+            (
+                f"Minimum required torch version for importing coremltools.optimize.torch is {_CT_OPTIMIZE_TORCH_MIN_VERSION}. "
+                f"Got torch version {torch_version}."
+            )
+        )
+
 except:
     _HAS_TORCH = False
 MSG_TORCH_NOT_FOUND = "PyTorch not found."
+MSG_TORCH_EXPORT_API_NOT_FOUND = "Torch.Export API not found."
 
 
 _HAS_TORCH_VISION = True
@@ -189,6 +204,13 @@ def __get_sklearn_version(version):
     _HAS_EXECUTORCH = False
 MSG_EXECUTORCH_NOT_FOUND = "Executorch not found."
 
+_HAS_TORCHAO = True
+try:
+    import torchao
+except:
+    _HAS_TORCHAO = False
+MSG_TORCHAO_NOT_FOUND = "Torchao not found."
+
 # ---------------------------------------------------------------------------------------
 try:
     import scipy
diff --git a/coremltools/converters/mil/frontend/_utils.py b/coremltools/converters/mil/frontend/_utils.py
index 4da82cee3..6ed160392 100644
--- a/coremltools/converters/mil/frontend/_utils.py
+++ b/coremltools/converters/mil/frontend/_utils.py
@@ -513,7 +513,13 @@ def _concat_dims(dims, none_if_empty=False):
 
 
 def _decompose_scaled_dot_product_attention(
-    q: Var, k: Var, v: Var, mask: Var, name: str, before_op: Optional[Operation] = None
+    q: Var,
+    k: Var,
+    v: Var,
+    mask: Var,
+    name: str,
+    scale: Optional[Var] = None,
+    before_op: Optional[Operation] = None,
 ) -> Var:
     # scale the query input
     embed_size = q.shape[-1]
@@ -524,9 +530,12 @@ def _decompose_scaled_dot_product_attention(
         )
 
     q, k, v = promote_input_dtypes([q, k, v])
-    multiplicative_scale_factor = 1 / math.sqrt(embed_size)
-    if types.builtin_to_string(q.dtype) == "fp16":
-        multiplicative_scale_factor = np.float16(multiplicative_scale_factor)
+    if scale is None:
+        multiplicative_scale_factor = 1 / math.sqrt(embed_size)
+        if types.builtin_to_string(q.dtype) == "fp16":
+            multiplicative_scale_factor = np.float16(multiplicative_scale_factor)
+    else:
+        multiplicative_scale_factor = scale
     q = mb.mul(x=q, y=multiplicative_scale_factor, before_op=before_op)
 
     # multiply query and key input tensors
@@ -583,6 +592,11 @@ def _construct_constexpr_dequant_op(
             scale = np.squeeze(scale)
         if isinstance(zero_point, (np.ndarray, np.generic)):
             zero_point = np.squeeze(zero_point)
+        if len(scale.shape) > 1 or len(zero_point.shape) > 1:
+            raise ValueError(
+                "The more fine-grained quantization (such as blockwise) is only supported since iOS18."
+                "Please set minimum_deployment_target to iOS18 for using it."
+            )
 
         kwargs = {
             "quantized_data": quantized_weights,
@@ -631,7 +645,10 @@ def _construct_constexpr_dequant_op(
     }
     if zero_point is not None and np.any(zero_point):
         # Only pass the offset parameter when not all elements in `zero_point` are zeroes.
-        zero_point = zero_point.reshape(scale.shape).astype(quantized_weights.dtype)
+        zero_point = zero_point.reshape(scale.shape)
+        # When zero_point is integer, it's required to have the same dtype as the quantized weight.
+        if np.issubdtype(zero_point.dtype, np.integer):
+            zero_point = zero_point.astype(quantized_weights.dtype)
         kwargs["offset"] = zero_point
     if name is not None:
         kwargs["name"] = name
diff --git a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
index f100ac901..d5351a8bc 100644
--- a/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
+++ b/coremltools/converters/mil/frontend/tensorflow/test/test_ops.py
@@ -2622,15 +2622,6 @@ def test_ios17_resize_bilinear_dynamic_shape(
         target_shape,
         align_corners,
     ):
-        if (
-            backend == ("mlprogram", "fp16")
-            and input_shape == (2, 5, 2, 3)
-            and target_shape == (20, 60)
-        ):
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
-
         """
         Since iOS17, dynamic shape is supported by lowering to `resize` MIL op.
         """
@@ -2732,15 +2723,6 @@ def test_ios17_resize_nearest_neighbor_dynamic_shape(
         input_shape,
         target_shape,
     ):
-        if (
-            backend == ("mlprogram", "fp16")
-            and input_shape == (2, 5, 2, 3)
-            and target_shape == (20, 60)
-        ):
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
-
         """
         Since iOS17, dynamic shape is supported by lowering to `resize` MIL op.
         """
@@ -5706,10 +5688,8 @@ def test_sort(self, compute_unit, backend, rank, dynamic):
         """
         tf.sort dispatches to tf.math.top_k, and k = size of the axis to be sorted
         """
-        if backend[0] == "mlprogram" and dynamic:
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
+        if platform.machine() == "x86_64" and dynamic:
+            pytest.xfail("rdar://135843153 ([Bug] Models failed on x86_64 platform)")
 
         # Here we test the conversion of tf.sort(x, axis=0)
         # If dynamic, we prepend None to x shape as the dynamic shape axis
@@ -6720,7 +6700,6 @@ def build_model(x):
     def test_programmatic(
         self, compute_unit, backend, input_block_rank, dynamic_input, dynamic_paddings
     ):
-
         input_rank, block_rank = input_block_rank
 
         # generate data
@@ -6733,6 +6712,9 @@ def test_programmatic(
             if block_shape[0] == 1:
                 pytest.skip("neuralnetwork backend doesn't support unity block shape.")
 
+        if input_block_rank == (4, 1) and dynamic_input and not dynamic_paddings:
+            pytest.xfail("rdar://133558007 shape deduction failure")
+
         paddings = []
         for i in range(block_rank):
             while True:
@@ -6832,14 +6814,12 @@ def test_programmatic(
         self, compute_unit, backend, input_block_rank, dynamic_input, dynamic_crops
     ):
         if (
-            backend == ("mlprogram", "fp16")
-            and input_block_rank == (3, 1) or (3,2)
+            platform.machine() == "x86_64"
+            and input_block_rank == (3, 1)
             and dynamic_input
             and not dynamic_crops
         ):
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
+            pytest.xfail("rdar://135843153 ([Bug] Models failed on x86_64 platform)")
 
         input_rank, block_rank = input_block_rank
 
@@ -6939,16 +6919,6 @@ def test_smoke_new_op(
         input_shape, block_shape, crops = shape_block_crops
         crops = np.array(crops, dtype=np.int32)
 
-        if (
-            backend == ("mlprogram", "fp16")
-            and shape_block_crops == [(4, 4, 6, 1), [1, 2], [[2, 1], [3, 3]]]
-            and dynamic_input
-            and not dynamic_crops
-        ):
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
-
         # The neuralnetwork backend doesn't support these tests
         if backend[0] == "neuralnetwork":
             return
diff --git a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
index 20ab10d01..ec41a251c 100644
--- a/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
+++ b/coremltools/converters/mil/frontend/tensorflow2/test/test_v2_ops_tf_keras.py
@@ -1389,11 +1389,6 @@ def test_lstm_time_distributed_dense(self, compute_unit, backend):
         "compute_unit, backend", itertools.product(compute_units, backends)
     )
     def test_lstm_dynamic_batch(self, compute_unit, backend):
-        if backend == ("mlprogram", "fp16"):
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
-
         input_shape = (1, 1280)
         inp = tf.keras.layers.Input(shape=input_shape)
         out, hn, cn = tf.keras.layers.LSTM(512,
diff --git a/coremltools/converters/mil/frontend/torch/converter.py b/coremltools/converters/mil/frontend/torch/converter.py
index de94511d3..b5cd8277e 100644
--- a/coremltools/converters/mil/frontend/torch/converter.py
+++ b/coremltools/converters/mil/frontend/torch/converter.py
@@ -29,6 +29,7 @@
 from coremltools.optimize.coreml import _utils as optimize_utils
 from coremltools.optimize.coreml._quantization_passes import prune_weights
 
+from .exir_utils import WRAPPED_SCALAR_INPUT_SUFFIX
 from .internal_graph import InternalTorchIRGraph, InternalTorchIRNode
 from .ops import convert_nodes
 from .quantization_ops import _dequantized_weight
@@ -41,7 +42,13 @@
     remove_getattr_nodes,
     transform_inplace_ops,
 )
-from .utils import NUM_TO_NUMPY_DTYPE, TORCH_DTYPE_TO_MIL_DTYPE, TORCH_DTYPE_TO_NUM, TorchFrontend
+from .utils import (
+    NUM_TO_NUMPY_DTYPE,
+    TORCH_DTYPE_TO_MIL_DTYPE,
+    TORCH_DTYPE_TO_NUM,
+    TORCH_EXPORT_BASED_FRONTENDS,
+    TorchFrontend,
+)
 
 if _HAS_TORCH_EXPORT_API:
     from torch.export import ExportedProgram
@@ -329,8 +336,8 @@ def prepare_for_conversion(self, node: InternalTorchIRNode) -> None:
         state feeds into only one ``read_state`` op.
         """
 
-        # EXIR has nothing to prepare
-        if self.frontend == TorchFrontend.EXIR:
+        # Only torch script needs to prepare
+        if self.frontend != TorchFrontend.TORCHSCRIPT:
             return
 
         for val in node.inputs:
@@ -431,7 +438,7 @@ def process_inplace_op(self, node: InternalTorchIRNode) -> None:
             }
 
         """
-        assert self.frontend != TorchFrontend.EXIR, "EXIR has no in-place op"
+        assert self.frontend == TorchFrontend.TORCHSCRIPT, "Only torch script has no in-place op"
 
         if len(node.inputs) == 0:
             return
@@ -475,7 +482,11 @@ def __getitem__(self, torch_name: str) -> Var:
 
     def __contains__(self, torch_name):
         """Returns whether or not the torch var exist in context."""
-        return torch_name in self._current_graph[-1]
+        for idx in reversed(range(len(self._current_graph))):
+            current_graph = self._current_graph[idx]
+            if torch_name in current_graph:
+                return True
+        return False
 
     def push(self, inputs=None):
         """
@@ -594,10 +605,19 @@ def __init__(
                 p(self.graph)
 
         elif _HAS_TORCH_EXPORT_API and isinstance(loaded_model, ExportedProgram):
-            self.context = TranscriptionContext(frontend=TorchFrontend.EXIR)
+            if loaded_model.dialect == "ATEN":
+                frontend = TorchFrontend.TORCHEXPORT
+            elif loaded_model.dialect == "EDGE":
+                frontend = TorchFrontend.EXECUTORCH
+            else:
+                raise NotImplementedError(
+                    "Conversion for models with only ATEN or EDGE dialect is supported/tested. "
+                    f"Provided Dialect: {loaded_model.dialect}"
+                )
+            self.context = TranscriptionContext(frontend=frontend)
             self.graph = InternalTorchIRGraph.from_exir(exir=loaded_model)
             # For iOS 18+, create states for all mutable buffers
-            if self.opset_version >= _target.iOS18:
+            if self.opset_version is not None and self.opset_version >= _target.iOS18:
                 self.states = []
                 for name, tensor in self.graph.buffers.items():
                     dtype = NUM_TO_NUMPY_DTYPE[TORCH_DTYPE_TO_NUM[tensor.dtype]]
@@ -640,6 +660,18 @@ def __init__(
             self.param_to_compression_info = self._construct_compression_info(
                 state_dict() if callable(state_dict) else state_dict
             )
+            if self.context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
+                # For EXIR, all param names are lifted as input names (in the format of `argx_x`), so we need to
+                # change names accordingly to make sure the compression info could be found later.
+                for (
+                    arg_name,
+                    param_name,
+                ) in loaded_model.graph_signature.inputs_to_parameters.items():
+                    if param_name in self.param_to_compression_info:
+                        self.param_to_compression_info[arg_name] = self.param_to_compression_info[
+                            param_name
+                        ]
+                        del self.param_to_compression_info[param_name]
 
     def _validate_states(self) -> None:
         """
@@ -780,7 +812,7 @@ def _construct_compression_info(
         """
         compression_info = dict()
         for torch_key_name in state_dict.keys():
-            if torch_key_name == f"{_COMPRESSION_INFO_PREFIX}/metadata_version":
+            if f"{_COMPRESSION_INFO_PREFIX}/metadata_version" in torch_key_name:
                 # TODO: rdar://124707382 ([Compression] Support versioning in CompressionInfo)
                 continue
 
@@ -1189,15 +1221,15 @@ def convert_const(self) -> None:
                     ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=scope_name),
                 ):
                     self._add_const(name, val)
-            elif self.context.frontend == TorchFrontend.EXIR:
-                # ExecuTorch has constants lifted as inputs, yet we have not sorted out
+            elif self.context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
+                # Torch.Export has constants lifted as inputs, yet we have not sorted out
                 # how to support IO metadata, so for now just put a dummy metadata
                 # since inputs/constants will not contribute to debugging/profiling
                 # TODO (rdar://125572392): Support torch.export IO metadata
-                with mb.scope(
-                    ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=[None]),
-                    ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]),
-                ):
+                scopes = [ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=[None])]
+                if self.context.frontend == TorchFrontend.EXECUTORCH:
+                    scopes.append(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]))
+                with mb.scope(*scopes):
                     self._add_const(name, val)
             else:
                 raise ValueError(f"Invalid PyTorch frontend {self.context.frontend}")
@@ -1249,7 +1281,7 @@ def convert(self) -> Program:
                             ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=torch_name),
                         ):
                             input_var = mb.cast(x=input_var, dtype="fp32")
-                elif self.context.frontend == TorchFrontend.EXIR:
+                elif self.context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
                     # EXIR has dtypes all determined, so for now we just stick to EXIR dtypes
                     # TODO (rdar://115845792): Handle fp16 IO dtypes
                     # When handle user provided IO dtypes, we will also need to handle IO metadata
@@ -1261,21 +1293,41 @@ def convert(self) -> Program:
                         raise ValueError(
                             "To use fp16 input, please set minimum deployment target to iOS16+"
                         )
+                    # Torch.export may produce scalar input,
+                    # which then gets wrapped as rank-1 size-1 tensor for Core ML residency
+                    # during our internal graph construction.
+                    # Here we squeeze it back to scalar
+                    if torch_name.endswith(WRAPPED_SCALAR_INPUT_SUFFIX):
+                        torch_name = torch_name[: -len(WRAPPED_SCALAR_INPUT_SUFFIX)]
+                        scopes = [
+                            ScopeInfo(
+                                source=ScopeSource.EXIR_STACK_TRACE,
+                                data=f"unwrap_scalar_input_{torch_name}",
+                            )
+                        ]
+                        if self.context.frontend == TorchFrontend.EXECUTORCH:
+                            scopes.append(
+                                ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None])
+                            )
+                        with mb.scope(*scopes):
+                            input_var = mb.squeeze(x=input_var, name=torch_name)
                 else:
                     raise ValueError(f"Invalid PyTorch frontend {self.context.frontend}")
                 self.context.add(input_var, torch_name=torch_name)
 
             # EXIR lifts buffer references as inputs, so we need to create them by reading states
-            if self.context.frontend == TorchFrontend.EXIR:
+            if self.context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
                 for (
                     input_name,
                     buffer_name,
                 ) in self.context.torch_graph.input_name_to_source_buffer_name.items():
                     buffer_var = self.context[buffer_name]
-                    with mb.scope(
-                        ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=f"read_{buffer_name}"),
-                        ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]),
-                    ):
+                    scopes = [
+                        ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=f"read_{buffer_name}")
+                    ]
+                    if self.context.frontend == TorchFrontend.EXECUTORCH:
+                        scopes.append(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]))
+                    with mb.scope(*scopes):
                         input_var = mb.read_state(input=buffer_var)
                         # As of iOS 18, Core ML state can only be fp16
                         # In torch converter, we convert everything under fp32
@@ -1295,17 +1347,19 @@ def convert(self) -> Program:
             # EXIR represents stateful execution as buffer mutation at output,
             # i.e. buffer.copy_(...) at the end of EXIR program,
             # so analogously we update state at the end of pymil function
-            if self.context.frontend == TorchFrontend.EXIR:
+            if self.context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
                 for (
                     output_name,
                     buffer_name,
                 ) in self.context.torch_graph.output_name_to_target_buffer_name.items():
                     output_var = self.context[output_name]
                     buffer_var = self.context[buffer_name]
-                    with mb.scope(
-                        ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=f"write_{buffer_name}"),
-                        ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]),
-                    ):
+                    scopes = [
+                        ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=f"write_{buffer_name}")
+                    ]
+                    if self.context.frontend == TorchFrontend.EXECUTORCH:
+                        scopes.append(ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[None]))
+                    with mb.scope(*scopes):
                         cast_value = mb.cast(
                             x=output_var, dtype=builtin_to_string(buffer_var.dtype)
                         )
@@ -1350,11 +1404,10 @@ def convert(self) -> Program:
                     ScopeSource.TORCHSCRIPT_MODULE_NAME,
                     ScopeSource.TORCHSCRIPT_MODULE_TYPE,
                 ]
-            elif self.context.frontend == TorchFrontend.EXIR:
-                essential_scope_sources = [
-                    ScopeSource.EXIR_STACK_TRACE,
-                    ScopeSource.EXIR_DEBUG_HANDLE,
-                ]
+            elif self.context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
+                essential_scope_sources = [ScopeSource.EXIR_STACK_TRACE]
+                if self.context.frontend == TorchFrontend.EXECUTORCH:
+                    essential_scope_sources.append(ScopeSource.EXIR_DEBUG_HANDLE)
             else:
                 raise ValueError(f"Invalid PyTorch frontend {self.context.frontend}")
             prog._add_essential_scope_source(essential_scope_sources)
diff --git a/coremltools/converters/mil/frontend/torch/exir_utils.py b/coremltools/converters/mil/frontend/torch/exir_utils.py
index 810e5ac57..7dd1d0999 100644
--- a/coremltools/converters/mil/frontend/torch/exir_utils.py
+++ b/coremltools/converters/mil/frontend/torch/exir_utils.py
@@ -15,6 +15,8 @@
 
 from .utils import TORCH_DTYPE_TO_MIL_DTYPE
 
+WRAPPED_SCALAR_INPUT_SUFFIX = "_wrapped_as_tensor_for_coreml"
+
 
 def _map_sympy_number_to_int(sympy_number: sympy.core.numbers.Number) -> int:
     MAX_DIM = 2147483647
@@ -25,7 +27,6 @@ def _map_sympy_number_to_int(sympy_number: sympy.core.numbers.Number) -> int:
 
 
 def _construct_ct_range_dim_from_torch_value_ranges(
-    symbol_name: str,
     value_ranges,  # torch.utils._sympy.value_ranges.ValueRanges
 ) -> RangeDim:
     if value_ranges.is_bool:
@@ -33,7 +34,7 @@ def _construct_ct_range_dim_from_torch_value_ranges(
 
     lower = _map_sympy_number_to_int(value_ranges.lower)
     upper = _map_sympy_number_to_int(value_ranges.upper)
-    return RangeDim(lower_bound=lower, upper_bound=upper, symbol=symbol_name)
+    return RangeDim(lower_bound=lower, upper_bound=upper)
 
 
 def _construct_symbol_name_to_ct_range_dim_dict(
@@ -43,7 +44,7 @@ def _construct_symbol_name_to_ct_range_dim_dict(
     for symbol, value_ranges in exported_program.range_constraints.items():
         symbol_name = str(symbol)
         symbol_name_to_ct_range_dim[symbol_name] = _construct_ct_range_dim_from_torch_value_ranges(
-            symbol_name, value_ranges
+            value_ranges
         )
     return symbol_name_to_ct_range_dim
 
@@ -69,6 +70,14 @@ def _construct_ct_tensor_type_from_torch(
         else:
             shape.append(int(size))
 
+    if len(shape) == 0:
+        shape = [1]
+        logger.warning(
+            "Core ML does not support scalar input, "
+            f"so {name} has been wrapped as rank-1 size-1 tensor"
+        )
+        name = name + WRAPPED_SCALAR_INPUT_SUFFIX
+
     return TensorType(name=name, dtype=coreml_dtype, shape=shape)
 
 
diff --git a/coremltools/converters/mil/frontend/torch/internal_graph.py b/coremltools/converters/mil/frontend/torch/internal_graph.py
index ea121f71a..129ecd2a8 100644
--- a/coremltools/converters/mil/frontend/torch/internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/internal_graph.py
@@ -13,21 +13,25 @@
 from coremltools.converters.mil.input_types import TensorType
 
 from .exir_utils import extract_io_from_exir_program
+from .torch_op_registry import _TORCH_OPS_REGISTRY
 from .torchscript_utils import _expand_and_optimize_ir
 from .utils import TORCH_DTYPE_TO_NUM, sanitize_op_kind
 
 
-def _make_ssa_name(name: str) -> str:
+def _make_ssa_name(name: Optional[Union[str, int]]) -> str:
     """
     Converts a symbol name (string) into an SSA name, by prepending '%'.
+    If the name is a parameter value (int), directly printing it without prepending '%'.
     Only used for pretty printing the graph.
     """
     if name is None:
         return "None"
+    if type(name) is int:
+        return str(name)
     return "%" + name
 
 
-def _ssa_name_list(names: List[str]) -> List[str]:
+def _ssa_name_list(names: List[Optional[Union[str, int]]]) -> List[str]:
     """
     Take a list of symbol names (strings) and return them as SSA names. Only
     used for pretty printing the graph.
@@ -161,6 +165,7 @@ def __init__(
         kind: str,
         inputs: List[str],
         outputs: List[str],
+        kwinputs: Optional[Dict[str, str]] = None,
         name: Optional[str] = None,
         parent: Optional[Union["InternalTorchIRGraph", "InternalTorchIRBlock"]] = None,
         attr: Optional[Dict[str, Any]] = None,
@@ -174,6 +179,7 @@ def __init__(
             kind: the kind (op) of the node.
             inputs: list of input symbols.
             outputs: list of output symbols.
+            kwinputs: dict of keyword input symbols.
             parent: The InternalTorchIRGraph/Block this node belongs to.
             attr:  dict of named attributes.
             blocks: list of InternalTorchIRBlock.
@@ -188,6 +194,7 @@ def __init__(
         self.kind = kind
         self.inputs = inputs
         self.outputs = outputs
+        self.kwinputs = kwinputs
         self.parent = parent
         self.attr = attr if attr is not None else {"value": None}
         self.blocks = blocks if blocks is not None else []
@@ -233,14 +240,14 @@ def from_torchscript_node(cls, node, parent):
 
     @classmethod
     def from_exir_node(cls, node):
-        def get_arguments(alist):
+        def get_arguments(alist: List) -> Tuple:
             args = []
             for i in alist:
                 if isinstance(i, torch.fx.Node):
                     args.append(i.name)
                 elif isinstance(i, torch.fx.immutable_collections.immutable_list):
                     args.append(get_arguments(i))
-                elif isinstance(i, (int, float)):
+                elif isinstance(i, (int, float, str)):
                     args.append(i)
                 # This is necessitated by backward compatibility:
                 # * TorchScript used to store dtype as integers/enums
@@ -251,17 +258,21 @@ def get_arguments(alist):
                 #   to leverage the existing TorchScript converter infra
                 elif isinstance(i, torch.dtype):
                     args.append(TORCH_DTYPE_TO_NUM[i])
+                elif (
+                    isinstance(i, torch.device)
+                    or isinstance(i, torch.layout)
+                    or isinstance(i, torch.memory_format)
+                ):
+                    # PyMIL graph does not care about these things
+                    pass
                 elif i is None:
                     args.append(None)
                 else:
-                    raise AssertionError(f"Unhandled type of the node: {type(i)}")
+                    raise AssertionError(
+                        f"Unhandled node type {type(i)}. Node content is: {str(i)}"
+                    )
             return tuple(args)
 
-        # TODO (rdar://128768037) handle kwargs
-        inputs = get_arguments(node.args)
-        # TODO: rdar://115846125 ([Executorch] Handle Models/Layers with Multiple outputs)
-        outputs = [node.name]
-
         try:
             kind = node.target.name()
         except:
@@ -270,6 +281,20 @@ def get_arguments(alist):
             else:
                 kind = str(node.target)
         kind = sanitize_op_kind(kind)
+        if not kind in _TORCH_OPS_REGISTRY:
+            raise ValueError(f"Unsupported fx node {str(node)}, kind {kind}")
+
+        # TODO (rdar://134015126) handle kwargs
+        inputs = get_arguments(node.args)
+        # TODO: rdar://115846125 ([Executorch] Handle Models/Layers with Multiple outputs)
+        outputs = [node.name]
+
+        kwinputs = {}
+        for keyword, arg in node.kwargs.items():
+            if arg is not None:
+                kwinputs[keyword] = get_arguments([arg])
+        if len(kwinputs) == 0:
+            kwinputs = None
 
         name = node.name
         return cls(
@@ -277,6 +302,7 @@ def get_arguments(alist):
             kind=kind,
             inputs=inputs,
             outputs=outputs,
+            kwinputs=kwinputs,
             parent=None,
             attr=None,
             blocks=None,
diff --git a/coremltools/converters/mil/frontend/torch/load.py b/coremltools/converters/mil/frontend/torch/load.py
index eda38fe28..e84877a67 100644
--- a/coremltools/converters/mil/frontend/torch/load.py
+++ b/coremltools/converters/mil/frontend/torch/load.py
@@ -10,16 +10,20 @@
 from torch.jit._script import RecursiveScriptModule
 
 from coremltools import _logger as logger
-from coremltools._deps import _HAS_TORCH_EXPORT_API
+from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH_EXPORT_API
 from coremltools.converters.mil.frontend.torch.converter import TorchConverter
 from coremltools.converters.mil.input_types import StateType, TensorType
 from coremltools.converters.mil.mil.program import Program
 
 from .converter import TorchConverter
+from .utils import TorchFrontend
 
 if _HAS_TORCH_EXPORT_API:
     from torch.export import ExportedProgram
 
+if _HAS_EXECUTORCH:
+    import executorch.exir
+
 
 def load(
     spec: Union[RecursiveScriptModule, "ExportedProgram", str],
@@ -108,16 +112,43 @@ def _torchscript_from_spec(model_spec: Union[str, RecursiveScriptModule]) -> Rec
 
     elif isinstance(model_spec, _torch.jit.ScriptModule):
         return model_spec
-    elif _HAS_TORCH_EXPORT_API and isinstance(model_spec, ExportedProgram):
-        return model_spec
+
     else:
         raise TypeError(
-            "A PyTorch model must either be a .pt or .pth file, or a TorchScript object. Received: {}".format(
-                type(model_spec)
-            )
+            "A PyTorch model must either be a .pt or .pth file, or a TorchScript object. "
+            f"Received: {type(model_spec)}"
         )
 
 
+if _HAS_TORCH_EXPORT_API:
+
+    def _torchexport_from_spec(
+        model_spec: Union[str, ExportedProgram],
+        frontend=TorchFrontend.TORCHEXPORT,
+    ) -> ExportedProgram:
+        # Load torch.export serialization
+        if isinstance(model_spec, str) and model_spec.endswith(".pt2"):
+            filename = _os_path.abspath(model_spec)
+            try:
+                model = _torch.export.load(filename)
+            except Exception as e:
+                logger.error(
+                    "\n\nERROR - Could not load the PyTorch model. Got the following error:\n"
+                )
+                raise e
+        elif isinstance(model_spec, ExportedProgram):
+            model = model_spec
+        else:
+            raise TypeError(
+                "A PyTorch model must either be a .pt2 file, or an ExportedProgram object. "
+                f"Received: {type(model_spec)}"
+            )
+        # To edge if edge dialect is desired
+        if frontend == TorchFrontend.EXECUTORCH and model.dialect != "EDGE":
+            model = executorch.exir.to_edge(model).exported_program()
+        return model
+
+
 def _perform_torch_convert(converter: TorchConverter, debug: bool) -> Program:
     try:
         prog = converter.convert()
diff --git a/coremltools/converters/mil/frontend/torch/ops.py b/coremltools/converters/mil/frontend/torch/ops.py
index 1047bfc4d..e38bc0473 100644
--- a/coremltools/converters/mil/frontend/torch/ops.py
+++ b/coremltools/converters/mil/frontend/torch/ops.py
@@ -8,7 +8,7 @@
 import numbers
 import re
 from collections.abc import Iterable
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import numpy as _np
 import numpy as np
@@ -42,6 +42,7 @@
     NUM_TO_TORCH_DTYPE,
     NUMPY_DTYPE_TO_TORCH_NUM,
     TORCH_DTYPE_TO_NUM,
+    TORCH_EXPORT_BASED_FRONTENDS,
     TYPE_TO_DTYPE_STRING,
     TorchFrontend,
     dtype_to_32bit,
@@ -56,6 +57,22 @@
 
 VALUE_CLOSE_TO_INFINITY = 1e+38
 
+TORCH_STRING_ARGS = {
+    # conv padding
+    "same",
+    "valid",
+
+    # meshgrid indexing
+    "ij",
+    "xy",
+
+    # pad mode
+    "circular",
+    "constant",
+    "reflect",
+    "replicate",
+}
+
 
 def _all_outputs_present(context, graph):
     """
@@ -127,11 +144,16 @@ def convert_single_node(context: TranscriptionContext, node: InternalTorchIRNode
             ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_TYPE, data=scope_type),
             ScopeInfo(source=ScopeSource.TORCHSCRIPT_MODULE_NAME, data=scope_name),
         ]
-    elif context.frontend == TorchFrontend.EXIR:
+    elif context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
         scopes = [
-            ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=[node.meta.get("stack_trace")]),
-            ScopeInfo(source=ScopeSource.EXIR_DEBUG_HANDLE, data=[node.meta.get("debug_handle")]),
+            ScopeInfo(source=ScopeSource.EXIR_STACK_TRACE, data=[node.meta.get("stack_trace")])
         ]
+        if context.frontend == TorchFrontend.EXECUTORCH:
+            scopes.append(
+                ScopeInfo(
+                    source=ScopeSource.EXIR_DEBUG_HANDLE, data=[node.meta.get("debug_handle")]
+                )
+            )
     else:
         raise ValueError(f"Invalid PyTorch frontend {context.frontend}")
 
@@ -180,6 +202,40 @@ def _assert_torch_dtype_num_is_not_complex_number(num):
         "This op does not support complex number dtype."
 
 
+def _get_bindings(context, alist) -> List[Var]:
+    """
+    This utility is needed in order to handle following cases:
+        With EXIR,
+        - Some of the inputs can be literals (like axis, perms) and thus can be of types: list, int etc.
+        - An Input Parameter of an op could be a list/tuple similar to our concat layer
+    """
+    results = []
+
+    for i in alist:
+        if isinstance(i, str):
+            if i in context:
+                results.append(context[i])
+            elif i in TORCH_STRING_ARGS:
+                results.append(i)
+            else:
+                raise ValueError(
+                    f"Binding {i} is neither a name of exisitng var in context, "
+                    "nor a torch string argument"
+                )
+        elif isinstance(i, (list, tuple)) and all(isinstance(j, int) for j in i):
+            results.append(mb.const(val=i))
+        elif isinstance(i, (list, tuple)):
+            results.append(_get_bindings(context, i))
+        elif isinstance(i, (int, float)):
+            results.append(mb.const(val=i))
+        elif i is None:
+            results.append(None)
+        else:
+            raise NotImplementedError(f"Binding of inputs of type {type(i)} not handled yet")
+
+    return results
+
+
 def _get_inputs(
     context,
     node,
@@ -192,49 +248,21 @@ def _get_inputs(
     value of @expected.
     """
 
-    def get_bindings(alist) -> List[Any]:
-        """
-        This utility is needed in order to handle following cases:
-            With EXIR,
-            - Some of the inputs can be literals (like axis, perms) and thus can be of types: list, int etc.
-            - An Input Parameter of an op could be a list/tuple similar to our concat layer
-        """
-        results = []
-
-        for i in alist:
-            if isinstance(i, str):
-                results.append(context[i])
-            elif isinstance(i, (list, tuple)) and all(isinstance(j, int) for j in i):
-                results.append(mb.const(val=i))
-            elif isinstance(i, (list, tuple)):
-                results.append(get_bindings(i))
-            elif isinstance(i, (int, float)):
-                results.append(mb.const(val=i))
-            elif i is None:
-                results.append(None)
-            else:
-                raise NotImplementedError(f"Binding of inputs of type {type(i)} not handled yet")
-
-        return results
-
     def check_if_number_of_inputs_expected(num_inputs: int, expected: Union[int, List, Tuple]) -> None:
         expected = [expected] if isinstance(expected, int) else expected
         if num_inputs not in expected:
             raise ValueError(
-                "node {} ({}) got {} input(s), expected {}".format(
-                    node.name, node.kind, num_inputs, expected
-                )
+                f"node {node.name} ({node.kind}) got {num_inputs} input(s), expected {expected}"
             )
 
     def check_if_number_of_inputs_more_than_min_expected(num_inputs: int, min_expected: int) -> None:
         if num_inputs < min_expected:
             raise ValueError(
-                "node {} ({}) got {} input(s), expected minimum {} inputs".format(
-                    node.name, node.kind, num_inputs, min_expected
-                )
+                f"node {node.name} ({node.kind}) got {num_inputs} input(s), "
+                f"expected minimum {min_expected} inputs"
             )
 
-    inputs = get_bindings(node.inputs)
+    inputs = _get_bindings(context, node.inputs)
 
     if expected is not None:
         if isinstance(expected, dict):
@@ -253,6 +281,17 @@ def check_if_number_of_inputs_more_than_min_expected(num_inputs: int, min_expect
     return inputs
 
 
+def _get_kwinputs(context, node, keyword: str, default: Optional[List[Var]] = None) -> List[Var]:
+    if node.kwinputs is None:
+        return default
+    else:
+        bindings = node.kwinputs.get(keyword)
+        if bindings is None:
+            return default
+        else:
+            return _get_bindings(context, bindings)
+
+
 def _list_select(shape_var, index):
     """
     Sometimes we need to select a specific item from a list. If that item
@@ -337,7 +376,7 @@ def _construct_constant(val, name):
 
 @register_torch_op
 def native_dropout(context, node):
-    if context.frontend == TorchFrontend.EXIR:
+    if context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
         inputs = _get_inputs(context, node, min_expected=2)
         context.add((inputs[0],), node.name)
     else:
@@ -825,7 +864,7 @@ def gt(context, node):
     context.add(greater)
 
 
-@register_torch_op(torch_alias=["t", "numpy_t", "transpose.int"])
+@register_torch_op(torch_alias=["t", "numpy_t"])
 def transpose(context, node):
     assert len(node.outputs) == 1
     inputs = _get_inputs(context, node)
@@ -976,76 +1015,203 @@ def linear(context, node):
     context.add(res, torch_name=node.name)
 
 
-@register_torch_op(torch_alias=["conv2d", "convolution"])
+@register_torch_op(
+    torch_alias=[
+        "convolution",
+        "conv1d",
+        "conv2d",
+        "conv3d",
+        "conv1d.padding",
+        "conv2d.padding",
+        "conv3d.padding",
+        "conv_transpose1d",
+        "conv_transpose2d.input",
+        "conv_transpose3d.input",
+    ]
+)
 def _convolution(context, node):
-    inputs = _get_inputs(context, node)
-
-    x = inputs[0]
-    # PyTorch and MIL has same weight layout
-    # Conv: [Cout, Cin, *D]
-    # ConvTranspose: [Cin, Cout, *D]
-    weight = inputs[1]
-    bias = inputs[2]
-    strides = inputs[3]
-
-    x, weight = promote_input_dtypes([x, weight])
-
-    # Expand padding. Torch accepts either an int (for all dimensions) or an n-tuple of ints (one per dimension), but
-    # we require a (2 * n)-tuple, where n is the number of spatial dimensions, start and end for each spatial dimension
-    pad = inputs[4].val
-
-    if len(weight.shape) in (3, 4):
-        # 1D and 2D: Need to explicitly state L-R, T-B pad
-        pad = _np.repeat(pad, 2)
-    elif len(weight.shape) == 5:
-        # 3D: Need to explicitly state F-Bk, L-R, T-B pad
-        if type(pad) == int:
-            pad = _np.repeat(pad, 6)
-        elif len(pad) == 3:
-            pad = _np.repeat(pad, 2)
-    else:
-        raise ValueError(
-            "Invalid weight dimension. Must be 3, 4, or 5 for 1D, 2D, or 3D convolution, respectively."
+    default_torch_padding = "valid" if node.kind.endswith(".padding") else 0
+
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(
+            context,
+            node,
+            min_expected={
+                TorchFrontend.TORCHSCRIPT: 7,
+                TorchFrontend.TORCHEXPORT: 2,
+                TorchFrontend.EXECUTORCH: 2,
+            },
         )
+        nargs = len(inputs)
 
-    dilations = inputs[5]
-    out_pad = None
-    if len(inputs) >= 9:
-        transposed = inputs[6].val
-        out_pad = inputs[7].val
-        group = inputs[8]
-    elif len(inputs) == 7:
-        transposed = False
-        group = inputs[6]
-    else:
-        raise ValueError(
-            "unexpected number of inputs for node {} ({}): {}".format(
-                node.name, node.kind, len(inputs)
-            )
-        )
+        x = inputs[0]
+        # PyTorch and MIL has same weight layout
+        # Conv: [Cout, Cin, *D]
+        # ConvTranspose: [Cin, Cout, *D]
+        weight = inputs[1]
+        x, weight = promote_input_dtypes([x, weight])
+
+        bias = inputs[2] if nargs > 2 else None
+        stride = inputs[3] if nargs > 3 else 1
+        padding = inputs[4] if nargs > 4 else default_torch_padding
+
+        if node.kind in ("_convolution", "convolution"):
+            dilation = inputs[5] if nargs > 5 else 1
+            transposed = inputs[6].val if nargs > 6 else False
+            out_padding = inputs[7] if nargs > 7 else 0
+            groups = inputs[8] if nargs > 8 else 1
+        elif re.match(r"conv_transpose[123]d.*", node.kind):
+            out_padding = inputs[5] if nargs > 5 else 0
+            groups = inputs[6] if nargs > 6 else 1
+            dilation = inputs[7] if nargs > 7 else 1
+            transposed = True
+        else:
+            dilation = inputs[5] if nargs > 5 else 1
+            groups = inputs[6] if nargs > 6 else 1
+            transposed = False
+            out_padding = 0
+
+        return x, weight, bias, stride, padding, dilation, groups, transposed, out_padding
+
+    def _parse_keyword_args(
+        context, node, bias, stride, padding, dilation, groups, out_padding
+    ) -> Tuple[Var]:
+        # Only torch.export may have kwargs
+        if context.frontend != TorchFrontend.TORCHEXPORT:
+            return bias, stride, padding, dilation, groups, out_padding
+
+        bias = _get_kwinputs(context, node, "bias", default=[bias])[0]
+        stride = _get_kwinputs(context, node, "stride", default=[stride])[0]
+        padding = _get_kwinputs(context, node, "padding", default=[padding])[0]
+        dilation = _get_kwinputs(context, node, "dilation", default=[dilation])[0]
+        groups = _get_kwinputs(context, node, "groups", default=[groups])[0]
+        out_padding = _get_kwinputs(context, node, "out_padding", default=[out_padding])[0]
+
+        return bias, stride, padding, dilation, groups, out_padding
+
+    def _translate_torch_args(node, weight, stride, padding, dilation, groups, out_padding):
+        spatial_rank = weight.rank - 2
+
+        # Core ML strides comes from torch stride
+        if isinstance(stride, Var):
+            stride = stride.val
+            assert stride is not None, "torch conv stride must be constant"
+        # Torch stride is an int (for all spatial dims) or an n-tuple of ints (one per spatial dim)
+        # Core ML requires an n-tuple
+        if isinstance(stride, int) or len(stride) == 1:
+            strides = _np.array([np.squeeze(stride)] * spatial_rank)
+        else:
+            strides = stride
+        # 1 is Core ML default value, so using None is preferred
+        if _np.all(strides == 1):
+            strides = None
+
+        # Core ML pad_type and pad come from torch padding
+        # For torch conv op .padding variants, torch padding is a string,
+        # with possible values ("valid", "same")
+        if node.kind.endswith(".padding"):
+            pad_type = padding
+            if isinstance(pad_type, Var):
+                assert pad_type.val is not None
+                pad_type = pad_type.val
+            assert pad_type in ("valid", "same")
+            # Core ML pad is None for pad_type "valid" / "same"
+            pad = None
+        # For other torch conv op variants, torch padding is
+        # an int (for all spatial dims) or an n-tuple of ints (one per spatial dim)
+        else:
+            if isinstance(padding, Var):
+                padding = padding.val
+                assert padding is not None, "torch conv padding must be constant"
+            # Core ML requires a (2 * n)-tuple, start and end for each spatial dim
+            if isinstance(padding, int) or len(padding) == 1:
+                pad = _np.array([np.squeeze(padding)] * (2 * spatial_rank))
+            else:
+                assert len(padding) == spatial_rank
+                pad = _np.repeat(padding, 2)
+            # Create Core ML pad_type according to Core ML pad
+            if _np.all(pad == 0):
+                pad_type = "valid"
+                # 0 is Core ML default value, so using None is preferred
+                pad = None
+            else:
+                pad_type = "custom"
+
+        # Core ML dilations comes from torch dilation
+        if isinstance(dilation, Var):
+            dilation = dilation.val
+            assert dilation is not None, "torch conv dilation must be constant"
+        # Torch dilation is an int (for all spatial dims) or an n-tuple of ints (one per spatial dim)
+        # Core ML requires an n-tuple
+        if isinstance(dilation, int) or len(dilation) == 1:
+            dilations = _np.array([np.squeeze(dilation)] * spatial_rank)
+        else:
+            dilations = dilation
+        # 1 is Core ML default value, so using None is preferred
+        if _np.all(dilations == 1):
+            dilations = None
+
+        # Core ML groups is torch groups
+        if isinstance(groups, Var):
+            groups = groups.val
+            assert groups is not None, "torch conv groups must be constant"
+        # 1 is Core ML default value, so using None is preferred
+        if groups == 1:
+            groups = None
+
+        if isinstance(out_padding, Var):
+            out_padding = out_padding.val
+            assert out_padding is not None, "torch out_padding must be constant"
+        # 0 is Core ML default value, so using None is preferred
+        if _np.all(out_padding == 0):
+            out_padding = None
+
+        return strides, pad_type, pad, dilations, groups, out_padding
+
+    (
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        transposed,
+        out_padding,
+    ) = _parse_positional_args(context, node)
+    bias, stride, padding, dilation, groups, out_padding = _parse_keyword_args(
+        context, node, bias, stride, padding, dilation, groups, out_padding
+    )
+    strides, pad_type, pad, dilations, groups, out_padding = _translate_torch_args(
+        node, weight, stride, padding, dilation, groups, out_padding
+    )
 
     kwargs = {
         "x": x,
         "weight": weight,
-        "strides": strides,
-        "pad_type": "custom",
-        "pad": pad,
-        "dilations": dilations,
-        "groups": group,
+        "pad_type": pad_type,
         "name": node.name,
     }
-    # Bias is optional in PyTorch's convolution.
     if bias is not None:
         kwargs["bias"] = bias
+    if pad_type == "custom":
+        kwargs["pad"] = pad
+    if strides is not None:
+        kwargs["strides"] = strides
+    if dilations is not None:
+        kwargs["dilations"] = dilations
+    if groups is not None:
+        kwargs["groups"] = groups
 
     if transposed is True:
+        pad_len = 2 * (weight.rank - 2)
         # Transposed convolution
         # Handle output_padding using pre-pad or post-crop
-        pre_pad = [0] * len(pad)
-        post_crop = [0] * len(pad)
+        pre_pad = [0] * pad_len
+        post_crop = [0] * pad_len
 
-        if out_pad is not None and any(out_pad):
-            output_padding = [0] * len(pad)
+        if out_padding is not None and any(out_padding):
+            output_padding = [0] * pad_len
             # output padding adds additional padding on one of the side of dimension
             # i.e. bottom from top-bottom,
             #      right  from left-right
@@ -1054,16 +1220,14 @@ def _convolution(context, node):
             # mapping output_padding to simplify further processing!
             #
             # For ConvTranspose2d: [bottom, right] -> [0, b, 0, r]
-            output_padding = [
-                0 if i % 2 == 0 else out_pad[i // 2] for i in range(len(pad))
-            ]
+            output_padding = [0 if i % 2 == 0 else out_padding[i // 2] for i in range(pad_len)]
             if sum(pad) == 0 and any(output_padding):
                 raise ValueError(
                     "ConvTranspose configuration of padding=0 and output_padding > 0 not supported!"
                 )
             post_crop = pad.copy()
             pad *= 0
-            for i in range(0, len(pad)):
+            for i in range(0, pad_len):
                 if post_crop[i] >= output_padding[i]:
                     post_crop[i] -= output_padding[i]
                 else:
@@ -1273,9 +1437,20 @@ def relu6(context, node):
 
 @register_torch_op
 def einsum(context, node):
-    vars = context[node.inputs[1]]
-    vars = promote_input_dtypes(vars)
-    equation = context[node.inputs[0]].val
+    if context.frontend == TorchFrontend.TORCHSCRIPT:
+        vars = context[node.inputs[1]]
+        vars = promote_input_dtypes(vars)
+        equation = context[node.inputs[0]].val
+    else:
+        equation = node.inputs[0]
+        if isinstance(equation, str) and equation in context:
+            equation = context[equation].val
+        tensor_names = node.inputs[1]
+        if isinstance(tensor_names, str) and tensor_names in context:
+            vars = context[tensor_names]
+        else:
+            assert isinstance(tensor_names, tuple)
+            vars = [context[tensor_name] for tensor_name in tensor_names]
     x = build_einsum_mil(vars, equation, node.name)
     context.add(x)
 
@@ -1412,7 +1587,18 @@ def _calculate_pool_output_size(in_dim, kernel, stride, pad_l, pad_r, ceil_mode)
     return new_pad
 
 
-def _max_pool(context, node, inputs):
+@register_torch_op(
+    torch_alias=[
+        "max_pool2d",
+        "max_pool3d",
+        "max_pool1d_with_indices",
+        "max_pool2d_with_indices",
+        "max_pool3d_with_indices",
+    ]
+)
+def max_pool1d(context, node):
+    inputs = _get_inputs(context, node, min_expected=3)
+
     x = inputs[0]
     kernel_sizes = inputs[1]
     strides = inputs[2]
@@ -1447,31 +1633,13 @@ def _max_pool(context, node, inputs):
         ceil_mode=ceil_mode if spatial_rank <= 2 else False,
     )
 
-    if node.kind == "max_pool2d_with_indices":
+    if re.match(r"max_pool[123]d_with_indices", node.kind):
         # TODO(rdar://117038432) ([Executorch] Handle/Bind other outputs of `max_pool2d_with_indices` op during lowering)
         context.add((pool, None), torch_name=node.name)
     else:
         context.add(pool)
 
 
-@register_torch_op
-def max_pool1d(context, node):
-    inputs = _get_inputs(context, node, expected=6)
-    _max_pool(context, node, inputs)
-
-
-@register_torch_op(torch_alias=["max_pool2d_with_indices"])
-def max_pool2d(context, node):
-    inputs = _get_inputs(context, node, min_expected=3)
-    _max_pool(context, node, inputs)
-
-
-@register_torch_op
-def max_pool3d(context, node):
-    inputs = _get_inputs(context, node, expected=6)
-    _max_pool(context, node, inputs)
-
-
 @register_torch_op
 def minimum(context, node):
     inputs = _get_inputs(context, node, expected=2)
@@ -1606,34 +1774,41 @@ def sub(context, node):
     ]
 )
 def mean(context, node):
-    inputs = _get_inputs(context, node, min_expected=1)
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=1)
+        nargs = len(inputs)
+
+        x = inputs[0]
+        dim = inputs[1] if nargs > 1 else None
+        keepdim = inputs[2] if nargs > 2 else False
+        return x, dim, keepdim
+
+    x, dim, keepdim = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if keepdim == False:
+            keepdim = _get_kwinputs(context, node, "keepdim", default=[keepdim])[0]
 
-    x = inputs[0]
     if types.is_bool(x.dtype):
         # TODO: In the future when MIL op supports bool, we need to use curr_opset_version to decide
         # if we want to cast or not.
         x = mb.cast(x=x, dtype="fp32")
     kwargs = {"x": x, "name": node.name}
 
-    # @axes is optional, so omit if None.
-    axes = None if len(inputs) < 2 else inputs[1]
-    if axes is not None:
-        # @axes needs to be a list, but if only one axis was specified in the
-        # model, it will be constructed as an int. Construct a new constant as a
-        # list.
-        if not isinstance(axes.val, _np.ndarray):
-            axes = mb.const(val=[axes.val], name=axes.name + "_list")
-            context.add(axes)
+    # torch dim means Core ML axes
+    if dim is not None:
+        # Core ML axes needs to be a list, but if only one dim was specified in torch,
+        # it will be constructed as an int, so we construct a new constant as a list
+        if not isinstance(dim.val, _np.ndarray):
+            axes = mb.const(val=[dim.val], name=dim.name + "_list")
+        else:
+            axes = dim.val
         kwargs["axes"] = axes
 
-    # @keep_dims is optional.
-    if len(inputs) >= 3:
-        keep_dims = inputs[2]
-        kwargs["keep_dims"] = keep_dims
+    # torch keepdim means Core ML keep_dims
+    if keepdim != False:
+        kwargs["keep_dims"] = keepdim
 
-    # Last input to mean is an optional output tensor. We always expect this to
-    # be None or absent.
-    assert len(inputs) <= 3 or inputs[3] is None
     if node.kind == "sum":
         res = mb.reduce_sum(**kwargs)
     elif node.kind == "logsumexp":
@@ -1665,7 +1840,7 @@ def unsqueeze(context, node):
     context.add(unsqueeze)
 
 
-@register_torch_op(torch_alias=["sym_size.int"])
+@register_torch_op(torch_alias=["sym_size"])
 def size(context, node):
     inputs = _get_inputs(context, node, expected=[1, 2])
     x = inputs[0]
@@ -1692,7 +1867,7 @@ def _shape_as_tensor(context, node):
     context.add(shape_node, node.name)
 
 
-@register_torch_op(torch_alias=["view_copy", "reshape"])
+@register_torch_op(torch_alias=["view_copy", "_unsafe_view", "reshape"])
 def view(context, node):
     inputs = _get_inputs(context, node, expected=2)
     x = inputs[0]
@@ -1727,28 +1902,54 @@ def view(context, node):
     context.add(view)
 
 
-@register_torch_op(torch_alias=['constant_pad_nd'])
+@register_torch_op(torch_alias=["constant_pad_nd"])
 def pad(context, node):
-    inputs = _get_inputs(context, node)
-    x = inputs[0]
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(
+            context,
+            node,
+            expected={TorchFrontend.TORCHSCRIPT: [3, 4]},
+            min_expected={TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2},
+        )
+        nargs = len(inputs)
+        if context.frontend == TorchFrontend.TORCHSCRIPT:
+            assert (node.kind == "pad") == (nargs == 4)
+            assert (node.kind == "constant_pad_nd") == (nargs == 3)
 
-    pad = inputs[1]
-    if pad.val is not None:
-        pad = pad.val.reshape((-1, 2))[::-1].reshape(-1).tolist()
-        missing_dims = x.rank - (len(pad) // 2)
-        pad = [0, 0] * missing_dims + pad
+        x = inputs[0]
+        pad = inputs[1]
+        if pad.val is not None:
+            pad = pad.val.reshape((-1, 2))[::-1].reshape(-1).tolist()
+            missing_dims = x.rank - (len(pad) // 2)
+            pad = [0, 0] * missing_dims + pad
+
+        if node.kind == "pad":
+            mode = "constant"
+            if nargs > 2:
+                if isinstance(inputs[2], str):
+                    mode = inputs[2]
+                else:
+                    if isinstance(inputs[2], Var) and inputs[2].val is not None:
+                        mode = inputs[2].val
+                    else:
+                        raise ValueError(
+                            "if pad mode is specified, then it must either be a string, "
+                            "or a constant pymil variable"
+                        )
+            assert mode in ("circular", "constant", "reflect", "replicate")
+            scalar_val = inputs[3] if nargs > 3 else 0.0
+        else:
+            mode = "constant"
+            scalar_val = inputs[2] if nargs > 2 else 0.0
+        if scalar_val is None:
+            scalar_val = 0.0
+        elif isinstance(scalar_val, Var):
+            assert scalar_val.val is not None
+            scalar_val = float(scalar_val.val)
 
-    if len(inputs) == 4:
-        mode = inputs[2].val
-        assert mode in ('constant', 'reflect', 'replicate')
-        val_index = 3
-    else:
-        mode = 'constant'
-        val_index = 2
+        return x, pad, mode, scalar_val
 
-    scalar_val = inputs[val_index] if inputs[val_index] else 0.0
-    if inputs[val_index] and inputs[val_index].op.op_type == "const":
-        scalar_val = float(scalar_val.val)
+    x, pad, mode, scalar_val = _parse_positional_args(context, node)
 
     if types.is_complex(x.dtype):
         real, imag = (mb.pad(x=x, pad=pad, mode=mode, constant_val=scalar_val, name=node.name) for x in (mb.complex_real(data=x), mb.complex_imag(data=x)))
@@ -2036,12 +2237,36 @@ def instance_norm(context, node):
 
 @register_torch_op
 def group_norm(context, node):
-    inputs = _get_inputs(context, node, expected=6)
-    x = inputs[0]
-    num_groups = inputs[1].val
-    weight = inputs[2]
-    bias = inputs[3]
-    eps = inputs[4]
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(
+            context,
+            node,
+            expected={TorchFrontend.TORCHSCRIPT: 6},
+            min_expected={TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2},
+        )
+        nargs = len(inputs)
+
+        x = inputs[0]
+        num_groups = inputs[1].val
+
+        weight = inputs[2] if nargs > 2 else None
+        bias = inputs[3] if nargs > 3 else None
+        eps = inputs[4].val if nargs > 4 else 1e-5
+
+        return x, num_groups, weight, bias, eps
+
+    def _parse_keyword_args(context, node, weight, bias) -> Tuple[Var]:
+        # Only torch.export may have kwargs
+        if context.frontend != TorchFrontend.TORCHEXPORT:
+            return weight, bias
+
+        weight = _get_kwinputs(context, node, "weight", default=[weight])[0]
+        bias = _get_kwinputs(context, node, "bias", default=[bias])[0]
+        return weight, bias
+
+    x, num_groups, weight, bias, eps = _parse_positional_args(context, node)
+    weight, bias = _parse_keyword_args(context, node, weight, bias)
+
     n,c = x.shape[0],x.shape[1] # at minimum (N, C) required
     num_groups = builtins.min(num_groups,c)
     new_shape = [n, num_groups, c//num_groups]
@@ -2062,9 +2287,9 @@ def group_norm(context, node):
 
     x = mb.reshape(x=x, shape=new_shape)
     mean = mb.reduce_mean(x=x, axes=axes_, keep_dims=True)
-    var = _std(x,axes_,True,False,eps.val)
-    x = mb.sub(x=x,y=mean)
-    x = mb.real_div(x=x,y=var)
+    var = _std(x, axes_, True, False, eps)
+    x = mb.sub(x=x, y=mean)
+    x = mb.real_div(x=x, y=var)
     x = mb.reshape(x=x, shape=input_shape)
     if weight is not None:
         weight = mb.reshape(x=weight, shape=weight_shape)
@@ -2121,37 +2346,59 @@ def cat(context, node):
     def is_tensor_empty(var: Var) -> bool:
         return np.any([size == 0 for size in var.shape])
 
-    inputs = _get_inputs(context, node, min_expected=1)
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=1)
+        nargs = len(inputs)
 
-    xs = inputs[0]
-    # PyTorch can have empty tensor, which is then ignored
-    # However, CoreML does not allow such empty tensor, so remove them now
-    if np.any([is_tensor_empty(x) for x in xs]):
-        xs = [x for x in xs if not is_tensor_empty(x)]
+        xs = inputs[0]
+        # PyTorch can have empty tensor, which is then ignored
+        # However, CoreML does not allow such empty tensor, so remove them now
+        if np.any([is_tensor_empty(x) for x in xs]):
+            xs = [x for x in xs if not is_tensor_empty(x)]
 
-    axis = 0 if len(inputs) == 1 else inputs[1]
+        dim = inputs[1] if nargs > 1 else 0
 
-    concat = mb.concat(
-        values=promote_input_dtypes(xs), axis=axis, name=node.name
-    )
+        return xs, dim
+
+    def _parse_keyword_args(context, node, dim) -> Var:
+        # Only torch.export may have kwargs
+        if context.frontend != TorchFrontend.TORCHEXPORT:
+            return dim
+
+        dim = _get_kwinputs(context, node, "dim", default=[dim])[0]
+        return dim
+
+    xs, dim = _parse_positional_args(context, node)
+    dim = _parse_keyword_args(context, node, dim)
+
+    concat = mb.concat(values=promote_input_dtypes(xs), axis=dim, name=node.name)
     context.add(concat)
 
 
 @register_torch_op
 def stack(context, node):
-    inputs = _get_inputs(context, node)
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=1)
+        nargs = len(inputs)
 
-    values = inputs[0]
+        tensors = inputs[0]
 
-    if len(inputs) < 2:
-        axis = 0
-    else:
-        axis = inputs[1]
+        dim = inputs[1] if nargs > 1 else 0
 
-    if len(values) == 1:
-        res = mb.expand_dims(x=values[0], axes=[axis.val], name=node.name)
+        return tensors, dim
+
+    tensors, dim = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if dim == 0:
+            dim = _get_kwinputs(context, node, "dim", default=[dim])[0]
+    if isinstance(dim, Var):
+        dim = dim.val
+
+    if len(tensors) == 1:
+        res = mb.expand_dims(x=tensors[0], axes=[dim], name=node.name)
     else:
-        res = mb.stack(values=values, axis=axis, name=node.name)
+        res = mb.stack(values=tensors, axis=dim, name=node.name)
     context.add(res)
 
 
@@ -2225,16 +2472,26 @@ def _int(context, node):
 
 @register_torch_op(torch_alias=["native_layer_norm"])
 def layer_norm(context, node):
-    inputs = _get_inputs(context, node, min_expected=5)
-    _input = inputs[0]
-    normalized_shape = inputs[1]
-    weight = inputs[2]
-    bias = inputs[3]
-    eps = inputs[4]
-    # cudnn_enable = inputs[5] unused
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(
+            context,
+            node,
+            expected={TorchFrontend.TORCHSCRIPT: [5, 6]},
+            min_expected={TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2},
+        )
+        nargs = len(inputs)
+
+        x, normalized_shape = inputs[:2]
+
+        weight = inputs[2] if nargs > 2 else None
+        bias = inputs[3] if nargs > 3 else None
+        eps = inputs[4] if nargs > 4 else None
+        return x, normalized_shape, weight, bias, eps
+
+    x, normalized_shape, weight, bias, eps = _parse_positional_args(context, node)
 
     layer_norm = mb.layer_norm(
-        x=_input,
+        x=x,
         axes=list(range(-len(normalized_shape.val), 0)),
         gamma=weight,
         beta=bias,
@@ -3075,7 +3332,7 @@ def upsample_linear1d(context, node):
     context.add(x)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["upsample_bilinear2d.vec"])
 def upsample_bilinear2d(context, node):
     inputs = _get_inputs(context, node)
     _input = inputs[0]
@@ -3200,7 +3457,7 @@ def upsample_nearest1d(context, node):
     context.add(x)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["upsample_nearest2d.vec"])
 def upsample_nearest2d(context, node):
     inputs = _get_inputs(context, node)
     _input = inputs[0]
@@ -3211,6 +3468,7 @@ def upsample_nearest2d(context, node):
 
     if (
         scale_factors is not None
+        and isinstance(scale_factors, Var)
         and scale_factors.val is not None
         and scale_factors.rank == 1
         and scale_factors.shape[0] == 2
@@ -3219,6 +3477,10 @@ def upsample_nearest2d(context, node):
         scale_factors = scale_factors.val
         scales_h = scale_factors[0]
         scales_w = scale_factors[1]
+    elif scale_factors is not None and isinstance(scale_factors, list) and len(scale_factors) == 2:
+        # get scale factors from provided inputs
+        scales_h = scale_factors[0]
+        scales_w = scale_factors[1]
     elif (
         isinstance(output_size, list)
         and output_size[0].val is None
@@ -3531,7 +3793,7 @@ def _false_path():
         context.add(output_var, torch_name=output_name)
 
 
-@register_torch_op(torch_alias=["select.int", "select_copy.int"])
+@register_torch_op(torch_alias=["select_copy"])
 def select(context, node):
     inputs = _get_inputs(context, node, expected=3)
     _input = inputs[0]
@@ -3706,29 +3968,27 @@ def _expand_list_to_rank_1(arr):
 
 
 def _translate_torch_tensor_assign(
-    x,
-    updates,
-    begin,
-    end,
-    stride,
-    begin_mask,
-    end_mask,
-    squeeze_mask,
-    name,
+    x: Var,
+    updates: Var,
+    begin: Var,
+    end: Var,
+    stride=None,
+    begin_mask=None,
+    end_mask=None,
+    squeeze_mask=None,
+    name=None,
 ):
-
-    def torch_tensor_assign_implementation() -> Var:
-        return mb.torch_tensor_assign(
-            x=x,
-            updates=updates,
-            begin=begin,
-            end=end,
-            stride=stride,
-            begin_mask=begin_mask,
-            end_mask=end_mask,
-            squeeze_mask=squeeze_mask,
-            name=name,
-        )
+    translation_kwargs = {}
+    if stride is not None:
+        translation_kwargs["stride"] = stride
+    if begin_mask is not None:
+        translation_kwargs["begin_mask"] = begin_mask
+    if end_mask is not None:
+        translation_kwargs["end_mask"] = end_mask
+    if squeeze_mask is not None:
+        translation_kwargs["squeeze_mask"] = squeeze_mask
+    if name is not None:
+        translation_kwargs["name"] = name
 
     if is_current_opset_version_compatible_with(target.iOS18):
         # slice_update is not supporting scalar update at runtime.
@@ -3742,7 +4002,13 @@ def torch_tensor_assign_implementation() -> Var:
                 if isinstance(var, Var) and var.val is None:
                     is_begin_or_end_dynamic = True
             if is_begin_or_end_dynamic or any_symbolic(x.shape):
-                return torch_tensor_assign_implementation()
+                return mb.torch_tensor_assign(
+                    x=x,
+                    updates=updates,
+                    begin=begin,
+                    end=end,
+                    **translation_kwargs,
+                )
 
             # First pick up the ``dim`` in which ``squeeze_mask[dim] = True``,
             # and do the following transformation:
@@ -3775,14 +4041,16 @@ def torch_tensor_assign_implementation() -> Var:
             update=updates,
             begin=begin,
             end=end,
-            stride=stride,
-            begin_mask=begin_mask,
-            end_mask=end_mask,
-            squeeze_mask=squeeze_mask,
-            name=name,
+            **translation_kwargs,
         )
 
-    return torch_tensor_assign_implementation()
+    return mb.torch_tensor_assign(
+        x=x,
+        updates=updates,
+        begin=begin,
+        end=end,
+        **translation_kwargs,
+    )
 
 
 @register_torch_op
@@ -3875,9 +4143,6 @@ def select_scatter(context, node):
         updates=updates,
         begin=begin,
         end=end,
-        stride=None,
-        begin_mask=None,
-        end_mask=None,
         squeeze_mask=squeeze_mask,
         name=node.name,
     )
@@ -4023,6 +4288,14 @@ def index_put(context, node):
         ), f"indices shape {indices.shape} must equal to input shape {x.shape} for index put operation."
         indices = mb.cast(x=indices, dtype="int32")
         indices = mb.non_zero(x=indices)
+
+        # if the indices is all False,
+        # we translate the op into identity
+        if 0 in indices.shape:
+            result = mb.identity(x=x, name=node.name)
+            context.add(result)
+            return
+
         # values
         if values.shape == ():
             values = mb.expand_dims(x=values, axes=[0])
@@ -4262,7 +4535,7 @@ def ones(context, node):
         context,
         node,
         expected={TorchFrontend.TORCHSCRIPT: [5, 6]},
-        min_expected={TorchFrontend.EXIR: 1}
+        min_expected={TorchFrontend.TORCHEXPORT: 1, TorchFrontend.EXECUTORCH: 1},
     )
     size = inputs[0]
     # dtype = NUM_TO_TORCH_DTYPE[inputs[1].val] unused
@@ -4293,6 +4566,15 @@ def ones_like(context, node):
     context.add(fill)
 
 
+@register_torch_op
+def fill(context, node):
+    inputs = _get_inputs(context, node, expected=2)
+    shape = inputs[0].shape
+    value = inputs[1].val
+    result = mb.fill(shape=shape, value=value, name=node.name)
+    context.add(result)
+
+
 def _make_fill_op(size, val, name):
     assert val is not None
     if isinstance(size, list):
@@ -4352,12 +4634,23 @@ def new_full(context, node):
     result = _make_fill_op(size, val, node.name)
     context.add(result)
 
-@register_torch_op
+
+@register_torch_op(torch_alias=["randint.low"])
 def randint(context, node):
-    inputs = _get_inputs(context, node, expected=(7, 8))
-    low = mb.cast(x=inputs[0], dtype="fp32")
-    high = mb.cast(x=inputs[1], dtype="fp32")
-    shape = inputs[2]
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=2)
+        if context.frontend == TorchFrontend.TORCHSCRIPT or node.kind == "randint.low":
+            low = mb.cast(x=inputs[0], dtype="fp32")
+            high = mb.cast(x=inputs[1], dtype="fp32")
+            shape = inputs[2].val
+        else:
+            assert node.kind == "randint"
+            low = 0.0
+            high = mb.cast(x=inputs[0], dtype="fp32")
+            shape = inputs[1].val
+        return low, high, shape
+
+    low, high, shape = _parse_positional_args(context, node)
     rand_uniform = mb.random_uniform(shape=shape, low=low, high=high)
     rand_int = mb.cast(x=rand_uniform, dtype="int32", name=node.name)
     context.add(rand_int)
@@ -4485,7 +4778,7 @@ def avg_pool1d(context, node):
         context,
         node,
         expected={TorchFrontend.TORCHSCRIPT : 6},
-        min_expected={TorchFrontend.EXIR : 2},
+        min_expected={TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2},
     )
     _avg_pool(context, node, inputs)
 
@@ -4495,7 +4788,11 @@ def avg_pool2d(context, node):
     inputs = _get_inputs(
         context,
         node,
-        min_expected={TorchFrontend.TORCHSCRIPT : 6, TorchFrontend.EXIR : 2},
+        min_expected={
+            TorchFrontend.TORCHSCRIPT: 6,
+            TorchFrontend.TORCHEXPORT: 2,
+            TorchFrontend.EXECUTORCH: 2,
+        },
     )
     divisor_override = None if len(inputs) < 7 else inputs[6]
     if divisor_override is not None:
@@ -4509,7 +4806,7 @@ def avg_pool3d(context, node):
         context,
         node,
         expected={TorchFrontend.TORCHSCRIPT : 7},
-        min_expected={TorchFrontend.EXIR : 2},
+        min_expected={TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2},
     )
     divisor_override = inputs[6]
     if divisor_override is not None:
@@ -4519,14 +4816,17 @@ def avg_pool3d(context, node):
 
 @register_torch_op(torch_alias=["_log_softmax"])
 def log_softmax(context, node):
-    inputs = _get_inputs(context, node)
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=2)
+        nargs = len(inputs)
 
-    x = inputs[0]
-    axis = inputs[1]
+        x = inputs[0]
+        axis = inputs[1]
+        # input 2 is dtype, so we ignore
+
+        return x, axis
 
-    # input 2 is either out or half_to_float, so we ignore
-    ignored = inputs[2]
-    assert ignored is None or ignored.dtype == types.bool
+    x, axis = _parse_positional_args(context, node)
 
     res = mb.softmax(x=x, axis=axis, name=node.name + "_softmax")
     res = mb.log(x=res, name=node.name)
@@ -4613,26 +4913,47 @@ def gelu(context, node):
 
 @register_torch_op(torch_alias=["_slice", "slice_copy"])
 def slice(context, node):
-    inputs = _get_inputs(
-        context,
-        node,
-        expected={TorchFrontend.TORCHSCRIPT : 5},
-        min_expected={TorchFrontend.EXIR : 1},
-    )
-    x = inputs[0]
-    dim = 0 if len(inputs) < 2 else inputs[1].val
-
-    start = 0
-    if len(inputs) > 2 and inputs[2] is not None:
-        start = inputs[2].val if inputs[2].val is not None else inputs[2]
-
-    end = None
-    if len(inputs) > 3 and inputs[3] is not None:
-        end = inputs[3].val if inputs[3].val is not None else inputs[3]
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(
+            context,
+            node,
+            expected=(1, 2, 3, 4, 5),
+        )
+        nargs = len(inputs)
 
-    step = 1
-    if len(inputs) > 4 and inputs[4] is not None:
-        step = inputs[4].val if inputs[4].val is not None else inputs[4]
+        x = inputs[0]
+        dim = inputs[1].val if nargs > 1 else 0
+        start = None
+        if nargs > 2:
+            start = inputs[2]
+            if isinstance(start, Var) and start.val is not None:
+                start = start.val
+        end = None
+        if nargs > 3:
+            end = inputs[3]
+            if isinstance(end, Var) and end.val is not None:
+                end = end.val
+        step = inputs[4].val if nargs > 4 else 1
+        return x, dim, start, end, step
+
+    x, dim, start, end, step = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if dim == 0:
+            dim = _get_kwinputs(context, node, "dim", default=[dim])[0]
+        if start is None:
+            start = _get_kwinputs(context, node, "start", default=[start])[0]
+        if end is None:
+            end = _get_kwinputs(context, node, "end", default=[end])[0]
+        if step == 1:
+            step = _get_kwinputs(context, node, "step", default=[step])[0]
+    # torch start = None means Core ML start = 0
+    if start is None:
+        start = 0
+    # dim must be constant
+    if isinstance(dim, Var):
+        dim = dim.val
+        assert dim is not None
 
     if start == 0 and end is None and step == 1:
         # Handling x[:], just pass through the tensor.
@@ -4674,10 +4995,31 @@ def slice(context, node):
 
 @register_torch_op(torch_alias=["split_with_sizes", "split_with_sizes_copy"])
 def split(context, node):
-    inputs = _get_inputs(context, node, expected=3)
-    x = inputs[0]
-    split_sizes = inputs[1]
-    dim = inputs[2].val
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=2)
+        nargs = len(inputs)
+
+        x = inputs[0]
+        split_sizes = inputs[1]
+        dim = inputs[2] if nargs > 2 else 0
+        return x, split_sizes, dim
+
+    def _parse_keyword_args(context, node, dim) -> Var:
+        # Only torch.export may have kwargs
+        if context.frontend != TorchFrontend.TORCHEXPORT:
+            return dim
+
+        dim = _get_kwinputs(context, node, "dim", default=[dim])[0]
+        return dim
+
+    def _translate_torch_args(dim) -> Var:
+        if isinstance(dim, Var):
+            dim = dim.val
+        return dim
+
+    x, split_sizes, dim = _parse_positional_args(context, node)
+    dim = _parse_keyword_args(context, node, dim)
+    dim = _translate_torch_args(dim)
 
     if not isinstance(split_sizes.val, _np.ndarray):
         shape = mb.shape(x=x)
@@ -4703,21 +5045,25 @@ def split(context, node):
     context.add(res, torch_name=node.name)
 
 
-@register_torch_op(torch_alias=["unbind.int"])
+@register_torch_op
 def unbind(context, node):
-    inputs = _get_inputs(
-        context,
-        node,
-        expected={
-            TorchFrontend.TORCHSCRIPT: 2,
-            TorchFrontend.EXIR: [1, 2],
-        },
-    )
-    x = inputs[0]
-    if len(inputs) == 1:
-        dim = 0
-    else:
-        dim = inputs[1].val
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, expected=(1, 2))
+        nargs = len(inputs)
+
+        x = inputs[0]
+        dim = inputs[1] if nargs > 1 else 0
+
+        return x, dim
+
+    x, dim = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if dim == 0:
+            dim = _get_kwinputs(context, node, "dim", default=[dim])[0]
+    if isinstance(dim, Var):
+        dim = dim.val
+
     split_sizes = [1] * x.shape[dim]
     if len(split_sizes) == 1:
         res = [mb.squeeze(x=x, axes=[dim])]
@@ -4890,6 +5236,49 @@ def expand_as(context, node):
     context.add(res)
 
 
+@register_torch_op(
+    torch_alias=[
+        "atleast_2d",
+        "atleast_3d",
+        "atleast_1d.sequence",
+        "atleast_2d.sequence",
+        "atleast_3d.sequence",
+    ]
+)
+def atleast_1d(context, node):
+    def _maybe_expand_dims(x: Var, rank: int, name: Optional[str] = None) -> Var:
+        if x.rank < rank:
+            if rank == 3:
+                if x.rank == 2:
+                    axes = [2]
+                elif x.rank == 1:
+                    axes = [0, 2]
+                else:
+                    axes = [0, 1, 2]
+            else:
+                axes = [*range(rank - x.rank)]
+            kwargs = {"x": x, "axes": axes}
+            if name is not None:
+                kwargs["name"] = name
+            x = mb.expand_dims(**kwargs)
+        return x
+
+    inputs = _get_inputs(context, node, expected=1)[0]
+    rank = int(node.kind[8])
+    assert rank in (1, 2, 3)
+
+    if isinstance(inputs, (tuple, list)):
+        results = []
+        for x in inputs:
+            results.append(_maybe_expand_dims(x, rank))
+    else:
+        assert isinstance(inputs, Var)
+        x = inputs
+        results = _maybe_expand_dims(x, rank, node.name)
+
+    context.add(results, torch_name=node.name)
+
+
 def _arange(
     context,
     node_name: str,
@@ -4905,32 +5294,60 @@ def _arange(
     context.add(res)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["arange.start"])
 def arange(context, node):
-    inputs = _get_inputs(context, node)
-    # dtype = inputs[-4]
-    # layout = inputs[-3]
-    # device = inputs[-2]
-    # pin_memory = inputs[-1]
-    if len(inputs) == 1 or len(inputs) == 5:
-        # inputs are [end] or [end, dtype, layout, device, pin_memory]
-        start = 0
-        end = inputs[0]
-        step = 1
-    elif len(inputs) == 6:
-        # inputs are [start, end, dtype, layout, device, pin_memory]
-        start = inputs[0]
-        end = inputs[1]
-        step = 1
-    elif len(inputs) == 7:
-        # inputs are [start, end, step, dtype, layout, device, pin_memory]
-        start = inputs[0]
-        end = inputs[1]
-        step = inputs[2]
-    else:
-        raise ValueError(
-            "arange must have exactly 5, 6, or 7 inputs, got {}".format(len(inputs))
-        )
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=1)
+        nargs = len(inputs)
+
+        if context.frontend == TorchFrontend.TORCHSCRIPT:
+            # dtype = inputs[-4]
+            # layout = inputs[-3]
+            # device = inputs[-2]
+            # pin_memory = inputs[-1]
+            if nargs == 1 or nargs == 5:
+                # inputs are [end] or [end, dtype, layout, device, pin_memory]
+                start = 0
+                end = inputs[0]
+                step = 1
+            elif nargs == 6:
+                # inputs are [start, end, dtype, layout, device, pin_memory]
+                start = inputs[0]
+                end = inputs[1]
+                step = 1
+            elif nargs == 7:
+                # inputs are [start, end, step, dtype, layout, device, pin_memory]
+                start = inputs[0]
+                end = inputs[1]
+                step = inputs[2]
+            else:
+                raise ValueError(f"arange must have exactly 5, 6, or 7 inputs, got {nargs}")
+        else:
+            if re.match(r"arange\.start.*", node.kind):
+                start = inputs[0]
+                assert nargs > 1, "arange.start has at least 2 positional args: start, end"
+                end = inputs[1]
+                if node.kind == "arange.start_step":
+                    step = inputs[2] if nargs > 2 else 1
+                else:
+                    step = 1
+            else:
+                start = 0
+                end = inputs[0]
+                step = 1
+
+        return start, end, step
+
+    def _parse_keyword_args(context, node, step) -> Var:
+        # Only torch.export may have kwargs
+        if context.frontend != TorchFrontend.TORCHEXPORT:
+            return step
+
+        step = _get_kwinputs(context, node, "step", default=[step])[0]
+        return step
+
+    start, end, step = _parse_positional_args(context, node)
+    step = _parse_keyword_args(context, node, step)
 
     _arange(context, node.name, start, end, step)
 
@@ -4964,7 +5381,7 @@ def masked_fill(context, node):
     context.add(res)
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["meshgrid.indexing"])
 def meshgrid(context, node):
     """
     For N input tensors, a meshgrid is constructed by viewing each tensor as an N-dimension tensor
@@ -4976,22 +5393,31 @@ def meshgrid(context, node):
     N, N-dimenional grids, where the ith grid is defined as expanding the ith input over
     dimensions defined by the other inputs.
     """
-    supported_indexing_modes = ("ij", "xy")
-    indexing = "ij"
-    inputs = _get_inputs(context, node, expected=[1, 2])
 
-    if len(inputs) == 2:
-        indexing = inputs[1].val
-        if indexing not in supported_indexing_modes:
-            raise ValueError("indexing mode {} not supported".format(indexing))
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, expected=[1, 2])
+        nargs = len(inputs)
+
+        tensor_inputs = inputs[0]
+        indexing = inputs[1].val if nargs > 1 else "ij"
+        return tensor_inputs, indexing
 
-    tensor_inputs = inputs[0]
-    assert isinstance(tensor_inputs, (list, tuple))
-    if len(tensor_inputs) < 2:
-        raise ValueError("Requires >= 2 tensor inputs.")
+    def _check_args(tensor_inputs, indexing) -> None:
+        assert isinstance(tensor_inputs, (list, tuple))
+        if len(tensor_inputs) < 2:
+            raise ValueError("Requires >= 2 tensor inputs.")
+        if any([len(tensor_var.shape) > 1 for tensor_var in tensor_inputs]):
+            raise ValueError("meshgrid received non-1d tensor.")
 
-    if any([len(tensor_var.shape) > 1 for tensor_var in tensor_inputs]):
-        raise ValueError("meshgrid received non-1d tensor.")
+        if indexing not in ("ij", "xy"):
+            raise ValueError(f"indexing mode {indexing} not supported")
+
+    tensor_inputs, indexing = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if indexing == "ij":
+            indexing = _get_kwinputs(context, node, "indexing", default=[indexing])[0]
+    _check_args(tensor_inputs, indexing)
 
     dim_tuple = tuple(tensor_var.shape[0] for tensor_var in tensor_inputs)
 
@@ -5025,6 +5451,9 @@ def meshgrid(context, node):
 # Defines all the nodes that are noOps
 @register_torch_op(
     torch_alias=[
+        "_assert_async.msg",
+        "_assert_scalar",
+        "_local_scalar_dense",
         "alias_copy",
         "clone",
         "contiguous",
@@ -5038,9 +5467,14 @@ def meshgrid(context, node):
 )
 def noop(context, node):
     logger.info(f"Setting pytorch op: {node.kind} to no-op.")
-    inputs = _get_inputs(context, node)
-    _input = inputs[0]
-    context.add(_input, torch_name=node.name)
+    # These noops do not produce output
+    if node.kind in ("_assert_scalar",):
+        return
+    # Other noops return input as output
+    else:
+        inputs = _get_inputs(context, node)
+        _input = inputs[0]
+        context.add(_input, torch_name=node.name)
 
 
 @register_torch_op
@@ -5062,7 +5496,7 @@ def zeros_like(context, node):
         context,
         node,
         expected={TorchFrontend.TORCHSCRIPT: 6},
-        min_expected={TorchFrontend.EXIR: 1},
+        min_expected={TorchFrontend.TORCHEXPORT: 1, TorchFrontend.EXECUTORCH: 1},
     )
     x = inputs[0]
     shape = mb.shape(x=x)
@@ -5295,12 +5729,26 @@ def repeat(context, node):
     context.add(mb.tile(x=x, reps=reps, name=node.name))
 
 
-@register_torch_op
+@register_torch_op(torch_alias=["repeat_interleave.self_tensor", "repeat_interleave.self_int"])
 def repeat_interleave(context, node):
     """
     For now, we only support scalar repeats + None or 0 dim
     """
 
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(
+            context,
+            node,
+            expected={TorchFrontend.TORCHSCRIPT: 4},
+            min_expected={TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2},
+        )
+        nargs = len(inputs)
+
+        x = inputs[0]
+        repeats = inputs[1]
+        dim = inputs[2] if nargs > 2 else None
+        return x, repeats, dim
+
     def repeat_interleave_dim0(x: Var, repeats_val: int, name: str = None) -> Var:
         """
         on a high level:
@@ -5321,27 +5769,35 @@ def repeat_interleave_dim0(x: Var, repeats_val: int, name: str = None) -> Var:
             result
         """
 
+        translation_kwargs = {}
+        if name is not None:
+            translation_kwargs["name"] = name
+
+        x_shape = mb.shape(x=x)
+
         reps = [1] * x.rank
         reps[0] = repeats_val
         x_tiled = mb.tile(x=x, reps=reps)
 
-        split_reps = [repeats_val] + list(x.shape)
-        x_reshaped = mb.reshape(x=x_tiled, shape=list(split_reps))
+        split_reps_shape = mb.concat(values=([repeats_val], x_shape), axis=0)
+        x_reshaped = mb.reshape(x=x_tiled, shape=split_reps_shape)
 
         perm = [*range(x.rank + 1)]
         perm[0] = 1
         perm[1] = 0
         x_transposed = mb.transpose(x=x_reshaped, perm=perm)
 
-        result_shape = list(x.shape)
-        result_shape[0] = -1
-        if name is None:
-            result = mb.reshape(x=x_transposed, shape=result_shape)
-        else:
-            result = mb.reshape(x=x_transposed, shape=result_shape, name=node.name)
+        x_unaffected_sizes = mb.slice_by_index(x=x_shape, begin=[1], end=[x.rank])
+        result_shape = mb.concat(values=([-1], x_unaffected_sizes), axis=0)
+        result = mb.reshape(x=x_transposed, shape=result_shape, **translation_kwargs)
+
         return result
 
-    x, repeats, dim, _ = _get_inputs(context, node, expected=4)
+    x, repeats, dim = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if dim is None:
+            dim = _get_kwinputs(context, node, "dim", default=[dim])[0]
 
     repeats_val = repeats.val
     if isinstance(repeats_val, np.ndarray):
@@ -5357,17 +5813,26 @@ def repeat_interleave_dim0(x: Var, repeats_val: int, name: str = None) -> Var:
     if dim is None:
         x = mb.reshape(x=x, shape=(-1,))
     else:
+        dim_val = dim.val
+        assert dim_val is not None, "torch.repeat_interleave uses static dim"
+        if dim_val < 0:
+            dim_val += x.rank
         # non-0 dim requires additional pre and post treatment
-        if dim.val != 0:
+        if dim_val != 0:
             is_dim_0 = False
 
+    # quick return: repeat 1 is noop
+    if repeats_val == 1:
+        context.add(x, torch_name=node.name)
+        return
+
     if is_dim_0:
         result = repeat_interleave_dim0(x, repeats_val, node.name)
     else:
         # pre treatment: permute to have dim 0
-        perm2dim0 = [dim.val]
+        perm2dim0 = [dim_val]
         for i in range(x.rank):
-            if i != dim.val:
+            if i != dim_val:
                 perm2dim0.append(i)
         x = mb.transpose(x=x, perm=perm2dim0)
 
@@ -5523,11 +5988,18 @@ def clamp(context, node):
 
 @register_torch_op
 def triu(context, node):
-    inputs = _get_inputs(context, node, expected=2)
+    assert context.frontend != TorchFrontend.EXECUTORCH, "triu is not a core aten op"
+    inputs = _get_inputs(
+        context,
+        node,
+        expected={
+            TorchFrontend.TORCHSCRIPT: 2,
+            TorchFrontend.TORCHEXPORT: [1, 2],
+        },
+    )
     x = inputs[0]
-    diagonal = inputs[1]
-    if diagonal is not None and diagonal.val is not None:
-        diagonal = diagonal.val
+    if len(inputs) > 1 and inputs[1] is not None and inputs[1].val is not None:
+        diagonal = inputs[1].val
     else:
         diagonal = 0
     if diagonal <= 0:
@@ -5540,11 +6012,18 @@ def triu(context, node):
 
 @register_torch_op
 def tril(context, node):
-    inputs = _get_inputs(context, node, expected=2)
+    assert context.frontend != TorchFrontend.EXECUTORCH, "tril is not a core aten op"
+    inputs = _get_inputs(
+        context,
+        node,
+        expected={
+            TorchFrontend.TORCHSCRIPT: 2,
+            TorchFrontend.TORCHEXPORT: [1, 2],
+        },
+    )
     x = inputs[0]
-    diagonal = inputs[1]
-    if diagonal is not None and diagonal.val is not None:
-        diagonal = diagonal.val
+    if len(inputs) > 1 and inputs[1] is not None and inputs[1].val is not None:
+        diagonal = inputs[1].val
     else:
         diagonal = 0
     if diagonal >= 0:
@@ -5911,7 +6390,7 @@ def copy(context, node):
         "In torch script frontend, by graph pass `generate_tensor_assignment_ops`, "
         "`torch.copy_` should have been replaced with `_internal_op_tensor_inplace_copy`"
     )
-    if context.frontend == TorchFrontend.EXIR:
+    if context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
         src = inputs[1]
         if inputs[0].shape != src.shape:
             _, src = _broadcast_tensors(inputs[: 2])
@@ -6173,10 +6652,31 @@ def _solve_broadcast_shape(shapes: List[List[int]]) -> List[np.ndarray]:
         dims = [shapes[j][i] for j in range(len(shapes))]
         if any_symbolic(dims):
             # rdar://85559497 (Handle dynamic shapes inputs broadcast for pytorch)
-            raise NotImplementedError(
-                "Only static shaped inputs are supported for torch.broadcast_tensors conversion."
-            )
-        result_shape.append(_np.max(dims))
+            symbols = set()
+            integers = set()
+            for dim in dims:
+                if is_symbolic(dim):
+                    symbols.add(dim)
+                else:
+                    integers.add(dim)
+            # Integers can be safely ignored
+            if integers == {1} or integers == set():
+                result_dim = list(symbols)[0]
+                result_shape.append(result_dim)
+                # In principle, there must be only 1 symbol
+                # In practise, since our symbol propagation is imperfect,
+                # we may see multiple symbols, even if they must equal to each other / 1
+                if len(symbols) != 1:
+                    logger.warning(f"Recklessly broadcast {symbols} to {result_dim}")
+            # In principle, in such case the symbols must be 1 or equal to the integer
+            # In practise, since our symbol propagation is imperfect,
+            # we may still see symbols, even if they must equal to max integer / 1
+            else:
+                result_dim = _np.max(list(integers))
+                result_shape.append(result_dim)
+                logger.warning(f"Recklessly broadcast {symbols} and {integers} to {result_dim}")
+        else:
+            result_shape.append(_np.max(dims))
     return result_shape
 
 def _broadcast_tensors(tensors):
@@ -6859,7 +7359,13 @@ def _cast_bool_attn_mask(attn_mask: Var, query_var: Var) -> Var:
     )
     return mb.mul(x=-3e4, y=compliment_of_mask)
 
-@register_torch_op(torch_alias=["_scaled_dot_product_flash_attention_for_cpu"])
+@register_torch_op(
+    torch_alias=[
+        "_scaled_dot_product_flash_attention_for_cpu",
+        "coreml.sdpa",
+        "coreml::sdpa",
+    ]
+)
 def scaled_dot_product_attention(context, node):
     """
     Input shapes/types:
@@ -6888,47 +7394,70 @@ def _broadcast_tensor_to_same_batch_dims(x: Var, batch_dims: List[int]) -> Var:
         broadcast_shape = batch_dims + list(x.shape[-2:])
         return _broadcast(x.name + "_broadcast_same_batch_dims", x, broadcast_shape)
 
-    inputs = _get_inputs(context, node, min_expected=3)
-    q, k, v = inputs[:3]
-    attn_mask = None if len(inputs) < 4 else inputs[3]
-    dropout = 0.0 if len(inputs) < 5 else inputs[4]
-    is_causal = False if len(inputs) < 6 else inputs[5].val
+    def _parse_positional_args(context, node) -> Tuple[Var]:
+        inputs = _get_inputs(context, node, min_expected=3)
+        nargs = len(inputs)
+
+        q, k, v = inputs[:3]
+
+        if node.kind == "scaled_dot_product_attention":
+            attn_mask = inputs[3] if nargs > 3 else None
+            dropout = inputs[4] if nargs > 4 else 0.0
+            is_causal = inputs[5].val if nargs > 5 else False
+            scale = inputs[6] if nargs > 6 else None
+        elif node.kind == "_scaled_dot_product_flash_attention_for_cpu":
+            dropout = inputs[3] if nargs > 3 else 0.0
+            is_causal = inputs[4].val if nargs > 4 else False
+            attn_mask = inputs[5] if nargs > 5 else None
+            scale = inputs[6] if nargs > 6 else None
+        else:
+            assert node.kind in ("coreml.sdpa", "coreml::sdpa")
+            attn_mask = inputs[3] if nargs > 3 else None
+            dropout = 0.0
+            is_causal = False
+            scale = None
 
-    # When len(inputs) == 7, the inputs are (q, k, v, attn_mask, dropout, is_causal, scale)
-    if len(inputs) == 7 and inputs[6] is not None:
-        raise NotImplementedError(
-            "scaled_dot_product_attention op: scale parameter is not handled."
-        )
+        return q, k, v, attn_mask, dropout, is_causal, scale
 
-    if attn_mask is not None and is_causal:
-        raise ValueError(
-            "scaled_dot_product_attention op: attn_mask cannot be provided when is_causal is set to True."
-        )
+    def _check_args(q, k, v, attn_mask, dropout, is_causal, scale) -> None:
+        if attn_mask is not None and is_causal:
+            raise ValueError(
+                "scaled_dot_product_attention op: attn_mask cannot be provided when is_causal is set to True."
+            )
 
-    if dropout is not None:
-        if isinstance(dropout, Var):
-            if dropout.val is None:
-                raise NotImplementedError(
-                    "A variable dropout probability is specified. Since Core ML "
-                    "does not support dropout yet, we cowardly refuse to convert it"
+        if dropout is not None:
+            if isinstance(dropout, Var):
+                if dropout.val is None:
+                    raise NotImplementedError(
+                        "A variable dropout probability is specified. Since Core ML "
+                        "does not support dropout yet, we cowardly refuse to convert it"
+                    )
+                else:
+                    dropout = dropout.val
+            if dropout != 0.0:
+                raise ValueError(
+                    "A non-zero dropout probability is specified. Since Core ML "
+                    "does not support dropout yet, we cannot convert it"
                 )
-            else:
-                dropout = dropout.val
-        if dropout != 0.0:
+
+        # check that ranks of q, k, v and attn_mask match
+        if k.rank != q.rank:
             raise ValueError(
-                "A non-zero dropout probability is specified. Since Core ML "
-                "does not support dropout yet, we cannot convert it"
+                "Rank of query and key do not match in scaled_dot_product_attention torch op"
+            )
+        if v.rank != q.rank:
+            raise ValueError(
+                "Rank of query and value do not match in scaled_dot_product_attention torch op"
             )
 
-    # check that ranks of q, k, v and attn_mask match
-    if k.rank != q.rank:
-        raise ValueError(
-            "Rank of query and key do not match in scaled_dot_product_attention torch op"
-        )
-    if v.rank != q.rank:
-        raise ValueError(
-            "Rank of query and value do not match in scaled_dot_product_attention torch op"
-        )
+    q, k, v, attn_mask, dropout, is_causal, scale = _parse_positional_args(context, node)
+    # torch.export may have kwargs
+    if context.frontend == TorchFrontend.TORCHEXPORT:
+        if attn_mask is None:
+            attn_mask = _get_kwinputs(context, node, "attn_mask", default=[attn_mask])[0]
+        if scale is None:
+            scale = _get_kwinputs(context, node, "scale", default=[scale])[0]
+    _check_args(q, k, v, attn_mask, dropout, is_causal, scale)
 
     mask = None
     if is_causal:
@@ -6941,7 +7470,8 @@ def _broadcast_tensor_to_same_batch_dims(x: Var, batch_dims: List[int]) -> Var:
             mask = attn_mask
 
     # Since ios18, Core ML supports scaled_dot_product_attention op
-    if is_current_opset_version_compatible_with(target.iOS18):
+    # It does not have scale, though
+    if is_current_opset_version_compatible_with(target.iOS18) and scale is None:
         # ios18 scaled_dot_product_attention only supports rank >= 3
         is_rank_2 = q.rank == 2
 
@@ -6972,7 +7502,7 @@ def _broadcast_tensor_to_same_batch_dims(x: Var, batch_dims: List[int]) -> Var:
 
     # For ios18-, scaled_dot_product_attention has to be decomposed
     else:
-        res = _utils._decompose_scaled_dot_product_attention(q, k, v, mask, node.name)
+        res = _utils._decompose_scaled_dot_product_attention(q, k, v, mask, node.name, scale=scale)
 
     context.add(res)
 
diff --git a/coremltools/converters/mil/frontend/torch/quantization_ops.py b/coremltools/converters/mil/frontend/torch/quantization_ops.py
index e47965850..11aab6f4c 100644
--- a/coremltools/converters/mil/frontend/torch/quantization_ops.py
+++ b/coremltools/converters/mil/frontend/torch/quantization_ops.py
@@ -3,12 +3,14 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-
 import numpy as _np
 import torch as _torch
+from packaging.version import Version
 
 from coremltools import _logger as logger
+from coremltools._deps import _HAS_TORCHAO, MSG_TORCHAO_NOT_FOUND
 from coremltools.converters.mil.frontend import _utils
+from coremltools.converters.mil.frontend.torch.ops import NUM_TO_NUMPY_DTYPE
 from coremltools.converters.mil.mil import Builder as mb
 from coremltools.converters.mil.mil import Var, types
 
@@ -16,12 +18,17 @@
 from .torch_op_registry import register_torch_op
 from .utils import (
     NUM_TO_TORCH_DTYPE,
+    TORCH_DTYPE_TO_NUM,
+    TORCH_EXPORT_BASED_FRONTENDS,
     TORCH_QTYPE_TO_NP_TYPE,
     TORCH_QTYPE_TO_STR,
     TYPE_TO_DTYPE_STRING,
     TorchFrontend,
 )
 
+if _HAS_TORCHAO:
+    from torchao.quantization import quant_primitives as torchao_quant
+
 
 def _quantize_general(
     context,
@@ -94,17 +101,22 @@ def quantize_per_tensor(context, node):
     inputs = _get_inputs(
         context,
         node,
-        expected={TorchFrontend.TORCHSCRIPT: 4, TorchFrontend.EXIR: 6},
+        expected={
+            TorchFrontend.TORCHSCRIPT: 4,
+            TorchFrontend.TORCHEXPORT: 6,
+            TorchFrontend.EXECUTORCH: 6,
+        },
     )
-    assert context.frontend in (TorchFrontend.TORCHSCRIPT, TorchFrontend.EXIR)
     if context.frontend == TorchFrontend.TORCHSCRIPT:
         input, scale, zero_point, torch_dtype = inputs
-    elif context.frontend == TorchFrontend.EXIR:
+    elif context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
         input, scale, zero_point, qmin, qmax, torch_dtype = inputs
         if qmax.val - qmin.val <= 16:
             logger.warning(
                 f"Core ML does not support 4-bit activation, so {torch_dtype.val} is used instead"
             )
+    else:
+        raise ValueError(f"Invalid PyTorch frontend {context.frontend}")
 
     _quantize_general(context, node, input, scale, zero_point, torch_dtype)
 
@@ -119,6 +131,17 @@ def quantize_per_channel(context, node):
     _quantize_general(context, node, input, scale, zero_point, torch_dtype, axis.val)
 
 
+@register_torch_op(
+    torch_alias=[
+        "quantized_decomposed::choose_qparams_per_token_asymmetric",
+        "quantized_decomposed.choose_qparams_per_token_asymmetric",
+    ]
+)
+def choose_qparams_per_token_asymmetric(context, node):
+    """PyTorch uses this op to calculate scale and zero_point on-the-fly for input data."""
+    raise NotImplementedError("Dynamic activation quantization is not supported in Core ML.")
+
+
 def _dequantize_general(
     context,
     node,
@@ -194,8 +217,10 @@ def _dequantize_general(
 def dequantize(context, node):
     if context.frontend == TorchFrontend.TORCHSCRIPT:
         context.quant_context.get_dequantized_var(node.inputs[0], node.name)
-    elif context.frontend == TorchFrontend.EXIR:
-        inputs = _get_inputs(context, node, min_expected={TorchFrontend.EXIR: 6})
+    elif context.frontend in TORCH_EXPORT_BASED_FRONTENDS:
+        inputs = _get_inputs(
+            context, node, min_expected={TorchFrontend.TORCHEXPORT: 6, TorchFrontend.EXECUTORCH: 6}
+        )
         num_inputs = len(inputs)
         if num_inputs == 6:
             input, scale, zero_point, qmin, qmax, _ = inputs
@@ -206,10 +231,7 @@ def dequantize(context, node):
             raise ValueError(f"dequantize should have 6 or 7 inputs, but got {num_inputs}")
         _dequantize_general(context, node, input, scale, zero_point, axis, qmin, qmax)
     else:
-        raise ValueError(
-            "dequantize is supported only in TorchScript and EXIR frontends, "
-            f"but got {context.frontend}"
-        )
+        raise ValueError(f"Invalid PyTorch frontend {context.frontend}")
 
 
 def _dequantized_weight(qweight, name: str = None):
@@ -457,3 +479,234 @@ def quantized_embedding(context, node):
     #  Changing the axis from 0 is not an option in torch, so we don't expose it
     gather = mb.gather(x=dequant_weights, indices=indices, name=node.name)
     context.add(gather)
+
+
+@register_torch_op(
+    torch_alias=[
+        "quantized_decomposed::embedding_4bit",
+        "quantized_decomposed::embedding_4bit.dtype",
+        "quantized_decomposed.embedding_4bit",
+        "quantized_decomposed.embedding_4bit.dtype",
+    ]
+)
+def quantized_embedding_4bit(context, node):
+    """Lower the 4-bit quantized embedding op used in executorch."""
+    inputs = _get_inputs(context, node, expected=[6, 7])
+    weight = inputs[0].val
+    weight_scales = inputs[1].val
+    weight_zero_points = None
+    if inputs[2] is not None and inputs[2].val is not None:
+        weight_zero_points = inputs[2].val
+    weight_quant_min = inputs[3].val
+    weight_quant_max = inputs[4].val
+    indices = inputs[5]
+
+    out_np_dtype = None
+    if len(inputs) > 6:
+        if isinstance(inputs[6].val, _torch.dtype):
+            out_np_dtype = NUM_TO_NUMPY_DTYPE[TORCH_DTYPE_TO_NUM[inputs[6].val]]
+        elif isinstance(inputs[6].val, (int, _np.generic)):
+            out_np_dtype = NUM_TO_NUMPY_DTYPE[inputs[6].val]
+    if out_np_dtype is not None:
+        weight_scales = weight_scales.astype(out_np_dtype)
+
+    if weight_quant_min == 0 and weight_quant_max == 0:
+        # Executorch wrongly passes both weight_quant_min and weight_quant_max. We should set it to correct numbers.
+        signed = True
+        weight_quant_min = -8
+        weight_quant_max = 7
+    else:
+        signed = weight_quant_min < 0
+
+    quant_low = -8 if signed else 0
+    quant_high = 7 if signed else 15
+    quant_torch_dtype = _torch.int8 if signed else _torch.uint8
+    if weight_quant_min != quant_low:
+        raise ValueError(
+            f"The weight_quant_min should be {quant_low} for 4-bit embedding, but got {weight_quant_min}."
+        )
+    if weight_quant_max != quant_high:
+        raise ValueError(
+            f"The weight_quant_max should be {quant_high} for 4-bit embedding, but got {weight_quant_max}."
+        )
+
+    # Unpack the weight to the normal layout.
+    with _torch.no_grad():
+        weight = _torch.from_numpy(weight)
+        # The original weight was packed by using 8-bit to represent two numbers, so we need to separate them.
+        help_move_bits = 2**4
+        weight_even = weight.div(help_move_bits, rounding_mode="trunc")
+        weight_odd = weight.remainder(help_move_bits)
+        weight_unpacked = _torch.stack((weight_even, weight_odd), dim=-1)
+        weight = weight_unpacked.view(weight.shape[0], -1)
+        weight = weight.view(quant_torch_dtype).add(weight_quant_min).numpy()
+
+    if not _np.logical_and(weight >= quant_low, weight <= quant_high).all():
+        raise ValueError(
+            f"All elements in weight should be within 4-bit range ({quant_low} to {quant_high})."
+        )
+
+    quantized_np_dtype = types.nptype_from_builtin(
+        types.string_to_builtin("int4" if signed else "uint4")
+    )
+    dequant_weight = _utils._construct_constexpr_dequant_op(
+        weight.astype(quantized_np_dtype),
+        weight_zero_points,
+        weight_scales,
+        axis=-1,
+        name=inputs[0].name,
+    )
+
+    gather = mb.gather(x=dequant_weight, indices=indices, name=node.name)
+    context.add(gather)
+
+
+@register_torch_op
+def _convert_weight_to_int4pack(context, node):
+    """Pack weight to int4pack format which will be fed into `_weight_int4pack_mm` op."""
+    inputs = _get_inputs(context, node, expected=2)
+    x = inputs[0].val
+    inner_k_tiles = inputs[1].val
+
+    if x is None or inner_k_tiles is None:
+        raise NotImplementedError(
+            "For `_convert_weight_to_int4pack` op, we only support static case, where x, "
+            "and inner_k_tiles are all known during compilation time."
+        )
+
+    with _torch.no_grad():
+        x_int4packed = _torch._convert_weight_to_int4pack(
+            _torch.from_numpy(x), inner_k_tiles
+        ).numpy()
+
+    res = mb.const(val=x_int4packed, name=node.name)
+    context.add(res)
+
+
+@register_torch_op
+def _weight_int4pack_mm(context, node):
+    """
+    The first argument is the same as torch.mm, but the second argument (weight) is packed.
+    The packed weight has rank=4, because the meta registration in dynamo requires operator has the same output shape
+    for each device. So it creates a fake shape {N / 8, K / (16 * innerKTiles), 32, innerKTiles / 2} for CPU.
+
+    More specifically:
+
+        # Original torch.mm
+        torch.mm(a, b)
+
+        # The int4 packed version mm
+        b_uint8, b_scales_and_zeros = _group_quantize_tensor(
+            b, n_bit=4, q_group_size=q_group
+        )
+        b_int4pack = torch._convert_weight_to_int4pack(
+            b_uint8, inner_k_tiles
+        )
+        weight_int4pack_mm(a, b_int4pack, b_scales_and_zeros)
+    """
+    if Version(_torch.__version__) < Version("2.4.0"):
+        raise AssertionError("To lower _weight_int4pack_mm, requires torch >= 2.4.0")
+
+    logger.warning(
+        "The current conversion of `_weight_int4pack_mm` op only works with model produced by torchao. "
+        "If the op is produced by other libs, you may observe large numerical discrepancy."
+    )
+
+    if not _HAS_TORCHAO:
+        raise AssertionError(
+            f"{MSG_TORCHAO_NOT_FOUND}\n torchao is needed to convert torch blockwise quantized model."
+        )
+
+    inputs = _get_inputs(context, node, expected=4)
+    x = inputs[0]
+    y_int4pack = inputs[1].val
+    group_size = inputs[2].val
+    y_scales_and_zeros = inputs[3].val
+
+    if y_int4pack is None or group_size is None or y_scales_and_zeros is None:
+        raise NotImplementedError(
+            "For `_weight_int4pack_mm` op, we only support static case, where y_int4pack, "
+            "group_size, y_scales_and_zeros are all known during compilation time."
+        )
+
+    if not (len(y_scales_and_zeros.shape) == 3 and y_scales_and_zeros.shape[2] == 2):
+        raise ValueError(
+            "The scales_and_zeros from torch should have 3 dims and last dim has size 2."
+        )
+    scales = _np.transpose(y_scales_and_zeros[:, :, 0])
+    zero_points = _np.transpose(y_scales_and_zeros[:, :, 1])
+
+    if _np.allclose(zero_points, zero_points.astype("int32")):
+        zero_points = zero_points.astype("int32")
+    else:
+        zero_points = zero_points.astype(_np.float32)
+
+    # Unpack the result of `torch._convert_weight_to_int4pack` back to plain layout.
+    # TODO: Use `torchao.ops.unpack_tensor_core_tiled_layout` to unpack after it has CPU implementation.
+    # The current way to unpack by using _weight_int4pack_mm with eye matrix is a workaround on CPU.
+    if len(y_int4pack.shape) != 4:
+        raise ValueError(
+            f"The packed y from torch should have 4 dims, but got {len(y_int4pack.shape)}."
+        )
+    inner_k_tiles = y_int4pack.shape[-1] * 2
+    y_unpacked_shape = (y_int4pack.shape[0] * 8, y_int4pack.shape[1] * (inner_k_tiles * 16))
+    eye_shape = y_unpacked_shape[1]
+    quant_min = 0
+    quant_max = 2**4 - 1
+    with _torch.no_grad():
+        y_dequantized = (
+            _torch._weight_int4pack_mm(
+                _torch.eye(eye_shape, device=_torch.device("cpu"), dtype=_torch.float32),
+                _torch.from_numpy(y_int4pack),
+                group_size,
+                _torch.from_numpy(y_scales_and_zeros).float(),
+            )
+            .t()
+            .contiguous()
+        )
+        zero_point_domain = (
+            torchao_quant.ZeroPointDomain.INT
+            if _np.issubdtype(zero_points.dtype, _np.integer)
+            else torchao_quant.ZeroPointDomain.FLOAT
+        )
+        y_quantized = torchao_quant.quantize_affine(
+            y_dequantized,
+            (1, group_size),
+            _torch.from_numpy(scales),
+            _torch.from_numpy(zero_points),
+            _torch.int32,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            zero_point_domain=zero_point_domain,
+        )
+    y_quantized = y_quantized.numpy().astype(_np.uint8)
+    if len(y_quantized.shape) != 2:
+        raise ValueError(
+            f"The unpacked quantized y should have 2 dims, but got {len(y_quantized.shape)}."
+        )
+    if not _np.logical_and(y_quantized >= 0, y_quantized <= 15).all():
+        raise ValueError("All elements should be within 4-bit range (0 to 15).")
+
+    # If zero_point_domain in `quantize_affine` is set to `ZeroPointDomain.INT`, it matches with CoreML implementation:
+    #         quant = torch.clamp(torch.round(input * (1.0 / scale)) + zero_point, quant_min, quant_max)
+    # However, for `ZeroPointDomain.FLOAT`, torchao did following transformations to make it compatible with `tinygemm`:
+    #         mid_point = (quant_max + quant_min + 1) / 2
+    #         min_val = zero_point - scale * mid_point
+    #         quant = torch.clamp(torch.round((input - min_val) / scale), quant_min, quant_max))
+    # As we want to make sure the quantize matches CoreML dequant op, we have to do following transformations:
+    #         dequant = (quant - mid_point) * scale + zp
+    # so we can re-write the expression as
+    #         dequant = (quant - (mid_point - zp / scale)) * scale
+    # which means the zero_point in CoreML is actually `mid_point - zp / scale`.
+    if not _np.issubdtype(zero_points.dtype, _np.integer):
+        mid_point = (quant_max + quant_min + 1) / 2
+        zero_points = mid_point - zero_points / scales
+
+    # Use MIL constexpr op to represent the quantization.
+    quantized_np_dtype = types.nptype_from_builtin(types.string_to_builtin("uint4"))
+    dequant_weights = _utils._construct_constexpr_dequant_op(
+        y_quantized.astype(quantized_np_dtype), zero_points, scales, axis=-1, name=inputs[1].name
+    )
+
+    res = mb.linear(x=x, weight=dequant_weights, name=node.name)
+    context.add(res)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py b/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
index 9331bacf9..f16663864 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_internal_graph.py
@@ -1412,7 +1412,8 @@ def test_max_pool2d(
                 ceil_mode,
             ],
             "max_pool2d",
-            ops.max_pool2d,
+            # Using ops.max_pool1d because max_pool2d is its alias
+            ops.max_pool1d,
             expected_result,
         )
 
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
index d6845aae1..f76d89734 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_conversion_api.py
@@ -12,6 +12,7 @@
 
 import numpy as np
 import pytest
+from packaging.version import Version
 from PIL import Image
 
 import coremltools as ct
@@ -19,8 +20,10 @@
     _HAS_EXECUTORCH,
     _HAS_HF,
     _HAS_TORCH,
+    _HAS_TORCHAO,
     MSG_EXECUTORCH_NOT_FOUND,
     MSG_TORCH_NOT_FOUND,
+    MSG_TORCHAO_NOT_FOUND,
 )
 from coremltools.converters.mil.frontend.torch.test.testing_utils import _copy_input_data
 from coremltools.converters.mil.frontend.torch.torch_op_registry import (
@@ -59,6 +62,9 @@
 if _HAS_EXECUTORCH:
     import executorch.exir
 
+if _HAS_TORCHAO:
+    from torchao.quantization import quant_api
+    from torchao.utils import unwrap_tensor_subclass
 
 @pytest.fixture
 def torch_model():
@@ -2842,3 +2848,111 @@ def test_iO16_default_fp32_io(self, float32_input_model_add_op):
             output_dtype="fp32",
             expected_op_list=["add"],
         )
+
+
+@pytest.mark.skipif(
+    Version(torch.__version__) < Version("2.4.0"),
+    reason="Most torchao functionalities only work with PyTorch 2.4.0+",
+)
+@pytest.mark.skipif(
+    ct.utils._macos_version() < (15, 0),
+    reason="Torchao block-wise quantization requires MacOS 15+.",
+)
+@pytest.mark.skipif(not _HAS_TORCHAO, reason=MSG_TORCHAO_NOT_FOUND)
+class TestTorchao:
+    """
+    This class tests the torchao quantized model conversion.
+    """
+
+    @staticmethod
+    def _construct_test_model():
+        # The old Quantizer method in torchao doesn't work with a single-layer model such as model=nn.Linear(...),
+        # so we have to create a Module which contains linear layers.
+        class TestModel(nn.Module):
+            def __init__(self):
+                super(TestModel, self).__init__()
+                # Currently torchao only supports Linear module without bias.
+                self.linear1 = nn.Linear(32, 64, bias=False)
+                self.linear2 = nn.Linear(64, 32, bias=False)
+                self.relu = nn.ReLU()
+
+            def forward(self, x):
+                x = self.relu(self.linear1(x))
+                return self.relu(self.linear2(x))
+
+        return TestModel().to(torch.device("cpu")).eval()
+
+    @pytest.mark.parametrize("use_export", (False, True))
+    def test_weight_only_quantization(self, use_export):
+        model = self._construct_test_model()
+        quantizer = quant_api.Int4WeightOnlyQuantizer(
+            precision=torch.float32, groupsize=32, inner_k_tiles=2, device=torch.device("cpu")
+        )
+        model = quantizer.quantize(model)
+        input_data = torch.randn((2, 32), dtype=torch.float16)
+
+        if use_export:
+            exported_model = torch.export.export(model, (input_data,))
+            inputs = None
+        else:
+            exported_model = torch.jit.trace(model, example_inputs=(input_data,))
+            inputs = [ct.TensorType(shape=input_data.shape, name="input")]
+
+        converted_model = ct.convert(
+            exported_model, inputs=inputs, minimum_deployment_target=ct.target.iOS18
+        )
+        main_func = converted_model._mil_program.functions["main"]
+        quantize_ops = main_func.find_ops(op_type="constexpr_blockwise_shift_scale")
+        assert len(quantize_ops) > 0
+
+        if ct.utils._is_macos():
+            result = converted_model.predict(
+                {
+                    list(converted_model.input_description)[0]: input_data.detach()
+                    .numpy()
+                    .astype(np.float32)
+                }
+            )
+            expected = model(input_data)
+            output_name = list(result.keys())[0]
+            np.testing.assert_allclose(result[output_name], expected.detach().numpy(), atol=1e-3)
+
+    def test_weight_only_quantization_bfloat16_not_support(self):
+        """
+        Torchao quant_api.int4_weight_only only supports bfloat16.
+        """
+        model = self._construct_test_model().bfloat16()
+        quant_api.quantize_(model, quant_api.int4_weight_only(group_size=32, inner_k_tiles=2))
+        model = unwrap_tensor_subclass(model)
+        input_data = torch.randn((2, 32), dtype=torch.float16)
+        exported_model = torch.export.export(model, (input_data,))
+        # The conversion of bfloat16 hasn't been supported yet.
+        with pytest.raises(KeyError, match="torch.bfloat16"):
+            ct.convert(exported_model, minimum_deployment_target=ct.target.iOS17)
+
+    @pytest.mark.parametrize("use_export", (True, False))
+    def test_dynamic_activation_quantization_not_support(self, use_export):
+        """
+        Although Int8DynActInt4WeightQuantizer will be deprecated, we still want
+        to test it because it's used in ExecuTorch to quantize llama models.
+        """
+        model = self._construct_test_model()
+        quantizer = quant_api.Int8DynActInt4WeightQuantizer(
+            precision=torch.float16, groupsize=32, device=torch.device("cpu")
+        )
+        model = quantizer.quantize(model)
+        input_data = torch.randn((2, 32), dtype=torch.float16)
+
+        if use_export:
+            exported_model = torch.export.export(model, (input_data,))
+            inputs = None
+            err_msg = "Unsupported fx node quantize_per_token"
+            err_type = ValueError
+        else:
+            exported_model = torch.jit.trace(model, example_inputs=(input_data,))
+            inputs = [ct.TensorType(shape=input_data.shape)]
+            err_msg = "Dynamic activation quantization is not supported in Core ML"
+            err_type = NotImplementedError
+
+        with pytest.raises(err_type, match=err_msg):
+            ct.convert(exported_model, inputs=inputs, minimum_deployment_target=ct.target.iOS17)
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_export_conversion_api.py b/coremltools/converters/mil/frontend/torch/test/test_torch_export_conversion_api.py
index 9d9896bfa..eeb05745f 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_export_conversion_api.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_export_conversion_api.py
@@ -12,27 +12,155 @@
 if not _HAS_TORCH_EXPORT_API:
     pytest.skip(allow_module_level=True, reason="torch.export is required")
 
-USE_EDGE_DIALECT = [False]
+from coremltools.converters.mil.frontend.torch.exir_utils import WRAPPED_SCALAR_INPUT_SUFFIX
+from coremltools.converters.mil.frontend.torch.utils import TorchFrontend
+
+frontends = [TorchFrontend.TORCHEXPORT]
+
 if _HAS_EXECUTORCH:
-    USE_EDGE_DIALECT.append(True)
+    import executorch.exir
+
+    frontends.append(TorchFrontend.EXECUTORCH)
 
 import torch
 
+import coremltools as ct
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.mil.scope import ScopeSource
 
-from .testing_utils import TorchBaseTest, TorchFrontend
+from .testing_utils import TorchBaseTest
 
 backends = testing_reqs.backends
 compute_units = testing_reqs.compute_units
 
+TORCH_EXPORT_DEFAULT_LOWER_BOUND = {TorchFrontend.TORCHEXPORT: 2, TorchFrontend.EXECUTORCH: 2}
+if torch.__version__ >= "2.4.0":
+    TORCH_EXPORT_DEFAULT_LOWER_BOUND[TorchFrontend.TORCHEXPORT] = 0
+
 
 class TestTorchExportConversionAPI(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, use_edge_dialect, dynamic",
-        itertools.product(compute_units, backends, USE_EDGE_DIALECT, (True, False)),
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
+    )
+    def test_scalar_input(self, compute_unit, backend, frontend):
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x + 1
+
+        model = Model()
+        model.eval()
+
+        mlmodel = self.run_compare_torch(
+            (),
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )[1]
+        main_function = mlmodel._mil_program.functions["main"]
+
+        assert len(main_function.inputs) == 1
+        input_name = list(main_function.inputs.keys())[0]
+        input_var = main_function.inputs[input_name]
+        assert input_name.endswith(WRAPPED_SCALAR_INPUT_SUFFIX)
+        assert input_var.shape == (1,)
+
+        squeeze_op = main_function.find_ops(op_type="squeeze")[0]
+        if backend[1] == "fp32":
+            assert squeeze_op.x is input_var
+        elif backend[1] == "fp16":
+            cast_op = main_function.find_ops(op_type="cast")[0]
+            assert cast_op.x is input_var
+            assert cast_op.dtype.val == "fp16"
+            assert squeeze_op.x is cast_op.outputs[0]
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
+    )
+    def test_dynamic_input(self, compute_unit, backend, frontend):
+        if ct.utils._macos_version() <= (14, 2):
+            pytest.xfail("rdar://135925921 ([CI] Upgrade External CI Machine OS)")
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 5)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = Model()
+        model.eval()
+
+        batch_dim = torch.export.Dim("batch_dim")
+        dynamic_shapes = {"x": {0: batch_dim}}
+
+        coreml_model = self.run_compare_torch(
+            (2, 3),
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+            torch_export_dynamic_shapes=dynamic_shapes,
+        )[1]
+
+        input_proto = coreml_model.input_description._fd_spec[0]
+        size_ranges = input_proto.type.multiArrayType.shapeRange.sizeRanges
+        assert size_ranges[0].lowerBound == TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+        assert size_ranges[0].upperBound == 2147483647
+        assert size_ranges[1].lowerBound == 3
+        assert size_ranges[1].upperBound == 3
+
+    @pytest.mark.parametrize("frontend, dynamic", itertools.product(frontends, (True, False)))
+    def test_invalid_inputs(self, frontend, dynamic):
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(3, 5)
+
+            def forward(self, x):
+                return self.linear(x)
+
+        model = Model()
+        model.eval()
+
+        example_inputs = (torch.rand(2, 3),)
+
+        dynamic_shapes = None
+        if dynamic:
+            batch_dim = torch.export.Dim("batch_dim")
+            dynamic_shapes = {"x": {0: batch_dim}}
+
+        exported_program = torch.export.export(
+            model,
+            example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
+        if frontend == TorchFrontend.EXECUTORCH:
+            exported_program = executorch.exir.to_edge(exported_program).exported_program()
+
+        with pytest.raises(
+            AssertionError, match=r"'inputs' argument should be None for ExportedProgram"
+        ):
+            inputs = [ct.TensorType(shape=(2, 3))]
+            if dynamic:
+                batch_dim = ct.RangeDim(lower_bound=1, upper_bound=128)
+                shape = (batch_dim, 3)
+                inputs = [ct.TensorType(shape=shape)]
+            ct.convert(exported_program, inputs=inputs)
+
+
+class TestExecuTorchExamples(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, dynamic",
+        itertools.product(compute_units, backends, frontends, (True, False)),
     )
-    def test_mul(self, compute_unit, backend, use_edge_dialect, dynamic):
+    def test_mul(self, compute_unit, backend, frontend, dynamic):
+        if ct.utils._macos_version() <= (14, 2) and dynamic:
+            pytest.xfail("rdar://135925921 ([CI] Upgrade External CI Machine OS)")
+
         class MulModule(torch.nn.Module):
             def forward(self, input, other):
                 return input * other
@@ -46,15 +174,24 @@ def forward(self, input, other):
                 "other": {0: dim0, 1: dim1},
             }
 
-        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+        coreml_model = self.run_compare_torch(
             [(3, 2), (3, 2)],
             MulModule(),
             compute_unit=compute_unit,
             backend=backend,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             torch_export_dynamic_shapes=dynamic_shapes,
-        )
+        )[1]
+
+        if dynamic:
+            for input_proto in coreml_model.input_description._fd_spec:
+                size_ranges = input_proto.type.multiArrayType.shapeRange.sizeRanges
+                assert size_ranges[0].lowerBound == TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+                assert size_ranges[0].upperBound == 2147483647
+                assert size_ranges[1].lowerBound == max(
+                    1, TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+                )
+                assert size_ranges[1].upperBound == 3
 
         mil_program = coreml_model._mil_program
         mul = mil_program.functions["main"].find_ops(op_type="mul")[0]
@@ -62,7 +199,7 @@ def forward(self, input, other):
         stack_trace = mul.scopes[ScopeSource.EXIR_STACK_TRACE][0]
         assert stack_trace.split("\n")[-2].strip() == "return input * other"
 
-        if use_edge_dialect:
+        if frontend == TorchFrontend.EXECUTORCH:
             debug_handle = mul.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0]
             assert isinstance(debug_handle, int)
 
@@ -101,10 +238,13 @@ def forward(self, input, other):
                 assert ops[index_cast][-1]["Operator"] == "cast"
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, use_edge_dialect, dynamic",
-        itertools.product(compute_units, backends, USE_EDGE_DIALECT, (True, False)),
+        "compute_unit, backend, frontend, dynamic",
+        itertools.product(compute_units, backends, frontends, (True, False)),
     )
-    def test_linear(self, compute_unit, backend, use_edge_dialect, dynamic):
+    def test_linear(self, compute_unit, backend, frontend, dynamic):
+        if ct.utils._macos_version() <= (14, 2) and dynamic:
+            pytest.xfail("rdar://135925921 ([CI] Upgrade External CI Machine OS)")
+
         class LinearModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -118,15 +258,22 @@ def forward(self, arg):
             batch_dim = torch.export.Dim("batch_dim")
             dynamic_shapes = {"arg": {0: batch_dim}}
 
-        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+        coreml_model = self.run_compare_torch(
             [(3, 3)],
             LinearModule(),
             compute_unit=compute_unit,
             backend=backend,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             torch_export_dynamic_shapes=dynamic_shapes,
-        )
+        )[1]
+
+        if dynamic:
+            input_proto = coreml_model.input_description._fd_spec[0]
+            size_ranges = input_proto.type.multiArrayType.shapeRange.sizeRanges
+            assert size_ranges[0].lowerBound == TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+            assert size_ranges[0].upperBound == 2147483647
+            assert size_ranges[1].lowerBound == 3
+            assert size_ranges[1].upperBound == 3
 
         mil_program = coreml_model._mil_program
         linear = mil_program.functions["main"].find_ops(op_type="linear")[0]
@@ -134,7 +281,7 @@ def forward(self, arg):
         stack_trace = linear.scopes[ScopeSource.EXIR_STACK_TRACE][0]
         assert stack_trace.split("\n")[-2].strip() == "return self.linear(arg)"
 
-        if use_edge_dialect:
+        if frontend == TorchFrontend.EXECUTORCH:
             debug_handle = linear.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0]
             assert isinstance(debug_handle, int)
 
@@ -174,10 +321,10 @@ def forward(self, arg):
                 assert ops[index_cast][-1]["Operator"] == "cast"
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, use_edge_dialect, dynamic",
-        itertools.product(compute_units, backends, USE_EDGE_DIALECT, (True, False)),
+        "compute_unit, backend, frontend, dynamic",
+        itertools.product(compute_units, backends, frontends, (True, False)),
     )
-    def test_add(self, compute_unit, backend, use_edge_dialect, dynamic):
+    def test_add(self, compute_unit, backend, frontend, dynamic):
         if dynamic:
             pytest.skip(
                 "https://github.com/apple/coremltools/issues/2307 "
@@ -197,15 +344,20 @@ def forward(self, x, y):
             dim0 = torch.export.Dim("dim0", min=1)
             dynamic_shapes = {"x": {0: dim0}, "y": {0: dim0}}
 
-        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+        coreml_model = self.run_compare_torch(
             [(1,), (1,)],
             AddModule(),
             compute_unit=compute_unit,
             backend=backend,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             torch_export_dynamic_shapes=dynamic_shapes,
-        )
+        )[1]
+
+        if dynamic:
+            for input_proto in coreml_model.input_description._fd_spec:
+                size_ranges = input_proto.type.multiArrayType.shapeRange.sizeRanges
+                assert size_ranges[0].lowerBound == 1
+                assert size_ranges[0].upperBound == 2147483647
 
         mil_program = coreml_model._mil_program
         adds = mil_program.functions["main"].find_ops(op_type="add")
@@ -220,7 +372,7 @@ def forward(self, x, y):
         for i, stack_trace in enumerate(stack_traces):
             assert stack_trace.split("\n")[-2].strip() == source_codes[i]
 
-        if use_edge_dialect:
+        if frontend == TorchFrontend.EXECUTORCH:
             debug_handles = [add.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0] for add in adds]
             for debug_handle in debug_handles:
                 assert isinstance(debug_handle, int)
@@ -268,10 +420,13 @@ def forward(self, x, y):
                     assert ops[index_cast][-1]["Operator"] == "cast"
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, use_edge_dialect, dynamic",
-        itertools.product(compute_units, backends, USE_EDGE_DIALECT, (True, False)),
+        "compute_unit, backend, frontend, dynamic",
+        itertools.product(compute_units, backends, frontends, (True, False)),
     )
-    def test_add_mul(self, compute_unit, backend, use_edge_dialect, dynamic):
+    def test_add_mul(self, compute_unit, backend, frontend, dynamic):
+        if ct.utils._macos_version() <= (14, 2) and dynamic:
+            pytest.xfail("rdar://135925921 ([CI] Upgrade External CI Machine OS)")
+
         class AddMulModule(torch.nn.Module):
             def forward(self, a, x, b):
                 y = torch.mm(a, x)
@@ -287,15 +442,34 @@ def forward(self, a, x, b):
                 "b": {},
             }
 
-        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+        coreml_model = self.run_compare_torch(
             [(2, 2), (2, 2), (2, 2)],
             AddMulModule(),
             compute_unit=compute_unit,
             backend=backend,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             torch_export_dynamic_shapes=dynamic_shapes,
-        )
+        )[1]
+
+        if dynamic:
+            for i, input_proto in enumerate(coreml_model.input_description._fd_spec):
+                multi_array_type = input_proto.type.multiArrayType
+                shape = multi_array_type.shape
+                size_ranges = multi_array_type.shapeRange.sizeRanges
+                if i == 0:
+                    assert size_ranges[0].lowerBound == 2
+                    assert size_ranges[0].upperBound == 2
+                    assert size_ranges[1].lowerBound == TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+                    assert size_ranges[1].upperBound == 2147483647
+                elif i == 1:
+                    assert size_ranges[0].lowerBound == TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+                    assert size_ranges[0].upperBound == 2147483647
+                    assert size_ranges[1].lowerBound == 2
+                    assert size_ranges[1].upperBound == 2
+                else:
+                    assert i == 2
+                    assert shape == [2, 2]
+                    assert len(size_ranges) == 0
 
         mil_program = coreml_model._mil_program
         matmul_or_add = {}
@@ -314,7 +488,7 @@ def forward(self, a, x, b):
             source_code = source_codes[op_type]
             assert stack_trace.split("\n")[-2].strip() == source_code
 
-        if use_edge_dialect:
+        if frontend == TorchFrontend.EXECUTORCH:
             debug_handle = {
                 k: v.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0] for k, v in matmul_or_add.items()
             }
@@ -364,10 +538,13 @@ def forward(self, a, x, b):
                     assert ops[op_type][index_cast][-1]["Operator"] == "cast"
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, use_edge_dialect, dynamic",
-        itertools.product(compute_units, backends, USE_EDGE_DIALECT, (True, False)),
+        "compute_unit, backend, frontend, dynamic",
+        itertools.product(compute_units, backends, frontends, (True, False)),
     )
-    def test_softmax(self, compute_unit, backend, use_edge_dialect, dynamic):
+    def test_softmax(self, compute_unit, backend, frontend, dynamic):
+        if ct.utils._macos_version() <= (14, 2) and dynamic:
+            pytest.xfail("rdar://135925921 ([CI] Upgrade External CI Machine OS)")
+
         class SoftmaxModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -381,15 +558,22 @@ def forward(self, x):
             vocab_dim = torch.export.Dim("vocab_dim")
             dynamic_shapes = {"x": {0: vocab_dim}}
 
-        _, coreml_model, _, _, _, _ = self.run_compare_torch(
+        coreml_model = self.run_compare_torch(
             [(2, 2)],
             SoftmaxModule(),
             compute_unit=compute_unit,
             backend=backend,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             torch_export_dynamic_shapes=dynamic_shapes,
-        )
+        )[1]
+
+        if dynamic:
+            input_proto = coreml_model.input_description._fd_spec[0]
+            size_ranges = input_proto.type.multiArrayType.shapeRange.sizeRanges
+            assert size_ranges[0].lowerBound == TORCH_EXPORT_DEFAULT_LOWER_BOUND[frontend]
+            assert size_ranges[0].upperBound == 2147483647
+            assert size_ranges[1].lowerBound == 2
+            assert size_ranges[1].upperBound == 2
 
         mil_program = coreml_model._mil_program
         softmax = mil_program.functions["main"].find_ops(op_type="softmax")[0]
@@ -397,7 +581,7 @@ def forward(self, x):
         stack_trace = softmax.scopes[ScopeSource.EXIR_STACK_TRACE][0]
         assert stack_trace.split("\n")[-2].strip() == "return self.softmax(x)"
 
-        if use_edge_dialect:
+        if frontend == TorchFrontend.EXECUTORCH:
             debug_handle = softmax.scopes[ScopeSource.EXIR_DEBUG_HANDLE][0]
             assert isinstance(debug_handle, int)
 
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_export_quantization.py b/coremltools/converters/mil/frontend/torch/test/test_torch_export_quantization.py
index 878bb25dc..031095dd0 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_export_quantization.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_export_quantization.py
@@ -13,9 +13,12 @@
 if not _HAS_TORCH_EXPORT_API:
     pytest.skip(allow_module_level=True, reason="torch.export is required")
 
-USE_EDGE_DIALECT = [False]
+from coremltools.converters.mil.frontend.torch.utils import TorchFrontend
+
+frontends = [TorchFrontend.TORCHEXPORT]
+
 if _HAS_EXECUTORCH:
-    USE_EDGE_DIALECT.append(True)
+    frontends.append(TorchFrontend.EXECUTORCH)
 
 import torch
 
@@ -42,7 +45,7 @@
     QuantizationScheme,
 )
 
-from .testing_utils import TorchBaseTest, TorchFrontend
+from .testing_utils import TorchBaseTest
 
 
 class TestTorchExportQuantization(TorchBaseTest):
@@ -108,18 +111,16 @@ def make_torch_quantized_graph(
         return converted_graph
 
     @pytest.mark.parametrize(
-        "quantizer_name, quantization_type, is_per_channel, nbit, use_edge_dialect",
+        "quantizer_name, quantization_type, is_per_channel, nbit, frontend",
         itertools.product(
             ("XNNPack", "CoreML"),
             ("PTQ", "QAT"),
             (True, False),
             (4, 8),
-            USE_EDGE_DIALECT,
+            frontends,
         ),
     )
-    def test_conv_relu(
-        self, quantizer_name, quantization_type, is_per_channel, nbit, use_edge_dialect
-    ):
+    def test_conv_relu(self, quantizer_name, quantization_type, is_per_channel, nbit, frontend):
         SHAPE = (1, 3, 256, 256)
 
         class Model(torch.nn.Module):
@@ -152,8 +153,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         _, mlmodel, _, _, _, _ = self.run_compare_torch(
             SHAPE,
             converted_graph,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             backend=("mlprogram", "fp16"),
             minimum_deployment_target=minimum_deployment_target,
         )
@@ -173,18 +173,16 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             assert constexpr_affine_dequantize_op.quantized_data.dtype in (types.int8, types.uint8)
 
     @pytest.mark.parametrize(
-        "quantizer_name, quantization_type, is_per_channel, nbit, use_edge_dialect",
+        "quantizer_name, quantization_type, is_per_channel, nbit, frontend",
         itertools.product(
             ("XNNPack", "CoreML"),
             ("PTQ", "QAT"),
             (True, False),
             (4, 8),
-            USE_EDGE_DIALECT,
+            frontends,
         ),
     )
-    def test_linear(
-        self, quantizer_name, quantization_type, is_per_channel, nbit, use_edge_dialect
-    ):
+    def test_linear(self, quantizer_name, quantization_type, is_per_channel, nbit, frontend):
         SHAPE = (1, 5)
 
         class Model(torch.nn.Module):
@@ -213,8 +211,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         _, mlmodel, _, _, _, _ = self.run_compare_torch(
             SHAPE,
             converted_graph,
-            frontend=TorchFrontend.EXIR,
-            use_edge_dialect=use_edge_dialect,
+            frontend=frontend,
             backend=("mlprogram", "fp16"),
             minimum_deployment_target=minimum_deployment_target,
         )
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
index 673ae47b3..aa9b627cc 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_ops.py
@@ -10,20 +10,19 @@
 
 import numpy as np
 import pytest
+
+torch = pytest.importorskip("torch")
 import torch.nn as nn
 
 import coremltools as ct
 from coremltools import RangeDim, Shape, TensorType
-from coremltools._deps import (
-    _HAS_EXECUTORCH,
-    _HAS_TORCH_AUDIO,
-    _HAS_TORCH_VISION,
-    version_lt,
-)
+from coremltools._deps import _HAS_TORCH_AUDIO, _HAS_TORCH_VISION, version_lt
 from coremltools.converters.mil import testing_reqs
 from coremltools.converters.mil.frontend.torch.utils import (
     NUM_TO_TORCH_DTYPE,
     NUMPY_DTYPE_TO_TORCH_NUM,
+    TORCH_EXPORT_BASED_FRONTENDS,
+    TorchFrontend,
 )
 from coremltools.converters.mil.mil import Operation, Program, types
 from coremltools.converters.mil.mil.var import Var
@@ -38,8 +37,9 @@
 from .testing_utils import (
     ModuleWrapper,
     TorchBaseTest,
-    TorchFrontend,
     contains_op,
+    export_torch_model_to_frontend,
+    frontends,
     generate_input_data,
 )
 
@@ -50,13 +50,15 @@
     import torchvision
 
 
-frontends = [TorchFrontend.TORCHSCRIPT]
-
-if _HAS_EXECUTORCH:
-    frontends.append(TorchFrontend.EXIR)
-
 backends = testing_reqs.backends
 compute_units = testing_reqs.compute_units
+for frontend in frontends:
+    if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+        # torch.export limits the number of compilation frames to prevent infinite loop
+        # However, those frames are not immediately released after torch.export is done,
+        # so when we have many torch.export calls, we can still hit the frame number limit
+        torch._dynamo.config.accumulated_cache_size_limit = 1000000
+        break
 
 torch = pytest.importorskip("torch")
 torch.manual_seed(30)
@@ -69,7 +71,6 @@
 
 
 class TestScriptedModels(TorchBaseTest):
-
     @staticmethod
     def get_while_loop_model():
         class TestLayer(nn.Module):
@@ -107,35 +108,29 @@ def forward(self, x):
     def test_while_loop(self, compute_unit, backend):
         model = TestScriptedModels.get_while_loop_model()
         self.run_compare_torch(
-            model.input_size,
-            model,
-            backend=backend,
-            compute_unit=compute_unit,
-            use_scripting=True
+            model.input_size, model, backend=backend, compute_unit=compute_unit, use_scripting=True
         )
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_cond(self, compute_unit, backend):
         torch_model = TestScriptedModels.get_cond_model()
 
         self.run_compare_torch(
-            torch.tensor([1.]),
+            torch.tensor([1.0]),
             torch_model,
             input_as_shape=False,
             backend=backend,
             compute_unit=compute_unit,
-            use_scripting=True
+            use_scripting=True,
         )
 
         self.run_compare_torch(
-            torch.tensor([11.]),
+            torch.tensor([11.0]),
             torch_model,
             input_as_shape=False,
             backend=backend,
             compute_unit=compute_unit,
-            use_scripting=True
+            use_scripting=True,
         )
 
     @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
@@ -161,11 +156,7 @@ def forward(self, x):
         model = TestNet().eval()
 
         self.run_compare_torch(
-            model.input_size,
-            model,
-            backend=backend,
-            compute_unit=compute_unit,
-            use_scripting=True
+            model.input_size, model, backend=backend, compute_unit=compute_unit, use_scripting=True
         )
 
     @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
@@ -195,11 +186,7 @@ def forward(self, x):
         model = TestNet().eval()
 
         self.run_compare_torch(
-            model.input_size,
-            model,
-            backend=backend,
-            compute_unit=compute_unit,
-            use_scripting=True
+            model.input_size, model, backend=backend, compute_unit=compute_unit, use_scripting=True
         )
 
     @pytest.mark.parametrize(
@@ -252,9 +239,30 @@ def test_conv(self, compute_unit, backend):
 
 class TestMean(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
+        "compute_unit, backend, frontend, keepdim",
+        itertools.product(compute_units, backends, frontends, (True, False)),
+    )
+    def test_mean(self, compute_unit, backend, frontend, keepdim):
+        class Model(nn.Module):
+            def forward(self, x):
+                return torch.mean(x, dim=(2, 3), keepdim=keepdim)
+
+        model = Model()
+        shape = (1, 3, 256, 256)
+
+        self.run_compare_torch(
+            shape,
+            model,
+            frontend=frontend,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
     )
-    def test_with_flexible_shape(self, compute_unit, backend):
+    def test_with_flexible_shape(self, compute_unit, backend, frontend):
         if backend[0] == "mlprogram" and _macos_version() < (13, 0):
             pytest.xfail(
                 "Issue fixed in iOS16/macOS13: https://github.com/apple/coremltools/issues/1420"
@@ -284,15 +292,14 @@ def forward(self, x):
         self.run_compare_torch(
             shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             converter_input_type=converter_input_type,
         )
 
     @staticmethod
-    @pytest.mark.skipif(
-        ct.utils._macos_version() < (13, 0), reason="Bug fixed in macOS13/iOS16"
-    )
+    @pytest.mark.skipif(ct.utils._macos_version() < (13, 0), reason="Bug fixed in macOS13/iOS16")
     def test_flexible_shape_with_default_value():
         # test for bug reported in https://github.com/apple/coremltools/issues/1420
         class Network(torch.nn.Module):
@@ -443,18 +450,20 @@ def test(
 
 class TestFrac(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, frontend, shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             COMMON_SHAPES,
         ),
     )
-    def test_frac(self, compute_unit, backend, shape):
+    def test_frac(self, compute_unit, backend, frontend, shape):
         model = ModuleWrapper(function=torch.frac)
         TorchBaseTest.run_compare_torch(
             shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             rand_range=(-10.0, 10.0),
@@ -543,9 +552,7 @@ class TestSort(TorchBaseTest):
         ),
     )
     def test_sort(self, compute_unit, backend, shape, axis, descending):
-        model = ModuleWrapper(
-            function=torch.sort, kwargs={"dim": axis, "descending": descending}
-        )
+        model = ModuleWrapper(function=torch.sort, kwargs={"dim": axis, "descending": descending})
         TorchBaseTest.run_compare_torch(
             shape,
             model,
@@ -655,15 +662,16 @@ def test_dot(self, compute_unit, backend, vector_length):
 
 class TestOuter(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_vector_length, y_vector_length",
+        "compute_unit, backend, frontend, x_vector_length, y_vector_length",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [1, 5],
             [1, 3],
         ),
     )
-    def test_outer(self, compute_unit, backend, x_vector_length, y_vector_length):
+    def test_outer(self, compute_unit, backend, frontend, x_vector_length, y_vector_length):
         model = ModuleWrapper(function=torch.outer)
 
         vector1 = generate_input_data((x_vector_length,))
@@ -672,6 +680,7 @@ def test_outer(self, compute_unit, backend, x_vector_length, y_vector_length):
         TorchBaseTest.run_compare_torch(
             (vector1, vector2),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -732,9 +741,7 @@ class TestNorms(TorchBaseTest):
     def test_frobenius_norm(self, compute_unit, backend, shape, keepdim):
         num_dims = len(shape)
         for dim in range(-num_dims, num_dims):
-            model = ModuleWrapper(
-                function=torch.norm, kwargs={"keepdim": keepdim, "dim": dim}
-            )
+            model = ModuleWrapper(function=torch.norm, kwargs={"keepdim": keepdim, "dim": dim})
             TorchBaseTest.run_compare_torch(
                 shape,
                 model,
@@ -768,15 +775,15 @@ def test_number_norm(self, compute_unit, backend, shape, p, keepdim):
 
 class TestNarrow(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, frontend, shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             COMMON_SHAPES,
         ),
     )
-    def test_narrow(self, compute_unit, backend, shape):
-
+    def test_narrow(self, compute_unit, backend, frontend, shape):
         class Model(torch.nn.Module):
             def __init__(self, dim, start, length):
                 super().__init__()
@@ -787,9 +794,8 @@ def __init__(self, dim, start, length):
             def forward(self, x):
                 return torch.narrow(x, self.dim, self.start, self.length)
 
-
         for cur_dim in range(len(shape)):
-            for cur_start in range(shape[cur_dim]-1):
+            for cur_start in range(shape[cur_dim] - 1):
                 for cur_length in range(1, shape[cur_dim] - cur_start):
 
                     m = Model(cur_dim, cur_start, cur_length)
@@ -797,6 +803,7 @@ def forward(self, x):
                     TorchBaseTest.run_compare_torch(
                         shape,
                         m,
+                        frontend=frontend,
                         backend=backend,
                         compute_unit=compute_unit,
                     )
@@ -879,11 +886,7 @@ def _is_valid_config(self, shape, order, dim):
             if order is not None:
                 if len(shape) > 2:
                     return False
-                elif (
-                    len(shape) == 2
-                    and not isinstance(order, str)
-                    and (order == 0 or order > 2)
-                ):
+                elif len(shape) == 2 and not isinstance(order, str) and (order == 0 or order > 2):
                     return False
                 elif len(shape) == 1 and isinstance(order, str):
                     return False
@@ -993,18 +996,20 @@ def test_longer_range_input_element_values(self):
         TorchBaseTest.run_compare_torch(x, model, input_as_shape=False)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, frontend, shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             COMMON_SHAPES,
         ),
     )
-    def test_additional_shapes_and_backends(self, compute_unit, backend, shape):
+    def test_additional_shapes_and_backends(self, compute_unit, backend, frontend, shape):
         model = TestHardswish.HardswishModel()
         TorchBaseTest.run_compare_torch(
             shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -1013,9 +1018,7 @@ def test_additional_shapes_and_backends(self, compute_unit, backend, shape):
 class TestBatchNorm(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, num_features, eps, affine",
-        itertools.product(
-            compute_units, backends, [5, 3, 1], [0.1, 1e-05], [True, False]
-        ),
+        itertools.product(compute_units, backends, [5, 3, 1], [0.1, 1e-05], [True, False]),
     )
     def test_batchnorm(self, compute_unit, backend, num_features, eps, affine):
         model = nn.BatchNorm2d(num_features, eps, affine=affine)
@@ -1061,9 +1064,7 @@ def forward(self, x):
             ["None", "Batch", "Height", "Width", "Depth", "All"],
         ),
     )
-    def test_batchnorm_3d(
-        self, compute_unit, backend, num_features, eps, affine, dynamic_input
-    ):
+    def test_batchnorm_3d(self, compute_unit, backend, num_features, eps, affine, dynamic_input):
         model = nn.BatchNorm3d(num_features, eps, affine=affine)
         input_shape = (6, num_features, 2, 3, 4)
         if dynamic_input == "None":
@@ -1076,27 +1077,19 @@ def test_batchnorm_3d(
         else:
             if dynamic_input == "Batch":
                 converter_input_type = [
-                    TensorType(
-                        shape=(RangeDim(1, 10), num_features, 2, 3, 4), dtype=np.float32
-                    )
+                    TensorType(shape=(RangeDim(1, 10), num_features, 2, 3, 4), dtype=np.float32)
                 ]
             elif dynamic_input == "Height":
                 converter_input_type = [
-                    TensorType(
-                        shape=(6, num_features, RangeDim(1, 10), 3, 4), dtype=np.float32
-                    )
+                    TensorType(shape=(6, num_features, RangeDim(1, 10), 3, 4), dtype=np.float32)
                 ]
             elif dynamic_input == "Width":
                 converter_input_type = [
-                    TensorType(
-                        shape=(6, num_features, 2, RangeDim(1, 10), 4), dtype=np.float32
-                    )
+                    TensorType(shape=(6, num_features, 2, RangeDim(1, 10), 4), dtype=np.float32)
                 ]
             elif dynamic_input == "Depth":
                 converter_input_type = [
-                    TensorType(
-                        shape=(6, num_features, 2, 3, RangeDim(1, 10)), dtype=np.float32
-                    )
+                    TensorType(shape=(6, num_features, 2, 3, RangeDim(1, 10)), dtype=np.float32)
                 ]
             elif dynamic_input == "All":
                 converter_input_type = [
@@ -1302,70 +1295,77 @@ def test_instancenorm_1d(self, compute_unit, backend, num_features):
 
 class TestGroupNorm(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, group_features, eps, affine",
+        "compute_unit, backend, frontend, group_features, eps, affine",
         itertools.product(
-            compute_units, backends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
+            compute_units, backends, frontends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
         ),
     )
-    def test_groupnorm(self, compute_unit, backend, group_features, eps, affine):
-        model = nn.GroupNorm(
-            group_features[0], group_features[1], eps=eps, affine=affine
-        )
+    def test_groupnorm(self, compute_unit, backend, frontend, group_features, eps, affine):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch uses native_group_norm")
+
+        model = nn.GroupNorm(group_features[0], group_features[1], eps=eps, affine=affine)
         self.run_compare_torch(
             (6, group_features[1], 5, 5),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, group_features, eps, affine",
+        "compute_unit, backend, frontend, group_features, eps, affine",
         itertools.product(
-            compute_units, backends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
+            compute_units, backends, frontends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
         ),
     )
     def test_groupnorm_rank3_input(
-        self, compute_unit, backend, group_features, eps, affine
+        self, compute_unit, backend, frontend, group_features, eps, affine
     ):
-        model = nn.GroupNorm(
-            group_features[0], group_features[1], eps=eps, affine=affine
-        )
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch uses native_group_norm")
+
+        model = nn.GroupNorm(group_features[0], group_features[1], eps=eps, affine=affine)
         self.run_compare_torch(
             (6, group_features[1], 5),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, group_features, eps, affine",
+        "compute_unit, backend, frontend, group_features, eps, affine",
         itertools.product(
-            compute_units, backends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
+            compute_units, backends, frontends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
         ),
     )
     def test_groupnorm_rank2_input(
-        self, compute_unit, backend, group_features, eps, affine
+        self, compute_unit, backend, frontend, group_features, eps, affine
     ):
-        model = nn.GroupNorm(
-            group_features[0], group_features[1], eps=eps, affine=affine
-        )
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch uses native_group_norm")
+
+        model = nn.GroupNorm(group_features[0], group_features[1], eps=eps, affine=affine)
         self.run_compare_torch(
             (4, group_features[1]),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, group_features, eps, affine",
+        "compute_unit, backend, frontend, group_features, eps, affine",
         itertools.product(
-            compute_units, backends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
+            compute_units, backends, frontends, [(16, 32), (1, 1)], [0.1, 1e-05], [True, False]
         ),
     )
-    def test_groupnorm_dynamic(self, compute_unit, backend, group_features, eps, affine):
-        model = nn.GroupNorm(
-            group_features[0], group_features[1], eps=eps, affine=affine
-        )
+    def test_groupnorm_dynamic(self, compute_unit, backend, frontend, group_features, eps, affine):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch uses native_group_norm")
+
+        model = nn.GroupNorm(group_features[0], group_features[1], eps=eps, affine=affine)
         dim_upper_bound = 30 if backend[0] == "mlprogram" else -1
         converter_input_type = [
             TensorType(
@@ -1381,6 +1381,7 @@ def test_groupnorm_dynamic(self, compute_unit, backend, group_features, eps, aff
         self.run_compare_torch(
             (6, group_features[1], 10, 10),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             converter_input_type=converter_input_type,
@@ -1389,13 +1390,10 @@ def test_groupnorm_dynamic(self, compute_unit, backend, group_features, eps, aff
 
 class TestLinear(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(
-            compute_units,
-            backends,
-        ),
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
     )
-    def test_linear_fp16(self, compute_unit, backend):
+    def test_linear_fp16(self, compute_unit, backend, frontend):
         class Model(nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1408,6 +1406,7 @@ def forward(self, x):
         self.run_compare_torch(
             torch.randn(4, 4, dtype=torch.float16),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -1415,65 +1414,75 @@ def forward(self, x):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, in_features, out_features, bias",
+        "compute_unit, backend, frontend, in_features, out_features, bias",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [5],
             [10],
             [True, False],
         ),
     )
     def test_linear_rank1_input(
-        self, compute_unit, backend, in_features, out_features, bias
+        self, compute_unit, backend, frontend, in_features, out_features, bias
     ):
         model = nn.Linear(in_features, out_features, bias=bias)
         self.run_compare_torch(
             (in_features,),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, in_features, out_features, bias",
-        itertools.product(compute_units, backends, [10, 25], [3, 6], [True, False]),
+        "compute_unit, backend, frontend, in_features, out_features, bias",
+        itertools.product(compute_units, backends, frontends, [10, 25], [3, 6], [True, False]),
     )
     def test_linear_rank2_input(
-        self, compute_unit, backend, in_features, out_features, bias
+        self, compute_unit, backend, frontend, in_features, out_features, bias
     ):
         model = nn.Linear(in_features, out_features, bias=bias)
         self.run_compare_torch(
             (1, in_features),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, in_features, out_features, bias",
-        itertools.product(compute_units, backends, [10], [6], [True, False]),
+        "compute_unit, backend, frontend, in_features, out_features, bias",
+        itertools.product(compute_units, backends, frontends, [10], [6], [True, False]),
     )
     def test_linear_rank3_input(
-        self, compute_unit, backend, in_features, out_features, bias
+        self, compute_unit, backend, frontend, in_features, out_features, bias
     ):
         model = nn.Linear(in_features, out_features, bias=bias)
         self.run_compare_torch(
             (1, 3, in_features),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, in_features, out_features, bias",
-        itertools.product(compute_units, backends, [10], [6], [True, False]),
+        "compute_unit, backend, frontend, in_features, out_features, bias",
+        itertools.product(compute_units, backends, frontends, [10], [6], [True, False]),
     )
     def test_linear_rank4_input(
-        self, compute_unit, backend, in_features, out_features, bias
+        self, compute_unit, backend, frontend, in_features, out_features, bias
     ):
         model = nn.Linear(in_features, out_features, bias=bias)
-        self.run_compare_torch((1, 5, 3, in_features), model, backend=backend)
+        self.run_compare_torch(
+            (1, 5, 3, in_features),
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )
 
 
 class TestConv(TorchBaseTest):
@@ -1482,6 +1491,7 @@ class TestConv(TorchBaseTest):
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "padding",
                 "stride",
                 "length",
@@ -1493,10 +1503,11 @@ class TestConv(TorchBaseTest):
             ]
         ),
         [
-            (compute_unit, backend, padding, stride, *param)
-            for compute_unit, backend, padding, stride, param in itertools.product(
+            (compute_unit, backend, frontend, padding, stride, *param)
+            for compute_unit, backend, frontend, padding, stride, param in itertools.product(
                 [ct.ComputeUnit.CPU_ONLY],
                 backends,
+                frontends,
                 ["same", "valid", 0, 1],
                 [1, 2, 3],
                 [
@@ -1516,6 +1527,7 @@ def test_convolution1d(
         self,
         compute_unit,
         backend,
+        frontend,
         padding,
         stride,
         length,
@@ -1524,7 +1536,6 @@ def test_convolution1d(
         kernel_size,
         dilation,
         bias,
-        groups=1,
     ):
         if padding == "same" and stride != 1:
             # configuration not supported
@@ -1541,6 +1552,7 @@ def test_convolution1d(
         self.run_compare_torch(
             (1, in_channels, length),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -1550,6 +1562,7 @@ def test_convolution1d(
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "padding",
                 "stride",
                 "height",
@@ -1562,10 +1575,11 @@ def test_convolution1d(
             ]
         ),
         [
-            (compute_unit, backend, padding, stride, *param)
-            for compute_unit, backend, padding, stride, param in itertools.product(
+            (compute_unit, backend, frontend, padding, stride, *param)
+            for compute_unit, backend, frontend, padding, stride, param in itertools.product(
                 [ct.ComputeUnit.CPU_ONLY],
                 backends,
+                frontends,
                 ["same", "valid", 1, 0],
                 [1, 2, 3],
                 [
@@ -1585,6 +1599,7 @@ def test_convolution2d(
         self,
         compute_unit,
         backend,
+        frontend,
         padding,
         stride,
         height,
@@ -1594,7 +1609,6 @@ def test_convolution2d(
         kernel_size,
         dilation,
         bias,
-        groups=1,
     ):
         if padding == "same" and stride != 1:
             return
@@ -1610,6 +1624,7 @@ def test_convolution2d(
         self.run_compare_torch(
             (1, in_channels, height, width),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -1619,6 +1634,7 @@ def test_convolution2d(
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "padding",
                 "stride",
                 "depth",
@@ -1632,10 +1648,11 @@ def test_convolution2d(
             ]
         ),
         [
-            (compute_unit, backend, padding, stride, *param)
-            for compute_unit, backend, padding, stride, param in itertools.product(
+            (compute_unit, backend, frontend, padding, stride, *param)
+            for compute_unit, backend, frontend, padding, stride, param in itertools.product(
                 [ct.ComputeUnit.CPU_ONLY],
                 backends,
+                frontends,
                 ["same", "valid", 1, 0],
                 [1, 2, 3],
                 [
@@ -1655,6 +1672,7 @@ def test_convolution3d(
         self,
         compute_unit,
         backend,
+        frontend,
         padding,
         stride,
         depth,
@@ -1665,7 +1683,6 @@ def test_convolution3d(
         kernel_size,
         dilation,
         bias,
-        groups=1,
     ):
         if padding == "same" and stride != 1:
             return
@@ -1681,6 +1698,7 @@ def test_convolution3d(
         self.run_compare_torch(
             (1, in_channels, depth, height, width),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -1692,6 +1710,7 @@ class TestDynamicConv(TorchBaseTest):
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "width",
                 "in_channels",
                 "out_channels",
@@ -1701,10 +1720,11 @@ class TestDynamicConv(TorchBaseTest):
             ]
         ),
         [
-            (compute_unit, backend, *param)
-            for compute_unit, backend, param in itertools.product(
+            (compute_unit, backend, frontend, *param)
+            for compute_unit, backend, frontend, param in itertools.product(
                 compute_units,
                 backends,
+                frontends,
                 [
                     (5, 1, 1, 1, 2, 1),
                     (3, 1, 1, 1, 2, 3),
@@ -1722,6 +1742,7 @@ def test_convolution1d(
         self,
         compute_unit,
         backend,
+        frontend,
         width,
         in_channels,
         out_channels,
@@ -1732,9 +1753,7 @@ def test_convolution1d(
     ):
         class DynamicConv(nn.Module):
             def forward(self, input_data, weights):
-                return nn.functional.conv1d(
-                    input_data, weights, stride=stride, padding=padding
-                )
+                return nn.functional.conv1d(input_data, weights, stride=stride, padding=padding)
 
         model = DynamicConv()
         input_shape = [
@@ -1744,6 +1763,7 @@ def forward(self, input_data, weights):
         self.run_compare_torch(
             input_shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -1753,6 +1773,7 @@ def forward(self, input_data, weights):
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "height",
                 "width",
                 "in_channels",
@@ -1763,10 +1784,11 @@ def forward(self, input_data, weights):
             ]
         ),
         [
-            (compute_unit, backend, *param)
-            for compute_unit, backend, param in itertools.product(
+            (compute_unit, backend, frontend, *param)
+            for compute_unit, backend, frontend, param in itertools.product(
                 compute_units,
                 backends,
+                frontends,
                 [
                     (5, 3, 1, 1, 1, 2, 0),
                     (3, 3, 1, 1, 1, 2, 1),
@@ -1784,6 +1806,7 @@ def test_convolution2d(
         self,
         compute_unit,
         backend,
+        frontend,
         height,
         width,
         in_channels,
@@ -1795,9 +1818,7 @@ def test_convolution2d(
     ):
         class DynamicConv(nn.Module):
             def forward(self, input_data, weights):
-                return nn.functional.conv2d(
-                    input_data, weights, stride=stride, padding=padding
-                )
+                return nn.functional.conv2d(input_data, weights, stride=stride, padding=padding)
 
         model = DynamicConv()
 
@@ -1806,7 +1827,7 @@ def forward(self, input_data, weights):
             (out_channels, int(in_channels / groups), kernel_size, kernel_size),
         ]
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
@@ -1816,6 +1837,7 @@ class TestConvTranspose(TorchBaseTest):
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "width",
                 "in_channels",
                 "out_channels",
@@ -1826,10 +1848,11 @@ class TestConvTranspose(TorchBaseTest):
             ]
         ),
         [
-            (compute_unit, backend, *param)
-            for compute_unit, backend, param in itertools.product(
+            (compute_unit, backend, frontend, *param)
+            for compute_unit, backend, frontend, param in itertools.product(
                 compute_units,
                 backends,
+                frontends,
                 [
                     (3, 1, 1, 1, 2, 0, 1),
                     (3, 1, 1, 1, 2, 1, 3),
@@ -1847,6 +1870,7 @@ def test_convolution_transpose1d(
         self,
         compute_unit,
         backend,
+        frontend,
         width,
         in_channels,
         out_channels,
@@ -1868,6 +1892,7 @@ def test_convolution_transpose1d(
         self.run_compare_torch(
             (1, in_channels, width),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -1877,6 +1902,7 @@ def test_convolution_transpose1d(
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "height",
                 "width",
                 "in_channels",
@@ -1888,10 +1914,11 @@ def test_convolution_transpose1d(
             ]
         ),
         [
-            (compute_unit, backend, *param)
-            for compute_unit, backend, param in itertools.product(
+            (compute_unit, backend, frontend, *param)
+            for compute_unit, backend, frontend, param in itertools.product(
                 compute_units,
                 backends,
+                frontends,
                 [
                     (5, 5, 1, 1, 1, 2, 0, 1),
                     (5, 5, 1, 1, 1, 2, 1, 3),
@@ -1909,6 +1936,7 @@ def test_convolution_transpose2d(
         self,
         compute_unit,
         backend,
+        frontend,
         height,
         width,
         in_channels,
@@ -1917,7 +1945,6 @@ def test_convolution_transpose2d(
         stride,
         padding,
         dilation,
-        groups=1,
     ):
         model = nn.ConvTranspose2d(
             in_channels=in_channels,
@@ -1930,15 +1957,17 @@ def test_convolution_transpose2d(
         self.run_compare_torch(
             (1, in_channels, height, width),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, dynamic_input",
+        "compute_unit, backend, frontend, dynamic_input",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [True, False],
         ),
     )
@@ -1946,6 +1975,7 @@ def test_convolution_transpose2d_dynamic_input(
         self,
         compute_unit,
         backend,
+        frontend,
         dynamic_input,
     ):
         in_channels = 5
@@ -1961,6 +1991,7 @@ def test_convolution_transpose2d_dynamic_input(
         in_width = 512
         input_shape = (1, in_channels, in_height, in_width)
 
+        converter_input_type = None
         if dynamic_input:
             upper_bound = 4096 if backend[0] == "mlprogram" else -1
             converter_input_type = [
@@ -1969,26 +2000,21 @@ def test_convolution_transpose2d_dynamic_input(
                     dtype=np.float32,
                 )
             ]
-            self.run_compare_torch(
-                input_shape,
-                model,
-                backend=backend,
-                compute_unit=compute_unit,
-                converter_input_type=converter_input_type,
-            )
-        else:
-            self.run_compare_torch(
-                input_shape,
-                model,
-                backend=backend,
-                compute_unit=compute_unit,
-            )
+        self.run_compare_torch(
+            input_shape,
+            model,
+            converter_input_type=converter_input_type,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )
 
     @pytest.mark.parametrize(
         ",".join(
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "height",
                 "width",
                 "in_channels",
@@ -2001,10 +2027,11 @@ def test_convolution_transpose2d_dynamic_input(
             ]
         ),
         [
-            (compute_unit, backend, *param)
-            for compute_unit, backend, param in itertools.product(
+            (compute_unit, backend, frontend, *param)
+            for compute_unit, backend, frontend, param in itertools.product(
                 compute_units,
                 backends,
+                frontends,
                 [
                     (5, 5, 1, 1, 1, 2, 1, 1, 1),
                     (5, 5, 1, 1, 1, 2, 2, 3, 2),
@@ -2022,6 +2049,7 @@ def test_convolution_transpose2d_output_padding(
         self,
         compute_unit,
         backend,
+        frontend,
         height,
         width,
         in_channels,
@@ -2031,7 +2059,6 @@ def test_convolution_transpose2d_output_padding(
         padding,
         dilation,
         output_padding,
-        groups=1,
     ):
 
         # Output padding must be less than either stride or dilation
@@ -2053,13 +2080,20 @@ def test_convolution_transpose2d_output_padding(
             dilation=dilation,
             output_padding=output_padding,
         )
-        self.run_compare_torch((1, in_channels, height, width), model, backend=backend)
+        self.run_compare_torch(
+            (1, in_channels, height, width),
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )
 
     @pytest.mark.parametrize(
         ",".join(
             [
                 "compute_unit",
                 "backend",
+                "frontend",
                 "depth",
                 "height",
                 "width",
@@ -2072,10 +2106,11 @@ def test_convolution_transpose2d_output_padding(
             ]
         ),
         [
-            (compute_unit, backend, *param)
-            for compute_unit, backend, param in itertools.product(
+            (compute_unit, backend, frontend, *param)
+            for compute_unit, backend, frontend, param in itertools.product(
                 compute_units,
                 backends,
+                frontends,
                 [
                     (3, 5, 5, 1, 1, 1, 2, 0, 1),
                     (3, 5, 5, 1, 1, 1, 2, 1, 3),
@@ -2086,42 +2121,13 @@ def test_convolution_transpose2d_output_padding(
                     (4, 6, 5, 3, 3, 1, 3, 1, 3),
                 ],
             )
-        ]
-        + [
-            pytest.param(
-                ct.ComputeUnit.CPU_ONLY,
-                "neualnetwork",
-                5,
-                5,
-                1,
-                1,
-                3,
-                4,
-                1,
-                1,
-                2,
-                marks=pytest.mark.xfail,
-            ),
-            pytest.param(
-                ct.ComputeUnit.CPU_ONLY,
-                "neualnetwork",
-                5,
-                5,
-                1,
-                1,
-                3,
-                2,
-                1,
-                3,
-                2,
-                marks=pytest.mark.xfail,
-            ),
         ],
     )
     def test_convolution_transpose3d(
         self,
         compute_unit,
         backend,
+        frontend,
         depth,
         height,
         width,
@@ -2143,6 +2149,7 @@ def test_convolution_transpose3d(
         self.run_compare_torch(
             (1, in_channels, depth, height, width),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -2589,9 +2596,7 @@ def forward(self, x):
                 # Value of y is Nondeterministic, so return length
                 return torch.Tensor([len(y)])
 
-        self.run_compare_torch(
-            shape, TestModel(), backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(shape, TestModel(), backend=backend, compute_unit=compute_unit)
 
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
@@ -3018,7 +3023,7 @@ def test_max_pool3d(
         padding,
         ceil_mode,
     ):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail("TODO (rdar://115846125): handle multi-output op max_pool3d_with_indices")
 
         if padding > kernel_size / 2:
@@ -3119,10 +3124,11 @@ def forward(self, x, y):
 
 class TestAMaxAMin(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shapes, mode, reduce_dim, keepdim",
+        "compute_unit, backend, frontend, input_shapes, mode, reduce_dim, keepdim",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 5, 7, 3)],
                 [(3, 2, 9)],
@@ -3133,13 +3139,15 @@ class TestAMaxAMin(TorchBaseTest):
             [True, False],
         ),
     )
-    def test_minimum_maximum(self, compute_unit, backend, input_shapes, mode, reduce_dim, keepdim):
+    def test_minimum_maximum(
+        self, compute_unit, backend, frontend, input_shapes, mode, reduce_dim, keepdim
+    ):
         class TestModel(torch.nn.Module):
             def forward(self, input):
                 if type(reduce_dim) == int:
                     reduce_dim_clamped = min(input.dim() - 1, reduce_dim)
                 else:
-                    reduce_dim_clamped = reduce_dim[:input.dim()]
+                    reduce_dim_clamped = reduce_dim[: input.dim()]
                 if mode == "minimum":
                     return torch.amin(input, reduce_dim_clamped, keepdim)
                 elif mode == "maximum":
@@ -3149,16 +3157,16 @@ def forward(self, input):
 
         model = TestModel()
         self.run_compare_torch(
-            input_shapes, model, backend=backend, compute_unit=compute_unit
+            input_shapes, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestPoolSymbolicInput(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
     )
-    def test_max_pool(self, compute_unit, backend):
+    def test_max_pool(self, compute_unit, backend, frontend):
         model = nn.MaxPool2d(
             kernel_size=1,
             stride=2,
@@ -3177,16 +3185,17 @@ def test_max_pool(self, compute_unit, backend):
         self.run_compare_torch(
             input_shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             converter_input_type=converter_input_type,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
-        itertools.product(compute_units, backends),
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
     )
-    def test_avg_pool(self, compute_unit, backend):
+    def test_avg_pool(self, compute_unit, backend, frontend):
         model = nn.AvgPool2d(
             kernel_size=2,
             stride=2,
@@ -3205,6 +3214,7 @@ def test_avg_pool(self, compute_unit, backend):
         self.run_compare_torch(
             input_shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             converter_input_type=converter_input_type,
@@ -3538,17 +3548,15 @@ def forward(self, x):
 # Check GitHub Issue #810, assume num_layers == 2 and bidirectional == True
 class TestStackedBLSTM(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_size, hidden_size, num_layers, bias, batch_first, dropout, bidirectional",
+        "compute_unit, backend, input_size, hidden_size, bias, batch_first, dropout",
         itertools.product(
             compute_units,
             backends,
             [7],
             [5],
-            [2],
             [True, False],
             [True, False],
             [0.3],
-            [True],
         ),
     )
     def test_lstm(
@@ -3557,11 +3565,9 @@ def test_lstm(
         backend,
         input_size,
         hidden_size,
-        num_layers,
         bias,
         batch_first,
         dropout,
-        bidirectional,
     ):
         model = nn.Sequential(
             nn.LSTM(
@@ -3609,8 +3615,10 @@ def test_lstm(
 
 
 class TestConcat(TorchBaseTest):
-    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
-    def test_cat_basic(self, compute_unit, backend):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
+    )
+    def test_cat_basic(self, compute_unit, backend, frontend):
         class TestNet(nn.Module):
             def forward(self, x):
                 x = torch.cat((x, x), axis=1)
@@ -3620,12 +3628,15 @@ def forward(self, x):
         self.run_compare_torch(
             (1, 2, 3),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
-    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
-    def test_cat_with_empty(self, compute_unit, backend):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
+    )
+    def test_cat_with_empty(self, compute_unit, backend, frontend):
         class TestNet(nn.Module):
             def forward(self, x):
                 return torch.cat((x, torch.tensor([])), axis=1)
@@ -3634,14 +3645,18 @@ def forward(self, x):
         self.run_compare_torch(
             (1, 2, 3),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
     )
-    def test_cat_input_types_promotion(self, compute_unit, backend):
+    def test_cat_input_types_promotion(self, compute_unit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("executorch does not allow mixed dtypes")
+
         class TestNet(nn.Module):
             def forward(self, x, y):
                 return torch.cat((x, y), axis=1)
@@ -3651,6 +3666,7 @@ def forward(self, x, y):
         self.run_compare_torch(
             [input_data_x, input_data_y],
             TestNet(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -3660,9 +3676,9 @@ def forward(self, x, y):
     # has one item. NN throws an error for this case, hence why we have to
     # run through the full conversion process to test it.
     @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
     )
-    def test_cat_single_input(self, compute_unit, backend):
+    def test_cat_single_input(self, compute_unit, backend, frontend):
         class TestNet(nn.Module):
             def forward(self, x):
                 x = torch.cat((x,), axis=1)
@@ -3672,12 +3688,15 @@ def forward(self, x):
         self.run_compare_torch(
             (1, 3, 16, 16),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
-    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
-    def test_cat_const_fold(self, compute_unit, backend):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
+    )
+    def test_cat_const_fold(self, compute_unit, backend, frontend):
         class TestNet(nn.Module):
             def forward(self, x):
                 x = torch.tensor([[[1, 2], [2, 3], [3, 4]]])
@@ -3687,6 +3706,7 @@ def forward(self, x):
         mlmodel = self.run_compare_torch(
             (1, 2, 3),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -3694,23 +3714,22 @@ def forward(self, x):
         # The `listconstruct` is folded into a single const.
         assert len(prog.find_ops(op_type="const")) == 1
 
-        with patch.object(Var, '_is_nonreplaceable_var') as mocked_is_nonreplaceable_var:
+        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
             # Mock that the input with shape [1, 3, 2] const is non-replaceable.
             mocked_is_nonreplaceable_var.side_effect = (
                 lambda var: var.op and var.op.op_type == "const" and var.rank == 3
             )
             mlmodel = self.run_compare_torch(
-                [(1, 2, 3)],
-                model,
-                backend=backend,
-                compute_unit=compute_unit
+                [(1, 2, 3)], model, frontend=frontend, backend=backend, compute_unit=compute_unit
             )
             prog = mlmodel[1]._mil_program
             # The `listconstruct` is not folded so there are 3 const ops.
             assert len(prog.find_ops(op_type="const")) == 3
 
-    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
-    def test_concat_alias(self, compute_unit, backend):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend", itertools.product(compute_units, backends, frontends)
+    )
+    def test_concat_alias(self, compute_unit, backend, frontend):
         class Outer(torch.nn.Module):
             def __init__(self, net):
                 super(Outer, self).__init__()
@@ -3730,6 +3749,7 @@ def forward(self, x):
         self.run_compare_torch(
             (1, 3, 16, 16),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -3737,14 +3757,15 @@ def forward(self, x):
 
 class TestTile(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, dims",
+        "compute_unit, backend, frontend, dims",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, 2, 4), (3, 2), (2,)],
         ),
     )
-    def test_tile(self, compute_unit, backend, dims):
+    def test_tile(self, compute_unit, backend, frontend, dims):
         class TestModel(nn.Module):
             def forward(self, x):
                 return torch.tile(x, dims)
@@ -3752,6 +3773,7 @@ def forward(self, x):
         self.run_compare_torch(
             (2, 3, 5),
             TestModel(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -3759,14 +3781,15 @@ def forward(self, x):
 
 class TestBitwiseNot(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_type",
+        "compute_unit, backend, frontend, input_type",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             ["int", "bool"],
         ),
     )
-    def test_bitwise_not(self, compute_unit, backend, input_type):
+    def test_bitwise_not(self, compute_unit, backend, frontend, input_type):
         class TestNet(nn.Module):
             def forward(self, x):
                 return torch.bitwise_not(x)
@@ -3779,6 +3802,7 @@ def forward(self, x):
         self.run_compare_torch(
             torch_in,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -3799,14 +3823,15 @@ def _get_inputs(self, input_types):
         return (x, y)
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_types",
+        "compute_unit, backend, frontend, input_types",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [("int", "int"), ("int", "bool"), ("bool", "int"), ("bool", "bool")],
         ),
     )
-    def test_mul_int_or_bool(self, compute_unit, backend, input_types):
+    def test_mul_int_or_bool(self, compute_unit, backend, frontend, input_types):
         class TestMulWithBool(nn.Module):
             def forward(self, x, y):
                 return x * y
@@ -3816,20 +3841,22 @@ def forward(self, x, y):
         self.run_compare_torch(
             (x, y),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_types",
+        "compute_unit, backend, frontend, input_types",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [("int", "int"), ("int", "bool"), ("bool", "int"), ("bool", "bool")],
         ),
     )
-    def test_add_int_or_bool(self, compute_unit, backend, input_types):
+    def test_add_int_or_bool(self, compute_unit, backend, frontend, input_types):
         class TestAddWithBool(nn.Module):
             def forward(self, x, y):
                 return x + y
@@ -3839,21 +3866,26 @@ def forward(self, x, y):
         self.run_compare_torch(
             (x, y),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_complex, y_complex",
+        "compute_unit, backend, frontend, x_complex, y_complex",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             (True, False),
             (True, False),
         ),
     )
-    def test_add_complex(self, compute_unit, backend, x_complex, y_complex):
+    def test_add_complex(self, compute_unit, backend, frontend, x_complex, y_complex):
+        if frontend == TorchFrontend.EXECUTORCH and (x_complex or y_complex):
+            pytest.skip("Complex is not aten canonical")
+
         class TestAddComplexModel(nn.Module):
             def forward(self, x, y):
                 if x_complex:
@@ -3867,6 +3899,7 @@ def forward(self, x, y):
             TestAddComplexModel(),
             compute_unit=compute_unit,
             backend=backend,
+            frontend=frontend,
         )
 
 
@@ -3987,10 +4020,11 @@ def forward(self, x):
 
 class TestDim(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, frontend, shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 (1,),
                 (2, 3),
@@ -3998,13 +4032,13 @@ class TestDim(TorchBaseTest):
             ],
         ),
     )
-    def test_dim(self, compute_unit, backend, shape):
+    def test_dim(self, compute_unit, backend, frontend, shape):
         class DimModel(nn.Module):
             def forward(self, x):
                 return torch.tensor([x.dim()])
 
         self.run_compare_torch(
-            shape, DimModel().eval(), backend=backend, compute_unit=compute_unit
+            shape, DimModel().eval(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
@@ -4144,9 +4178,7 @@ def forward(self, x):
 
         input_shape = (3, 3) if eye_type == "single" else (2, 3)
         model = Model().eval()
-        self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
 
 
 class TestOnes(TorchBaseTest):
@@ -4202,39 +4234,50 @@ def forward(self, x):
 
 class TestRandint(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, low, high",
+        "compute_unit, backend, frontend, shape, low, high",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1,), (2, 3)],
             [-1, 2],
             [3, 5],
         ),
     )
-    def test_randint(self, compute_unit, backend, shape, low, high):
+    def test_randint(self, compute_unit, backend, frontend, shape, low, high):
         class TestModel(nn.Module):
             def forward(self, x):
                 y = torch.randint(low, high, x.shape)
-                return torch.Tensor([len(y)])
+                if frontend == TorchFrontend.TORCHSCRIPT:
+                    return torch.Tensor([len(y)])
+                else:
+                    return torch.tensor(y.shape)
 
         self.run_compare_torch(
-            shape, TestModel(), backend=backend, compute_unit=compute_unit
-        )
-
+            shape,
+            TestModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )
+
+    @pytest.mark.parametrize("frontend", frontends)
+    def test_tuple_input(self, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.randint.low is not Aten Canonical")
 
-    def test_tuple_input(self):
         class TestModel(nn.Module):
             def forward(self, x):
                 return torch.randint(0, 3, (10,))
 
         model = TestModel().eval()
         x = torch.randn((1, 3, 256, 256))
-        traced_model = torch.jit.trace(model, example_inputs=x)
-        ct.convert(traced_model, inputs=[ct.TensorType(shape=x.shape)])
+        torch_model = export_torch_model_to_frontend(model, (x,), frontend)
+        inputs = [ct.TensorType(shape=x.shape)] if frontend == TorchFrontend.TORCHSCRIPT else None
+        ct.convert(torch_model, inputs=inputs)
 
 
 class TestRand(TorchBaseTest):
-
     @pytest.mark.parametrize(
         "compute_unit, backend, shape, dtype",
         itertools.product(
@@ -4249,45 +4292,51 @@ class TestModel(nn.Module):
             def forward(self, x):
                 y = torch.rand(x.shape, dtype=dtype)
                 # can't compare directly (this is random)
-                return torch.stack([
-                    torch.ones_like(y, dtype=torch.float32),
-                    (y >= 0).to(torch.float32),
-                    (y < 1).to(torch.float32),
-                ])
+                return torch.stack(
+                    [
+                        torch.ones_like(y, dtype=torch.float32),
+                        (y >= 0).to(torch.float32),
+                        (y < 1).to(torch.float32),
+                    ]
+                )
 
-        self.run_compare_torch(
-            shape, TestModel(), backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(shape, TestModel(), backend=backend, compute_unit=compute_unit)
 
 
 class TestRandn(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, frontend, shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1,), (2, 3)],
         ),
     )
-    def test_randn(self, compute_unit, backend, shape):
+    def test_randn(self, compute_unit, backend, frontend, shape):
         class TestModel(nn.Module):
             def forward(self, x):
                 y = torch.randn(*x.shape)
-                return torch.Tensor([len(y)])
+                if frontend == TorchFrontend.TORCHSCRIPT:
+                    return torch.Tensor([len(y)])
+                else:
+                    return torch.tensor(y.shape)
 
         self.run_compare_torch(
-            shape, TestModel(), backend=backend, compute_unit=compute_unit
+            shape,
+            TestModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
         )
 
-
     @pytest.mark.parametrize(
-        "dtype",
-        [torch.complex64, torch.cfloat, torch.complex128, torch.cdouble]
+        "dtype", [torch.complex64, torch.cfloat, torch.complex128, torch.cdouble]
     )
     def test_invalid_complex_dtype(self, dtype):
         class TestModel(torch.nn.Module):
             def forward(self, x):
-                return torch.randn((5, 4), dtype=torch.cfloat)
+                return torch.randn((5, 4), dtype=dtype)
 
         with pytest.raises(AssertionError, match="complex number dtype"):
             self.run_compare_torch((5, 4), TestModel())
@@ -4295,24 +4344,27 @@ def forward(self, x):
 
 class TestRandnLike(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape",
+        "compute_unit, backend, frontend, shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1,), (2, 3)],
         ),
     )
-    def test_randn_like(self, compute_unit, backend, shape):
+    def test_randn_like(self, compute_unit, backend, frontend, shape):
         class TestModel(nn.Module):
             def forward(self, x):
                 y = torch.randn_like(torch.randn(shape))
-                return torch.Tensor([len(y)])
+                if frontend == TorchFrontend.TORCHSCRIPT:
+                    return torch.Tensor([len(y)])
+                else:
+                    return torch.tensor(y.shape)
 
         self.run_compare_torch(
-            shape, TestModel(), backend=backend, compute_unit=compute_unit
+            shape, TestModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
-
     @pytest.mark.parametrize(
         "dtype",
         [torch.complex64, torch.cfloat, torch.complex128, torch.cdouble]
@@ -4320,7 +4372,7 @@ def forward(self, x):
     def test_invalid_complex_dtype(self, dtype):
         class TestModel(torch.nn.Module):
             def forward(self, x):
-                return torch.randn_like(x, dtype=torch.cfloat)
+                return torch.randn_like(x, dtype=dtype)
 
         with pytest.raises(AssertionError, match="complex number dtype"):
             self.run_compare_torch((5, 4), TestModel())
@@ -4332,11 +4384,6 @@ class TestTypeAs(TorchBaseTest):
         itertools.product(compute_units, backends, ["int32", "float32", "bool"]),
     )
     def test_type_as(self, compute_unit, backend, type):
-        if backend == ("mlprogram", "fp16") and type == "bool":
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
-
         class TestNet(nn.Module):
             def forward(self, x, y):
                 return x.type_as(y)
@@ -4420,9 +4467,7 @@ def test_min_max_with_no_arguments(self, compute_unit, backend, input_shape, mod
 
     @pytest.mark.parametrize(
         "compute_unit, backend, input_shape, dim, mode",
-        itertools.product(
-            compute_units, backends, [(2, 2), (1, 1)], [0, 1], ["min", "max"]
-        ),
+        itertools.product(compute_units, backends, [(2, 2), (1, 1)], [0, 1], ["min", "max"]),
     )
     def test_min_max_no_keepdim(self, compute_unit, backend, input_shape, dim, mode):
         input_data = torch.rand(input_shape)
@@ -4444,55 +4489,52 @@ def test_min_max_no_keepdim(self, compute_unit, backend, input_shape, dim, mode)
     )
     def test_min_max_two_tensors(self, compute_unit, backend, input_shape, mode):
         model = self.TestModel(mode)
-        self.run_compare_torch(
-            [input_shape] * 2, model, backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch([input_shape] * 2, model, backend=backend, compute_unit=compute_unit)
 
 
 class TestLayerNorm(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, eps",
+        "compute_unit, backend, frontend, input_shape, eps",
         itertools.product(
             [ct.ComputeUnit.CPU_ONLY],
             backends,
+            frontends,
             [(1, 3, 15, 15), (1, 1, 1, 1)],
             [1e-5, 1e-7],
         ),
     )
-    def test_layer_norm(self, compute_unit, backend, input_shape, eps):
+    def test_layer_norm(self, compute_unit, backend, frontend, input_shape, eps):
         model = nn.LayerNorm(input_shape, eps=eps)
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestPixelShuffle(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, batch_size, CHW, r",
+        "compute_unit, backend, frontend, batch_size, CHW, r",
         itertools.product(
-            compute_units, backends, [1, 3], [(1, 4, 4), (3, 2, 3)], [2, 4]
+            compute_units, backends, frontends, [1, 3], [(1, 4, 4), (3, 2, 3)], [2, 4]
         ),
     )
-    def test_pixel_shuffle(self, compute_unit, backend, batch_size, CHW, r):
+    def test_pixel_shuffle(self, compute_unit, backend, frontend, batch_size, CHW, r):
         C, H, W = CHW
         input_shape = (batch_size, C * r * r, H, W)
         model = nn.PixelShuffle(upscale_factor=r)
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
-@pytest.mark.skipif(
-    _macos_version() < (13, 0), reason="New functionality in macOS13/iOS16"
-)
+@pytest.mark.skipif(_macos_version() < (13, 0), reason="New functionality in macOS13/iOS16")
 class TestPixelUnshuffle(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, batch_size, CHW, r",
+        "compute_unit, backend, frontend, batch_size, CHW, r",
         itertools.product(
-            compute_units, backends, [1, 3], [(1, 4, 4), (3, 2, 3)], [2, 4]
+            compute_units, backends, frontends, [1, 3], [(1, 4, 4), (3, 2, 3)], [2, 4]
         ),
     )
-    def test_pixel_shuffle(self, compute_unit, backend, batch_size, CHW, r):
+    def test_pixel_shuffle(self, compute_unit, backend, frontend, batch_size, CHW, r):
         if backend[0] == "neuralnetwork":
             pytest.skip("pixel_unshuffle only supported in mlprogram backend.")
 
@@ -4502,6 +4544,7 @@ def test_pixel_shuffle(self, compute_unit, backend, batch_size, CHW, r):
         self.run_compare_torch(
             input_shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=ct.target.iOS16,
@@ -4510,10 +4553,11 @@ def test_pixel_shuffle(self, compute_unit, backend, batch_size, CHW, r):
 
 class TestExpand(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 1), (2, 2)],
                 [(3, 1), (-1, 4)],
@@ -4523,7 +4567,7 @@ class TestExpand(TorchBaseTest):
             ],
         ),
     )
-    def test_expand(self, compute_unit, backend, shapes):
+    def test_expand(self, compute_unit, backend, frontend, shapes):
         input_shape, output_shape = shapes
 
         class TestModel(torch.nn.Module):
@@ -4533,18 +4577,21 @@ def forward(self, x):
         model = TestModel()
 
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target",
+        "compute_unit, backend, frontend, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [None, ct.target.iOS17],
         ),
     )
-    def test_expand_dynamic_shape0(self, compute_unit, backend, minimum_deployment_target):
+    def test_expand_dynamic_shape0(
+        self, compute_unit, backend, frontend, minimum_deployment_target
+    ):
         class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[1], x.shape[1])
@@ -4558,19 +4605,21 @@ def forward(self, x):
                     shape=[1, ct.RangeDim(upper_bound=20 if backend[0] == "mlprogram" else -1)]
                 )
             ],
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_expand_dynamic_shape1(self, compute_unit, backend):
+    def test_expand_dynamic_shape1(self, compute_unit, backend, frontend):
         class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[0], 1, x.shape[-1], x.shape[-1])
@@ -4588,18 +4637,20 @@ def forward(self, x):
                     ]
                 )
             ],
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_expand_dynamic_shape2(self, compute_unit, backend):
+    def test_expand_dynamic_shape2(self, compute_unit, backend, frontend):
         class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[-1], 1, x.shape[-1], x.shape[-1])
@@ -4610,18 +4661,20 @@ def forward(self, x):
             TestModel(),
             input_as_shape=False,
             converter_input_type=[TensorType(shape=[1, ct.RangeDim(upper_bound=upper_bound)])],
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_expand_dynamic_shape3(self, compute_unit, backend):
+    def test_expand_dynamic_shape3(self, compute_unit, backend, frontend):
         class TestModel(nn.Module):
             def forward(self, x):
                 return x.expand(x.shape[0], 10)
@@ -4639,18 +4692,27 @@ def forward(self, x):
                     ]
                 )
             ],
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_expand_dynamic_shape_from_another_input(self, compute_unit, backend):
+    def test_expand_dynamic_shape_from_another_input(self, compute_unit, backend, frontend):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip(
+                "torch._dynamo.exc.UserError: Tried to use data-dependent value in the subsequent "
+                "computation. This can happen when we encounter unbounded dynamic value that is "
+                "unknown during tracing time."
+            )
+
         class TestModel(nn.Module):
             def forward(self, x, y):
                 return x.expand(int(y[0]), int(y[1]))
@@ -4665,15 +4727,17 @@ def forward(self, x, y):
                 ),
                 TensorType(shape=(2,)),
             ],
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shapes",
+        "compute_unit, backend, frontend, input_shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 1), (2, 2)],
                 [(3, 1), (3, 4)],
@@ -4682,7 +4746,7 @@ def forward(self, x, y):
             ],
         ),
     )
-    def test_expand_as(self, compute_unit, backend, input_shapes):
+    def test_expand_as(self, compute_unit, backend, frontend, input_shapes):
         class TestModel(torch.nn.Module):
             def forward(self, x, y):
                 return x.expand_as(y)
@@ -4690,31 +4754,119 @@ def forward(self, x, y):
         model = TestModel()
 
         self.run_compare_torch(
-            input_shapes, model, backend=backend, compute_unit=compute_unit
+            input_shapes, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestExpandDims(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_and_axis",
+        "compute_unit, backend, frontend, rank_and_axis",
         itertools.product(
             compute_units,
             backends,
-            [
-                (rank, axis)
-                for rank in range(1, 5)
-                for axis in range(-rank - 1, rank + 1)
-            ],
+            frontends,
+            [(rank, axis) for rank in range(1, 5) for axis in range(-rank - 1, rank + 1)],
         ),
     )
-    def test_unsqueeze(self, compute_unit, backend, rank_and_axis):
+    def test_unsqueeze(self, compute_unit, backend, frontend, rank_and_axis):
         rank, axis = rank_and_axis
         input_shape = tuple(np.random.randint(low=2, high=10, size=rank))
         model = ModuleWrapper(function=torch.unsqueeze, kwargs={"dim": axis})
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
+
+
+class TestAtLeastND(TorchBaseTest):
+    @staticmethod
+    def _generate_input_shape(input_rank):
+        if input_rank == 0:
+            # Core ML does not support scalar input, so we use rank-1 size-1 tensor then squeeze
+            input_shape = (1,)
+        else:
+            input_shape = np.random.randint(2, 5, input_rank)
+        return input_shape
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, rank, input_rank",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+            (1, 2, 3),
+            (0, 1, 2, 3, 4, 5),
+        ),
+    )
+    def test_atleast_nd(self, compute_unit, backend, frontend, rank, input_rank):
+        if backend[0] == "neuralnetwork" and rank in (2, 3) and input_rank == 0:
+            pytest.xfail("rdar://134723147 nn backend additionally expands a dim")
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                # Core ML does not support scalar input, so we use rank-1 size-1 tensor then squeeze
+                if input_rank == 0:
+                    x = torch.squeeze(x)
+                if rank == 1:
+                    result = torch.atleast_1d(x)
+                elif rank == 2:
+                    result = torch.atleast_2d(x)
+                else:
+                    assert rank == 3
+                    result = torch.atleast_3d(x)
+                return result
+
+        input_shape = self._generate_input_shape(input_rank)
+        model = Model()
+
+        self.run_compare_torch(
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, rank, input_rank",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+            (1, 2, 3),
+            (0, 1, 2, 3, 4, 5),
+        ),
+    )
+    def test_atleast_nd_sequence(self, compute_unit, backend, frontend, rank, input_rank):
+        if backend[0] == "neuralnetwork" and rank in (2, 3) and input_rank == 0:
+            pytest.xfail("rdar://134723147 nn backend additionally expands a dim")
+
+        class Model(torch.nn.Module):
+            def forward(self, x, y):
+                # Core ML does not support scalar input, so we use rank-1 size-1 tensor then squeeze
+                if input_rank == 0:
+                    x = torch.squeeze(x)
+                    y = torch.squeeze(y)
+
+                # Lowering "tuple input as output" pymil program gives wrong output,
+                # so insert add ops to avoid "input as output"
+                # TODO (rdar://134722912) Fix the "tuple input as output" pymil program lowering
+                x = x + 1.0
+                y = y + 2.0
+
+                if rank == 1:
+                    result = torch.atleast_1d((x, y))
+                elif rank == 2:
+                    result = torch.atleast_2d((x, y))
+                else:
+                    assert rank == 3
+                    result = torch.atleast_3d((x, y))
+                return result
+
+        input_shape = [
+            self._generate_input_shape(input_rank),
+            self._generate_input_shape(input_rank),
+        ]
+        model = Model()
+
+        self.run_compare_torch(
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
 class TestLinspace(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -4735,13 +4887,9 @@ def forward(self, x):
                 return torch.linspace(start, end, steps)
 
         model = Model()
-        self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend", itertools.product(compute_units, backends)
-    )
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
     def test_linspace_static_large(self, compute_unit, backend):
         input_shape = tuple([1])
 
@@ -4750,9 +4898,7 @@ def forward(self, x):
                 return torch.linspace(1, 2_000_000, 2_000_000)
 
         model = Model()
-        self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
 
     @pytest.mark.parametrize(
         "compute_unit, backend, start_end, steps",
@@ -4794,29 +4940,20 @@ def forward(self, x):
 
         model = Model()
         mlmodel = self.run_compare_torch(
-            [(1, 2, 3)],
-            model,
-            backend=backend,
-            compute_unit=compute_unit
+            [(1, 2, 3)], model, backend=backend, compute_unit=compute_unit
         )
         prog = mlmodel[1]._mil_program
         # The linspace op is folded to const, so there is no range_1d op.
         assert len(prog.find_ops(op_type="const")) == 1
         assert len(prog.find_ops(op_type="range_1d")) == 0
 
-        with patch.object(Var, '_is_nonreplaceable_var') as mocked_is_nonreplaceable_var:
+        with patch.object(Var, "_is_nonreplaceable_var") as mocked_is_nonreplaceable_var:
             # Mock that the first param to linspace is non-replaceable.
             mocked_is_nonreplaceable_var.side_effect = (
-                lambda var: var.op
-                and var.op.op_type == "const"
-                and var.rank == 0
-                and var.val == 0
+                lambda var: var.op and var.op.op_type == "const" and var.rank == 0 and var.val == 0
             )
             mlmodel = self.run_compare_torch(
-                [(1, 2, 3)],
-                model,
-                backend=backend,
-                compute_unit=compute_unit
+                [(1, 2, 3)], model, backend=backend, compute_unit=compute_unit
             )
             prog = mlmodel[1]._mil_program
             # The linspace op is not folded to const, but translated to range_1d instead.
@@ -4825,10 +4962,11 @@ def forward(self, x):
 
 class TestArange(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, start_end_step",
+        "compute_unit, backend, frontend, start_end_step",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 (-0.1, -0.7, -0.07),
                 (3, 10, 0.3),
@@ -4838,16 +4976,11 @@ class TestArange(TorchBaseTest):
             ],
         ),
     )
-    def test_arange_static(self, compute_unit, backend, start_end_step):
+    def test_arange_static(self, compute_unit, backend, frontend, start_end_step):
         if start_end_step == (1, 10, 1e-6):
-            pytest.xfail(
-                "rdar://88998831 (range_1d has numerical issue when the step is small)"
-            )
-        input_shape = tuple(
-            [
-                1,
-            ]
-        )
+            pytest.xfail("rdar://88998831 (range_1d has numerical issue when the step is small)")
+
+        input_shape = (1,)
         start, end, step = start_end_step
 
         class Model(nn.Module):
@@ -4856,14 +4989,15 @@ def forward(self, x):
 
         model = Model()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, start_end_step",
+        "compute_unit, backend, frontend, start_end_step",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 (-0.1, -0.7, -0.07),
                 (3, 10, 0.3),
@@ -4872,7 +5006,14 @@ def forward(self, x):
             ],
         ),
     )
-    def test_arange_dynamic(self, compute_unit, backend, start_end_step):
+    def test_arange_dynamic(self, compute_unit, backend, frontend, start_end_step):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip(
+                "torch._dynamo.exc.UserError: Tried to use data-dependent value in the subsequent "
+                "computation. This can happen when we encounter unbounded dynamic value that is "
+                "unknown during tracing time."
+            )
+
         start, end, step = start_end_step
 
         class Model(nn.Module):
@@ -4884,32 +5025,55 @@ def forward(self, x):
         self.run_compare_torch(
             inputs,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
+    )
+    def test_arange_without_start(self, compute_unit, backend, frontend):
+        class Model(nn.Module):
+            def forward(self, x):
+                return torch.arange(10)
+
+        model = Model()
+        self.run_compare_torch(
+            (1,), model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
+
 
 class TestEinsum(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, equation, reverse_input_order, dynamic",
+        "compute_unit, backend, frontend, equation, reverse_input_order, dynamic",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             einsum_equations,
             [False, True],
             [False, True],
         ),
     )
-    def test_binary_einsum(self, compute_unit, backend, equation, reverse_input_order, dynamic):
+    def test_binary_einsum(
+        self, compute_unit, backend, frontend, equation, reverse_input_order, dynamic
+    ):
         if dynamic and backend[0] == "mlprogram" and ct.utils._macos_version() > (14, 2):
             pytest.xfail("rdar://120386990 (Einsum Model Failed)")
 
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch einsum decomposition issue")
+
         class TestBinaryEinsum(nn.Module):
             def forward(self, x, y):
                 return torch.einsum(equation, x, y)
 
         input_shapes, converter_input_type = gen_input_shapes_einsum(equation, dynamic, backend)
+        if frontend != TorchFrontend.TORCHSCRIPT:
+            converter_input_type = None
 
         if reverse_input_order:
             input_output_strings = equation.split("->")
@@ -4923,10 +5087,11 @@ def forward(self, x, y):
         res = self.run_compare_torch(
             input_shapes,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=True,
-            converter_input_type=converter_input_type
+            converter_input_type=converter_input_type,
         )
 
         # Verify the pattern of the hardcode einsum cases
@@ -4947,19 +5112,20 @@ def forward(self, x, y):
             assert "shape" not in ops_in_prog
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, equation, dynamic",
+        "compute_unit, backend, frontend, equation, dynamic",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             ["ab->ba", "aa->a", "ab->b", "iijk->ji"],
             [False, True],
         ),
     )
-    def test_unary_einsum(self, compute_unit, backend, equation, dynamic):
-        if backend == ("mlprogram", "fp16") and equation == "iijk->ji" and dynamic:
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
+    def test_unary_einsum(self, compute_unit, backend, frontend, equation, dynamic):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch einsum decomposition issue")
+        if platform.machine() == "x86_64" and dynamic and equation == "iijk->ji":
+            pytest.xfail("rdar://135843153 ([Bug] Models failed on x86_64 platform)")
 
         class TestUnaryEinsum(nn.Module):
             def forward(self, x):
@@ -4970,22 +5136,27 @@ def forward(self, x):
         self.run_compare_torch(
             input_shapes,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=True,
-            converter_input_type=converter_input_type
+            converter_input_type=converter_input_type,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, equation, dynamic",
+        "compute_unit, backend, frontend, equation, dynamic",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             ["ab,bc,cd->ba", "abb,abc,a->ab"],
             [False, True],
         ),
     )
-    def test_ternary_einsum(self, compute_unit, backend, equation, dynamic):
+    def test_ternary_einsum(self, compute_unit, backend, frontend, equation, dynamic):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch einsum decomposition issue")
+
         class TestTernaryEinsum(nn.Module):
             def forward(self, x, y, z):
                 return torch.einsum(equation, x, y, z)
@@ -4995,6 +5166,7 @@ def forward(self, x, y, z):
         self.run_compare_torch(
             input_shapes,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=True,
@@ -5002,13 +5174,14 @@ def forward(self, x, y, z):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_einsum_with_same_input(self, compute_unit, backend):
+    def test_einsum_with_same_input(self, compute_unit, backend, frontend):
         class Einsum(nn.Module):
             def forward(self, m1, m2, m3):
                 y1 = torch.einsum("bnhd,bdhm->bnhm", m1, m2)
@@ -5024,6 +5197,7 @@ def forward(self, m1, m2, m3):
         self.run_compare_torch(
             [m1, m2, m3],
             Einsum(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -5108,9 +5282,7 @@ def test_cumsum(self, compute_unit, backend, axis):
         input_shape = list(np.random.randint(low=2, high=10, size=4))
         input_shape = tuple(input_shape)
         model = ModuleWrapper(function=torch.cumsum, kwargs={"dim": axis})
-        self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
 
 
 class TestReshape(TorchBaseTest):
@@ -5165,17 +5337,18 @@ def test_reshape_scalar(self, compute_unit, backend, frontend, minimum_deploymen
 
 class TestReshapeAs(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_output_shape",
+        "compute_unit, backend, frontend, input_output_shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 ((6, 1, 1), (3, 2)),
                 ((8,), (2, 1, 1, 2, 2)),
             ],
         ),
     )
-    def test_reshape(self, compute_unit, backend, input_output_shape):
+    def test_reshape(self, compute_unit, backend, frontend, input_output_shape):
         class Model(nn.Module):
             def forward(self, x, ref):
                 return x.reshape_as(ref)
@@ -5185,6 +5358,7 @@ def forward(self, x, ref):
         self.run_compare_torch(
             [input_shape, output_shape],
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -5192,10 +5366,10 @@ def forward(self, x, ref):
 
 class TestFlatten(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, start_dim, end_dim, is_dynamic",
-        itertools.product(compute_units, backends, [2, -2, 0], [3, -1], [False, True]),
+        "compute_unit, backend, frontend, start_dim, end_dim, is_dynamic",
+        itertools.product(compute_units, backends, frontends, [2, -2, 0], [3, -1], [False, True]),
     )
-    def test_flatten(self, compute_unit, backend, start_dim, end_dim, is_dynamic):
+    def test_flatten(self, compute_unit, backend, frontend, start_dim, end_dim, is_dynamic):
         input_shape = (2, 3, 4, 5)
         converter_input_type = None
         if is_dynamic:
@@ -5217,6 +5391,7 @@ def test_flatten(self, compute_unit, backend, start_dim, end_dim, is_dynamic):
         self.run_compare_torch(
             input_shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             converter_input_type=converter_input_type,
@@ -5225,16 +5400,17 @@ def test_flatten(self, compute_unit, backend, start_dim, end_dim, is_dynamic):
 
 class TestUnflatten(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, dim, auto_infer_idx, dynamic",
+        "compute_unit, backend, frontend, dim, auto_infer_idx, dynamic",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             (0, 1, -1, -2),
             (0, 1, None),
             (True, False),
         ),
     )
-    def test_unflatten(self, compute_unit, backend, dim, auto_infer_idx, dynamic):
+    def test_unflatten(self, compute_unit, backend, frontend, dim, auto_infer_idx, dynamic):
         if dynamic and auto_infer_idx is not None:
             pytest.skip("Auto-inferring shape (-1) not supported for dynamic input.")
 
@@ -5273,6 +5449,7 @@ def forward(self, x):
             (NHEAD * BATCH_SIZE, NHEAD * INPUT_SIZE),
             Head(NHEAD, BATCH_SIZE, INPUT_SIZE, OUTPUT_SIZE),
             converter_input_type=inputs,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -5280,12 +5457,12 @@ def forward(self, x):
 
 class TestGather(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_and_axis",
+        "compute_unit, backend, frontend, rank_and_axis",
         itertools.product(
-            compute_units, backends, [(i, j) for i in range(1, 6) for j in range(0, i)]
+            compute_units, backends, frontends, [(i, j) for i in range(1, 6) for j in range(0, i)]
         ),
     )
-    def test_gather_along_axis(self, compute_unit, backend, rank_and_axis):
+    def test_gather_along_axis(self, compute_unit, backend, frontend, rank_and_axis):
         rank, axis = rank_and_axis
         params_shape = np.random.randint(low=2, high=5, size=rank)
         indices_shape = np.copy(params_shape)
@@ -5296,13 +5473,15 @@ def test_gather_along_axis(self, compute_unit, backend, rank_and_axis):
             function=torch.gather,
             kwargs={"dim": axis, "index": torch.from_numpy(indices)},
         )
-        self.run_compare_torch([params_shape], model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            [params_shape], model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_enumerated_shape",
-        itertools.product(compute_units, backends, (True, False)),
+        "compute_unit, backend, frontend, input_enumerated_shape",
+        itertools.product(compute_units, backends, frontends, (True, False)),
     )
-    def test_gather_enumerated_shape(self, compute_unit, backend, input_enumerated_shape):
+    def test_gather_enumerated_shape(self, compute_unit, backend, frontend, input_enumerated_shape):
         axis = 0
         params_shape = (2, 3, 4)
         indices_shape = (3, 3, 4)
@@ -5330,6 +5509,7 @@ def forward(self, x, index):
             Model(),
             input_as_shape=False,
             converter_input_type=converter_input_type,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=ct.target.iOS17,
@@ -5345,24 +5525,47 @@ def test_gather_along_axis_invalid_indices(self):
             torch.gather(data, 1, torch.tensor([[0, 0], [2, 0]]))
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, dynamic",
-        itertools.product(compute_units, backends, [True, False]),
+        "compute_unit, backend, frontend, dynamic",
+        itertools.product(compute_units, backends, frontends, [True, False]),
     )
-    def test_gather_nd_int16_indices(self, compute_unit, backend, dynamic):
+    def test_gather_nd_int16_indices(self, compute_unit, backend, frontend, dynamic):
         """Test the indices access in torch model which gets lowered to gather_nd."""
         B, C, H, W, T = 1, 24, 64, 64, 32
         data = torch.rand(B, C, H, W)
         time = (torch.rand(1, T) * (C - 1)).to(torch.int)
 
-        class DynamicModel(torch.nn.Module):
-            def forward(self, data, time):
-                return data[torch.arange(B).unsqueeze(1), time, :, :]
+        if frontend == TorchFrontend.TORCHSCRIPT:
+
+            class DynamicModel(torch.nn.Module):
+                def forward(self, data, time):
+                    return data[torch.arange(B).unsqueeze(1), time, :, :]
+
+            class StaticModel(torch.nn.Module):
+                def forward(self, data):
+                    return data[torch.arange(B).unsqueeze(1), time, :, :]
+
+            torch_model = DynamicModel() if dynamic else StaticModel()
+        else:
+
+            class DynamicModel(torch.nn.Module):
+                def __init__(self, B):
+                    super().__init__()
+                    self.slice0 = torch.arange(B).unsqueeze(1)
+
+                def forward(self, data, time):
+                    return data[self.slice0, time, :, :]
+
+            class StaticModel(torch.nn.Module):
+                def __init__(self, B, time):
+                    super().__init__()
+                    self.slice0 = torch.arange(B).unsqueeze(1)
+                    self.time = time
+
+                def forward(self, data):
+                    return data[self.slice0, self.time, :, :]
 
-        class StaticModel(torch.nn.Module):
-            def forward(self, data):
-                return data[torch.arange(B).unsqueeze(1), time, :, :]
+            torch_model = DynamicModel(B) if dynamic else StaticModel(B, time)
 
-        torch_model = DynamicModel() if dynamic else StaticModel()
         input_data = (data, time) if dynamic else data
         converter_input_type = [ct.TensorType(shape=data.shape)]
         if dynamic:
@@ -5373,6 +5576,7 @@ def forward(self, data):
             torch_model,
             input_as_shape=False,
             converter_input_type=converter_input_type,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=ct.target.iOS17,
@@ -5631,20 +5835,11 @@ def test_softplus(self, compute_unit, backend, beta, threshold, minimum_deployme
 
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
-        itertools.product(
-            compute_units,
-            backends,
-            COMMON_SHAPES_ALL
-        ),
+        itertools.product(compute_units, backends, COMMON_SHAPES_ALL),
     )
     def test_mish(self, compute_unit, backend, shape):
         model = nn.Mish().eval()
-        self.run_compare_torch(
-            shape,
-            model,
-            backend=backend,
-            compute_unit=compute_unit
-        )
+        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
 
     @pytest.mark.parametrize(
         "compute_unit, backend, shape",
@@ -5668,12 +5863,12 @@ def test_silu(self, compute_unit, backend, shape):
 
     @pytest.mark.parametrize(
         "compute_unit, backend, rounding_mode, x2_type",
-        itertools.product(compute_units, backends, [None, "floor", "trunc"], [np.float32, np.int32]),
+        itertools.product(
+            compute_units, backends, [None, "floor", "trunc"], [np.float32, np.int32]
+        ),
     )
     def test_div(self, compute_unit, backend, rounding_mode, x2_type):
-        model = ModuleWrapper(
-            function=torch.div, kwargs={"rounding_mode": rounding_mode}
-        )
+        model = ModuleWrapper(function=torch.div, kwargs={"rounding_mode": rounding_mode})
         x1 = torch.from_numpy(np.array([2.3, 2.6, -3.6, -3.2], dtype=np.float32))
         x2 = torch.from_numpy(np.array([1.0, 1.0, 1.0, 1.0], dtype=x2_type))
         out = torch.div(x1, x2, rounding_mode=rounding_mode)
@@ -5689,10 +5884,11 @@ def test_div(self, compute_unit, backend, rounding_mode, x2_type):
 
 class TestElementWiseUnary(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, op_string",
+        "compute_unit, backend, frontend, shape, op_string",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, 3, 5, 8)],
             [
                 "abs",
@@ -5715,7 +5911,7 @@ class TestElementWiseUnary(TorchBaseTest):
             ],
         ),
     )
-    def test_elementwise_no_params(self, compute_unit, backend, shape, op_string):
+    def test_elementwise_no_params(self, compute_unit, backend, frontend, shape, op_string):
         if not contains_op(torch, op_string):
             return
         if op_string == "sqrt" and compute_unit != ct.ComputeUnit.CPU_ONLY:
@@ -5723,13 +5919,16 @@ def test_elementwise_no_params(self, compute_unit, backend, shape, op_string):
 
         op_func = getattr(torch, op_string)
         model = ModuleWrapper(function=op_func)
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, clamp_range, minimum_deployment_target",
+        "compute_unit, backend, frontend, shape, clamp_range, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, 3, 5, 8)],
             [
                 (0.0, 1.0),
@@ -5744,7 +5943,9 @@ def test_elementwise_no_params(self, compute_unit, backend, shape, op_string):
             [None, ct.target.iOS17],
         ),
     )
-    def test_clamp(self, compute_unit, backend, shape, clamp_range, minimum_deployment_target):
+    def test_clamp(
+        self, compute_unit, backend, frontend, shape, clamp_range, minimum_deployment_target
+    ):
         params_dict = {}
         if clamp_range[0] is not None:
             params_dict["min"] = clamp_range[0]
@@ -5755,6 +5956,7 @@ def test_clamp(self, compute_unit, backend, shape, clamp_range, minimum_deployme
         self.run_compare_torch(
             shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             rand_range=(-5, 5),
@@ -5762,19 +5964,21 @@ def test_clamp(self, compute_unit, backend, shape, clamp_range, minimum_deployme
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_clamp_int_input(self, compute_unit, backend):
+    def test_clamp_int_input(self, compute_unit, backend, frontend):
         params_dict = {"min": -2, "max": 2}
         input_data = torch.randint(low=-5, high=5, size=(2, 3, 4))
         model = ModuleWrapper(torch.clamp, params_dict)
         self.run_compare_torch(
             input_data,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -5782,19 +5986,21 @@ def test_clamp_int_input(self, compute_unit, backend):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_clamp_min_int(self, compute_unit, backend):
+    def test_clamp_min_int(self, compute_unit, backend, frontend):
         params_dict = {"min": 0}
         input_data = torch.randint(low=-5, high=5, size=(2, 3, 4))
         model = ModuleWrapper(torch.clamp_min, params_dict)
         self.run_compare_torch(
             input_data,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -5802,35 +6008,40 @@ def test_clamp_min_int(self, compute_unit, backend):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_clamp_min_float(self, compute_unit, backend):
+    def test_clamp_min_float(self, compute_unit, backend, frontend):
         params_dict = {"min": 0.0}
         input_data = torch.randn((2, 3, 4))
         model = ModuleWrapper(torch.clamp_min, params_dict)
         self.run_compare_torch(
             input_data,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, threshold, minimum_deployment_target",
+        "compute_unit, backend, frontend, shape, threshold, minimum_deployment_target",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, 3, 5, 8)],
             [(0.0, 0.0), (0.5, 0.5), (0.5, 10), (0.9, 0.0)],
             [None, ct.target.iOS17],
         ),
     )
-    def test_threshold(self, compute_unit, backend, shape, threshold, minimum_deployment_target):
+    def test_threshold(
+        self, compute_unit, backend, frontend, shape, threshold, minimum_deployment_target
+    ):
         model = torch.nn.Threshold(threshold[0], threshold[1]).eval()
         input_value = torch.rand(np.prod(shape))
         # make sure the values are not too close to the threshold
@@ -5841,6 +6052,7 @@ def test_threshold(self, compute_unit, backend, shape, threshold, minimum_deploy
         self.run_compare_torch(
             input_value,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -5848,10 +6060,11 @@ def test_threshold(self, compute_unit, backend, shape, threshold, minimum_deploy
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, op_string",
+        "compute_unit, backend, frontend, shape, op_string",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, 3, 5, 8)],
             [
                 "log",
@@ -5861,29 +6074,29 @@ def test_threshold(self, compute_unit, backend, shape, threshold, minimum_deploy
         ),
     )
     def test_elementwise_numerically_stable(
-        self, compute_unit, backend, shape, op_string
+        self, compute_unit, backend, frontend, shape, op_string
     ):
         op_func = getattr(torch, op_string)
         model = ModuleWrapper(function=op_func)
         self.run_compare_torch(
             shape,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             rand_range=(20, 100),
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, dtype",
+        "compute_unit, backend, frontend, dtype",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [np.int32, np.float32],
         ),
     )
-    def test_log_dtype(
-        self, compute_unit, backend, dtype
-    ):
+    def test_log_dtype(self, compute_unit, backend, frontend, dtype):
         SHAPE = (2, 3)
 
         input_data = np.random.randint(1, 100, SHAPE).astype(dtype)
@@ -5894,56 +6107,58 @@ def test_log_dtype(
         self.run_compare_torch(
             input_data,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
-            converter_input_type=converter_input_type
+            converter_input_type=converter_input_type,
         )
 
 
 class TestAtan2(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_atan2(self, compute_unit, backend, rank):
+    def test_atan2(self, compute_unit, backend, frontend, rank):
         model = ModuleWrapper(function=torch.atan2)
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
-        TorchBaseTest.run_compare_torch(
+        self.run_compare_torch(
             [input_shape, input_shape],
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
-            input_as_shape=True,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_atan2_x0(self, compute_unit, backend, rank):
+    def test_atan2_x0(self, compute_unit, backend, frontend, rank):
         model = ModuleWrapper(function=torch.atan2)
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
         y = generate_input_data(input_shape, rand_range=(-1.0, 1.0))
         x = torch.zeros(input_shape)
-        TorchBaseTest.run_compare_torch(
+        self.run_compare_torch(
             (y, x),
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_atan2_y0x0(self, compute_unit, backend, rank):
+    def test_atan2_y0x0(self, compute_unit, backend, frontend, rank):
         model = ModuleWrapper(function=torch.atan2)
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
         y = torch.zeros(input_shape)
         x = torch.zeros(input_shape)
-        TorchBaseTest.run_compare_torch(
+        self.run_compare_torch(
             (y, x),
             model,
             backend=backend,
@@ -5952,60 +6167,64 @@ def test_atan2_y0x0(self, compute_unit, backend, rank):
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_atan2_broadcast(self, compute_unit, backend, rank):
+    def test_atan2_broadcast(self, compute_unit, backend, frontend, rank):
         model = ModuleWrapper(function=torch.atan2)
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
         truncated_shape = list(input_shape)
         while len(truncated_shape) > 1:
             truncated_shape.pop(0)
-            TorchBaseTest.run_compare_torch(
+            self.run_compare_torch(
                 [input_shape, truncated_shape],
                 model,
+                frontend=frontend,
                 backend=backend,
                 compute_unit=compute_unit,
-                input_as_shape=True,
             )
-            TorchBaseTest.run_compare_torch(
+            self.run_compare_torch(
                 [truncated_shape, input_shape],
                 model,
+                frontend=frontend,
                 backend=backend,
                 compute_unit=compute_unit,
-                input_as_shape=True,
             )
 
 
 class TestTriu(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, diagonal",
+        "compute_unit, backend, frontend, shape, diagonal",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(5, 5), (3, 4), (5, 1)],
             [None, -1, 0, 2],
         ),
     )
-    def test_triu(self, compute_unit, backend, shape, diagonal):
+    def test_triu(self, compute_unit, backend, frontend, shape, diagonal):
         params_dict = {}
         if diagonal is not None:
             params_dict["diagonal"] = diagonal
         model = ModuleWrapper(torch.triu, params_dict)
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
 
 class TestTril(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, diagonal",
+        "compute_unit, backend, frontend, shape, diagonal",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(5, 5), (3, 4), (5, 1)],
             [None, -1, 0, 2],
         ),
     )
-    def test_tril(self, compute_unit, backend, shape, diagonal):
+    def test_tril(self, compute_unit, backend, frontend, shape, diagonal):
         params_dict = {}
         if diagonal is not None:
             params_dict["diagonal"] = diagonal
@@ -6013,38 +6232,39 @@ def test_tril(self, compute_unit, backend, shape, diagonal):
         self.run_compare_torch(
             shape,
             model,
-            backend=backend,
             compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
         )
 
 
 class TestMatMul(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_bmm(self, compute_unit, backend):
+    def test_bmm(self, compute_unit, backend, frontend):
         shape_x, shape_y = (3, 4, 5), (3, 5, 6)
         model = ModuleWrapper(function=torch.bmm)
         self.run_compare_torch(
-            [shape_x, shape_y], model, backend=backend, compute_unit=compute_unit
+            [shape_x, shape_y], model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_bmm_with_fp16_inputs(self, compute_unit, backend):
-        if backend == ("mlprogram", "fp16"):
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
+    def test_bmm_with_fp16_inputs(self, compute_unit, backend, frontend):
+        if platform.machine() == "x86_64" and ct.utils._macos_version() <= (14, 2):
+            pytest.xfail("rdar://135925921 ([CI] Upgrade External CI Machine OS)")
 
         class TestModel(torch.nn.Module):
             def forward(self, x, y):
@@ -6060,6 +6280,7 @@ def forward(self, x, y):
         self.run_compare_torch(
             inputs,
             TestModel(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=ct.target.iOS16,
@@ -6069,14 +6290,15 @@ def forward(self, x, y):
 
 class TestNumel(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape",
+        "compute_unit, backend, frontend, input_shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1,), (2, 3)],
         ),
     )
-    def test_numel(self, compute_unit, backend, input_shape):
+    def test_numel(self, compute_unit, backend, frontend, input_shape):
         class TestModel(torch.nn.Module):
             def forward(self, x):
                 res = torch.numel(x)
@@ -6084,44 +6306,47 @@ def forward(self, x):
 
         model = TestModel()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestSplit(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, split_size_or_sections, dim",
-        itertools.product(compute_units, backends, [1, 2, [1, 4]], [0, -2]),
+        "compute_unit, backend, frontend, split_size_or_sections, dim",
+        itertools.product(compute_units, backends, frontends, [1, 2, [1, 4]], [0, -2]),
     )
-    def test_split(self, compute_unit, backend, split_size_or_sections, dim):
+    def test_split(self, compute_unit, backend, frontend, split_size_or_sections, dim):
         input_shape = (5, 2)
         model = ModuleWrapper(
             function=torch.split,
             kwargs={"split_size_or_sections": split_size_or_sections, "dim": dim},
         )
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, frontend=frontend, backend=backend, compute_unit=compute_unit
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, split_sizes, dim",
-        itertools.product(compute_units, backends, [[1, 4], [3, 2]], [-1, -2]),
+        "compute_unit, backend, frontend, split_sizes, dim",
+        itertools.product(compute_units, backends, frontends, [[1, 4], [3, 2]], [-1, -2]),
     )
-    def test_split_with_sizes(self, compute_unit, backend, split_sizes, dim):
+    def test_split_with_sizes(self, compute_unit, backend, frontend, split_sizes, dim):
         input_shape = (5, 5)
         model = ModuleWrapper(
             function=torch.split_with_sizes,
             kwargs={"split_sizes": split_sizes, "dim": dim},
         )
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, frontend=frontend, backend=backend, compute_unit=compute_unit
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, dim",
-        itertools.product(compute_units, backends, [-1]),
+        "compute_unit, backend, frontend, dim",
+        itertools.product(compute_units, backends, frontends, [-1]),
     )
-    def test_split_with_dynamic_sizes(self, compute_unit, backend, dim):
+    def test_split_with_dynamic_sizes(self, compute_unit, backend, frontend, dim):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip("Torch.Export cannot export dynamic sizes")
+
         class TestModel(torch.nn.Module):
             def forward(self, x):
                 size = x[0]
@@ -6136,6 +6361,7 @@ def forward(self, x):
             model,
             expected_results=torch_out,
             input_as_shape=False,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -6155,6 +6381,7 @@ def forward(self, x):
                         model,
                         expected_results=torch_out,
                         input_as_shape=False,
+                        frontend=frontend,
                         backend=backend,
                         compute_unit=compute_unit,
                     )
@@ -6162,44 +6389,45 @@ def forward(self, x):
 
 class TestUnbind(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, dim",
-        itertools.product(compute_units, backends, [0, 1, 2]),
+        "compute_unit, backend, frontend, dim",
+        itertools.product(compute_units, backends, frontends, [0, 1, 2]),
     )
-    def test_unbind(self, compute_unit, backend, dim):
+    def test_unbind(self, compute_unit, backend, frontend, dim):
         input_shape = (3, 3, 4)
         model = ModuleWrapper(function=torch.unbind, kwargs={"dim": dim})
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_unbind_one_dim_shape(self, compute_unit, backend):
+    def test_unbind_one_dim_shape(self, compute_unit, backend, frontend):
         input_shape = (1,)
         dim = 0
         model = ModuleWrapper(function=torch.unbind, kwargs={"dim": dim})
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestTranspose(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shape, dims",
+        "compute_unit, backend, frontend, shape, dims",
         itertools.product(
-            compute_units, backends, COMMON_SHAPES, [(0, 1), (-2, -1), (1, 0), (-1, -2)]
+            compute_units, backends, frontends, COMMON_SHAPES, [(0, 1), (-2, -1), (1, 0), (-1, -2)]
         ),
     )
-    def test(self, compute_unit, backend, shape, dims):
-        model = ModuleWrapper(
-            function=torch.transpose, kwargs={"dim0": dims[0], "dim1": dims[1]}
+    def test(self, compute_unit, backend, frontend, shape, dims):
+        model = ModuleWrapper(function=torch.transpose, kwargs={"dim0": dims[0], "dim1": dims[1]})
+        self.run_compare_torch(
+            shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
 
 
 class TestTo(TorchBaseTest):
@@ -6289,11 +6517,11 @@ def forward(self, input_data):
     )
     def test_to_no_param(self, compute_unit, backend: Tuple[str], input_type):
         if input_type == np.float16 and backend[0] == "neuralnetwork":
+            pytest.skip("Input float16 needs target >= iOS16, which doesn't support neuralnetwork.")
+        if input_type == np.float16 and _macos_version() < (13, 0):
             pytest.skip(
-                "Input float16 needs target >= iOS16, which doesn't support neuralnetwork."
+                "Input float16 needs target >= iOS16, which is not available until macOS 13."
             )
-        if input_type == np.float16 and _macos_version() < (13, 0):
-            pytest.skip("Input float16 needs target >= iOS16, which is not available until macOS 13.")
 
         class TestModel(torch.nn.Module):
             def forward(self, input_data):
@@ -6342,10 +6570,7 @@ def forward(self, x):
                 lambda var: var.op and "range_1d" in var.op.op_type
             )
             mlmodel = self.run_compare_torch(
-                [(1, 2, 3)],
-                model,
-                backend=backend,
-                compute_unit=compute_unit
+                [(1, 2, 3)], model, backend=backend, compute_unit=compute_unit
             )
             prog = mlmodel[1]._mil_program
             # The range_1d op translated from `torch.arange` shouldn't be folded.
@@ -6362,7 +6587,7 @@ class TestSlice(TorchBaseTest):
     def test_slice(self, compute_unit, backend, frontend, start, end, step):
         class SliceModel(torch.nn.Module):
             def forward(self, x):
-                y = x[start : end : step]
+                y = x[start:end:step]
                 return y
 
         model = SliceModel()
@@ -6382,7 +6607,7 @@ def forward(self, x):
         ),
     )
     def test_dynamic_slice(self, compute_unit, backend, frontend):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2189: "
                 "torch.export Cannot Use Dynamic Index to Slice"
@@ -6405,9 +6630,7 @@ def forward(self, tokens, context, context_length):
                 tokens_embeddings = self.tokens_embedding(tokens)
                 context_embeddings = self.context_embedding(context)
                 embeddings = torch.cat((context_embeddings, tokens_embeddings), dim=0)
-                embeddings = self.dynamic_slicer(
-                    embeddings, torch.squeeze(context_length)
-                )
+                embeddings = self.dynamic_slicer(embeddings, torch.squeeze(context_length))
 
                 return embeddings
 
@@ -6430,41 +6653,48 @@ def forward(self, tokens, context, context_length):
 
 class TestRepeat(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_repeat(self, compute_unit, backend, rank):
+    def test_repeat(self, compute_unit, backend, frontend, rank):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip("ectedly found a <class 'torch.Tensor'> in the inputs")
+
         input_shape = np.random.randint(low=2, high=6, size=rank)
         repeats = np.random.randint(low=2, high=4, size=rank)
         input_shape = tuple(input_shape)
 
         model = ModuleWrapper(function=lambda x: x.repeat(*repeats))
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, backend=backend, compute_unit=compute_unit, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, (1, 2)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, (1, 2)),
     )
-    def test_repeats_with_extra_dimensions(self, compute_unit, backend, rank):
+    def test_repeats_with_extra_dimensions(self, compute_unit, backend, frontend, rank):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip("unexpectedly found a <class 'torch.Tensor'> in the inputs")
+
         input_shape = np.random.randint(low=2, high=6, size=rank)
 
         for num_extra_dims in (1, 2):
             repeats = np.random.randint(low=2, high=4, size=rank + num_extra_dims)
             model = ModuleWrapper(function=lambda x: x.repeat(*repeats))
             self.run_compare_torch(
-                input_shape, model, backend=backend, compute_unit=compute_unit
+                input_shape, model, backend=backend, compute_unit=compute_unit, frontend=frontend
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_repeats_with_enumerated_shape_case1(self, compute_unit, backend):
+    def test_repeats_with_enumerated_shape_case1(self, compute_unit, backend, frontend):
         class Model(nn.Module):
             def forward(self, x, y):
                 reps = x.size(0)
@@ -6484,16 +6714,18 @@ def forward(self, x, y):
             ],
             backend=backend,
             compute_unit=compute_unit,
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_repeats_with_enumerated_shape_case2(self, compute_unit, backend):
+    def test_repeats_with_enumerated_shape_case2(self, compute_unit, backend, frontend):
         class Model(nn.Module):
             def forward(self, x, y):
                 return y.repeat(x.size(0), x.size(1))
@@ -6511,16 +6743,18 @@ def forward(self, x, y):
             ],
             backend=backend,
             compute_unit=compute_unit,
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_repeats_with_symbolic_shape(self, compute_unit, backend):
+    def test_repeats_with_symbolic_shape(self, compute_unit, backend, frontend):
         class Model(nn.Module):
             def forward(self, x, y):
                 return y.repeat([x.shape[-1], 1, x.shape[0]])
@@ -6543,29 +6777,69 @@ def forward(self, x, y):
             ],
             backend=backend,
             compute_unit=compute_unit,
+            frontend=frontend,
         )
 
 
 class TestRepeatInterleave(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, repeat",
+        "compute_unit, backend, frontend, rank, dim, repeat",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             (1, 3, 5),
-            (2, torch.tensor(3), torch.tensor([4])),
+            (None, 0, 1, 2, 3, 4),
+            (1, torch.tensor(1), torch.tensor([1]), 2, torch.tensor(3), torch.tensor([4])),
         ),
     )
-    def test_scalar_repeat(self, compute_unit, backend, rank, repeat):
+    def test_scalar_repeat(self, compute_unit, backend, frontend, rank, dim, repeat):
+        if dim is not None and dim >= rank:
+            pytest.skip()
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.repeat_interleave.Tensor is not Aten Canonical")
+
         input_shape = tuple(np.random.randint(low=1, high=6, size=rank))
-        for dim in [None] + [*range(rank)]:
-            model = ModuleWrapper(function=lambda x: x.repeat_interleave(repeat, dim=dim))
-            self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
+        model = ModuleWrapper(function=lambda x: x.repeat_interleave(repeat, dim=dim))
+
+        mlmodel = self.run_compare_torch(
+            input_shape,
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )[1]
+        # when repeat = 1, repeat_interelave is a noop
+        if repeat in (1, torch.tensor(1), torch.tensor([1])):
+            assert get_op_types_in_program(mlmodel._mil_program) in (
+                ["identity"],
+                ["identity", "identity"],
+                ["cast", "cast"],
+                ["reshape"],
+                ["cast", "reshape", "cast"],
+            )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+        ),
+    )
+    def test_single_fill_tensor_repeat(self, compute_unit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.repeat_interleave.Tensor is not Aten Canonical")
 
-    def test_single_fill_tensor_repeat(self):
         input_shape = (3, 2)
         model = ModuleWrapper(function=lambda x: x.repeat_interleave(torch.tensor([2, 2]), dim=1))
-        self.run_compare_torch(input_shape, model)
+        self.run_compare_torch(
+            input_shape,
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+        )
 
     def test_unsupported_tensor_repeat(self):
         input_shape = (4, 1, 3)
@@ -6578,6 +6852,52 @@ def test_unsupported_tensor_repeat(self):
         ):
             self.run_compare_torch(input_shape, model)
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, dim",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+            (None, -4, -3, -2, -1),
+        ),
+    )
+    def test_dynamic(self, compute_unit, backend, frontend, dim):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch size op does not work on FakeTensor")
+        if platform.machine() == "x86_64":
+            pytest.xfail("rdar://135843153 ([Bug] Models failed on x86_64 platform)")
+
+        input_shape = (2, 3, 5, 7)
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                return x.repeat_interleave(2, dim=dim)
+
+        model = Model()
+
+        torch_export_dynamic_shapes = None
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            batch_dim = torch.export.Dim(name="batch_dim", max=128)
+            sequence_length = torch.export.Dim(name="sequence_length", max=256)
+            torch_export_dynamic_shapes = {"x": {0: batch_dim, 2: sequence_length}}
+
+        converter_input_type = None
+        if frontend == TorchFrontend.TORCHSCRIPT:
+            batch_dim = RangeDim(lower_bound=2, upper_bound=128)
+            sequence_length = RangeDim(lower_bound=2, upper_bound=256)
+            input_symbolic_shape = (batch_dim, 3, sequence_length, 7)
+            converter_input_type = [TensorType(shape=input_symbolic_shape)]
+
+        self.run_compare_torch(
+            input_shape,
+            model,
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
+            torch_export_dynamic_shapes=torch_export_dynamic_shapes,
+            converter_input_type=converter_input_type,
+        )
+
 
 class TestStd(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -6680,30 +7000,54 @@ def forward(self, x):
 
 class TestFill(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, dynamic, fill_scalar, src_dtype",
+        "compute_unit, backend, frontend, rank, dynamic, fill_scalar, src_dtype",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [1, 3],
             [False, True],
             [0.2, torch.tensor(float("-inf")), torch.tensor(2)],
             [torch.int32, torch.float32],
         ),
     )
-    def test_fill_(self, compute_unit, backend, rank, dynamic, fill_scalar, src_dtype):
+    def test_fill_(self, compute_unit, backend, frontend, rank, dynamic, fill_scalar, src_dtype):
         if src_dtype == torch.int32 and fill_scalar == torch.tensor(float("-inf")):
             pytest.skip("float(-inf) cannot be casted to int.")
+        if (
+            backend[0] == "neuralnetwork"
+            and fill_scalar == 0.2
+            and src_dtype == torch.int32
+            and frontend in TORCH_EXPORT_BASED_FRONTENDS
+        ):
+            pytest.xfail("rdar://133816197 Cast mb.fill output dtype to EXIR specification")
 
         input_shape = np.random.randint(low=2, high=6, size=rank)
         input_shape = tuple(input_shape)
 
-        class FillModel(nn.Module):
-            def forward(self, x):
-                y = torch.empty(x.shape, dtype=src_dtype)
-                y.fill_(fill_scalar)
-                return y
+        if frontend == TorchFrontend.TORCHSCRIPT:
+
+            class FillModel(nn.Module):
+                def forward(self, x):
+                    y = torch.empty(x.shape, dtype=src_dtype)
+                    y.fill_(fill_scalar)
+                    return y
+
+            model = FillModel()
+        else:
+
+            class FillModel(nn.Module):
+                def __init__(self, fill_scalar):
+                    super().__init__()
+                    self.fill_scalar = fill_scalar
+
+                def forward(self, x):
+                    y = torch.empty(x.shape, dtype=src_dtype)
+                    y.fill_(self.fill_scalar)
+                    return y
+
+            model = FillModel(fill_scalar)
 
-        model = FillModel()
         if dynamic:
             upper_bound = 10 if backend[0] == "mlprogram" else -1
             if rank == 1:
@@ -6731,35 +7075,60 @@ def forward(self, x):
             input_shape,
             model,
             converter_input_type=converter_input_type,
+            compute_unit=compute_unit,
             backend=backend,
-            compute_unit=compute_unit
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, dynamic, fill_scalar, src_dtype",
+        "compute_unit, backend, frontend, rank, dynamic, fill_scalar, src_dtype",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [1, 3],
             [False, True],
             [0.2, torch.tensor(float("-inf")), torch.tensor(2)],
             [torch.int32, torch.float32],
         ),
     )
-    def test_fill__2(self, compute_unit, backend, rank, dynamic, fill_scalar, src_dtype):
+    def test_fill__2(self, compute_unit, backend, frontend, rank, dynamic, fill_scalar, src_dtype):
         if src_dtype == torch.int32 and fill_scalar == torch.tensor(float("-inf")):
             pytest.skip("float(-inf) cannot be casted to int.")
+        if (
+            backend[0] == "neuralnetwork"
+            and fill_scalar == 0.2
+            and src_dtype == torch.int32
+            and frontend in TORCH_EXPORT_BASED_FRONTENDS
+        ):
+            pytest.xfail("rdar://133816197 Cast mb.fill output dtype to EXIR specification")
 
         input_shape = np.random.randint(low=2, high=6, size=rank)
         input_shape = tuple(input_shape)
 
-        class FillModel(nn.Module):
-            def forward(self, x):
-                y = torch.empty(x.shape, dtype=src_dtype)
-                y.fill_(fill_scalar)
-                return y + 1
+        if frontend == TorchFrontend.TORCHSCRIPT:
+
+            class FillModel(nn.Module):
+                def forward(self, x):
+                    y = torch.empty(x.shape, dtype=src_dtype)
+                    y.fill_(fill_scalar)
+                    return y + 1
+
+            model = FillModel()
+        else:
+
+            class FillModel(nn.Module):
+                def __init__(self, fill_scalar):
+                    super().__init__()
+                    self.fill_scalar = fill_scalar
+
+                def forward(self, x):
+                    y = torch.empty(x.shape, dtype=src_dtype)
+                    y.fill_(self.fill_scalar)
+                    return y + 1
+
+            model = FillModel(fill_scalar)
 
-        model = FillModel()
         if dynamic:
             upper_bound = 10 if backend[0] == "mlprogram" else -1
             if rank == 1:
@@ -6787,8 +7156,9 @@ def forward(self, x):
             input_shape,
             model,
             converter_input_type=converter_input_type,
+            compute_unit=compute_unit,
             backend=backend,
-            compute_unit=compute_unit
+            frontend=frontend,
         )
 
 
@@ -7136,10 +7506,10 @@ def forward(self, x, y):
 
 class TestLog10(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_log10(self, compute_unit, backend, rank):
+    def test_log10(self, compute_unit, backend, frontend, rank):
         class Log10Model(nn.Module):
             def forward(self, x):
                 return torch.log10(x)
@@ -7147,16 +7517,16 @@ def forward(self, x):
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
         model = Log10Model()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestLog2(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_log2(self, compute_unit, backend, rank):
+    def test_log2(self, compute_unit, backend, frontend, rank):
         class Log2Model(nn.Module):
             def __init__(self):
                 super(Log2Model, self).__init__()
@@ -7167,38 +7537,41 @@ def forward(self, x):
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
         model = Log2Model()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestUnique(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x, return_inverse, return_counts",
+        "compute_unit, backend, frontend, x, return_inverse, return_counts",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             (
                 [1, 2, 3, 2, 2, 3, 99, -1, 1],
                 [[1, 2, 3, 100], [3, 2, 99, 1]],
             ),
             (True, False),
             (True, False),
-        )
+        ),
     )
-    def test(self, compute_unit, backend, x, return_inverse, return_counts):
+    def test(self, compute_unit, backend, frontend, x, return_inverse, return_counts):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip("torch._dynamo.exc.Unsupported: dynamic shape operator: aten._unique2")
+
         class Model(nn.Module):
             def forward(self, x):
-                return torch.unique(
-                    x, return_inverse=return_inverse, return_counts=return_counts
-                )
+                return torch.unique(x, return_inverse=return_inverse, return_counts=return_counts)
 
-        if backend[0] == 'neuralnetwork':
+        if backend[0] == "neuralnetwork":
             pytest.xfail("This op is only supported on mlprogram backend.")
 
         self.run_compare_torch(
             torch.Tensor(x),
             Model(),
             input_as_shape=False,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -7206,14 +7579,15 @@ def forward(self, x):
 
 class TestFlip(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank_dim",
+        "compute_unit, backend, frontend, rank_dim",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, [0]), (2, [0, 1]), (3, [1]), (4, [0, 1, 2, 3])],
         ),
     )
-    def test_flip(self, compute_unit, backend, rank_dim):
+    def test_flip(self, compute_unit, backend, frontend, rank_dim):
         rank, dim = rank_dim
 
         class FlipModel(nn.Module):
@@ -7223,16 +7597,17 @@ def forward(self, x):
         input_shape = tuple(np.random.randint(low=1, high=10, size=rank))
         model = FlipModel()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestBitWiseLogical(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_y, op_string",
+        "compute_unit, backend, frontend, x_y, op_string",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 ([True, False, True, False], [True, True, False, False]),
                 ([[True, False], [True, False]], [[True, True], [False, False]]),
@@ -7246,7 +7621,7 @@ class TestBitWiseLogical(TorchBaseTest):
             ],
         ),
     )
-    def test_bitwise_logical(self, compute_unit, backend, x_y, op_string):
+    def test_bitwise_logical(self, compute_unit, backend, frontend, x_y, op_string):
         if not contains_op(torch, op_string):
             return
         op_func = getattr(torch, op_string)
@@ -7256,6 +7631,7 @@ def test_bitwise_logical(self, compute_unit, backend, x_y, op_string):
         self.run_compare_torch(
             [x, y],
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -7264,10 +7640,11 @@ def test_bitwise_logical(self, compute_unit, backend, x_y, op_string):
 
 class TestLogicalAnd(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_y",
+        "compute_unit, backend, frontend, x_y",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 ([True, False, True, False], [True, True, False, False]),
                 ([[True, False], [True, False]], [[True, True], [False, False]]),
@@ -7276,7 +7653,7 @@ class TestLogicalAnd(TorchBaseTest):
             ],
         ),
     )
-    def test_logical_and(self, compute_unit, backend, x_y):
+    def test_logical_and(self, compute_unit, backend, frontend, x_y):
         class TestNet(nn.Module):
             def forward(self, x, y):
                 return torch.logical_and(x, y)
@@ -7287,6 +7664,7 @@ def forward(self, x, y):
         self.run_compare_torch(
             [x, y],
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -7295,10 +7673,11 @@ def forward(self, x, y):
 
 class TestLogicalOr(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_y",
+        "compute_unit, backend, frontend, x_y",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 ([True, False, True, False], [True, True, False, False]),
                 ([[True, False], [True, False]], [[True, True], [False, False]]),
@@ -7307,7 +7686,7 @@ class TestLogicalOr(TorchBaseTest):
             ],
         ),
     )
-    def test_logical_or(self, compute_unit, backend, x_y):
+    def test_logical_or(self, compute_unit, backend, frontend, x_y):
         class TestNet(nn.Module):
             def forward(self, x, y):
                 return torch.logical_or(x, y)
@@ -7318,6 +7697,7 @@ def forward(self, x, y):
         self.run_compare_torch(
             [x, y],
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -7326,10 +7706,11 @@ def forward(self, x, y):
 
 class TestLogicalXor(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x_y",
+        "compute_unit, backend, frontend, x_y",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 ([True, False, True, False], [True, True, False, False]),
                 ([[True, False], [True, False]], [[True, True], [False, False]]),
@@ -7338,7 +7719,7 @@ class TestLogicalXor(TorchBaseTest):
             ],
         ),
     )
-    def test_logical_xor(self, compute_unit, backend, x_y):
+    def test_logical_xor(self, compute_unit, backend, frontend, x_y):
         class TestNet(nn.Module):
             def forward(self, x, y):
                 return torch.logical_xor(x, y)
@@ -7349,6 +7730,64 @@ def forward(self, x, y):
         self.run_compare_torch(
             [x, y],
             model,
+            frontend=frontend,
+            backend=backend,
+            compute_unit=compute_unit,
+            input_as_shape=False,
+        )
+
+
+class TestLogicalNot(TorchBaseTest):
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, input_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+            [torch.int32, torch.float32, torch.bool],
+        ),
+    )
+    def test_logical_not(self, compute_unit, backend, frontend, input_dtype):
+        class TestModel(torch.nn.Module):
+            def forward(self, x):
+                return torch.logical_not(x)
+
+        input_data = torch.randint(
+            low=0, high=2 if input_dtype == torch.bool else 4, size=(2, 3, 4), dtype=input_dtype
+        )
+        self.run_compare_torch(
+            input_data,
+            TestModel(),
+            frontend=frontend,
+            backend=backend,
+            compute_unit=compute_unit,
+            input_as_shape=False,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, input_dtype, output_dtype",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+            [torch.int32, torch.float32, torch.bool],
+            [torch.int16, torch.float16, torch.bool],
+        ),
+    )
+    def test_logical_not_with_out(self, compute_unit, backend, frontend, input_dtype, output_dtype):
+        class TestModel(torch.nn.Module):
+            def forward(self, x):
+                out_tensor = torch.empty((2, 3, 4), dtype=output_dtype)
+                torch.logical_not(x, out=out_tensor)
+                return out_tensor
+
+        input_data = torch.randint(
+            low=0, high=2 if input_dtype == torch.bool else 4, size=(2, 3, 4), dtype=input_dtype
+        )
+        self.run_compare_torch(
+            input_data,
+            TestModel(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -7467,7 +7906,7 @@ def forward(self, cond, x, y):
         itertools.product(compute_units, backends, frontends, COMMON_SHAPES + [(10,)]),
     )
     def test_where_single_param(self, compute_unit, backend, frontend, shape):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2183: "
                 "Operator torch._ops.aten._assert_async.msg is not Aten Canonical"
@@ -7533,7 +7972,7 @@ def forward(self, x):
         itertools.product(compute_units, backends, frontends)
     )
     def test_dynamic_index(self, compute_unit, backend, frontend):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2189: "
                 "torch.export Cannot Use Dynamic Index to Select"
@@ -7545,11 +7984,11 @@ def forward(self, float_arr, int_arr):
                 float_arr[dynamic_index] = 12.95
                 return float_arr
 
-        a = torch.Tensor([1., 2., 4., 5])
+        a = torch.Tensor([1.0, 2.0, 4.0, 5])
         i = torch.Tensor([0, 1, 2]).long()
-        inputs_types=[
+        inputs_types = [
             ct.TensorType(name="a", shape=a.shape),
-            ct.TensorType(name="i", shape=i.shape, dtype=np.int32)
+            ct.TensorType(name="i", shape=i.shape, dtype=np.int32),
         ]
 
         self.run_compare_torch(
@@ -7559,14 +7998,16 @@ def forward(self, float_arr, int_arr):
             converter_input_type=inputs_types,
             frontend=frontend,
             backend=backend,
-            compute_unit=compute_unit
+            compute_unit=compute_unit,
         )
 
     @pytest.mark.parametrize(
         "compute_unit, backend, frontend",
         itertools.product(compute_units, backends, frontends),
     )
-    def test_dynamic_index_with_explicit_slice_on_all_other_dims(self, compute_unit, backend, frontend):
+    def test_dynamic_index_with_explicit_slice_on_all_other_dims(
+        self, compute_unit, backend, frontend
+    ):
         class SelectModel(torch.nn.Module):
             def forward(self, x, position):
                 y = x[:, :, position]
@@ -7585,15 +8026,18 @@ def forward(self, x, position):
 
 class TestNonZero(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, as_tuple",
+        "compute_unit, backend, frontend, rank, as_tuple",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [1, 3],
             [False, True],
         ),
     )
-    def test_non_zero(self, compute_unit, backend, rank, as_tuple):
+    def test_non_zero(self, compute_unit, backend, frontend, rank, as_tuple):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip("Cannot support _assert_async")
 
         if rank == 1:
             input_shape = 10
@@ -7616,6 +8060,7 @@ def test_non_zero(self, compute_unit, backend, rank, as_tuple):
             input,
             model,
             input_as_shape=False,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -7623,24 +8068,86 @@ def test_non_zero(self, compute_unit, backend, rank, as_tuple):
 
 class TestTorchTensor(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
+        "compute_unit, backend, frontend, rank",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [0, 1, 2, 3, 4, 5],
         ),
     )
-    def test_torch_tensor(self, compute_unit, backend, rank):
-        class Model(nn.Module):
-            def __init__(self, rank):
-                super(Model, self).__init__()
-                self.rank = rank
+    def test_torch_tensor(self, compute_unit, backend, frontend, rank):
+        if frontend == TorchFrontend.TORCHSCRIPT:
 
-            def forward(self, x):
-                with torch.no_grad():
+            class Model(nn.Module):
+                def __init__(self, rank):
+                    super(Model, self).__init__()
+                    self.rank = rank
+
+                def forward(self, x):
+                    with torch.no_grad():
+                        if self.rank == 0:
+                            res = self.generate_tensor_rank_0(x)
+                            return torch.unsqueeze(res, 0)
+                        if self.rank == 1:
+                            return self.generate_tensor_rank_1(x)
+                        if self.rank == 2:
+                            return self.generate_tensor_rank_2(x)
+                        if self.rank == 3:
+                            return self.generate_tensor_rank_3(x)
+                        if self.rank == 4:
+                            return self.generate_tensor_rank_4(x)
+                        if self.rank == 5:
+                            return self.generate_tensor_rank_5(x)
+
+                @torch.jit.script
+                def generate_tensor_rank_0(x):
+                    _, _, _, w = x.shape
+                    return torch.tensor(w, dtype=torch.int32)
+
+                @torch.jit.script
+                def generate_tensor_rank_1(x):
+                    _, _, h, w = x.shape
+                    return torch.tensor([h, w, 0, 1], dtype=torch.int32)
+
+                @torch.jit.script
+                def generate_tensor_rank_2(x):
+                    _, _, h, w = x.shape
+                    return torch.tensor([[0, h], [h, w], [w, w]], dtype=torch.float32)
+
+                @torch.jit.script
+                def generate_tensor_rank_3(x):
+                    _, _, h, w = x.shape
+                    return torch.tensor([[[h, 1]], [[3, w]]], dtype=torch.int32)
+
+                @torch.jit.script
+                def generate_tensor_rank_4(x):
+                    _, _, h, w = x.shape
+                    return torch.tensor(
+                        [
+                            [[[h, h], [h, w]], [[w, w], [w, 1]]],
+                            [[[0, 0], [1, 1]], [[0, h], [h, w]]],
+                        ],
+                        dtype=torch.float32,
+                    )
+
+                @torch.jit.script
+                def generate_tensor_rank_5(x):
+                    _, _, h, w = x.shape
+                    return torch.tensor(
+                        [[[[[h, w], [w, w]], [[1, 1], [0, h]]]]], dtype=torch.float32
+                    )
+
+        else:
+
+            class Model(nn.Module):
+                def __init__(self, rank):
+                    super(Model, self).__init__()
+                    self.rank = rank
+
+                def forward(self, x):
                     if self.rank == 0:
-                        res = self.generate_tensor_rank_0(x)
-                        return torch.unsqueeze(res, 0)
+                        return self.generate_tensor_rank_0(x)
                     if self.rank == 1:
                         return self.generate_tensor_rank_1(x)
                     if self.rank == 2:
@@ -7652,53 +8159,50 @@ def forward(self, x):
                     if self.rank == 5:
                         return self.generate_tensor_rank_5(x)
 
-            @torch.jit.script
-            def generate_tensor_rank_0(x):
-                _, _, _, w = x.shape
-                return torch.tensor(w, dtype=torch.int32)
-
-            @torch.jit.script
-            def generate_tensor_rank_1(x):
-                _, _, h, w = x.shape
-                return torch.tensor([h, w, 0, 1], dtype=torch.int32)
-
-            @torch.jit.script
-            def generate_tensor_rank_2(x):
-                _, _, h, w = x.shape
-                return torch.tensor([[0, h], [h, w], [w, w]], dtype=torch.float32)
-
-            @torch.jit.script
-            def generate_tensor_rank_3(x):
-                _, _, h, w = x.shape
-                return torch.tensor([[[h, 1]], [[3, w]]], dtype=torch.int32)
-
-            @torch.jit.script
-            def generate_tensor_rank_4(x):
-                _, _, h, w = x.shape
-                return torch.tensor(
-                    [
-                        [[[h, h], [h, w]], [[w, w], [w, 1]]],
-                        [[[0, 0], [1, 1]], [[0, h], [h, w]]],
-                    ],
-                    dtype=torch.float32,
-                )
+                def generate_tensor_rank_0(self, x):
+                    _, _, _, w = x.shape
+                    return torch.tensor(w, dtype=torch.int32)
+
+                def generate_tensor_rank_1(self, x):
+                    _, _, h, w = x.shape
+                    return torch.tensor([h, w, 0, 1], dtype=torch.int32)
+
+                def generate_tensor_rank_2(self, x):
+                    _, _, h, w = x.shape
+                    return torch.tensor([[0, h], [h, w], [w, w]], dtype=torch.float32)
+
+                def generate_tensor_rank_3(self, x):
+                    _, _, h, w = x.shape
+                    return torch.tensor([[[h, 1]], [[3, w]]], dtype=torch.int32)
+
+                def generate_tensor_rank_4(self, x):
+                    _, _, h, w = x.shape
+                    return torch.tensor(
+                        [
+                            [[[h, h], [h, w]], [[w, w], [w, 1]]],
+                            [[[0, 0], [1, 1]], [[0, h], [h, w]]],
+                        ],
+                        dtype=torch.float32,
+                    )
 
-            @torch.jit.script
-            def generate_tensor_rank_5(x):
-                _, _, h, w = x.shape
-                return torch.tensor(
-                    [[[[[h, w], [w, w]], [[1, 1], [0, h]]]]], dtype=torch.float32
-                )
+                def generate_tensor_rank_5(self, x):
+                    _, _, h, w = x.shape
+                    return torch.tensor(
+                        [[[[[h, w], [w, w]], [[1, 1], [0, h]]]]], dtype=torch.float32
+                    )
 
         shape = (1, 1, 3, 4)
         model = Model(rank)
-        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            shape, model, compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, torch_op",
+        "compute_unit, backend, frontend, torch_op",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 torch.abs,
                 torch.acos,
@@ -7725,7 +8229,10 @@ def generate_tensor_rank_5(x):
             ],
         ),
     )
-    def test_torch_rank0_tensor(self, compute_unit, backend, torch_op):
+    def test_torch_rank0_tensor(self, compute_unit, backend, frontend, torch_op):
+        if frontend == TorchFrontend.EXECUTORCH and torch_op == torch.exp2:
+            pytest.skip("torch._ops.aten.exp2.default is not Aten Canonical")
+
         class Model(nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
                 return torch_op(torch.tensor(0.1))
@@ -7735,6 +8242,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
             torch.tensor([1.0, 2.0, 3.0]),
             model,
             input_as_shape=False,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -7821,15 +8329,6 @@ def test_tensor_assign_case_broadcast(
             pytest.xfail(
                 "rdar://128024502 ([Bug][iOS18] slice_update failing test on backends beside CPU_ONLY + Classic CPU)"
             )
-        else:
-            if (
-                backend == "mlprogram"
-                and shape == (5, 4, 3)
-                and minimum_deployment_target == ct.target.iOS18
-            ):
-                pytest.xfail(
-                    "rdar://128024502 ([Bug][iOS18] slice_update failing test on backends beside CPU_ONLY + Classic CPU)"
-                )
 
         class TensorAssignModel(torch.nn.Module):
             def __init__(self):
@@ -8010,16 +8509,16 @@ def test_tensor_assign_dynamic_slice(
             pytest.xfail(
                 "rdar://128024502 ([Bug][iOS18] slice_update failing test on backends beside CPU_ONLY + Classic CPU)"
             )
-        else:
-            # On BNNS, some cases are passing, only static cases are failing
-            if (
-                backend[0] == "mlprogram"
-                and not dynamic
-                and minimum_deployment_target == ct.target.iOS18
-            ):
-                pytest.xfail(
-                    "rdar://128024502 ([Bug][iOS18] slice_update failing test on backends beside CPU_ONLY + Classic CPU)"
-                )
+
+        if (
+            backend[0] == "mlprogram"
+            and not dynamic
+            and minimum_deployment_target == ct.target.iOS18
+        ):
+            pytest.xfail(
+                "rdar://133494070 [iOS18] [Slice_Update] "
+                "Toy iOS18.slice_update Model Passes in BNNS but Dies in Core ML"
+            )
 
         # general case with dynamic begin and end
         class TensorAssignModel(torch.nn.Module):
@@ -8123,27 +8622,21 @@ def forward(self, x):
 
 class TestSelectScatter(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target, input_shape",
+        "compute_unit, backend, frontend, minimum_deployment_target, input_shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [None, ct.target.iOS18],
             [(1,), (4,), (3, 4), (1, 2, 4)],
         ),
     )
-    def test_select_scatter(self, compute_unit, backend, minimum_deployment_target, input_shape):
+    def test_select_scatter(
+        self, compute_unit, backend, frontend, minimum_deployment_target, input_shape
+    ):
         rank = len(input_shape)
 
-        if (
-            input_shape == (1, 2, 4)
-            and minimum_deployment_target == ct.target.iOS18
-        ):
-            pytest.xfail(
-                "rdar://128024502 ([Bug][iOS18] slice_update failing test on backends beside CPU_ONLY + Classic CPU)"
-            )
-
         def test_model(src_shape, dim, index):
-
             class SelectScatterModel(torch.nn.Module):
                 def forward(self, x, y):
                     return torch.select_scatter(
@@ -8172,13 +8665,17 @@ def forward(self, x, y):
             res = self.run_compare_torch(
                 [input_shape, src_shape],
                 model,
+                frontend=frontend,
                 backend=backend,
                 compute_unit=compute_unit,
                 minimum_deployment_target=minimum_deployment_target,
             )
 
             # check slice_update is used
-            if minimum_deployment_target == ct.target.iOS18:
+            if (
+                minimum_deployment_target == ct.target.iOS18
+                and frontend != TorchFrontend.EXECUTORCH
+            ):
                 prog = res[1]._mil_program
                 assert "slice_update" in get_op_types_in_program(prog)
 
@@ -8192,15 +8689,18 @@ def forward(self, x, y):
 
 class TestSliceScatter(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, minimum_deployment_target, input_shape",
+        "compute_unit, backend, frontend, minimum_deployment_target, input_shape",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [None, ct.target.iOS18],
             [(1,), (4,), (3, 4), (1, 2, 4)],
         ),
     )
-    def test_slice_scatter(self, compute_unit, backend, minimum_deployment_target, input_shape):
+    def test_slice_scatter(
+        self, compute_unit, backend, frontend, minimum_deployment_target, input_shape
+    ):
         rank = len(input_shape)
 
         def test_model(src_shape, dim, start, end, step):
@@ -8218,6 +8718,7 @@ def forward(self, x, y):
             res = self.run_compare_torch(
                 [input_shape, src_shape],
                 SliceScatterModel(),
+                frontend=frontend,
                 backend=backend,
                 compute_unit=compute_unit,
                 minimum_deployment_target=minimum_deployment_target,
@@ -8251,7 +8752,7 @@ class TestIndexPut(TorchBaseTest):
         ),
     )
     def test_index_put_bool_index_case_1(self, compute_unit, backend, frontend, minimum_deployment_target):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2183: "
                 "Operator torch._ops.aten._assert_async.msg is not Aten Canonical"
@@ -8287,7 +8788,10 @@ def forward(self, x, y):
     def test_index_put_bool_index_case_2(
         self, compute_unit, backend, frontend, rank, minimum_deployment_target
     ):
-        if backend[0] == "neuralnetwork" and frontend == TorchFrontend.EXIR:
+        if backend[0] == "neuralnetwork" and frontend in (
+            TorchFrontend.TORCHEXPORT,
+            TorchFrontend.EXECUTORCH,
+        ):
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2185: "
                 "EXIR IndexPut Fails on NeuralNetwork Backend"
@@ -8296,7 +8800,40 @@ def test_index_put_bool_index_case_2(
         class IndexPutModel(torch.nn.Module):
             def forward(self, x):
                 mask = torch.tensor([True, False, False, False, True, True]).view(3, 2)
-                if frontend == TorchFrontend.EXIR:
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+                    x = x.clone()
+                if rank == 0:
+                    x[mask] = 0.0
+                if rank == 1:
+                    x[mask] = torch.tensor([1.0])
+                return x
+
+        self.run_compare_torch(
+            (3, 2),
+            IndexPutModel(),
+            frontend=frontend,
+            backend=backend,
+            compute_unit=compute_unit,
+            minimum_deployment_target=minimum_deployment_target,
+        )
+
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend, rank, minimum_deployment_target",
+        itertools.product(
+            compute_units,
+            backends,
+            frontends,
+            [0, 1],
+            [None, ct.target.iOS17],
+        ),
+    )
+    def test_index_put_bool_index_all_false(
+        self, compute_unit, backend, frontend, rank, minimum_deployment_target
+    ):
+        class IndexPutModel(torch.nn.Module):
+            def forward(self, x):
+                mask = torch.tensor([False, False, False, False, False, False]).view(3, 2)
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
                     x = x.clone()
                 if rank == 0:
                     x[mask] = 0.0
@@ -8310,7 +8847,7 @@ def forward(self, x):
             frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
-            minimum_deployment_target=minimum_deployment_target
+            minimum_deployment_target=minimum_deployment_target,
         )
 
     @pytest.mark.parametrize(
@@ -8322,8 +8859,13 @@ def forward(self, x):
             [None, ct.target.iOS17],
         ),
     )
-    def test_index_put_dynamic_bool_index(self, compute_unit, backend, frontend, minimum_deployment_target):
-        if backend[0] == "neuralnetwork" and frontend == TorchFrontend.EXIR:
+    def test_index_put_dynamic_bool_index(
+        self, compute_unit, backend, frontend, minimum_deployment_target
+    ):
+        if backend[0] == "neuralnetwork" and frontend in (
+            TorchFrontend.TORCHEXPORT,
+            TorchFrontend.EXECUTORCH,
+        ):
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2185: "
                 "EXIR IndexPut Fails on NeuralNetwork Backend"
@@ -8334,7 +8876,7 @@ def test_index_put_dynamic_bool_index(self, compute_unit, backend, frontend, min
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, y):
                 mask = y > 1
-                if frontend == TorchFrontend.EXIR:
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
                     x = x.clone()
                 x[y > 1] = 0.0
                 return x
@@ -8367,7 +8909,10 @@ def forward(self, x, y):
     def test_index_put_int_index_case_1(
         self, compute_unit, backend, frontend, rank, accumulate, minimum_deployment_target
     ):
-        if backend[0] == "neuralnetwork" and frontend == TorchFrontend.EXIR:
+        if backend[0] == "neuralnetwork" and frontend in (
+            TorchFrontend.TORCHEXPORT,
+            TorchFrontend.EXECUTORCH,
+        ):
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2185: "
                 "EXIR IndexPut Fails on NeuralNetwork Backend"
@@ -8375,7 +8920,7 @@ def test_index_put_int_index_case_1(
 
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, indices, values):
-                if frontend == TorchFrontend.EXIR:
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
                     x = x.clone()
                 x.index_put_(tuple(indices.t()), values, accumulate=accumulate)
                 return x
@@ -8550,15 +9095,15 @@ def forward(self, x, position, val):
     def test_index_put_negative_indices_case_1(
         self, compute_unit, backend, frontend, accumulate, minimum_deployment_target
     ):
-        if backend[0] == "neuralnetwork" and frontend == TorchFrontend.EXIR:
-            pytest.xfail(
-                "https://github.com/apple/coremltools/issues/2185: "
-                "EXIR IndexPut Fails on NeuralNetwork Backend"
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip(
+                "https://github.com/pytorch/pytorch/issues/134443 "
+                "Torch exported program outputs fake tensor"
             )
 
         class IndexPutModel(torch.nn.Module):
             def forward(self, x):
-                if frontend == TorchFrontend.EXIR:
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
                     x = x.clone()
                 x.index_put_(
                     indices=(torch.LongTensor([0, -1]), torch.LongTensor([-2, 1])),
@@ -8590,7 +9135,10 @@ def forward(self, x):
     def test_index_put_negative_indices_case_2(
         self, compute_unit, backend, frontend, rank, accumulate, minimum_deployment_target
     ):
-        if backend[0] == "neuralnetwork" and frontend == TorchFrontend.EXIR:
+        if backend[0] == "neuralnetwork" and frontend in (
+            TorchFrontend.TORCHEXPORT,
+            TorchFrontend.EXECUTORCH,
+        ):
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2185: "
                 "EXIR IndexPut Fails on NeuralNetwork Backend"
@@ -8606,7 +9154,7 @@ def test_index_put_negative_indices_case_2(
 
         class IndexPutModel(torch.nn.Module):
             def forward(self, x, indices, values):
-                if frontend == TorchFrontend.EXIR:
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
                     x = x.clone()
                 x.index_put_(tuple(indices.t()), values, accumulate=accumulate)
                 return x
@@ -8660,7 +9208,7 @@ class TestIndex(TorchBaseTest):
     def test_index_bool_indices(
         self, compute_unit, backend, frontend, input_dtype, shape, minimum_deployment_target
     ):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2183: "
                 "Operator torch._ops.aten._assert_async.msg is not Aten Canonical"
@@ -8721,7 +9269,7 @@ def forward(self, x, y):
     def test_index_int_index_case_1(
         self, compute_unit, backend, frontend, input_dtype, shape, minimum_deployment_target
     ):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2184: "
                 "Cannot Convert Empty EXIR Model"
@@ -9057,7 +9605,7 @@ def forward(self, x):
     def test_index_int_index_case_9(
         self, compute_unit, backend, frontend, input_dtype, shape, minimum_deployment_target
     ):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2183: "
                 "Operator torch._ops.aten._assert_async.msg is not Aten Canonical"
@@ -9101,7 +9649,7 @@ def forward(self, x):
     def test_index_int_index_case_10(
         self, compute_unit, backend, frontend, input_dtype, shape, minimum_deployment_target
     ):
-        if frontend == TorchFrontend.EXIR:
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2183: "
                 "Operator torch._ops.aten._assert_async.msg is not Aten Canonical"
@@ -9289,12 +9837,11 @@ def test_index_select_invalid_indices(self):
 class TestLoss(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, rank, reduction",
-        itertools.product(
-            compute_units, backends, range(1, 4), ["none", "mean", "sum"]
-        ),
+        itertools.product(compute_units, backends, range(1, 4), ["none", "mean", "sum"]),
     )
     def test_mse_loss(self, compute_unit, backend, rank: int, reduction: str):
         input_shape = tuple(np.random.randint(low=1, high=5, size=rank))
+
         class Model(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -9305,19 +9852,23 @@ def forward(self, x, y):
 
         input_shapes = [input_shape, input_shape]
 
-        self.run_compare_torch(
-            input_shapes, Model(), backend=backend, compute_unit=compute_unit
-        )
+        self.run_compare_torch(input_shapes, Model(), backend=backend, compute_unit=compute_unit)
 
 
 class TestPad(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank, mode",
+        "compute_unit, backend, frontend, rank, mode",
         itertools.product(
-            compute_units, backends, range(3, 5), ["reflect", "replicate"]
+            compute_units, backends, frontends, range(3, 5), ["reflect", "replicate"]
         ),
     )
-    def test_pad_reflect_replicate(self, compute_unit, backend, rank: int, mode: str):
+    def test_pad_reflect_replicate(self, compute_unit, backend, frontend, rank: int, mode: str):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip(
+                "torch._dynamo.exc.UserError: Tried to use data-dependent value "
+                "in the subsequent computation"
+            )
+
         if rank == 3:
             pad_len = 2
             input_shape = (5, 10, 10)
@@ -9330,18 +9881,22 @@ def test_pad_reflect_replicate(self, compute_unit, backend, rank: int, mode: str
             )
         max_pad = min(input_shape[-1], input_shape[-2])
         pad = list(np.random.randint(low=0, high=max_pad, size=pad_len))
-        model = ModuleWrapper(
-            function=torch.nn.functional.pad, kwargs={"pad": pad, "mode": mode}
-        )
+        model = ModuleWrapper(function=torch.nn.functional.pad, kwargs={"pad": pad, "mode": mode})
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, backend=backend, compute_unit=compute_unit, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, rank",
-        itertools.product(compute_units, backends, range(1, 6)),
+        "compute_unit, backend, frontend, rank",
+        itertools.product(compute_units, backends, frontends, range(1, 6)),
     )
-    def test_pad_constant(self, compute_unit, backend, rank: int):
+    def test_pad_constant(self, compute_unit, backend, frontend, rank: int):
+        if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            pytest.skip(
+                "torch._dynamo.exc.UserError: Tried to use data-dependent value in the subsequent "
+                "computation"
+            )
+
         if rank > 5:
             raise NotImplementedError("Only supports < 6D constant padding")
         val = float(np.random.random(1))
@@ -9353,63 +9908,71 @@ def test_pad_constant(self, compute_unit, backend, rank: int):
             kwargs={"pad": pad, "mode": "constant", "value": val},
         )
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape,
+            model,
+            backend=backend,
+            compute_unit=compute_unit,
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_constant_pad_1d(self, compute_unit, backend):
+    def test_constant_pad_1d(self, compute_unit, backend, frontend):
         input_shape = (3, 4, 5)
         model = torch.nn.ConstantPad1d((5, 6), 3.5).eval()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, backend=backend, compute_unit=compute_unit, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_constant_pad_2d(self, compute_unit, backend):
+    def test_constant_pad_2d(self, compute_unit, backend, frontend):
         input_shape = (3, 4, 5, 6)
         model = torch.nn.ConstantPad2d((5, 6, 3, 8), 3.5).eval()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, backend=backend, compute_unit=compute_unit, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_constant_pad_3d(self, compute_unit, backend):
+    def test_constant_pad_3d(self, compute_unit, backend, frontend):
         input_shape = (3, 4, 5, 6, 2)
         model = torch.nn.ConstantPad3d((5, 6, 3, 8, 2, 4), 3.5).eval()
         self.run_compare_torch(
-            input_shape, model, backend=backend, compute_unit=compute_unit
+            input_shape, model, backend=backend, compute_unit=compute_unit, frontend=frontend
         )
 
 
 class TestMaskedFill(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, dtype, value",
+        "compute_unit, backend, frontend, dtype, value",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [np.int32, np.float32],
             [10.3, 7, 0],
         ),
     )
-    def test_masked_fill(self, compute_unit, backend, dtype, value):
+    def test_masked_fill(self, compute_unit, backend, frontend, dtype, value):
         SHAPE = (2, 3)
         MASK = torch.bernoulli(torch.rand(SHAPE[-1])).to(torch.bool)
 
@@ -9418,9 +9981,10 @@ def test_masked_fill(self, compute_unit, backend, dtype, value):
         model = ModuleWrapper(torch.masked_fill, {"mask": MASK, "value": value})
         converter_input_type = [TensorType(shape=SHAPE, dtype=dtype)]
 
-        TorchBaseTest.run_compare_torch(
+        self.run_compare_torch(
             input_data,
             model,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -9430,10 +9994,11 @@ def test_masked_fill(self, compute_unit, backend, dtype, value):
 
 class TestMeshgrid(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, x, y, z, dtype, inp_mode, indexing",
+        "compute_unit, backend, frontend, x, y, z, dtype, inp_mode, indexing",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [1, 2],
             [3, 4],
             [5, 6],
@@ -9446,6 +10011,7 @@ def test_meshgrid(
         self,
         compute_unit,
         backend,
+        frontend,
         x,
         y,
         z,
@@ -9474,6 +10040,7 @@ def forward(self, x, y, z):
             model,
             expected_results,
             input_as_shape=False,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -9486,17 +10053,15 @@ class TestAddmm(TorchBaseTest):
             compute_units,
             backends,
             ((2, 2, 2), (4, 5, 9)),
-            (1., 2.),
-            (1., 3.),
-        )
+            (1.0, 2.0),
+            (1.0, 3.0),
+        ),
     )
     def test_addmm(self, compute_unit, backend, shapes, beta, alpha):
-
         class TestModel(nn.Module):
             def forward(self, x):
                 return torch.addmm(x, m1, m2, beta=beta, alpha=alpha)
 
-
         m, n, p = shapes
 
         # m1 @ m2 must be legal
@@ -9506,7 +10071,10 @@ def forward(self, x):
         x_shape = (m, p)
 
         self.run_compare_torch(
-            x_shape, TestModel(), backend=backend, compute_unit=compute_unit,
+            x_shape,
+            TestModel(),
+            backend=backend,
+            compute_unit=compute_unit,
         )
 
 
@@ -9539,7 +10107,10 @@ def forward(self, x):
         for dim in dims:
             m = TestModel(dim, shapes)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit,
+                shapes,
+                m,
+                backend=backend,
+                compute_unit=compute_unit,
                 minimum_deployment_target=minimum_deployment_target,
             )
 
@@ -9556,7 +10127,9 @@ def forward(self, x):
             [None, ct.target.iOS17],
         ),
     )
-    def test_scatter_with_scalar_source(self, compute_unit, backend, shapes_dims, minimum_deployment_target):
+    def test_scatter_with_scalar_source(
+        self, compute_unit, backend, shapes_dims, minimum_deployment_target
+    ):
         class TestModel(nn.Module):
             def __init__(self, dim, shapes):
                 super(TestModel, self).__init__()
@@ -9571,7 +10144,10 @@ def forward(self, x):
         for dim in dims:
             m = TestModel(dim, shapes)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit,
+                shapes,
+                m,
+                backend=backend,
+                compute_unit=compute_unit,
                 minimum_deployment_target=minimum_deployment_target,
             )
 
@@ -9605,7 +10181,10 @@ def forward(self, x):
         for dim in dims:
             m = TestModel(dim, shapes, mode)
             self.run_compare_torch(
-                shapes, m, backend=backend, compute_unit=compute_unit,
+                shapes,
+                m,
+                backend=backend,
+                compute_unit=compute_unit,
                 minimum_deployment_target=minimum_deployment_target,
             )
 
@@ -9685,27 +10264,29 @@ def forward(self, x):
 
 class TestBroadcastTensors(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1,), (1, 2)],
         ),
     )
-    def test_one_tensor(self, compute_unit, backend, shapes):
+    def test_one_tensor(self, compute_unit, backend, frontend, shapes):
         class TestModel(nn.Module):
             def forward(self, a):
                 return torch.broadcast_tensors(a)
 
         self.run_compare_torch(
-            shapes, TestModel().eval(), backend=backend, compute_unit=compute_unit
+            shapes, TestModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 1), (1, 3)],
                 [(5, 1, 4, 1), (3, 1, 1)],
@@ -9714,20 +10295,21 @@ def forward(self, a):
             ],
         ),
     )
-    def test_two_tensors(self, compute_unit, backend, shapes):
+    def test_two_tensors(self, compute_unit, backend, frontend, shapes):
         class TestModel(nn.Module):
             def forward(self, a, b):
                 return torch.broadcast_tensors(a, b)
 
         self.run_compare_torch(
-            shapes, TestModel().eval(), backend=backend, compute_unit=compute_unit
+            shapes, TestModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 1), (1, 3), (1,), (1, 1)],
                 [(5, 1, 4, 1), (3, 1, 1), (1,), (4, 8)],
@@ -9735,13 +10317,13 @@ def forward(self, a, b):
             ],
         ),
     )
-    def test_four_tensors(self, compute_unit, backend, shapes):
+    def test_four_tensors(self, compute_unit, backend, frontend, shapes):
         class TestModel(nn.Module):
             def forward(self, a, b, c, d):
                 return torch.broadcast_tensors(a, b, c, d)
 
         self.run_compare_torch(
-            shapes, TestModel().eval(), backend=backend, compute_unit=compute_unit
+            shapes, TestModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
@@ -9800,17 +10382,15 @@ def test_embedding_invalid_indices(self):
 
 class TestDuplicateOutputTensors(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_dtype",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
-            [np.int32, np.float32],
+            frontends,
         ),
     )
     # Test case for rdar://100138064 (Duplicate output tensors trigger ops removal errors).
-    def test_duplicate_output_not_raise_errors(
-        self, compute_unit, backend, input_dtype
-    ):
+    def test_duplicate_output_not_raise_errors(self, compute_unit, backend, frontend):
         if backend[0] == "neuralnetwork":
             pytest.skip(
                 "rdar://100243127 ([PyTorch] Duplicate Output Tensor Doesn't work for neuralnetwork)"
@@ -9829,6 +10409,7 @@ def forward(self, x):
             model,
             expected_results=expected_results,
             input_as_shape=False,
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             converter_input_type=converter_input_type,
@@ -9864,9 +10445,7 @@ def forward(self, x):
         model = BaddbmmModel()
         # Makes it broadcastable to (B, N, P).
         for input_shape in [(1, N, P), (B, 1, P), (1, P)]:
-            self.run_compare_torch(
-                input_shape, model, backend=backend, compute_unit=compute_unit
-            )
+            self.run_compare_torch(input_shape, model, backend=backend, compute_unit=compute_unit)
 
 
 class TestGlu(TorchBaseTest):
@@ -9883,17 +10462,16 @@ def test_glu(self, compute_unit, backend, shapes):
         glu_dim_list = [-1] + [i for i in range(len(shapes))]
         for glu_dim in glu_dim_list:
             model = torch.nn.GLU(glu_dim)
-            self.run_compare_torch(
-                shapes, model, backend=backend, compute_unit=compute_unit
-            )
+            self.run_compare_torch(shapes, model, backend=backend, compute_unit=compute_unit)
 
 
 class TestHstack(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 4, 6), (2, 4, 6)],
                 [(1, 4, 5), (1, 2, 5)],
@@ -9901,24 +10479,25 @@ class TestHstack(TorchBaseTest):
             ],  # Test 1-D tensors.
         ),
     )
-    def test_hstack(self, compute_unit, backend, shapes):
+    def test_hstack(self, compute_unit, backend, frontend, shapes):
         class HstackModel(nn.Module):
             def forward(self, *tensors):
                 return torch.hstack(tensors)
 
         self.run_compare_torch(
-            shapes, HstackModel(), backend=backend, compute_unit=compute_unit
+            shapes, HstackModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [[(2, 4, 6), (2, 4, 6)]],
         ),
     )
-    def test_hstack_with_parameter_out(self, compute_unit, backend, shapes):
+    def test_hstack_with_parameter_out(self, compute_unit, backend, frontend, shapes):
         class HstackModel(nn.Module):
             def forward(self, *tensors):
                 output_tensor = torch.tensor([])
@@ -9926,16 +10505,17 @@ def forward(self, *tensors):
                 return output_tensor
 
         self.run_compare_torch(
-            shapes, HstackModel(), backend=backend, compute_unit=compute_unit
+            shapes, HstackModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestRemainder(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [
                 [(2, 4, 6), (2, 4, 6)],
                 [(2, 4, 6), (4, 6)],  # broadcastable tensors
@@ -9943,24 +10523,25 @@ class TestRemainder(TorchBaseTest):
             ],
         ),
     )
-    def test_remainder(self, compute_unit, backend, shapes):
+    def test_remainder(self, compute_unit, backend, frontend, shapes):
         class RemainderModel(nn.Module):
             def forward(self, dividend, divisor):
                 return torch.remainder(dividend, divisor)
 
         self.run_compare_torch(
-            shapes, RemainderModel(), backend=backend, compute_unit=compute_unit
+            shapes, RemainderModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, shapes",
+        "compute_unit, backend, frontend, shapes",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [[(2, 4, 6), (2, 4, 6)]],
         ),
     )
-    def test_remainder_with_parameter_out(self, compute_unit, backend, shapes):
+    def test_remainder_with_parameter_out(self, compute_unit, backend, frontend, shapes):
         class RemainderModel(nn.Module):
             def forward(self, dividend, divisor):
                 output_tensor = torch.tensor([])
@@ -9968,17 +10549,18 @@ def forward(self, dividend, divisor):
                 return output_tensor
 
         self.run_compare_torch(
-            shapes, RemainderModel(), backend=backend, compute_unit=compute_unit
+            shapes, RemainderModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_remainder_input_types_promotion(self, compute_unit, backend):
+    def test_remainder_input_types_promotion(self, compute_unit, backend, frontend):
         class RemainderModel(nn.Module):
             def forward(self, dividend, divisor):
                 return torch.remainder(dividend, divisor)
@@ -9988,6 +10570,7 @@ def forward(self, dividend, divisor):
         self.run_compare_torch(
             [input_dividend, input_divisor],
             RemainderModel(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
@@ -9997,9 +10580,7 @@ def forward(self, dividend, divisor):
 class TestSum(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend, input_dtype",
-        itertools.product(
-            compute_units, backends, [torch.int32, torch.float32, torch.bool]
-        ),
+        itertools.product(compute_units, backends, [torch.int32, torch.float32, torch.bool]),
     )
     def test_sum(self, compute_unit, backend, input_dtype):
         model = ModuleWrapper(function=torch.sum)
@@ -10119,19 +10700,15 @@ def test_roll(self, compute_unit, backend, shape, shifts):
                 # Negative shifts
                 [[-9, -1], [1, 2]],
                 # Duplicate dims
-                [[8, 10, -8], [0, 1, 0]]
+                [[8, 10, -8], [0, 1, 0]],
             ],
         ),
     )
     def test_roll_with_dims(self, compute_unit, backend, shape, shifts_dims):
         shifts, dims = shifts_dims
         model = ModuleWrapper(torch.roll, kwargs={"shifts": shifts, "dims": dims})
-        self.run_compare_torch(
-            shape,
-            model,
-            backend=backend,
-            compute_unit=compute_unit
-        )
+        self.run_compare_torch(shape, model, backend=backend, compute_unit=compute_unit)
+
 
 class TestArgmax(TorchBaseTest):
     @pytest.mark.parametrize(
@@ -10152,14 +10729,8 @@ def test_argmax(
         axis: int,
         input_dtype: np.dtype,
     ):
-        input_data = (
-            torch.rand(*shape)
-            if input_dtype == np.float32
-            else torch.randint(10, shape)
-        )
-        converter_input_type = [
-            ct.TensorType(shape=input_data.shape, dtype=input_dtype)
-        ]
+        input_data = torch.rand(*shape) if input_dtype == np.float32 else torch.randint(10, shape)
+        converter_input_type = [ct.TensorType(shape=input_data.shape, dtype=input_dtype)]
         model = ModuleWrapper(function=torch.argmax, kwargs={"dim": axis})
         expected_results = model(input_data)
         TorchBaseTest.run_compare_torch(
@@ -10205,13 +10776,17 @@ def forward(self, *inputs):
 
 class TestComplex(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_complex(self, compute_unit: ct.ComputeUnit, backend):
+    def test_complex(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class ComplexModel(torch.nn.Module):
             def forward(self, x):
                 real_part = x + 1
@@ -10220,33 +10795,46 @@ def forward(self, x):
                 return torch.stack([complex_data.real, complex_data.imag], dim=1)
 
         TorchBaseTest.run_compare_torch(
-            (2, 3, 4), ComplexModel(), backend=backend, compute_unit=compute_unit
+            (2, 3, 4),
+            ComplexModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_complex_real_imag_same_input(self, compute_unit: ct.ComputeUnit, backend):
+    def test_complex_real_imag_same_input(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class ComplexModel(torch.nn.Module):
             def forward(self, x):
                 return torch.complex(x, x).real
 
         TorchBaseTest.run_compare_torch(
-            (2, 3, 4), ComplexModel(), backend=backend, compute_unit=compute_unit
+            (2, 3, 4),
+            ComplexModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_complex_input_error(self, compute_unit: ct.ComputeUnit, backend):
+    def test_complex_input_error(self, compute_unit: ct.ComputeUnit, backend, frontend):
         class ComplexModel(torch.nn.Module):
             def forward(self, x):
                 return torch.complex(x.real, x.imag)
@@ -10256,45 +10844,54 @@ def forward(self, x):
             TypeError,
             match="dtype=<class 'numpy.complex64'> is unsupported for inputs/outputs of the model",
         ):
-            converter_input_type = [
-                ct.TensorType(shape=input_data.shape, dtype=np.complex64)
-            ]
+            converter_input_type = [ct.TensorType(shape=input_data.shape, dtype=np.complex64)]
             TorchBaseTest.run_compare_torch(
                 input_data,
                 ComplexModel(),
-                backend=backend,
-                compute_unit=compute_unit,
                 input_as_shape=False,
                 converter_input_type=converter_input_type,
+                compute_unit=compute_unit,
+                backend=backend,
+                frontend=frontend,
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_complex_output_error(self, compute_unit: ct.ComputeUnit, backend):
+    def test_complex_output_error(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class ComplexModel(torch.nn.Module):
             def forward(self, x):
                 return torch.complex(x, x)
 
-        with pytest.raises(
-            ValueError, match="MIL doesn't support complex data as model's output"
-        ):
+        with pytest.raises(ValueError, match="MIL doesn't support complex data as model's output"):
             TorchBaseTest.run_compare_torch(
-                (2, 3, 4), ComplexModel(), backend=backend, compute_unit=compute_unit
+                (2, 3, 4),
+                ComplexModel(),
+                compute_unit=compute_unit,
+                backend=backend,
+                frontend=frontend,
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
-        )
+            frontends,
+        ),
     )
-    def test_abs(self, compute_unit, backend):
+    def test_abs(self, compute_unit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class AbsModel(torch.nn.Module):
             def forward(self, x):
                 x = torch.complex(x, x)
@@ -10303,6 +10900,7 @@ def forward(self, x):
         TorchBaseTest.run_compare_torch(
             (1, 16),
             AbsModel(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -10310,72 +10908,92 @@ def forward(self, x):
 
 class TestReal(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_real_real_input(self, compute_unit: ct.ComputeUnit, backend):
+    def test_real_real_input(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class RealModel(torch.nn.Module):
             def forward(self, x):
                 return torch.real(x)
 
         TorchBaseTest.run_compare_torch(
-            (2, 3, 4), RealModel(), backend=backend, compute_unit=compute_unit
+            (2, 3, 4), RealModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_real_complex_input(self, compute_unit: ct.ComputeUnit, backend):
+    def test_real_complex_input(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class RealModel(torch.nn.Module):
             def forward(self, x):
                 return torch.real(torch.complex(x, x))
 
         TorchBaseTest.run_compare_torch(
-            (2, 3, 4), RealModel(), backend=backend, compute_unit=compute_unit
+            (2, 3, 4), RealModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestImag(TorchBaseTest):
     # torch.imag only support complex input, so we don't need to test real number input.
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_imag_complex_input(self, compute_unit: ct.ComputeUnit, backend):
+    def test_imag_complex_input(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class ImagModel(torch.nn.Module):
             def forward(self, x):
                 return torch.imag(torch.complex(x, x))
 
         TorchBaseTest.run_compare_torch(
-            (2, 3, 4), ImagModel(), backend=backend, compute_unit=compute_unit
+            (2, 3, 4), ImagModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
         )
 
 
 class TestViewAsReal(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_view_as_real(self, compute_unit: ct.ComputeUnit, backend):
+    def test_view_as_real(self, compute_unit: ct.ComputeUnit, backend, frontend):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten.complex.default is not Aten Canonical")
+
         class RealModel(torch.nn.Module):
             def forward(self, x):
                 return torch.view_as_real(torch.complex(x, 2 * x))
 
         TorchBaseTest.run_compare_torch(
-            (2, 3, 4), RealModel(), backend=backend, compute_unit=compute_unit
+            (2, 3, 4),
+            RealModel(),
+            compute_unit=compute_unit,
+            backend=backend,
+            frontend=frontend,
         )
 
 
@@ -10387,16 +11005,12 @@ class TestFft(TorchBaseTest):
             backends,
         ),
     )
-    def test_directly_use_fft_complex_output_error(
-        self, compute_unit: ct.ComputeUnit, backend
-    ):
+    def test_directly_use_fft_complex_output_error(self, compute_unit: ct.ComputeUnit, backend):
         class FftModel(torch.nn.Module):
             def forward(self, x):
                 return torch.fft.fft(x)
 
-        with pytest.raises(
-            ValueError, match="MIL doesn't support complex data as model's output"
-        ):
+        with pytest.raises(ValueError, match="MIL doesn't support complex data as model's output"):
             TorchBaseTest.run_compare_torch(
                 (2, 3, 4), FftModel(), backend=backend, compute_unit=compute_unit
             )
@@ -10446,9 +11060,7 @@ def forward(self, x):
             [None, "forward", "backward", "ortho"],
         ),
     )
-    def test_fft_basic(
-        self, compute_unit: ct.ComputeUnit, backend, fft_variant, n, dim, norm
-    ):
+    def test_fft_basic(self, compute_unit: ct.ComputeUnit, backend, fft_variant, n, dim, norm):
         class FftModel(torch.nn.Module):
             def forward(self, x):
                 if fft_variant == "fft":
@@ -10738,12 +11350,17 @@ class NmsModel(torch.nn.Module):
             def forward(self, boxes, scores):
                 return torchvision.ops.nms(boxes, scores, iou_threshold=0.2)
 
-        input_boxes = torch.tensor([[3., 2., 3., 0.],
-                                    [0., 0., 2., 2.],
-                                    [1., 3., 2., 1.],
-                                    [0., 2., 1., 3.],
-                                    [1., 1., 2., 3.]], dtype=torch.float32)
-        input_scores = torch.tensor([3., 2., 0., 1., 4.], dtype=torch.float32)
+        input_boxes = torch.tensor(
+            [
+                [3.0, 2.0, 3.0, 0.0],
+                [0.0, 0.0, 2.0, 2.0],
+                [1.0, 3.0, 2.0, 1.0],
+                [0.0, 2.0, 1.0, 3.0],
+                [1.0, 1.0, 2.0, 3.0],
+            ],
+            dtype=torch.float32,
+        )
+        input_scores = torch.tensor([3.0, 2.0, 0.0, 1.0, 4.0], dtype=torch.float32)
         converter_input_type = [
             ct.TensorType(shape=input_boxes.shape),
             ct.TensorType(shape=input_scores.shape),
@@ -10826,43 +11443,63 @@ def forward(self, boxes, scores):
 
 class TestTensorSize(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
-        )
+            frontends,
+        ),
     )
-    def test_tensor_size(self, compute_unit: ct.ComputeUnit.CPU_ONLY, backend: List[Tuple[str]]):
+    def test_tensor_size(
+        self, compute_unit: ct.ComputeUnit.CPU_ONLY, backend: List[Tuple[str]], frontend
+    ):
         class TestModel(torch.nn.Module):
             def forward(self, x):
-                return x.size()
+                # torch.export cannot deal with
+                # * non-tensor output (because torch.export will try to call .detach)
+                # * empty graph (i.e. no tenosr operation)
+                # so we use an op to wrap the output into tensor
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+                    return torch.tensor(x.size())
+                else:
+                    return x.size()
 
         self.run_compare_torch(
             [(1, 2, 3)],
             TestModel(),
             backend=backend,
             compute_unit=compute_unit,
+            frontend=frontend,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, dim, minimum_deployment_target",
+        "compute_unit, backend, frontend, dim, minimum_deployment_target",
         itertools.product(
             compute_units,
-            [('mlprogram', "fp16")],
+            [("mlprogram", "fp16")],
+            frontends,
             [2, -1],
             [None, ct.target.iOS17],
-        )
+        ),
     )
     def test_tensor_size_with_dim(
         self,
         compute_unit: ct.ComputeUnit.CPU_ONLY,
         backend: List[Tuple[str]],
+        frontend,
         dim: int,
         minimum_deployment_target: ct.target,
     ):
         class TestModel(torch.nn.Module):
             def forward(self, x):
-                return x.size(dim=dim)
+                # torch.export cannot deal with
+                # * non-tensor output (because torch.export will try to call .detach)
+                # * empty graph (i.e. no tenosr operation)
+                # so we use an op to wrap the output into tensor
+                if frontend in TORCH_EXPORT_BASED_FRONTENDS:
+                    return torch.tensor(x.size(dim=dim))
+                else:
+                    return x.size(dim=dim)
 
         self.run_compare_torch(
             [(1, 2, 3)],
@@ -10870,19 +11507,24 @@ def forward(self, x):
             backend=backend,
             compute_unit=compute_unit,
             minimum_deployment_target=minimum_deployment_target,
+            frontend=frontend,
         )
 
 
 class TestBitwiseAnd(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
     def test_bitwise_and(
-        self, compute_unit: ct.ComputeUnit.CPU_ONLY, backend: List[Tuple[str]]
+        self,
+        compute_unit: ct.ComputeUnit.CPU_ONLY,
+        backend: List[Tuple[str]],
+        frontend: TorchFrontend,
     ):
         class TestModel(torch.nn.Module):
             def forward(self, x, y):
@@ -10894,32 +11536,33 @@ def forward(self, x, y):
         self.run_compare_torch(
             [input_data_x, input_data_y],
             TestModel(),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
             input_as_shape=False,
         )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
     def test_bitwise_and_unsupport_input(
-        self, compute_unit: ct.ComputeUnit.CPU_ONLY, backend: List[Tuple[str]]
+        self,
+        compute_unit: ct.ComputeUnit.CPU_ONLY,
+        backend: List[Tuple[str]],
+        frontend: TorchFrontend,
     ):
         class TestModel(torch.nn.Module):
             def forward(self, x, y):
                 return torch.bitwise_and(x, y)
 
         input_shape = (2, 3)
-        input_data_x = torch.randint(
-            low=0, high=10, size=input_shape, dtype=torch.int32
-        )
-        input_data_y = torch.randint(
-            low=0, high=10, size=input_shape, dtype=torch.int32
-        )
+        input_data_x = torch.randint(low=0, high=10, size=input_shape, dtype=torch.int32)
+        input_data_y = torch.randint(low=0, high=10, size=input_shape, dtype=torch.int32)
         with pytest.raises(
             NotImplementedError,
             match="The `bitwise_and` op only supports boolean input",
@@ -10927,78 +11570,32 @@ def forward(self, x, y):
             self.run_compare_torch(
                 [input_data_x, input_data_y],
                 TestModel(),
+                frontend=frontend,
                 backend=backend,
                 compute_unit=compute_unit,
                 input_as_shape=False,
             )
 
 
-class TestLogicalNot(TorchBaseTest):
-    @pytest.mark.parametrize(
-        "compute_unit, backend, input_dtype",
-        itertools.product(
-            compute_units,
-            backends,
-            [torch.int32, torch.float32, torch.bool],
-        ),
-    )
-    def test_logical_not(self, compute_unit, backend, input_dtype):
-        class TestModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.logical_not(x)
-
-        input_data = torch.randint(
-            low=0, high=2 if input_dtype == torch.bool else 4, size=(2, 3, 4), dtype=input_dtype
-        )
-        self.run_compare_torch(
-            input_data,
-            TestModel(),
-            backend=backend,
-            compute_unit=compute_unit,
-            input_as_shape=False,
-        )
-
-    @pytest.mark.parametrize(
-        "compute_unit, backend, input_dtype, output_dtype",
-        itertools.product(
-            compute_units,
-            backends,
-            [torch.int32, torch.float32, torch.bool],
-            [torch.int16, torch.float16, torch.bool],
-        ),
-    )
-    def test_logical_not_with_out(self, compute_unit, backend, input_dtype, output_dtype):
-        class TestModel(torch.nn.Module):
-            def forward(self, x):
-                out_tensor = torch.empty((2, 3, 4), dtype=output_dtype)
-                torch.logical_not(x, out=out_tensor)
-                return out_tensor
-
-        input_data = torch.randint(
-            low=0, high=2 if input_dtype == torch.bool else 4, size=(2, 3, 4), dtype=input_dtype
-        )
-        self.run_compare_torch(
-            input_data,
-            TestModel(),
-            backend=backend,
-            compute_unit=compute_unit,
-            input_as_shape=False,
-        )
-
-
 class TestUnfold(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape, kernel_size, padding, stride",
+        "compute_unit, backend, frontend, input_shape, kernel_size, padding, stride",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [(1, 1, 10, 11), (5, 3, 12, 13)],
             [(2, 3)],
             [0, 1, 8, (1, 3), (2, 6), (0, 5)],
             [1, 2, 7, (2, 3), (5, 4)],
         ),
     )
-    def test_unfold(self, compute_unit, backend, input_shape, kernel_size, padding, stride):
+    def test_unfold(
+        self, compute_unit, backend, frontend, input_shape, kernel_size, padding, stride
+    ):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("ExecuTorch produces rank > 5 tensor")
+
         self.run_compare_torch(
             input_shape,
             ModuleWrapper(
@@ -11007,8 +11604,9 @@ def test_unfold(self, compute_unit, backend, input_shape, kernel_size, padding,
                     "kernel_size": kernel_size,
                     "padding": padding,
                     "stride": stride,
-                }
+                },
             ),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -11035,24 +11633,28 @@ def construct_block_count(
         block_count = 1
         for i in range(dim):
             block_count *= np.floor(
-                (output_size[i] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i]
+                (output_size[i] + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1)
+                / stride[i]
                 + 1
             ).astype(np.int32)
         return block_count
 
-
     @pytest.mark.parametrize(
-        "compute_unit, backend, N, C, output_size, kernel_size",
+        "compute_unit, backend, frontend, N, C, output_size, kernel_size",
         itertools.product(
             compute_units,
             backends,
+            frontends,
             [1, 2],
             [1, 3],
             [(12, 12), (12, 24)],
             [(2, 2), (2, 3)],
         ),
     )
-    def test_unfold(self, compute_unit, backend, N, C, output_size, kernel_size):
+    def test_unfold(self, compute_unit, backend, frontend, N, C, output_size, kernel_size):
+        if frontend == TorchFrontend.EXECUTORCH:
+            pytest.skip("torch._ops.aten._unsafe_index_put.default is not Aten Canonical")
+
         block_count = self.construct_block_count(
             output_size,
             kernel_size,
@@ -11066,8 +11668,9 @@ def test_unfold(self, compute_unit, backend, N, C, output_size, kernel_size):
                     "output_size": output_size,
                     "kernel_size": kernel_size,
                     "stride": kernel_size,
-                }
+                },
             ),
+            frontend=frontend,
             backend=backend,
             compute_unit=compute_unit,
         )
@@ -11075,13 +11678,14 @@ def test_unfold(self, compute_unit, backend, N, C, output_size, kernel_size):
 
 class TestTupleUnpack(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend",
+        "compute_unit, backend, frontend",
         itertools.product(
             compute_units,
             backends,
+            frontends,
         ),
     )
-    def test_tuple_unpack(self, compute_unit, backend):
+    def test_tuple_unpack(self, compute_unit, backend, frontend):
         class ReturnTupleModel(nn.Module):
             def forward(self, x):
                 return x * 3, x * 4, x * 5
@@ -11095,17 +11699,22 @@ def forward(self, x):
                 out1, out2, out3 = self.return_tuple_layer(x)
                 return out1.relu(), out2.sigmoid(), out3.softmax(1)
 
-        self.run_compare_torch((1, 2, 3), TestModel(), backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            (1, 2, 3), TestModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
 
 class TestTupleIndex(TorchBaseTest):
     @pytest.mark.parametrize(
         "compute_unit, backend",
-        itertools.product(compute_units, backends,),
+        itertools.product(
+            compute_units,
+            backends,
+        ),
     )
     def test_tuple_index(self, compute_unit, backend):
         class InnerModel(nn.Module):
-            def forward(self,x):
+            def forward(self, x):
                 return (torch.tensor([0]), torch.tensor([1]))
 
         class OuterModel(nn.Module):
@@ -11118,11 +11727,20 @@ def forward(self, x):
                 return inner[0]
 
         x = torch.rand(1, 3, 640, 640)
-        self.run_compare_torch(x, OuterModel(),
-                               input_as_shape=False, use_scripting=True,
-                               backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            x,
+            OuterModel(),
+            input_as_shape=False,
+            use_scripting=True,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
 
 
+@pytest.mark.skipif(
+    platform.machine() == "x86_64",
+    reason="The x86_64 has outdated PyTorch, which doesn't have _scaled_dot_product_flash_attention in fx node.",
+)
 class TestScaledDotProductAttention(TorchBaseTest):
     """
     Tests for torch.nn.functional.scaled_dot_product_attention op
@@ -11169,12 +11787,15 @@ def test_different_batch_dims(self, compute_unit, backend, frontend, minimum_dep
             minimum_deployment_target=minimum_deployment_target,
         )
 
-        # Only iOS 18 with torch script can have mb.sdpa, because
-        # 1. mb.sdpa is introduced in iOS 18, so before iOS 18 we would decompose sdpa
-        # 2. torch.sdpa is not a core aten op, so EXIR would decompose sdpa
-        if minimum_deployment_target == ct.target.iOS18 and frontend == TorchFrontend.TORCHSCRIPT:
-            if backend == ("mlprogram", "fp16"):
-                assert get_op_types_in_program(res[1]._mil_program) == [
+        # mb.sdpa is introduced in iOS 18, so before iOS 18 we would decompose sdpa
+        # torch.sdpa is not a core aten op, so executorch would decompose sdpa
+        if (
+            backend[0] == "mlprogram"
+            and minimum_deployment_target == ct.target.iOS18
+            and frontend != TorchFrontend.EXECUTORCH
+        ):
+            if backend[1] == "fp16":
+                expected_ops = [
                     "cast",
                     "tile",
                     "cast",
@@ -11182,6 +11803,9 @@ def test_different_batch_dims(self, compute_unit, backend, frontend, minimum_dep
                     "cast",
                     "scaled_dot_product_attention",
                 ]
+            else:
+                expected_ops = ["tile", "tile", "scaled_dot_product_attention"]
+            assert get_op_types_in_program(res[1]._mil_program) == expected_ops
 
     @pytest.mark.parametrize(
         "compute_unit, backend, frontend, minimum_deployment_target, rank, dynamic",
@@ -11208,7 +11832,7 @@ def test_different_input_ranks_no_mask(
         elif rank == 4:
             input_shape = (batch_size, n_heads_1, seq_len, d)
         elif rank == 5:
-            input_shape = (batch_size, n_heads_1, n_heads_1, seq_len, d)
+            input_shape = (batch_size, n_heads_1, n_heads_2, seq_len, d)
         else:
             raise ValueError("invalid rank")
 
@@ -11241,46 +11865,42 @@ def test_different_input_ranks_no_mask(
             minimum_deployment_target=minimum_deployment_target,
         )
 
-        # Only iOS 18 with torch script can have mb.sdpa, because
-        # 1. mb.sdpa is introduced in iOS 18, so before iOS 18 we would decompose sdpa
-        # 2. torch.sdpa is not a core aten op, so EXIR would decompose sdpa
-        if minimum_deployment_target == ct.target.iOS18 and frontend == TorchFrontend.TORCHSCRIPT:
-            if backend == ("mlprogram", "fp16"):
-                if rank == 2:
-                    if dynamic:
-                        expected_ops = [
-                            "expand_dims",
-                            "expand_dims",
-                            "expand_dims",
-                            "scaled_dot_product_attention",
-                            "squeeze",
-                        ]
-                    else:
-                        expected_ops = [
-                            "cast",
-                            "expand_dims",
-                            "cast",
-                            "expand_dims",
-                            "cast",
-                            "expand_dims",
-                            "scaled_dot_product_attention",
-                            "squeeze",
-                        ]
-                    assert get_op_types_in_program(coreml_model._mil_program) == expected_ops
-
+        # mb.sdpa is introduced in iOS 18, so before iOS 18 we would decompose sdpa
+        # torch.sdpa is not a core aten op, so executorch would decompose sdpa
+        if (
+            backend[0] == "mlprogram"
+            and minimum_deployment_target == ct.target.iOS18
+            and frontend != TorchFrontend.EXECUTORCH
+        ):
+            pymil_inputs = list(coreml_model._mil_program.functions["main"].inputs.values())
+            is_io_fp16 = pymil_inputs[0].dtype == types.fp16
+            is_io_precision_same_as_compute_precision = is_io_fp16 == (backend[1] == "fp16")
+            if rank == 2:
+                if is_io_precision_same_as_compute_precision:
+                    expected_ops = [
+                        "expand_dims",
+                        "expand_dims",
+                        "expand_dims",
+                        "scaled_dot_product_attention",
+                        "squeeze",
+                    ]
                 else:
-                    if dynamic:
-                        expected_ops = [
-                            "scaled_dot_product_attention",
-                        ]
-                    else:
-                        expected_ops = [
-                            "cast",
-                            "cast",
-                            "cast",
-                            "scaled_dot_product_attention",
-                        ]
-                    assert get_op_types_in_program(coreml_model._mil_program) == expected_ops
+                    expected_ops = [
+                        "cast",
+                        "expand_dims",
+                        "cast",
+                        "expand_dims",
+                        "cast",
+                        "expand_dims",
+                        "scaled_dot_product_attention",
+                        "squeeze",
+                    ]
+            else:
+                if is_io_precision_same_as_compute_precision:
+                    expected_ops = ["scaled_dot_product_attention"]
+                else:
+                    expected_ops = ["cast", "cast", "cast", "scaled_dot_product_attention"]
+            assert get_op_types_in_program(coreml_model._mil_program) == expected_ops
 
     @pytest.mark.parametrize(
         "compute_unit, backend, frontend, minimum_deployment_target, seq_lengths, include_heads, dynamic",
@@ -11304,7 +11924,7 @@ def test_is_causal_flag(
         include_heads,
         dynamic,
     ):
-        if frontend == TorchFrontend.EXIR:
+        if frontend == TorchFrontend.EXECUTORCH:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2199: placeholder assertion error"
             )
@@ -11369,7 +11989,7 @@ def test_attn_mask(
         bool_mask,
         dynamic,
     ):
-        if frontend == TorchFrontend.TORCHSCRIPT and bool_mask:
+        if frontend != TorchFrontend.EXECUTORCH and bool_mask:
             pytest.xfail(
                 "rdar://110499660 ([CI][Bug] test_attn_mask is occasionally failing when bool_mask = True)"
             )
@@ -11413,6 +12033,30 @@ def test_attn_mask(
             input_as_shape=False,
         )
 
+    @pytest.mark.parametrize(
+        "compute_unit, backend, frontend",
+        itertools.product(compute_units, backends, frontends),
+    )
+    def test_scale(self, compute_unit, backend, frontend):
+        batch_size, seq_len, n_heads, d = 2, 10, 3, 7
+        input_shape = (batch_size, n_heads, seq_len, d)
+        model = ModuleWrapper(
+            function=nn.functional.scaled_dot_product_attention,
+            kwargs={
+                "attn_mask": None,
+                "dropout_p": 0.0,
+                "is_causal": False,
+                "scale": 1.5,
+            },
+        )
+        self.run_compare_torch(
+            [input_shape] * 3,
+            model,
+            frontend=frontend,
+            backend=backend,
+            compute_unit=compute_unit,
+        )
+
     @pytest.mark.parametrize(
         "compute_unit, backend, frontend, minimum_deployment_target, mask_as_input, dynamic",
         itertools.product(
@@ -11433,7 +12077,7 @@ def test_toy_xformer_with_sdpa(
         mask_as_input,
         dynamic,
     ):
-        if frontend == TorchFrontend.EXIR and not mask_as_input:
+        if frontend == TorchFrontend.EXECUTORCH and not mask_as_input:
             pytest.xfail(
                 "https://github.com/apple/coremltools/issues/2199: placeholder assertion error"
             )
@@ -11558,8 +12202,7 @@ def test_dropout_early_error_out(self):
         value = generate_input_data(value_shape)
 
         model = ModuleWrapper(
-            function=nn.functional.scaled_dot_product_attention,
-            kwargs={"dropout_p": 0.0}
+            function=nn.functional.scaled_dot_product_attention, kwargs={"dropout_p": 0.0}
         )
         self.run_compare_torch(
             (query, key, value),
@@ -11575,8 +12218,7 @@ def test_dropout_early_error_out(self):
             ),
         ):
             model = ModuleWrapper(
-                function=nn.functional.scaled_dot_product_attention,
-                kwargs={"dropout_p": 0.1}
+                function=nn.functional.scaled_dot_product_attention, kwargs={"dropout_p": 0.1}
             )
             self.run_compare_torch(
                 (query, key, value),
@@ -11600,7 +12242,9 @@ def __init__(self, input_size, hidden_size, nhead=1, num_layers=1, dropout_rate=
                     dim_feedforward=hidden_size,
                     dropout=dropout_rate,
                 )
-                self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_layers)
+                self.transformer_encoder = nn.TransformerEncoder(
+                    encoder_layers, num_layers=num_layers
+                )
 
             def forward(self, x):
                 y = self.transformer_encoder(x)
@@ -11651,15 +12295,17 @@ def test_transformer(self, compute_unit, backend, dynamic):
 
 class TestFliplr(TorchBaseTest):
     @pytest.mark.parametrize(
-        "compute_unit, backend, input_shape",
-        itertools.product(compute_units, backends, [(2, 3), (3, 4, 5), (8, 2, 6, 4)]),
+        "compute_unit, backend, frontend, input_shape",
+        itertools.product(compute_units, backends, frontends, [(2, 3), (3, 4, 5), (8, 2, 6, 4)]),
     )
-    def test_fliplr(self, compute_unit, backend, input_shape):
+    def test_fliplr(self, compute_unit, backend, frontend, input_shape):
         class TestModel(nn.Module):
             def forward(self, x):
                 return torch.fliplr(x)
 
-        self.run_compare_torch(input_shape, TestModel(), backend=backend, compute_unit=compute_unit)
+        self.run_compare_torch(
+            input_shape, TestModel(), compute_unit=compute_unit, backend=backend, frontend=frontend
+        )
 
 
 class TestMultinomial(TorchBaseTest):
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
index a600faa06..9d429283e 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_quantization_ops.py
@@ -8,9 +8,11 @@
 from typing import Optional
 
 import numpy as np
+import numpy.testing
 import pytest
 import torch
 import torchvision
+from packaging.version import Version
 
 import coremltools as ct
 import coremltools.optimize as cto
@@ -18,10 +20,13 @@
 from coremltools._deps import (
     _HAS_TORCH,
     _HAS_TORCH_VISION,
+    _HAS_TORCHAO,
     MSG_TORCH_NOT_FOUND,
     MSG_TORCH_VISION_NOT_FOUND,
+    MSG_TORCHAO_NOT_FOUND,
 )
 from coremltools.converters.mil import testing_reqs
+from coremltools.converters.mil.frontend.torch.utils import TorchFrontend
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.testing_utils import get_op_types_in_program
 from coremltools.optimize.coreml import _quantization_passes
@@ -32,7 +37,11 @@
     create_unique_weight,
 )
 
-from .testing_utils import TorchBaseTest
+from .testing_utils import TorchBaseTest, frontends
+
+if _HAS_TORCHAO:
+    import torchao
+    from torchao.quantization import quant_primitives as torchao_quant
 
 pytest.mark.skipif(not _HAS_TORCH, reason=MSG_TORCH_NOT_FOUND)
 
@@ -103,6 +112,7 @@ def run_compare_torch(
         input_as_shape=True,
         minimum_deployment_target=ct.target.iOS17,
         compute_unit=ct.ComputeUnit.CPU_ONLY,
+        frontend=TorchFrontend.TORCHSCRIPT,
         converter=ct.convert,
     ):
         # TODO(rdar://108472419): properly design a random input
@@ -119,6 +129,7 @@ def run_compare_torch(
             use_scripting=False,
             compute_unit=compute_unit,
             minimum_deployment_target=minimum_deployment_target,
+            frontend=frontend,
             converter=converter,
         )
 
@@ -431,6 +442,185 @@ def forward(self, x):
         else:
             assert get_op_types_in_program(prog) == ["constexpr_blockwise_shift_scale", "matmul"]
 
+    @pytest.mark.skipif(not _HAS_TORCHAO, reason=MSG_TORCHAO_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "use_numpy, inner_k_tiles, group_size",
+        itertools.product([True, False], [2, 4, 8], [32, 64]),
+    )
+    def test_unpack_int4packed_by_mm_with_eye_matrix(self, use_numpy, inner_k_tiles, group_size):
+        """
+        Check if the packed weight could be restored by _weight_int4pack_mm with eye matrix on CPU.
+
+        As there is no kernel implemented for CPU to unpack the data packed by `torch._convert_weight_to_int4pack`,
+        we use `torch._weight_int4pack_mm` to do matrix multiplication with an eye matrix to get unpacked data.
+        """
+        if use_numpy:
+            y_np = numpy.random.rand(128, 128).astype(np.float32)
+            y = torch.from_numpy(y_np).to(torch.device("cpu"))
+        else:
+            y = torch.rand(128, 128, dtype=torch.float32, device=torch.device("cpu"))
+
+        (
+            y_quantized,
+            y_scales_and_zeros,
+        ) = torchao.quantization.utils.groupwise_affine_quantize_tensor(
+            y, n_bit=4, groupsize=group_size, dtype=torch.float32
+        )
+        y_int4packed = torch._convert_weight_to_int4pack(y_quantized, inner_k_tiles)
+        y_unpacked_shape = (y_int4packed.shape[0] * 8, y_int4packed.shape[1] * (inner_k_tiles * 16))
+        eye_shape = y_unpacked_shape[1]
+        eye_matrix = torch.eye(eye_shape, device=torch.device("cpu"), dtype=torch.float32)
+        if Version(torch.__version__) < Version("2.4.0"):
+            # The `torch._weight_int4pack_mm` op requires bfloat16 before PyTorch 2.4.0.
+            eye_matrix = eye_matrix.to(torch.bfloat16)
+            y_scales_and_zeros = y_scales_and_zeros.to(torch.bfloat16)
+        y_dequant = torch._weight_int4pack_mm(
+            eye_matrix,
+            y_int4packed,
+            group_size,
+            y_scales_and_zeros,
+        )
+        y_dequant = y_dequant.t().contiguous().float()
+
+        # Makes sure this `_weight_int4pack_mm` with eye matrix fully restores the original y.
+        np.testing.assert_allclose(y_dequant.numpy(), y.numpy(), atol=0.035, rtol=0.05)
+
+        # Also verifies that the quantized y could be accurately reproduced by torchao utils.
+        scales = torch.transpose(y_scales_and_zeros[:, :, 0], 0, 1)
+        zero_points = torch.transpose(y_scales_and_zeros[:, :, 1], 0, 1)
+        block_size = (1, group_size)
+        y_dequant_quantized = torchao_quant.quantize_affine(
+            y_dequant,
+            block_size,
+            scales,
+            zero_points,
+            torch.int32,
+            quant_min=0,
+            quant_max=2**4 - 1,
+            zero_point_domain=torchao_quant.ZeroPointDomain.FLOAT,
+        )
+        assert torch.equal(y_quantized, y_dequant_quantized)
+
+        # The torchao dequantization utils should be able to recover the original y.
+        y_dequantized_by_torchao = torchao_quant.dequantize_affine(
+            y_quantized,
+            (1, group_size),
+            scales,
+            zero_points,
+            torch.int32,
+            quant_min=0,
+            quant_max=2**4 - 1,
+            zero_point_domain=torchao_quant.ZeroPointDomain.FLOAT,
+        )
+        np.testing.assert_allclose(y_dequant.numpy(), y_dequantized_by_torchao.numpy(), rtol=4e-3)
+
+    @pytest.mark.skipif(
+        Version(torch.__version__) < Version("2.4.0"),
+        reason="_weight_int4pack_mm requires bfloat16 before PyTorch 2.4.0",
+    )
+    @pytest.mark.skipif(not _HAS_TORCHAO, reason=MSG_TORCHAO_NOT_FOUND)
+    @pytest.mark.parametrize(
+        "compute_unit, inner_k_tiles, group_size",
+        itertools.product(compute_units, [2, 4, 8], [32, 64]),
+    )
+    def test_weight_int4pack_mm(self, compute_unit, inner_k_tiles, group_size):
+        y = torch.rand(128, 128, dtype=torch.float32, device=torch.device("cpu"))
+
+        class Model(torch.nn.Module):
+            def forward(self, x):
+                (
+                    y_quantized,
+                    y_scales_and_zeros,
+                ) = torchao.quantization.utils.groupwise_affine_quantize_tensor(
+                    y, n_bit=4, groupsize=group_size, dtype=torch.float32
+                )
+                y_int4packed = torch._convert_weight_to_int4pack(y_quantized, inner_k_tiles)
+                return torch._weight_int4pack_mm(x, y_int4packed, group_size, y_scales_and_zeros)
+
+        model = Model().to(torch.device("cpu"))
+        input_shape = [(2, 128)]
+        res = self.run_compare_torch(
+            input_shape,
+            model,
+            minimum_deployment_target=ct.target.iOS18,
+            compute_unit=compute_unit,
+            rtol=0.1,
+        )
+        prog = res[1]._mil_program
+        assert get_op_types_in_program(prog) == ["constexpr_blockwise_shift_scale", "linear"]
+
+    @pytest.mark.skipif(
+        not hasattr(torch.ops.quantized_decomposed, "embedding_4bit"),
+        reason="The `embedding_4bit` op doesn't exist in quantized_decomposed custom opset.",
+    )
+    @pytest.mark.parametrize(
+        "compute_unit, group_size, dtype, signed",
+        itertools.product(
+            compute_units, [32, 64], [None, torch.float16, torch.float32], [False, True]
+        ),
+    )
+    def test_quantized_decomposed_embedding_4bit_dtype(
+        self, compute_unit, group_size, dtype, signed
+    ):
+        if not signed:
+            # To reproduce this executorch bug, use following settings
+            #    scales = torch.ones(size=scales_shape, dtype=torch.float32, device=torch.device("cpu"))
+            #    input_data = torch.zeros(size=(1, 1), dtype=torch.int32)
+            # Then you will find coreml outputs is the expected (consistent with `unpacked_weight`).
+            pytest.skip(
+                "rdar://135216194 (Executorch embedding_4bit implementation bug for unsigned quantization)"
+            )
+
+        quant_low = -8 if signed else 0
+        quant_high = 7 if signed else 15
+        quant_dtype = torch.int8 if signed else torch.uint8
+
+        weight_shape = (128, 128)
+        unpacked_weight = torch.randint(
+            low=quant_low,
+            high=quant_high + 1,
+            size=weight_shape,
+            dtype=quant_dtype,
+        )
+        # Pack the weight to embedding_4bit's usable format.
+        weight_range_shifted = unpacked_weight.add(-quant_low).view(torch.uint8)
+        weight_view = weight_range_shifted.view(
+            unpacked_weight.shape[0], unpacked_weight.shape[1] // 2, 2
+        )
+        weight_even = weight_view[:, :, 0] * 16  # left shift 4
+        weight_odd = weight_view[:, :, 1]
+        weight = weight_even + weight_odd
+
+        scales_shape = list(weight_shape)
+        scales_shape[-1] = weight_shape[-1] // group_size
+        scales = torch.rand(*scales_shape, dtype=torch.float32)
+
+        class Model(torch.nn.Module):
+            def forward(self, indices: torch.Tensor):
+                if dtype is not None:
+                    return torch.ops.quantized_decomposed.embedding_4bit(
+                        weight, scales, None, quant_low, quant_high, indices, dtype=dtype
+                    )
+                else:
+                    return torch.ops.quantized_decomposed.embedding_4bit(
+                        weight, scales, None, quant_low, quant_high, indices
+                    )
+
+        # The 4-bit packing-unpacking in torch could be messed up when transferring between devices, so it's safer
+        # to specify device at the beginning.
+        model = Model().to(torch.device("cpu"))
+        input_data = torch.randint(low=0, high=weight_shape[-1], size=(2, 128), dtype=torch.int32)
+        res = self.run_compare_torch(
+            input_data,
+            model,
+            input_as_shape=False,
+            minimum_deployment_target=ct.target.iOS18,
+            compute_unit=compute_unit,
+            rtol=1e-3,
+        )
+        prog = res[1]._mil_program
+        assert get_op_types_in_program(prog) == ["constexpr_blockwise_shift_scale", "gather"]
+
 
 @pytest.mark.skipif(not _HAS_TORCH_VISION, reason=MSG_TORCH_VISION_NOT_FOUND)
 class TestTorchvisionQuantizedModels(TorchQuantizationBaseTest):
@@ -451,30 +641,36 @@ class TestPytorchCarryCompressionInfo(TorchQuantizationBaseTest):
     """Test compressed PyTorch models which use register_buffer to carry compression info."""
 
     @pytest.mark.parametrize(
-        "compute_unit, n_bits, signed, minimum_deployment_target",
+        "compute_unit, n_bits, signed, use_linear, minimum_deployment_target, frontend",
         itertools.product(
             compute_units,
             [4, 8],
             [True, False],
+            [True, False],
             [ct.target.iOS16, ct.target.iOS18],
+            frontends,
         ),
     )
-    def test_quantization(self, compute_unit, n_bits, signed, minimum_deployment_target):
+    def test_quantization(
+        self, compute_unit, n_bits, signed, use_linear, minimum_deployment_target, frontend
+    ):
         if n_bits == 4 and minimum_deployment_target < ct.target.iOS18:
             pytest.skip("Sub-byte quantization is only supported since iOS18.")
 
-        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(
+        model, inputs, _, _ = get_test_model_and_data(
             quantize_config=cto.coreml.OpLinearQuantizerConfig(
                 mode="linear_symmetric",
                 dtype=types.get_nbits_int_builtin_type(n_bits, signed),
                 granularity="per_tensor",
-            )
+            ),
+            use_linear=use_linear,
         )
 
-        scale = np.array([2.0], dtype=np.float32).reshape(1, 1, 1, 1)
+        target_scale_shape = (1, 1) if use_linear else (1, 1, 1, 1)
+        scale = np.array([2.0], dtype=np.float32).reshape(*target_scale_shape)
         zero_point = np.array(
             [0 if signed else 2 ** (n_bits - 1)], dtype=np.int8 if signed else np.uint8
-        ).reshape(1, 1, 1, 1)
+        ).reshape(*target_scale_shape)
 
         model.register_buffer("_COREML_/metadata_version", torch.tensor(2))
         model.register_buffer("_COREML_/weight/compression_type", torch.tensor([3]))
@@ -482,13 +678,13 @@ def test_quantization(self, compute_unit, n_bits, signed, minimum_deployment_tar
         model.register_buffer("_COREML_/weight/quantization_scale", torch.from_numpy(scale))
         model.register_buffer("_COREML_/weight/zero_point", torch.from_numpy(zero_point))
 
-        traced_model = torch.jit.trace(model, torch_input_values)
         input_shape = [input.shape.to_list() for input in inputs]
         res = self.run_compare_torch(
             input_shape,
-            traced_model,
+            model,
             minimum_deployment_target=minimum_deployment_target,
             compute_unit=compute_unit,
+            frontend=frontend,
             converter=ct.convert,
             rtol=1e-04,
             atol=1e-03,
@@ -511,11 +707,11 @@ def test_quantization(self, compute_unit, n_bits, signed, minimum_deployment_tar
                 assert types.builtin_to_string(quantize_op.zero_point.dtype) == target_dtype_str
 
     @pytest.mark.parametrize(
-        "compute_unit, n_bits, minimum_deployment_target",
-        itertools.product(compute_units, [4, 8], [ct.target.iOS16, ct.target.iOS18]),
+        "compute_unit, n_bits, minimum_deployment_target, frontend",
+        itertools.product(compute_units, [4, 8], [ct.target.iOS16, ct.target.iOS18], frontends),
     )
     def test_multiple_parameters_in_same_layer(
-        self, compute_unit, n_bits, minimum_deployment_target
+        self, compute_unit, n_bits, minimum_deployment_target, frontend
     ):
         """Test one layer has multiple parameters (such as weight and bias in a linear layer)"""
         if n_bits == 4 and minimum_deployment_target < ct.target.iOS18:
@@ -559,13 +755,12 @@ def forward(self, x):
         )
         model.register_buffer("_COREML_/metadata_version", torch.tensor(2))
 
-        torch_input_values = torch.rand((8, 16))
-        traced_model = torch.jit.trace(model, torch_input_values)
         res = self.run_compare_torch(
             [(8, 16)],
-            traced_model,
+            model,
             minimum_deployment_target=minimum_deployment_target,
             compute_unit=compute_unit,
+            frontend=frontend,
             converter=ct.convert,
         )
         main_func = res[1]._mil_program.functions["main"]
@@ -579,8 +774,15 @@ def forward(self, x):
         linear_ops = main_func.find_ops(op_type="linear")
         assert linear_ops[0].weight.op.op_type == "const"
         assert linear_ops[0].bias.op.op_type == "const"
-        assert linear_ops[1].weight.op.op_type == quantize_op_type
-        assert linear_ops[1].bias.op.op_type == quantize_op_type
+        if frontend == TorchFrontend.EXECUTORCH:
+            # In EXECUTORCH, the second linear layer is represented by `matmul` and `add` op.
+            matmul_op = main_func.find_ops(op_type="matmul")[0]
+            add_op = main_func.find_ops(op_type="add")[0]
+            assert matmul_op.y.op.op_type == quantize_op_type
+            assert add_op.x.op.op_type == quantize_op_type
+        else:
+            assert linear_ops[1].weight.op.op_type == quantize_op_type
+            assert linear_ops[1].bias.op.op_type == quantize_op_type
 
         quantize_ops = main_func.find_ops(op_type=quantize_op_type)
         assert len(quantize_ops) == 2
@@ -634,18 +836,28 @@ def test_invalid_compression_info(self):
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, n_bits, group_size, channel_axis, cluster_dim, minimum_deployment_target",
+        "compute_unit, n_bits, group_size, channel_axis, cluster_dim, use_linear, minimum_deployment_target, frontend",
         itertools.product(
             compute_units,
             [4, 8],
             [0, 1, 2],
             [0, 1],
             [1, 2],
+            [True, False],
             [ct.target.iOS16, ct.target.iOS18],
+            frontends,
         ),
     )
     def test_palettization(
-        self, compute_unit, n_bits, group_size, channel_axis, cluster_dim, minimum_deployment_target
+        self,
+        compute_unit,
+        n_bits,
+        group_size,
+        channel_axis,
+        cluster_dim,
+        use_linear,
+        minimum_deployment_target,
+        frontend,
     ):
         if (
             group_size in (0, 2)
@@ -661,21 +873,35 @@ def test_palettization(
                 pytest.skip("Cluster dim must <= group size.")
 
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(
-            multi_layer=True
+            multi_layer=True,
+            use_linear=use_linear,
         )
 
-        # per-channel scales for the [32, 64, 2, 2] and [64, 32, 2, 2] weight.
-        scale_1 = np.array([2.0] * 32, dtype=np.float32).reshape(32, 1, 1, 1)
-        scale_2 = np.array([3.0] * 64, dtype=np.float32).reshape(64, 1, 1, 1)
+        if use_linear:
+            # per-channel scales for the [32, 64] and [16, 32] weight.
+            scale_1 = np.array([2.0] * 32, dtype=np.float32).reshape(32, 1)
+            scale_2 = np.array([3.0] * 16, dtype=np.float32).reshape(16, 1)
+        else:
+            # per-channel scales for the [32, 64, 2, 2] and [64, 32, 2, 2] weight.
+            scale_1 = np.array([2.0] * 32, dtype=np.float32).reshape(32, 1, 1, 1)
+            scale_2 = np.array([3.0] * 64, dtype=np.float32).reshape(64, 1, 1, 1)
 
+        layername_1 = "linear_1" if use_linear else "conv_1"
+        layername_2 = "linear_2" if use_linear else "conv_2"
         unique_weight_1 = create_unique_weight(
-            model.conv_1.weight, nbits=n_bits, vector_size=cluster_dim, vector_axis=channel_axis
+            getattr(model, layername_1).weight,
+            nbits=n_bits,
+            vector_size=cluster_dim,
+            vector_axis=channel_axis,
         )
         unique_weight_2 = create_unique_weight(
-            model.conv_2.weight, nbits=n_bits, vector_size=cluster_dim, vector_axis=channel_axis
+            getattr(model, layername_2).weight,
+            nbits=n_bits,
+            vector_size=cluster_dim,
+            vector_axis=channel_axis,
         )
 
-        # Use grouped-channel-wise lut for conv1 for iOS18+.
+        # Use grouped-channel-wise lut for layer1 for iOS18+.
         block_sizes = [0] * len(unique_weight_1.shape)
         if minimum_deployment_target >= ct.target.iOS18:
             block_sizes[channel_axis] = group_size
@@ -688,7 +914,7 @@ def test_palettization(
             channel_axis=channel_axis,
         )
 
-        # Use per-tensor lut for conv2.
+        # Use per-tensor lut for layer2.
         lut_2_params = _quantization_passes.palettize_weights.blockwise_compress(
             unique_weight_2,
             "UNIQUE",
@@ -704,30 +930,38 @@ def test_palettization(
             unique_weight_2 *= scale_2
 
         with torch.no_grad():
-            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(unique_weight_1))
-            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(unique_weight_2))
+            getattr(model, layername_1).weight = torch.nn.Parameter(torch.Tensor(unique_weight_1))
+            getattr(model, layername_2).weight = torch.nn.Parameter(torch.Tensor(unique_weight_2))
 
         model.register_buffer("_COREML_/metadata_version", torch.tensor(1))
         if minimum_deployment_target >= ct.target.iOS18:
-            model.conv_1.register_buffer("_COREML_/weight/compression_type", torch.tensor([2]))
-            model.conv_1.register_buffer("_COREML_/weight/lut", torch.tensor(lut_1_params.lut))
-            model.conv_1.register_buffer(
+            getattr(model, layername_1).register_buffer(
+                "_COREML_/weight/compression_type", torch.tensor([2])
+            )
+            getattr(model, layername_1).register_buffer(
+                "_COREML_/weight/lut", torch.tensor(lut_1_params.lut)
+            )
+            getattr(model, layername_1).register_buffer(
                 "_COREML_/weight/palettization_scale", torch.from_numpy(scale_1)
             )
-        model.conv_2.register_buffer("_COREML_/weight/compression_type", torch.tensor([2]))
-        model.conv_2.register_buffer("_COREML_/weight/lut", torch.tensor(lut_2_params.lut))
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([2])
+        )
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/lut", torch.tensor(lut_2_params.lut)
+        )
         if minimum_deployment_target >= ct.target.iOS18:
-            model.conv_2.register_buffer(
+            getattr(model, layername_2).register_buffer(
                 "_COREML_/weight/palettization_scale", torch.from_numpy(scale_2)
             )
 
-        traced_model = torch.jit.trace(model, torch_input_values)
         input_shape = [input.shape.to_list() for input in inputs]
         res = self.run_compare_torch(
             input_shape,
-            traced_model,
+            model,
             minimum_deployment_target=minimum_deployment_target,
             compute_unit=compute_unit,
+            frontend=frontend,
             converter=ct.convert,
             rtol=0.2 if cluster_dim > 1 else 1e-5,  # Vector palettization has larger info loss.
         )
@@ -737,16 +971,19 @@ def test_palettization(
             expected_dtype = f"uint{n_bits}"
             expected_quantize_ops_num = 2
             expected_palettize_ops_num = 2
-            palettize_op_child_op_type = "constexpr_blockwise_shift_scale"
+            # The lut with pcs op order is determined by canonicalize_quantized_lut_pattern graph pass.
+            palettize_op_child_op_type = "linear" if use_linear else "conv"
         else:
             expected_dtype = "uint8"
             expected_quantize_ops_num = 0
             expected_palettize_ops_num = 1
-            # The iOS16 doesn't have per-channel-scale, so lut output is directly fed into conv.
-            palettize_op_child_op_type = "conv"
+            # The iOS16 doesn't have per-channel-scale, so lut output is directly fed into next op.
+            palettize_op_child_op_type = "linear" if use_linear else "conv"
 
         quantize_ops = main_func.find_ops(op_type="constexpr_blockwise_shift_scale")
         assert len(quantize_ops) == expected_quantize_ops_num
+        for quantize_op in quantize_ops:
+            assert quantize_op.outputs[0].child_ops[0].op_type == "constexpr_lut_to_dense"
         palettize_ops = main_func.find_ops(op_type="constexpr_lut_to_dense")
         assert len(palettize_ops) == expected_palettize_ops_num
         for palettize_op in palettize_ops:
@@ -756,10 +993,10 @@ def test_palettization(
                 assert palettize_op.lut.shape[-1] == cluster_dim
 
     @pytest.mark.parametrize(
-        "compute_unit, minimum_deployment_target",
-        itertools.product(compute_units, [ct.target.iOS16, ct.target.iOS18]),
+        "compute_unit, minimum_deployment_target, frontend",
+        itertools.product(compute_units, [ct.target.iOS16, ct.target.iOS18], frontends),
     )
-    def test_palettization_8bit_lut(self, compute_unit, minimum_deployment_target):
+    def test_palettization_8bit_lut(self, compute_unit, minimum_deployment_target, frontend):
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(
             multi_layer=True
         )
@@ -857,36 +1094,54 @@ def test_palettization_8bit_lut(self, compute_unit, minimum_deployment_target):
         assert len(palettize_ops) == 2
         assert types.builtin_to_string(palettize_ops[0].indices.dtype) == "uint4"
         assert types.builtin_to_string(palettize_ops[1].indices.dtype) == "uint6"
+        # The op order is adjusted by common::canonicalize_quantized_lut_pattern graph pass.
+        for quantize_op in quantize_ops:
+            assert quantize_op.outputs[0].child_ops[0].op_type == "constexpr_lut_to_dense"
         for palettize_op in palettize_ops:
-            assert palettize_op.outputs[0].child_ops[0].op_type == "constexpr_blockwise_shift_scale"
+            assert palettize_op.outputs[0].child_ops[0].op_type == "conv"
 
     @pytest.mark.parametrize(
-        "compute_unit, sparse_ratio, minimum_deployment_target",
+        "compute_unit, sparse_ratio, use_linear, minimum_deployment_target, frontend",
         itertools.product(
             compute_units,
             [0.01, 0.5, 0.99],
+            [True, False],
             [ct.target.iOS16, ct.target.iOS18],
+            frontends,
         ),
     )
-    def test_pruning(self, compute_unit, sparse_ratio, minimum_deployment_target):
+    def test_pruning(
+        self, compute_unit, sparse_ratio, use_linear, minimum_deployment_target, frontend
+    ):
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(
-            multi_layer=True
+            multi_layer=True, use_linear=use_linear
         )
+        layername_1 = "linear_1" if use_linear else "conv_1"
+        layername_2 = "linear_2" if use_linear else "conv_2"
+
         with torch.no_grad():
-            model.conv_1.weight = torch.nn.Parameter(
+            getattr(model, layername_1).weight = torch.nn.Parameter(
                 torch.Tensor(
-                    create_sparse_weight(model.conv_1.weight, target_sparsity=sparse_ratio)
+                    create_sparse_weight(
+                        getattr(model, layername_1).weight, target_sparsity=sparse_ratio
+                    )
                 )
             )
-            model.conv_2.weight = torch.nn.Parameter(
+            getattr(model, layername_2).weight = torch.nn.Parameter(
                 torch.Tensor(
-                    create_sparse_weight(model.conv_2.weight, target_sparsity=sparse_ratio)
+                    create_sparse_weight(
+                        getattr(model, layername_2).weight, target_sparsity=sparse_ratio
+                    )
                 )
             )
 
         model.register_buffer("_COREML_/metadata_version", torch.tensor(1))
-        model.conv_1.register_buffer("_COREML_/weight/compression_type", torch.tensor([1]))
-        model.conv_2.register_buffer("_COREML_/weight/compression_type", torch.tensor([1]))
+        getattr(model, layername_1).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([1])
+        )
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([1])
+        )
 
         traced_model = torch.jit.trace(model, torch_input_values)
         input_shape = [input.shape.to_list() for input in inputs]
@@ -902,7 +1157,7 @@ def test_pruning(self, compute_unit, sparse_ratio, minimum_deployment_target):
         assert len(sparse_ops) == 2
 
         for sparse_op in sparse_ops:
-            assert sparse_op.outputs[0].child_ops[0].op_type == "conv"
+            assert sparse_op.outputs[0].child_ops[0].op_type == "linear" if use_linear else "conv"
             assert types.builtin_to_string(sparse_op.nonzero_data.dtype) == "fp32"
             if minimum_deployment_target >= ct.target.iOS18:
                 assert types.builtin_to_string(sparse_op.mask.dtype) == "uint1"
@@ -911,52 +1166,69 @@ def test_pruning(self, compute_unit, sparse_ratio, minimum_deployment_target):
                 assert types.builtin_to_string(sparse_op.shape.dtype) == "uint32"
 
     @pytest.mark.parametrize(
-        "compute_unit, n_bits, signed",
+        "compute_unit, n_bits, signed, use_linear, frontend",
         itertools.product(
             compute_units,
             [4, 8],
             [True, False],
+            [True, False],
+            frontends,
         ),
     )
-    def test_joint_pruning_quantization(self, compute_unit, n_bits, signed):
+    def test_joint_pruning_quantization(self, compute_unit, n_bits, signed, use_linear, frontend):
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(
             multi_layer=True,
+            use_linear=use_linear,
         )
 
         # Make the weight sparse and also quantization-friendly.
+        layername_1 = "linear_1" if use_linear else "conv_1"
+        layername_2 = "linear_2" if use_linear else "conv_2"
         weight_1, scale_1, zero_point_1 = create_quantize_friendly_weight(
-            model.conv_1.weight.detach().numpy(), nbits=n_bits, signed=signed
+            getattr(model, layername_1).weight.detach().numpy(), nbits=n_bits, signed=signed
         )
-        weight_1 *= np.random.randint(low=0, high=2, size=model.conv_1.weight.shape)
+        weight_1 *= np.random.randint(low=0, high=2, size=weight_1.shape)
         weight_2, scale_2, zero_point_2 = create_quantize_friendly_weight(
-            model.conv_2.weight.detach().numpy(), nbits=n_bits, signed=signed
+            getattr(model, layername_2).weight.detach().numpy(), nbits=n_bits, signed=signed
         )
-        weight_2 *= np.random.randint(low=0, high=2, size=model.conv_2.weight.shape)
+        weight_2 *= np.random.randint(low=0, high=2, size=weight_2.shape)
         with torch.no_grad():
-            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1))
-            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2))
+            getattr(model, layername_1).weight = torch.nn.Parameter(torch.Tensor(weight_1))
+            getattr(model, layername_2).weight = torch.nn.Parameter(torch.Tensor(weight_2))
 
         model.register_buffer("_COREML_/metadata_version", torch.tensor(2))
-        model.conv_1.register_buffer("_COREML_/weight/compression_type", torch.tensor([1, 3]))
-        model.conv_1.register_buffer("_COREML_/weight/quantization_n_bits", torch.tensor(n_bits))
-        model.conv_1.register_buffer(
+        getattr(model, layername_1).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([1, 3])
+        )
+        getattr(model, layername_1).register_buffer(
+            "_COREML_/weight/quantization_n_bits", torch.tensor(n_bits)
+        )
+        getattr(model, layername_1).register_buffer(
             "_COREML_/weight/quantization_scale", torch.from_numpy(scale_1)
         )
-        model.conv_1.register_buffer("_COREML_/weight/zero_point", torch.from_numpy(zero_point_1))
-        model.conv_2.register_buffer("_COREML_/weight/compression_type", torch.tensor([1, 3]))
-        model.conv_2.register_buffer("_COREML_/weight/quantization_n_bits", torch.tensor(n_bits))
-        model.conv_2.register_buffer(
+        getattr(model, layername_1).register_buffer(
+            "_COREML_/weight/zero_point", torch.from_numpy(zero_point_1)
+        )
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([1, 3])
+        )
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/quantization_n_bits", torch.tensor(n_bits)
+        )
+        getattr(model, layername_2).register_buffer(
             "_COREML_/weight/quantization_scale", torch.from_numpy(scale_2)
         )
-        model.conv_2.register_buffer("_COREML_/weight/zero_point", torch.from_numpy(zero_point_2))
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/zero_point", torch.from_numpy(zero_point_2)
+        )
 
-        traced_model = torch.jit.trace(model, torch_input_values)
         input_shape = [input.shape.to_list() for input in inputs]
         res = self.run_compare_torch(
             input_shape,
-            traced_model,
+            model,
             minimum_deployment_target=ct.target.iOS18,
             compute_unit=compute_unit,
+            frontend=frontend,
             converter=ct.convert,
             atol=1e-2,
         )
@@ -976,32 +1248,39 @@ def test_joint_pruning_quantization(self, compute_unit, n_bits, signed):
         for sparse_op in sparse_ops:
             assert types.builtin_to_string(sparse_op.mask.dtype) == "uint1"
             assert types.builtin_to_string(sparse_op.nonzero_data.dtype) == "fp32"
-            assert sparse_op.outputs[0].child_ops[0].op_type == "conv"
+            assert sparse_op.outputs[0].child_ops[0].op_type == "linear" if use_linear else "conv"
 
     @pytest.mark.parametrize(
-        "compute_unit, n_bits, group_size",
+        "compute_unit, n_bits, group_size, use_linear, frontend",
         itertools.product(
             compute_units,
             [4, 8],
             [0, 1, 2],
+            [True, False],
+            frontends,
         ),
     )
-    def test_joint_pruning_palettization(self, compute_unit, n_bits, group_size):
+    def test_joint_pruning_palettization(
+        self, compute_unit, n_bits, group_size, use_linear, frontend
+    ):
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data(
-            multi_layer=True
+            multi_layer=True,
+            use_linear=use_linear,
         )
 
         # Make the weight sparse and also can be represented by lut.
-        weight_1 = create_unique_weight(model.conv_1.weight, nbits=n_bits) * np.random.randint(
-            low=0, high=2, size=model.conv_1.weight.shape
-        )
-        weight_2 = create_unique_weight(model.conv_2.weight, nbits=n_bits) * np.random.randint(
-            low=0, high=2, size=model.conv_2.weight.shape
-        )
+        layername_1 = "linear_1" if use_linear else "conv_1"
+        layername_2 = "linear_2" if use_linear else "conv_2"
+        weight_1 = create_unique_weight(
+            getattr(model, layername_1).weight, nbits=n_bits
+        ) * np.random.randint(low=0, high=2, size=getattr(model, layername_1).weight.shape)
+        weight_2 = create_unique_weight(
+            getattr(model, layername_2).weight, nbits=n_bits
+        ) * np.random.randint(low=0, high=2, size=getattr(model, layername_2).weight.shape)
 
         with torch.no_grad():
-            model.conv_1.weight = torch.nn.Parameter(torch.Tensor(weight_1))
-            model.conv_2.weight = torch.nn.Parameter(torch.Tensor(weight_2))
+            getattr(model, layername_1).weight = torch.nn.Parameter(torch.Tensor(weight_1))
+            getattr(model, layername_2).weight = torch.nn.Parameter(torch.Tensor(weight_2))
 
         lut_1_params = _quantization_passes.palettize_weights.blockwise_compress(
             weight_1,
@@ -1017,10 +1296,18 @@ def test_joint_pruning_palettization(self, compute_unit, n_bits, group_size):
         )
 
         model.register_buffer("_COREML_/metadata_version", torch.tensor(1))
-        model.conv_1.register_buffer("_COREML_/weight/compression_type", torch.tensor([1, 2]))
-        model.conv_1.register_buffer("_COREML_/weight/lut", torch.tensor(lut_1_params.lut))
-        model.conv_2.register_buffer("_COREML_/weight/compression_type", torch.tensor([1, 2]))
-        model.conv_2.register_buffer("_COREML_/weight/lut", torch.tensor(lut_2_params.lut))
+        getattr(model, layername_1).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([1, 2])
+        )
+        getattr(model, layername_1).register_buffer(
+            "_COREML_/weight/lut", torch.tensor(lut_1_params.lut)
+        )
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/compression_type", torch.tensor([1, 2])
+        )
+        getattr(model, layername_2).register_buffer(
+            "_COREML_/weight/lut", torch.tensor(lut_2_params.lut)
+        )
 
         traced_model = torch.jit.trace(model, torch_input_values)
         input_shape = [input.shape.to_list() for input in inputs]
@@ -1058,4 +1345,4 @@ def test_joint_pruning_palettization(self, compute_unit, n_bits, group_size):
         for sparse_op in sparse_ops:
             assert types.builtin_to_string(sparse_op.mask.dtype) == "uint1"
             assert types.builtin_to_string(sparse_op.nonzero_data.dtype) == "fp32"
-            assert sparse_op.outputs[0].child_ops[0].op_type == "conv"
+            assert sparse_op.outputs[0].child_ops[0].op_type == "linear" if use_linear else "conv"
diff --git a/coremltools/converters/mil/frontend/torch/test/test_torch_stateful_model.py b/coremltools/converters/mil/frontend/torch/test/test_torch_stateful_model.py
index a9999fbe1..01828cdef 100644
--- a/coremltools/converters/mil/frontend/torch/test/test_torch_stateful_model.py
+++ b/coremltools/converters/mil/frontend/torch/test/test_torch_stateful_model.py
@@ -9,7 +9,7 @@
 import pytest
 
 import coremltools as ct
-from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH_EXPORT_API
+from coremltools.converters.mil.frontend.torch.utils import TorchFrontend
 from coremltools.converters.mil.mil import types
 from coremltools.converters.mil.mil.types.symbolic import any_symbolic
 from coremltools.converters.mil.testing_reqs import compute_units
@@ -25,15 +25,7 @@
 
 torch = pytest.importorskip("torch")
 
-from .testing_utils import TorchFrontend, export_torch_model_to_frontend
-
-frontends = [TorchFrontend.TORCHSCRIPT]
-if _HAS_TORCH_EXPORT_API or _HAS_EXECUTORCH:
-    frontends.append(TorchFrontend.EXIR)
-
-ALTER_FRONTEND = [False]
-if _HAS_EXECUTORCH:
-    ALTER_FRONTEND.append(True)
+from .testing_utils import export_torch_model_to_frontend, frontends
 
 
 @pytest.fixture
@@ -239,16 +231,13 @@ def forward(self, x):
 )
 class TestStateConversionAPI:
     @pytest.mark.parametrize(
-        "compute_unit, frontend, alter_frontend",
-        itertools.product(compute_units, frontends, ALTER_FRONTEND),
+        "compute_unit, frontend",
+        itertools.product(compute_units, frontends),
     )
-    def test_state_model_api_example(self, compute_unit, frontend, alter_frontend):
+    def test_state_model_api_example(self, compute_unit, frontend):
         """
         Test the public API example.
         """
-        if frontend == TorchFrontend.TORCHSCRIPT and alter_frontend:
-            pytest.skip("Stateful conversion from torch.jit.script is not supported")
-
         class UpdateBufferModel(torch.nn.Module):
             def __init__(self):
                 super(UpdateBufferModel, self).__init__()
@@ -265,18 +254,18 @@ def forward(self, x):
             source_model,
             (torch.tensor([1, 2, 3], dtype=torch.float16),),
             frontend,
-            use_scripting=alter_frontend,
-            use_edge_dialect=alter_frontend,
         )
 
+        inputs = [ct.TensorType(shape=(3,))] if frontend == TorchFrontend.TORCHSCRIPT else None
+        states = (
+            [ct.StateType(wrapped_type=ct.TensorType(shape=(3,)), name="state_1")]
+            if frontend == TorchFrontend.TORCHSCRIPT
+            else None
+        )
         mlmodel = ct.convert(
             torch_model,
-            inputs=(None if frontend == TorchFrontend.EXIR else [ct.TensorType(shape=(3,))]),
-            states=(
-                None
-                if frontend == TorchFrontend.EXIR
-                else [ct.StateType(wrapped_type=ct.TensorType(shape=(3,)), name="state_1")]
-            ),
+            inputs=inputs,
+            states=states,
             minimum_deployment_target=ct.target.iOS18,
             convert_to="mlprogram",
             compute_units=compute_unit,
diff --git a/coremltools/converters/mil/frontend/torch/test/testing_utils.py b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
index c52d84881..624bca002 100644
--- a/coremltools/converters/mil/frontend/torch/test/testing_utils.py
+++ b/coremltools/converters/mil/frontend/torch/test/testing_utils.py
@@ -3,6 +3,9 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import os
+import platform
+from pathlib import Path
 from typing import List, Union
 
 import numpy as np
@@ -12,12 +15,18 @@
 
 import coremltools as ct
 import coremltools.models.utils as coremltoolsutils
-from coremltools import RangeDim, TensorType, _logger as logger
+from coremltools import RangeDim, TensorType
+from coremltools import _logger as logger
 from coremltools._deps import _HAS_EXECUTORCH, _HAS_TORCH_EXPORT_API, _IS_MACOS
 from coremltools.converters.mil.mil.types.type_mapping import nptype_from_builtin
-from coremltools.converters.mil.testing_utils import ct_convert, validate_minimum_deployment_target
+from coremltools.converters.mil.testing_utils import (
+    _create_current_pytest_serialization_path,
+    ct_convert,
+    debug_save_mlmodels,
+    validate_minimum_deployment_target,
+)
 
-from ..utils import TORCH_DTYPE_TO_MIL_DTYPE, TorchFrontend
+from ..utils import TORCH_DTYPE_TO_MIL_DTYPE, TORCH_EXPORT_BASED_FRONTENDS, TorchFrontend
 
 if _HAS_TORCH_EXPORT_API:
     from torch.export import ExportedProgram
@@ -25,6 +34,34 @@
 if _HAS_EXECUTORCH:
     import executorch.exir
 
+if "TORCH_FRONTENDS" in os.environ:
+    frontends = []
+    for frontend_str in os.environ["TORCH_FRONTENDS"].split(","):
+        frontend = TorchFrontend[frontend_str]
+        if platform.machine() == "x86_64" and frontend in TORCH_EXPORT_BASED_FRONTENDS:
+            logger.warning(
+                f"{frontend_str} is not supported well on x86_64, skipped this frontend test"
+            )
+            continue
+        if frontend == TorchFrontend.TORCHEXPORT and not _HAS_TORCH_EXPORT_API:
+            logger.warning(
+                "Must have torch.export API to test TORCHEXPORT frontend. Skipped this frontend test."
+            )
+            continue
+        if frontend == TorchFrontend.EXECUTORCH and not _HAS_EXECUTORCH:
+            logger.warning(
+                "Must have executorch to test EXECUTORCH frontend. Skipped this frontend test."
+            )
+            continue
+        frontends.append(frontend)
+else:
+    frontends = [TorchFrontend.TORCHSCRIPT]
+    if platform.machine() != "x86_64":
+        if _HAS_TORCH_EXPORT_API:
+            frontends.append(TorchFrontend.TORCHEXPORT)
+        if _HAS_EXECUTORCH:
+            frontends.append(TorchFrontend.EXECUTORCH)
+
 
 class ModuleWrapper(nn.Module):
     """
@@ -157,7 +194,6 @@ def export_torch_model_to_frontend(
     input_data,
     frontend,
     use_scripting=False,
-    use_edge_dialect=True,
     torch_export_dynamic_shapes=None,
 ):
     input_data_clone = _copy_input_data(input_data)
@@ -173,7 +209,7 @@ def export_torch_model_to_frontend(
         else:
             model_spec = torch.jit.trace(model, input_data_clone)
 
-    elif frontend == TorchFrontend.EXIR:
+    elif frontend in TORCH_EXPORT_BASED_FRONTENDS:
         try:
             model.eval()
         except NotImplementedError:
@@ -182,13 +218,13 @@ def export_torch_model_to_frontend(
         model_spec = torch.export.export(
             model, input_data_clone, dynamic_shapes=torch_export_dynamic_shapes
         )
-        if use_edge_dialect:
+        if frontend == TorchFrontend.EXECUTORCH:
             model_spec = executorch.exir.to_edge(model_spec).exported_program()
 
     else:
         raise ValueError(
             "Unknown value of frontend. Needs to be either TorchFrontend.TORCHSCRIPT "
-            f"or TorchFrontend.EXIR. Provided: {frontend}"
+            f"or TorchFrontend.TORCHEXPORT or TorchFrontend.EXECUTORCH. Provided: {frontend}"
         )
 
     return model_spec
@@ -240,6 +276,15 @@ def convert_and_compare(
         torch_input = _copy_input_data(input_data)
         expected_results = torch_model(*torch_input)
     expected_results = flatten_and_detach_torch_results(expected_results)
+
+    PYTEST_CURRENT_TEST = os.environ.get("PYTEST_CURRENT_TEST").split("(call)")[0].strip()
+    if PYTEST_CURRENT_TEST in debug_save_mlmodels:
+        serialization_path = _create_current_pytest_serialization_path()
+        Path(serialization_path).mkdir(parents=True, exist_ok=True)
+        flat_inputs = flatten_and_detach_torch_results(input_data)
+        np.savez(serialization_path + "ref_inputs.npz", *flat_inputs)
+        np.savez(serialization_path + "ref_outputs.npz", *expected_results)
+
     mlmodel = convert_to_mlmodel(
         model_spec,
         input_data,
@@ -294,9 +339,6 @@ def run_compare_torch(
         backend=("neuralnetwork", "fp32"),
         rand_range=(-1.0, 1.0),
         use_scripting=False,
-        # TODO (rdar://128768037): Once we fully figure out torch.export converter,
-        # we may default the tests to ATen dialect
-        use_edge_dialect=True,
         converter_input_type=None,
         compute_unit=ct.ComputeUnit.CPU_ONLY,
         minimum_deployment_target=None,
@@ -312,7 +354,7 @@ def run_compare_torch(
             expected_results <iterable, optional>: Expected result from running pytorch model.
             converter_input_type: If not None, then pass it to the "inputs" argument to the
                 ct.convert() call.
-            frontend: Either TorchFrontend.TORCHSCRIPT or TorchFrontend.EXIR
+            frontend: TorchFrontend enum
         """
         if minimum_deployment_target is not None:
             validate_minimum_deployment_target(minimum_deployment_target, backend)
@@ -325,7 +367,6 @@ def run_compare_torch(
             input_data,
             frontend,
             use_scripting=use_scripting,
-            use_edge_dialect=use_edge_dialect,
             torch_export_dynamic_shapes=torch_export_dynamic_shapes,
         )
 
diff --git a/coremltools/converters/mil/frontend/torch/utils.py b/coremltools/converters/mil/frontend/torch/utils.py
index e3b4da127..3507bf24e 100644
--- a/coremltools/converters/mil/frontend/torch/utils.py
+++ b/coremltools/converters/mil/frontend/torch/utils.py
@@ -118,7 +118,11 @@ def dtype_to_32bit(dtype):
 
 class TorchFrontend(Enum):
     TORCHSCRIPT = 1
-    EXIR = 2
+    TORCHEXPORT = 2
+    EXECUTORCH = 3
+
+
+TORCH_EXPORT_BASED_FRONTENDS = (TorchFrontend.TORCHEXPORT, TorchFrontend.EXECUTORCH)
 
 
 def sanitize_op_kind(op_kind: str) -> str:
@@ -141,14 +145,21 @@ def skip_default_prefix_and_suffix_with_deliminator(
     ) -> str:
         split = op_kind.split(deliminator)
         start = 1 if split[0] in {"aten", "prim"} and len(split) > 1 else 0
-        stop = -1 if split[-1] in {
-            "default",
-            "tensor",
-            "tensor_mode",
-            "scalar",
-            "tensor_scalar",
-        } and len(split) - start > 1 else len(split)
-        op_kind = deliminator.join(split[start : stop])
+        stop = (
+            -1
+            if split[-1]
+            in {
+                "default",
+                "int",
+                "tensor",
+                "tensor_mode",
+                "scalar",
+                "tensor_scalar",
+            }
+            and len(split) - start > 1
+            else len(split)
+        )
+        op_kind = deliminator.join(split[start:stop])
         return op_kind
 
     # 1. Lower case
diff --git a/coremltools/converters/mil/mil/operation.py b/coremltools/converters/mil/mil/operation.py
index f7fc9cee9..624b10737 100644
--- a/coremltools/converters/mil/mil/operation.py
+++ b/coremltools/converters/mil/mil/operation.py
@@ -498,8 +498,8 @@ def check_and_detach(v_new, v_old, op, no_check_var_types):
                 and not no_check_var_types
             ):
                 raise ValueError(
-                    f"New var type `{v_new.sym_type}` not a "
-                    f"subtype of existing var type `{v_old.sym_type}`."
+                    f"New var {v_new} doesn't have compatible "
+                    f"subtype of existing var `{v_old}`."
                 )
             v_old.remove_child_op(op, no_check_var_types)
 
diff --git a/coremltools/converters/mil/mil/ops/defs/iOS18/compression.py b/coremltools/converters/mil/mil/ops/defs/iOS18/compression.py
index c89b2be90..2f5f59f62 100644
--- a/coremltools/converters/mil/mil/ops/defs/iOS18/compression.py
+++ b/coremltools/converters/mil/mil/ops/defs/iOS18/compression.py
@@ -97,8 +97,8 @@ def _validate_shift_scale_inputs(
             scale_dim = scale.shape[rank_idx]
             if data_dim % scale_dim != 0:
                 raise ValueError(
-                    f"Number of scales along each dimension should be a factor of "
-                    f"corresponding dimension size of 'data'. However, at dim "
+                    "Number of scales along each dimension should be a factor of "
+                    "corresponding dimension size of 'data'. However, at dim "
                     f"{rank_idx}, the 'data' has {data_dim} while 'scale' has {scale_dim}."
                 )
 
diff --git a/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py b/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
index 58e21c3c9..11c1b5167 100644
--- a/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
+++ b/coremltools/converters/mil/mil/ops/tests/iOS16/test_constexpr_ops.py
@@ -63,6 +63,37 @@ def build(x):
         prog = mlmodel._mil_program
         assert "constexpr_affine_dequantize" in get_op_types_in_program(prog)
 
+    @pytest.mark.parametrize("compute_unit, backend", itertools.product(compute_units, backends))
+    def test_builder_to_backend_linear(self, compute_unit, backend):
+        input_data = np.ones((4, 64), dtype=np.float32)
+        input_placeholders = {
+            "x": mb.placeholder(shape=input_data.shape),
+        }
+        input_values = {"x": input_data}
+
+        def build(x):
+            weight = mb.constexpr_affine_dequantize(
+                quantized_data=np.ones((32, 64), dtype=np.uint8),
+                zero_point=np.uint8(0),
+                scale=np.float32(2.0),
+                axis=0,
+            )
+            return mb.linear(x=x, weight=weight, bias=np.zeros((32,), dtype=np.float32))
+
+        expected_output_types = (4, 32, types.fp32)
+        expected_outputs = np.ones((4, 32), dtype=np.float32) * 128
+
+        mlmodel = run_compare_builder(
+            build,
+            input_placeholders,
+            input_values,
+            expected_output_types,
+            expected_outputs,
+            compute_unit=compute_unit,
+            backend=backend,
+        )
+        assert "constexpr_affine_dequantize" in get_op_types_in_program(mlmodel._mil_program)
+
     def test_is_all_zeros(self):
         @mb.program(opset_version=ct.target.iOS16)
         def prog_0_scalar():
diff --git a/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py b/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
index 5267e8e9e..545c768cb 100644
--- a/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
+++ b/coremltools/converters/mil/mil/passes/defs/cleanup/const_deduplication.py
@@ -4,7 +4,7 @@
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 import hashlib
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union
 
 import numpy as np
 
@@ -46,14 +46,32 @@ class const_deduplication(AbstractGraphPass):
     (2) Deduplication of ``constexpr_*`` op:
 
         We consider a ``constexpr_*`` as duplicated if there exists such a previous ``constexpr_*`` that has the same ``op_type`` and input attributes.
+
+    Support options:
+
+    - ``const_threshold``: Skip deduplicating ``const`` ops that have smaller number of elements than a threshold. Defaults to ``100``. i.e. the constants with ``size < 100`` will not be deduplicated.
     """
 
-    NUMEL_THRESH = 100
+    # const with size < _const_threshold will not be deduplicated
+    _const_threshold = 100
+
+    # length of the number value hashkey
+    LENGTH_OF_HASHKEY = 100
     DTYPE2ATOL = {
         types.fp16: 6e-8,
         types.fp32: 1e-12,
     }
 
+    @property
+    def const_threshold(self) -> int:
+        return const_deduplication._const_threshold
+
+    @const_threshold.setter
+    def const_threshold(self, val: int) -> None:
+        if not isinstance(val, int):
+            raise ValueError(f"Expect option 'const_threshold' to be type of int. Got {type(val)}.")
+        const_deduplication._const_threshold = val
+
     def apply(self, prog) -> None:
         for f in prog.functions.values():
             self._constant_deduplication_block(f)
@@ -140,10 +158,10 @@ def find_constexprs(blocks: List[Block]) -> Dict[Var, List[Var]]:
                     hash_key = [op.op_type]
                 for v in op.inputs.values():
                     hash_key.append(v.dtype)
-                    if np.prod(v.shape) < const_deduplication.NUMEL_THRESH:
-                        hash_key.append(str(v.val))
-                    else:
+                    if v.val is None or const_deduplication.should_be_deduplicated(v.val):
                         hash_key.append(v)
+                    else:
+                        hash_key.append(str(v.val))
                 hash_key = tuple(hash_key)
                 if hash_key not in hashkey_2_duplicates:
                     hashkey_2_duplicates[hash_key] = [op.outputs[0]]
@@ -152,6 +170,15 @@ def find_constexprs(blocks: List[Block]) -> Dict[Var, List[Var]]:
 
         return {v[0]: v[1:] for v in hashkey_2_duplicates.values()}
 
+    @staticmethod
+    def should_be_deduplicated(val: Union[str, bool, np.ndarray]) -> bool:
+        assert val is not None, "val should only be type of (str, bool, np.ndarray)"
+        if isinstance(val, (str, bool)):
+            return False
+        if np.prod(val.shape) < const_deduplication._const_threshold:
+            return False
+        return True
+
     @staticmethod
     def find_constants(blocks: List[Block]) -> Dict[Var, List[Var]]:
         """
@@ -173,16 +200,16 @@ def find_constants(blocks: List[Block]) -> Dict[Var, List[Var]]:
                 constant_var = op.outputs[0]
                 if isinstance(constant_var, ListVar):
                     continue
-                shape = constant_var.shape
 
-                numel = np.prod(shape)
-                if numel < const_deduplication.NUMEL_THRESH:
+                if not const_deduplication.should_be_deduplicated(constant_var.val):
                     continue
 
+                shape = constant_var.shape
                 dtype = constant_var.dtype
                 value = constant_var.val
+
                 hash = hashlib.sha1(
-                    np.ascontiguousarray(value.reshape(-1)[: const_deduplication.NUMEL_THRESH])
+                    np.ascontiguousarray(value.reshape(-1)[: const_deduplication.LENGTH_OF_HASHKEY])
                 ).hexdigest()
                 if hasattr(op, "weight_key"):
                     key = (op.weight_key, dtype, shape, hash)
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py b/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
index 216ae8937..ddd147248 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_elementwise_binary.py
@@ -427,7 +427,10 @@ def _try_to_transform(op, block):
 
         # check the expand_dim op has axes = [0]
         expand_dims_op = expand_dims_ops[0]
-        if expand_dims_op.axes.val != [0]:
+        expand_dims_op_axes_val = expand_dims_op.axes.val
+        if isinstance(expand_dims_op_axes_val, np.ndarray):
+            expand_dims_op_axes_val = expand_dims_op_axes_val.tolist()
+        if expand_dims_op_axes_val != [0]:
             return False
         ops_to_remove.append(expand_dims_op)
         ops_to_remove += other_ops
diff --git a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
index 220b5ea8f..a0f177c61 100644
--- a/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/optimize_quantization.py
@@ -1120,3 +1120,115 @@ def _help_move_scale(
             new_var=scaled_output,
             force_replace=True,  # Need to force replace because it involves replacing constexpr op.
         )
+
+
+@register_pass(namespace="common")
+class canonicalize_quantized_lut_pattern(AbstractGraphPass):
+    """
+    The quantized lut (e.g. each entry in the LUT is int8) could be represented by two patterns:
+        Pattern 1:
+            lut(int8) -> constexpr_blockwise_shift_scale -> lut(fp16) -> constexpr_lut_to_dense -> dense(fp16)
+        Pattern 2:
+            lut(int8) -> constexpr_lut_to_dense -> dense(int8) -> constexpr_blockwise_shift_scale -> dense(fp16)
+    Those two patterns are mathematically equivalent when the quantization is per-tensor or per-channel.
+
+    This graph pass makes sure we always use one specific pattern by re-ordering the ops.
+    """
+
+    _DEQUANT_FIRST = True  # First dequantize and then depalettize (use pattern 1).
+
+    def apply(self, prog):
+        wrong_order_op1 = (
+            "constexpr_lut_to_dense" if self._DEQUANT_FIRST else "constexpr_blockwise_shift_scale"
+        )
+        wrong_order_op2 = (
+            "constexpr_blockwise_shift_scale" if self._DEQUANT_FIRST else "constexpr_lut_to_dense"
+        )
+
+        @block_context_manager
+        def apply_block(block: Block):
+            for op in list(block.operations):
+                for b in op.blocks:
+                    apply_block(b)
+                if op.op_type == wrong_order_op1 and len(op.outputs[0].child_ops) == 1:
+                    if op.outputs[0].child_ops[0].op_type == wrong_order_op2:
+                        self._reorder_quant_lut(block, op)
+
+        for f in prog.functions.values():
+            apply_block(f)
+
+    def _reorder_quant_lut(self, block: Block, old_op1: Operation):
+        """
+        Original order is op1 -> op2 -> output_op, and after reorder it becomes op2 -> op1 -> output_op.
+        Here op1 and op2 corresponds to either lut op or quant op, depending on `_DEQUANT_FIRST`.
+        """
+        old_op2 = old_op1.outputs[0].child_ops[0]
+        # If the old op has some meaningful info in the name (such as "conv1.weight"), we need to keep it.
+        new_op1_name = None if old_op1.op_type in old_op1.name else old_op1.name
+        new_op2_name = None if old_op2.op_type in old_op2.name else old_op2.name
+
+        if old_op1.op_type == "constexpr_blockwise_shift_scale":
+            # The old_op1 is dequant op and old_op2 is a lut op.
+            # The scale and offset from old_op1 is for lut, so the rank need to be adjusted.
+            if old_op1.scale.shape[-2:] != (1, 1):
+                raise AssertionError(
+                    "The quantization on lut must be per-tensor, so last two dims in `scale` should "
+                    f"both be 1, but got scale with shape {old_op1.scale.shape}."
+                )
+            new_scale_shape = old_op1.scale.shape[-2:]
+            scale = old_op1.scale.val.reshape(new_scale_shape)
+            offset = old_op1.offset
+            if offset is not None and offset.val is not None:
+                offset = old_op1.offset.val.reshape(new_scale_shape)
+
+            new_op1_args = {"indices": old_op2.indices, "lut": old_op1.data, "before_op": old_op2}
+            if new_op1_name is not None:
+                new_op1_args["name"] = new_op1_name
+            new_op1 = mb.constexpr_lut_to_dense(**new_op1_args)
+
+            new_op2_args = {"data": new_op1, "scale": scale, "offset": offset, "before_op": old_op2}
+            if new_op2_name is not None:
+                new_op2_args["name"] = new_op2_name
+            new_op2 = mb.constexpr_blockwise_shift_scale(**new_op2_args)
+        else:
+            # The old_op1 is lut op and old_op2 is a dequant op.
+            # The scale and offset from old_op2 is for depalettized weight, so the rank need to be adjusted to match
+            # the lut's rank.
+            new_scale_shape = old_op2.scale.shape + (1, 1)
+            scale = old_op2.scale.val.reshape(new_scale_shape)
+            offset = old_op2.offset
+            if offset is not None and offset.val is not None:
+                offset = old_op2.offset.val.reshape(new_scale_shape)
+
+            lut = old_op1.lut
+            if any(shape != 1 for shape in new_scale_shape):
+                # The lut need to be repeated when necessary. For example, in per-channel-scale, the lut has shape
+                # [16, 1, 16, 1], indices has shape [32, 1], and scale has shape [32, 1]. It means every two rows in
+                # the weight share a lut, and it's impossible to apply 32 scales to 16 lut tables. So we need to repeat
+                # the lut to become [32, 1, 16, 1], and then apply those 32 scales to each row.
+                lut = old_op1.lut.val
+                if lut is None:
+                    return  # Cannot handle the reording when the lut is not const.
+                for axis, (scale_shape, lut_shape) in enumerate(zip(new_scale_shape, lut.shape)):
+                    if scale_shape > lut_shape:
+                        if scale_shape % lut_shape != 0:
+                            return  # Skip when lut's shape cannot be repeated to match scale's shape.
+                        lut = np.repeat(lut, scale_shape // lut_shape, axis=axis)
+
+            new_op1_args = {"data": lut, "scale": scale, "offset": offset, "before_op": old_op1}
+            if new_op1_name is not None:
+                new_op1_args["name"] = new_op1_name
+            new_op1 = mb.constexpr_blockwise_shift_scale(**new_op1_args)
+
+            new_op2_args = {"indices": old_op1.indices, "lut": new_op1, "before_op": old_op1}
+            if new_op2_name is not None:
+                new_op2_args["name"] = new_op2_name
+            new_op2 = mb.constexpr_lut_to_dense(**new_op2_args)
+
+        block.replace_uses_of_var_after_op(
+            anchor_op=old_op2,
+            old_var=old_op2.outputs[0],
+            new_var=new_op2,
+            force_replace=True,  # Need to force replace because it involves replacing constexpr op.
+        )
+        block.remove_ops([old_op1, old_op2])
diff --git a/coremltools/converters/mil/mil/passes/defs/quantization.py b/coremltools/converters/mil/mil/passes/defs/quantization.py
index 6fa20e947..a91edbde4 100644
--- a/coremltools/converters/mil/mil/passes/defs/quantization.py
+++ b/coremltools/converters/mil/mil/passes/defs/quantization.py
@@ -9,6 +9,7 @@
 
 import numpy as np
 
+from coremltools import _logger as logger
 from coremltools.converters.mil._deployment_compatibility import AvailableTarget
 from coremltools.converters.mil.input_types import TensorType
 from coremltools.converters.mil.mil import Block
@@ -293,6 +294,11 @@ def transform_op(self, op) -> None:
                     len(var._child_ops) > 1
                     and casted_var_name in self.current_cache_vars()
                 ):
+                    if self.current_cache_vars()[casted_var_name].op.x != var:
+                        logger.warning(
+                            "The cached cast Var doesn't match the original Var. It's due to duplicated Var "
+                            f"names in the graph for {casted_var_name}."
+                        )
                     casted_inputs[param][i] = self.current_cache_vars()[casted_var_name]
                 else:
                     x = mb.cast(
diff --git a/coremltools/converters/mil/mil/passes/pass_pipeline.py b/coremltools/converters/mil/mil/passes/pass_pipeline.py
index 2ffcedc48..2446ba07d 100644
--- a/coremltools/converters/mil/mil/passes/pass_pipeline.py
+++ b/coremltools/converters/mil/mil/passes/pass_pipeline.py
@@ -34,6 +34,7 @@
     # after all quantization passes, since constexpr will not be further optimized
     # before const elimination, otherwise const dequantize would get bloated
     "common::dequantize_to_constexpr",
+    "common::canonicalize_quantized_lut_pattern",
     "common::const_elimination",
     "common::sanitize_input_output_names",
     "common::divide_to_multiply",
@@ -93,6 +94,7 @@
     # in the network (while reducing the total number of transposes), and after passes such as "fuse_layernorm_or_instancenorm"
     # which detects patterns that involve redundant ops ("sub") etc.
     "common::remove_redundant_ops",
+    "common::dedup_op_and_var_names",  # Must be applied before "add_fp16_cast" because "add_fp16_cast" use unique name cache.
     "common::add_fp16_cast",  # Will be removed if compute precision is not FP16.
     "common::add_int16_cast",  # Will be removed if compute precision is not FP16.
     "common::update_output_dtypes",  # Must run again after `add_fp16_cast` and `add_int16_cast`.
diff --git a/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py b/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py
index 87f54ceed..a1a7d7785 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_cleanup_passes.py
@@ -150,6 +150,138 @@ def func(x):
             assert const_ops[1].weight_id == 1
             assert const_ops[2].weight_id == 2
 
+    @staticmethod
+    def test_const_deduplication_with_threshold():
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(2,)),
+            ]
+        )
+        def prog(x):
+            # const_1 and const_2 will not be deduplicated
+            const_1 = [0.0]
+            const_2 = [0.0]
+            const_3 = [0.0, 1.0]
+            const_4 = [0.0, 1.0]
+
+            # 4 add ops
+            x = mb.add(x=x, y=const_1)
+            x = mb.add(x=x, y=const_2)
+            x = mb.add(x=x, y=const_3)
+            return mb.add(x=x, y=const_4)
+
+        graph_pass = PASS_REGISTRY["common::const_deduplication"]
+        graph_pass.const_threshold = 2
+        apply_pass_and_basic_check(prog, graph_pass)
+
+        # check the graph pass
+        assert_op_count_match(prog, expect=3, op="const")
+        const_ops = prog.functions["main"].find_ops(op_type="const")
+        assert const_ops[0].outputs[0].val.tolist() == [0.0]
+        assert const_ops[1].outputs[0].val.tolist() == [0.0]
+        assert const_ops[2].outputs[0].val.tolist() == [0.0, 1.0]
+
+    @staticmethod
+    def test_const_deduplication_with_threshold_for_pad():
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(100,)),
+            ]
+        )
+        def prog(x):
+            # both constant_val and pad inputs for two pad ops are deduplicaed
+            c_zero_scalar = np.float32(0.0)
+            x = mb.pad(x=x, pad=[1, 0], mode="constant", constant_val=c_zero_scalar)
+            return mb.pad(x=x, pad=[1, 0], mode="constant", constant_val=c_zero_scalar)
+
+        graph_pass = PASS_REGISTRY["common::const_deduplication"]
+        graph_pass.const_threshold = -1
+        apply_pass_and_basic_check(prog, graph_pass)
+
+        # check the graph pass
+        assert_op_count_match(prog, expect=4, op="const")
+        const_ops = prog.functions["main"].find_ops(op_type="const")
+        assert const_ops[0].outputs[0].val.tolist() == [1, 0]
+        assert const_ops[1].outputs[0].val == "constant"
+        assert const_ops[2].outputs[0].val == 0.0
+        assert const_ops[3].outputs[0].val == "constant"
+
+    @staticmethod
+    @pytest.mark.parametrize(
+        "constexpr_op",
+        CONSTEXPR_OPS,
+    )
+    def test_constexpr_deduplication_with_threshold(constexpr_op):
+        BATCH_DIM = 1
+        SEQUENCE_LENGTH = 1
+        ENCODING_DIM = 1
+        EMBEDDING_DIM = 2
+
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+                mb.TensorSpec(shape=(BATCH_DIM, SEQUENCE_LENGTH, ENCODING_DIM)),
+            ]
+        )
+        def prog(q, k):
+            weight_q = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM, ENCODING_DIM), seed=19)
+            weight_k = CONSTEXPR_FUNCS[constexpr_op]((EMBEDDING_DIM, ENCODING_DIM), seed=19)
+            q_e = mb.linear(x=q, weight=weight_q)
+            k_e = mb.linear(x=k, weight=weight_k)
+            return mb.matmul(x=q_e, y=k_e, transpose_y=True)
+
+        graph_pass = PASS_REGISTRY["common::const_deduplication"]
+        graph_pass.const_threshold = -1
+        apply_pass_and_basic_check(prog, graph_pass)
+
+        # check the graph pass
+        assert_op_count_match(prog, expect=1, op=constexpr_op)
+
+    @staticmethod
+    def test_str_should_not_be_deduplicated():
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(1,)),
+            ]
+        )
+        def prog(x):
+            x = mb.cast(x=x, dtype="int32")
+            return mb.cast(x=x, dtype="int32")
+
+        graph_pass = PASS_REGISTRY["common::const_deduplication"]
+        graph_pass.const_threshold = -1
+        apply_pass_and_basic_check(prog, graph_pass)
+
+        # check the graph pass
+        assert_op_count_match(prog, expect=2, op="const")
+        const_ops = prog.functions["main"].find_ops(op_type="const")
+        assert const_ops[0].outputs[0].val == "int32"
+        assert const_ops[1].outputs[0].val == "int32"
+
+    @staticmethod
+    def test_bool_should_not_be_deduplicated():
+        @mb.program(
+            input_specs=[
+                mb.TensorSpec(shape=(2,)),
+                mb.TensorSpec(shape=(2,)),
+            ]
+        )
+        def prog(x, y):
+            return mb.argsort(x=x, axis=-1, ascending=False), mb.argsort(
+                x=y, axis=-1, ascending=False
+            )
+
+        graph_pass = PASS_REGISTRY["common::const_deduplication"]
+        graph_pass.const_threshold = -1
+        apply_pass_and_basic_check(prog, graph_pass)
+
+        # check the graph pass
+        assert_op_count_match(prog, expect=3, op="const")
+        const_ops = prog.functions["main"].find_ops(op_type="const")
+        assert const_ops[0].outputs[0].val == -1
+        assert const_ops[1].outputs[0].val == False
+        assert const_ops[2].outputs[0].val == False
+
     @pytest.mark.parametrize(
         "q_weight_key, k_weight_key",
         itertools.product(
diff --git a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
index b03c8e46e..8ec2d271a 100644
--- a/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
+++ b/coremltools/converters/mil/mil/passes/tests/test_quantization_passes.py
@@ -2017,6 +2017,118 @@ def prog(x):
         assert get_op_types_in_program(prog) == get_op_types_in_program(prev_prog)
 
 
+@pytest.mark.skipif(ct.utils._macos_version() < (15, 0), reason="Only supported on macOS 15+")
+class TestReorderQuantizedLut:
+    @staticmethod
+    def _verify_numerical(prev_prog, prog, block, input_shape, rtol=1e-7, atol=0.0):
+        # Verify the numerical output matches between `prev_prog` and `prog`.
+        prev_model = ct.convert(
+            prev_prog,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+            convert_to="mlprogram",
+            minimum_deployment_target=ct.target.iOS18,
+        )
+        model = ct.convert(
+            prog,
+            pass_pipeline=ct.PassPipeline.EMPTY,
+            convert_to="mlprogram",
+            minimum_deployment_target=ct.target.iOS18,
+        )
+        output_name = block.outputs[0].name
+        x_val = np.random.rand(*input_shape).astype(np.float16)
+        input_dict = {"x": x_val}
+        prev_output = prev_model.predict(input_dict)[output_name]
+        output = model.predict(input_dict)[output_name]
+        np.testing.assert_allclose(prev_output, output, rtol=rtol, atol=atol)
+
+    @staticmethod
+    def _construct_weights_with_two_orders(weight_shape: Tuple[int, ...]):
+        """Construct two quantized lut weights, represented in different quant/lut orders."""
+        nbits = 4
+        num_palette = 2**nbits
+        indices_np_dtype = types.nptype_from_builtin(types.string_to_builtin(f"uint{nbits}"))
+        indices = np.random.randint(low=0, high=num_palette, size=weight_shape).astype(
+            indices_np_dtype
+        )
+        lut_shape = weight_shape + (num_palette, 1)
+        int8_lut = np.random.randint(low=0, high=6, size=lut_shape, dtype=np.int8)
+        scale = np.float16(2.0).reshape([1] * len(weight_shape))
+        offset = np.int8(1).reshape([1] * len(weight_shape))
+
+        lut_weight1 = mb.constexpr_lut_to_dense(indices=indices, lut=int8_lut)
+        quantized_lut_weight1 = mb.constexpr_blockwise_shift_scale(
+            data=lut_weight1, scale=scale, offset=offset
+        )
+        quantized_weight2 = mb.constexpr_blockwise_shift_scale(
+            data=int8_lut,
+            scale=scale.reshape([1] * len(int8_lut.shape)),
+            offset=offset.reshape([1] * len(int8_lut.shape)),
+        )
+        quantized_lut_weight2 = mb.constexpr_lut_to_dense(indices=indices, lut=quantized_weight2)
+
+        return quantized_lut_weight1, quantized_lut_weight2
+
+    @pytest.mark.parametrize(
+        "input_shape, dequant_first", itertools.product([(4, 3), (2, 3, 4)], [True, False])
+    )
+    def test_dequant_first(self, input_shape, dequant_first):
+        """
+        When dequant_first is True, the quantized lut ops representation will be reordered to follow
+            lut(int8) -> constexpr_blockwise_shift_scale -> lut(fp16) -> constexpr_lut_to_dense -> dense(fp16).
+        When dequant_first is False, the quantized lut ops representation will be reordered to follow
+            lut(int8) -> constexpr_lut_to_dense -> dense(int8) -> constexpr_blockwise_shift_scale -> dense(fp16)
+        """
+
+        @mb.program(
+            input_specs=[mb.TensorSpec(shape=input_shape, dtype=types.fp16)],
+            opset_version=ct.target.iOS18,
+        )
+        def prog(x):
+            quantized_lut_weight1, quantized_lut_weight2 = self._construct_weights_with_two_orders(
+                weight_shape=(8, input_shape[-1])
+            )
+            output1 = mb.linear(x=x, weight=quantized_lut_weight1)
+            output2 = mb.linear(x=x, weight=quantized_lut_weight2)
+            return mb.add(x=output1, y=output2)
+
+        from unittest import mock
+
+        from coremltools.converters.mil.mil.passes.defs.optimize_quantization import (
+            canonicalize_quantized_lut_pattern,
+        )
+
+        with mock.patch.object(canonicalize_quantized_lut_pattern, "_DEQUANT_FIRST", dequant_first):
+            prev_prog, _, block = apply_pass_and_basic_check(
+                prog, "common::canonicalize_quantized_lut_pattern", skip_essential_scope_check=True
+            )
+
+        assert get_op_types_in_program(prev_prog) == [
+            "constexpr_lut_to_dense",
+            "constexpr_blockwise_shift_scale",
+            "constexpr_blockwise_shift_scale",
+            "constexpr_lut_to_dense",
+            "linear",
+            "linear",
+            "add",
+        ]
+        dequant_ops = prog.functions["main"].find_ops(op_type="constexpr_blockwise_shift_scale")
+        lut_ops = prog.functions["main"].find_ops(op_type="constexpr_lut_to_dense")
+        assert len(dequant_ops) == 2
+        assert len(lut_ops) == 2
+        if dequant_first:
+            for dequant_op in dequant_ops:
+                assert dequant_op.outputs[0].child_ops[0].op_type == "constexpr_lut_to_dense"
+            for lut_op in lut_ops:
+                assert lut_op.outputs[0].child_ops[0].op_type == "linear"
+        else:
+            for lut_op in lut_ops:
+                assert lut_op.outputs[0].child_ops[0].op_type == "constexpr_blockwise_shift_scale"
+            for dequant_op in dequant_ops:
+                assert dequant_op.outputs[0].child_ops[0].op_type == "linear"
+
+        self._verify_numerical(prev_prog, prog, block, input_shape)
+
+
 class TestFP16CastTransform:
     def assertEqual(self, first, second):
         """A convenience method to migrate from unittest (self.assertEqual) to pytest."""
diff --git a/coremltools/converters/mil/testing_utils.py b/coremltools/converters/mil/testing_utils.py
index bc23279dc..6261f8fe9 100644
--- a/coremltools/converters/mil/testing_utils.py
+++ b/coremltools/converters/mil/testing_utils.py
@@ -115,20 +115,15 @@ def macos_compatible_with_deployment_target(minimum_deployment_target):
             return False
     return True
 
-def _serialize_current_pytest(mlmodel):
-    """
-    Usually pytest test name is of format file::class::test_function[param0-param1] (call)...
-    Assume each test produces only one Core ML model,
-    then file::class::test_function[param0-param1] is enough to determine unique name
-        {_COREMLTOOLS_DEBUG_SAVE_MLMODEL_DIRECTORY}/file/class/test_function/param0/param1/model.mlpackage
-    """
-    mlpackage_path = _COREMLTOOLS_DEBUG_SAVE_MLMODEL_DIRECTORY + "/"
+
+def _create_current_pytest_serialization_path() -> str:
+    serialization_path = _COREMLTOOLS_DEBUG_SAVE_MLMODEL_DIRECTORY + "/"
 
     PYTEST_CURRENT_TEST = os.environ.get("PYTEST_CURRENT_TEST").split("(call)")[0].strip()
     test_name_fragments = PYTEST_CURRENT_TEST.split("::")
 
     for test_name_fragment in test_name_fragments[:-1]:
-        mlpackage_path += f"{test_name_fragment.strip()}/"
+        serialization_path += f"{test_name_fragment.strip()}/"
 
     test_name = test_name_fragments[-1]
     # For a parameterized test, further decompose parameters into directories
@@ -138,17 +133,27 @@ def _serialize_current_pytest(mlmodel):
         test_function_name = test_name[:bra_index]
         parameters = test_name[bra_index + 1 : -1].split("-")
         # Append test function name and parameter to mlpackage path
-        mlpackage_path += f"{test_function_name}/"
+        serialization_path += f"{test_function_name}/"
         for parameter in parameters:
-            mlpackage_path += f"{parameter}/"
+            serialization_path += f"{parameter}/"
     else:
-        mlpackage_path += f"{test_name}/"
+        serialization_path += f"{test_name}/"
 
-    mlpackage_path += "model.mlpackage"
+    return serialization_path
 
+
+def _serialize_current_pytest_mlmodel(mlmodel) -> None:
+    """
+    Usually pytest test name is of format file::class::test_function[param0-param1] (call)...
+    Assume each test produces only one Core ML model,
+    then file::class::test_function[param0-param1] is enough to determine unique name
+        {_COREMLTOOLS_DEBUG_SAVE_MLMODEL_DIRECTORY}/file/class/test_function/param0/param1/model.mlpackage
+    """
+    mlpackage_path = _create_current_pytest_serialization_path() + "model.mlpackage"
     Path(mlpackage_path).mkdir(parents=True, exist_ok=True)
     mlmodel.save(mlpackage_path)
 
+
 def assert_op_count_match(program, expect, op=None, verbose=False):
     """
     Assert number of ops match expected number. If op is not specified,
@@ -531,20 +536,20 @@ def ct_convert(
         skip_model_load = True
 
     mlmodel = converter(
-                program,
-                source=source,
-                inputs=inputs,
-                outputs=outputs,
-                classifier_config=classifier_config,
-                minimum_deployment_target=minimum_deployment_target,
-                convert_to=target,
-                compute_precision=compute_precision,
-                skip_model_load=skip_model_load,
-                **kwargs
+        program,
+        source=source,
+        inputs=inputs,
+        outputs=outputs,
+        classifier_config=classifier_config,
+        minimum_deployment_target=minimum_deployment_target,
+        convert_to=target,
+        compute_precision=compute_precision,
+        skip_model_load=skip_model_load,
+        **kwargs,
     )
 
     if is_current_test_to_be_debugged:
-        _serialize_current_pytest(mlmodel)
+        _serialize_current_pytest_mlmodel(mlmodel)
         pytest.xfail("This test is to be debugged")
 
     return mlmodel
diff --git a/coremltools/models/_compiled_model.py b/coremltools/models/_compiled_model.py
index 0539f4433..2f95b407e 100644
--- a/coremltools/models/_compiled_model.py
+++ b/coremltools/models/_compiled_model.py
@@ -9,7 +9,10 @@
 from coremltools import ComputeUnit as _ComputeUnit
 from coremltools.models.model import MLState as _MLState
 
-from .model import MLModel as _MLModel
+from .model import (
+    _verify_optimization_hint_input,
+    MLModel as _MLModel,
+)
 from .utils import _macos_version
 
 try:
@@ -21,7 +24,12 @@
 class CompiledMLModel:
 
     @staticmethod
-    def _init_check(path: str, compute_units: _ComputeUnit, function_name: str):
+    def _init_check(
+        path: str,
+        compute_units: _ComputeUnit,
+        function_name: str,
+        optimization_hints: _Optional[dict] = None,
+    ):
         if _macos_version() < (10, 13):
             raise Exception("Loading compiled Core ML models is only support on macOS 10.13 or higher.")
         if _MLModelProxy is None:
@@ -35,11 +43,15 @@ def _init_check(path: str, compute_units: _ComputeUnit, function_name: str):
         if not isinstance(function_name, str):
             raise TypeError('The "function_name" parameter must be of type "str".')
 
+        _verify_optimization_hint_input(optimization_hints)
+
+
     def __init__(
         self,
         path: str,
         compute_units: _ComputeUnit = _ComputeUnit.ALL,
         function_name: _Optional[str] = None,
+        optimization_hints: _Optional[dict] = None,
     ):
         """
         Loads a compiled Core ML model.
@@ -59,6 +71,10 @@ def __init__(
                 - ``coremltools.ComputeUnit.CPU_AND_NE``: Use both the CPU and neural engine, but
                   not the GPU. Available only for macOS >= 13.0.
 
+        optimization_hints : dict or None
+            Keys are the names of the optimization hint, either 'reshapeFrequency' or 'specializationStrategy'.
+            Values are enumeration values of type ``coremltools.ReshapeFrequency`` or ``coremltools.SpecializationStrategy``.
+
         Examples
         --------
         .. sourcecode:: python
@@ -73,10 +89,24 @@ def __init__(
         if function_name is None:
             function_name = ""
 
-        self._init_check(path, compute_units, function_name)
+        self._init_check(path, compute_units, function_name, optimization_hints)
+
+        self.compute_unit = compute_units
+        self.function_name = function_name
+        if optimization_hints is not None:
+            self.optimization_hints = optimization_hints.copy()
+        else:
+            self.optimization_hints = None
 
         path = _expanduser(path)
-        self._proxy = _MLModelProxy(path, compute_units.name, function_name)
+
+        if self.optimization_hints is not None:
+            optimization_hints_str_vals = {k: v.name for k, v in self.optimization_hints.items()}
+        else:
+            optimization_hints_str_vals = {}
+
+        self._proxy = _MLModelProxy(path, compute_units.name, function_name, optimization_hints_str_vals)
+
 
     def predict(self, data, state: _Optional[_MLState] = None):
         """
@@ -119,6 +149,7 @@ def predict(self, data, state: _Optional[_MLState] = None):
             self._proxy, _MLModel._update_float16_multiarray_input_to_float32, data, state
         )
 
+
     def make_state(self) -> _MLState:
         """
         Returns a new state object, which can be passed to the ``predict`` method.
diff --git a/coremltools/models/model.py b/coremltools/models/model.py
index 96c1d2bfa..830d95a9b 100644
--- a/coremltools/models/model.py
+++ b/coremltools/models/model.py
@@ -15,9 +15,13 @@
 import numpy as _np
 import numpy as _numpy
 
-from coremltools import ComputeUnit as _ComputeUnit
-from coremltools import _logger as logger
-from coremltools import proto as _proto
+from coremltools import (
+    ComputeUnit as _ComputeUnit,
+    _logger as logger,
+    proto as _proto,
+    SpecializationStrategy as _SpecializationStrategy,
+    ReshapeFrequency as _ReshapeFrequency,
+)
 from coremltools._deps import _HAS_TF_1, _HAS_TF_2, _HAS_TORCH
 from coremltools.converters.mil.mil.program import Program as _Program
 from coremltools.converters.mil.mil.scope import ScopeSource as _ScopeSource
@@ -104,6 +108,30 @@
 _METADATA_SOURCE_DIALECT = "com.github.apple.coremltools.source_dialect"
 
 
+def _verify_optimization_hint_input(optimization_hint_input: _Optional[dict] = None) -> None:
+    """
+    Throws an exception if ``optimization_hint_input`` is not valid.
+    """
+    if optimization_hint_input is None:
+        return
+    if not isinstance(optimization_hint_input, dict):
+        raise TypeError('"optimization_hint_input" must be a dictionary or None')
+
+    if optimization_hint_input != {} and _macos_version() < (15, 0):
+        raise ValueError('Optimization hints are only available on macOS >= 15.0')
+
+    for k in optimization_hint_input.keys():
+        if k not in ('reshapeFrequency', 'specializationStrategy'):
+            raise ValueError(f"Unrecognized key in optimization_hint dictionary: {k}")
+
+    if "specializationStrategy" in optimization_hint_input and not isinstance(optimization_hint_input["specializationStrategy"], _SpecializationStrategy):
+        raise TypeError('"specializationStrategy" value of "optimization_hint_input" dictionary must be of type coremltools.SpecializationStrategy')
+
+    if "reshapeFrequency" in optimization_hint_input and not isinstance(optimization_hint_input["reshapeFrequency"], _ReshapeFrequency):
+        raise TypeError('"reshapeFrequency" value of "optimization_hint_input" dictionary must be of type coremltools.ReshapeFrequency')
+
+
+
 class _FeatureDescription:
     def __init__(self, fd_spec):
         self._fd_spec = fd_spec
@@ -222,6 +250,7 @@ def __init__(
         compute_units=_ComputeUnit.ALL,
         weights_dir=None,
         function_name=None,
+        optimization_hints: _Optional[dict] = None,
     ):
         """
         Construct an MLModel from an ``.mlmodel``.
@@ -282,6 +311,10 @@ def __init__(
             The name of the function from ``model`` to load.
             If not provided, ``function_name`` will be set to the ``defaultFunctionName`` in the proto.
 
+        optimization_hints : dict or None
+            Keys are the names of the optimization hint, either 'reshapeFrequency' or 'specializationStrategy'.
+            Values are enumeration values of type ``coremltools.ReshapeFrequency`` or ``coremltools.SpecializationStrategy``.
+
         Notes
         -----
         Internally this maintains the following:
@@ -342,8 +375,15 @@ def does_model_contain_mlprogram(model) -> bool:
             raise ValueError(
                 'coremltools.ComputeUnit.CPU_AND_NE is only available on macOS >= 13.0'
             )
+
+        _verify_optimization_hint_input(optimization_hints)
+
         self.compute_unit = compute_units
         self.function_name = function_name
+        if optimization_hints is not None:
+            self.optimization_hints = optimization_hints.copy()
+        else:
+            self.optimization_hints = None
 
         self.is_package = False
         self.is_temp_package = False
@@ -361,7 +401,7 @@ def does_model_contain_mlprogram(model) -> bool:
                 self.is_temp_package = is_temp_package
                 self._weights_dir = _try_get_weights_dir_path(model)
             self.__proxy__, self._spec, self._framework_error = self._get_proxy_and_spec(
-                model, compute_units, skip_model_load=skip_model_load,
+                model, compute_units, skip_model_load=skip_model_load, optimization_hints=optimization_hints,
             )
         elif isinstance(model, _proto.Model_pb2.Model):
             if does_model_contain_mlprogram(model):
@@ -381,7 +421,7 @@ def does_model_contain_mlprogram(model) -> bool:
                 _save_spec(model, filename)
 
             self.__proxy__, self._spec, self._framework_error = self._get_proxy_and_spec(
-                filename, compute_units, skip_model_load=skip_model_load,
+                filename, compute_units, skip_model_load=skip_model_load, optimization_hints=optimization_hints
             )
             try:
                 _os.remove(filename)
@@ -415,7 +455,11 @@ def does_model_contain_mlprogram(model) -> bool:
             self._model_input_names_set = set([i.name for i in f.input])
 
     def _get_proxy_and_spec(
-        self, filename: str, compute_units: _ComputeUnit, skip_model_load: _Optional[bool] = False
+        self,
+        filename: str,
+        compute_units: _ComputeUnit,
+        skip_model_load: _Optional[bool] = False,
+        optimization_hints: _Optional[dict] = None,
     ):
         filename = _os.path.expanduser(filename)
         specification = _load_spec(filename)
@@ -430,10 +474,14 @@ def _get_proxy_and_spec(
                 return None, specification, None
 
             function_name = "" if self.function_name is None else self.function_name
+            if optimization_hints is not None:
+                optimization_hints_str_vals = {k: v.name for k, v in optimization_hints.items()}
+            else:
+                optimization_hints_str_vals = {}
 
             try:
                 return (
-                    _MLModelProxy(filename, compute_units.name, function_name),
+                    _MLModelProxy(filename, compute_units.name, function_name, optimization_hints_str_vals),
                     specification,
                     None,
                 )
diff --git a/coremltools/optimize/__init__.py b/coremltools/optimize/__init__.py
index ad15d7c90..fb8aba0a3 100644
--- a/coremltools/optimize/__init__.py
+++ b/coremltools/optimize/__init__.py
@@ -3,9 +3,9 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
-from coremltools._deps import _HAS_TORCH
+from coremltools._deps import _IMPORT_CT_OPTIMIZE_TORCH
 
 from . import coreml
 
-if _HAS_TORCH:
+if _IMPORT_CT_OPTIMIZE_TORCH:
     from . import torch
diff --git a/coremltools/optimize/coreml/_quantization_passes.py b/coremltools/optimize/coreml/_quantization_passes.py
index a6b1ec857..5c3b14774 100644
--- a/coremltools/optimize/coreml/_quantization_passes.py
+++ b/coremltools/optimize/coreml/_quantization_passes.py
@@ -1307,6 +1307,8 @@ def blockwise_compress(
         """
         Compress original_data into n-bit representation by quantization.
 
+        mode: "LINEAR_SYMMETRIC" or "LINEAR".
+
         block_sizes: Each element is the block size on corresponding axis for original_data.
 
         Returns None if the weight cannot be compressed (for example, the dim size on an axis is not
diff --git a/coremltools/optimize/coreml/experimental/_post_training_quantization.py b/coremltools/optimize/coreml/experimental/_post_training_quantization.py
index c8925d46e..c48330d09 100644
--- a/coremltools/optimize/coreml/experimental/_post_training_quantization.py
+++ b/coremltools/optimize/coreml/experimental/_post_training_quantization.py
@@ -4,7 +4,7 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 from collections import defaultdict
-from typing import Dict, List, Union
+from typing import Dict, List, Optional, Union
 
 import numpy as np
 
@@ -24,13 +24,17 @@
 )
 
 
-def linear_quantize_activations(mlmodel: _MLModel, config: _OptimizationConfig, sample_data: List):
+def linear_quantize_activations(
+    mlmodel: _MLModel,
+    config: _OptimizationConfig,
+    sample_data: List[Dict[Optional[str], np.ndarray]],
+):
     """
     Utility function to convert a float precision MLModel of type ``mlprogram``, which uses
     float-precision activations, into a compressed MLModel that uses n-bit activations. Currently, only n=8
     is suppported.
 
-    This is achieved by feeding real sample data into the input MLModel, calibrating the resulting float activation values, 
+    This is achieved by feeding real sample data into the input MLModel, calibrating the resulting float activation values,
     converting the calibrated values into ``quantize`` and ``dequantize`` op pairs, and inserting those
     op pairs into the new MLModel instance where activations get quantized.
 
@@ -47,7 +51,9 @@ def linear_quantize_activations(mlmodel: _MLModel, config: _OptimizationConfig,
 
     sample_data: List
         Data used to characterize statistics of the activation values of the original float precision model.
-        Expects a list of sample input dictionaries.
+        Expects a list of sample input dictionaries, which should have the same format as the data used in `.predict`
+        method for the mlmodel. More specifically, the input name need to be specified in the data, unless it's a single
+        input model where the name will be auto inferred.
 
     Returns
     -------
@@ -77,6 +83,17 @@ def linear_quantize_activations(mlmodel: _MLModel, config: _OptimizationConfig,
         )
         compressed_model_w8a8 = cto.linear_quantize_weights(compressed_model_a8, weight_config)
     """
+    # Validate Sample data. If the sample data name is not provided, try to infer it.
+    for sample in sample_data:
+        if None in sample.keys():
+            input_spec = mlmodel.get_spec().description.input
+            if len(sample.keys()) > 1 or len(input_spec) > 1:
+                raise ValueError(
+                    "When the model has multiple inputs, please provide the name for each data in `sample_data`"
+                )
+            inferred_input_name = input_spec[0].name
+            sample[inferred_input_name] = sample[None]
+            del sample[None]
 
     ### Apply four major graph passes in order.
 
@@ -220,17 +237,25 @@ def _adjust_concat_surrounding_activation_stats(
         group_rmin_list, group_rmax_list = [], []
 
         for tensor_name in concat_group:
-            group_rmin_list.append(activation_stats_dict[tensor_name]["rmin"])
-            group_rmax_list.append(activation_stats_dict[tensor_name]["rmax"])
+            # Some tensor_name may not have rmin/rmax if the calibration failed before.
+            if tensor_name in activation_stats_dict:
+                group_rmin_list.append(activation_stats_dict[tensor_name]["rmin"])
+                group_rmax_list.append(activation_stats_dict[tensor_name]["rmax"])
+
+        if len(group_rmin_list) == 0:
+            raise ValueError(
+                "None of the calibration run succeeded. Please check logs about calibrating sample failures."
+            )
         group_rmin, group_rmax = min(group_rmin_list), max(group_rmax_list)
 
         for tensor_name in concat_group:
-            activation_stats_dict[tensor_name]["rmin"] = group_rmin
-            activation_stats_dict[tensor_name]["rmax"] = group_rmax
+            if tensor_name in activation_stats_dict:
+                activation_stats_dict[tensor_name]["rmin"] = group_rmin
+                activation_stats_dict[tensor_name]["rmax"] = group_rmax
 
 
 def _get_activation_calibration_stats(
-    fpmodel: _MLModel, sample_data: List
+    fpmodel: _MLModel, sample_data: List[Dict[str, np.ndarray]]
 ) -> Dict[str, Dict[str, float]]:
     """
     Calibration and store a dict of intermediate tensor stats.
@@ -246,7 +271,6 @@ def _get_activation_calibration_stats(
     -------
     activation_calibration_stats: dict
     """
-
     logger.warning(
         "Running compression pass linear_quantize_activations: start calibrating {} samples".format(
             len(sample_data)
diff --git a/coremltools/optimize/torch/quantization/_backend_config.py b/coremltools/optimize/torch/quantization/_backend_config.py
index 80f7b4624..52d3c57fa 100644
--- a/coremltools/optimize/torch/quantization/_backend_config.py
+++ b/coremltools/optimize/torch/quantization/_backend_config.py
@@ -29,7 +29,7 @@
     activation_configs as _activation_configs,
 )
 from coremltools.optimize.torch.quantization._backend_config_utils import (
-    binary_op_act_configs as _binary_op_relu_configs,
+    binary_op_act_configs as _binary_op_act_configs,
 )
 from coremltools.optimize.torch.quantization._backend_config_utils import (
     binary_op_configs as _binary_op_configs,
@@ -724,7 +724,7 @@ def _add_act() -> _List[_BackendPatternConfig]:
     FakeQuant ->
     """
     acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
-    return _binary_op_relu_configs(ops=[_operator.add, _torch.add], acts=list(acts))
+    return _binary_op_act_configs(ops=[_operator.add, _torch.add], acts=list(acts))
 
 
 @_BackendConfigRegistry.register()
@@ -741,7 +741,7 @@ def _mul_act() -> _List[_BackendPatternConfig]:
     FakeQuant ->
     """
     acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
-    return _binary_op_relu_configs(ops=[_operator.mul, _torch.mul], acts=list(acts))
+    return _binary_op_act_configs(ops=[_operator.mul, _torch.mul], acts=list(acts))
 
 
 @_BackendConfigRegistry.register()
@@ -758,7 +758,23 @@ def _matmul_act() -> _List[_BackendPatternConfig]:
     FakeQuant ->
     """
     acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
-    return _binary_op_relu_configs(ops=[_torch.matmul], acts=list(acts))
+    return _binary_op_act_configs(ops=[_torch.matmul], acts=list(acts))
+
+
+@_BackendConfigRegistry.register()
+def _einsum_act() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                einsum -> Act -> output
+    input_2 ->
+    qat:
+    FakeQuant ->
+                 einsum -> Act -> FakeQuant
+    FakeQuant ->
+    """
+    acts = _mod_activations + _func_activations + (_nn.ReLU, _F.relu, _torch.relu)
+    return _binary_op_act_configs(ops=[_torch.einsum], acts=list(acts))
 
 
 @_BackendConfigRegistry.register()
@@ -809,6 +825,21 @@ def _matmul() -> _List[_BackendPatternConfig]:
     return _binary_op_configs(ops=[_torch.matmul])
 
 
+@_BackendConfigRegistry.register()
+def _einsum() -> _List[_BackendPatternConfig]:
+    """
+    float:
+    input_1 ->
+                einsum -> output
+    input_2 ->
+    qat:
+    FakeQuant ->
+                 einsum -> FakeQuant
+    FakeQuant ->
+    """
+    return _binary_op_configs(ops=[_torch.einsum])
+
+
 @_BackendConfigRegistry.register()
 def _cat() -> _List[_BackendPatternConfig]:
     """
diff --git a/coremltools/optimize/torch/quantization/_utils.py b/coremltools/optimize/torch/quantization/_utils.py
index f283026e9..f0042b448 100644
--- a/coremltools/optimize/torch/quantization/_utils.py
+++ b/coremltools/optimize/torch/quantization/_utils.py
@@ -3,6 +3,7 @@
 #  Use of this source code is governed by a BSD-3-clause license that can be
 #  found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+import math
 import operator as _operator
 from collections import defaultdict
 from enum import Enum as _Enum
@@ -205,11 +206,23 @@ def get_quant_range(n_bits: int, dtype: _torch.dtype) -> _Tuple[int, int]:
         quant_max = max_q / 2 - 1
     return int(quant_min), int(quant_max)
 
+def get_n_bits_from_range(quant_min: int, quant_max: int) -> int:
+    """
+    Returns quantization n_bits for given quantization range
+    """
+    n_bits = int(math.log2(quant_max + 1))
+    if quant_min < 0:
+        n_bits += 1
+
+    return n_bits
 
-def register_compression_metadata(submodule, config):
+
+def register_compression_metadata(submodule):
     metadata = _CompressionMetadata("weight")
     metadata.compression_type = ["quantization"]
-    metadata.quantization_n_bits = config.weight_n_bits
+    metadata.quantization_n_bits = get_n_bits_from_range(
+        submodule.weight_quant_min, submodule.weight_quant_max
+    )
     metadata.quantization_scale = (
         submodule.weight_scale.detach().clone().unsqueeze(-1)
         if submodule.weight_axis == 0
diff --git a/coremltools/optimize/torch/quantization/quantizer.py b/coremltools/optimize/torch/quantization/quantizer.py
index 30eab3a8c..726ed5152 100644
--- a/coremltools/optimize/torch/quantization/quantizer.py
+++ b/coremltools/optimize/torch/quantization/quantizer.py
@@ -283,8 +283,7 @@ def finalize(
         _register_metadata_version(finalized_model)
         for name, submodule in finalized_model.named_modules(remove_duplicate=True):
             if hasattr(submodule, "weight_scale"):
-                submod_config = self._config.get_module_config(name, submodule)
-                _register_compression_metadata(submodule, submod_config)
+                _register_compression_metadata(submodule)
 
         if model is None:
             self._model = finalized_model
diff --git a/coremltools/test/api/test_api_visibilities.py b/coremltools/test/api/test_api_visibilities.py
index ea8fe22aa..7a3ce26bc 100644
--- a/coremltools/test/api/test_api_visibilities.py
+++ b/coremltools/test/api/test_api_visibilities.py
@@ -45,6 +45,8 @@ def _check_visible_modules(actual, expected):
     "libmilstoragepython",
     "optimize",
     "StateType",
+    "ReshapeFrequency",
+    "SpecializationStrategy",
 ]
 
 
diff --git a/coremltools/test/ml_program/test_compression.py b/coremltools/test/ml_program/test_compression.py
index 7452c1710..b3989d469 100644
--- a/coremltools/test/ml_program/test_compression.py
+++ b/coremltools/test/ml_program/test_compression.py
@@ -20,7 +20,9 @@
 
 
 def get_test_model_and_data(
-    multi_layer: bool = False, quantize_config: Optional[OpCompressorConfig] = None
+    multi_layer: bool = False,
+    quantize_config: Optional[OpCompressorConfig] = None,
+    use_linear: bool = False,
 ):
     """
     Prepare test model and data.
@@ -29,19 +31,24 @@ def get_test_model_and_data(
     :param quantize_config: If set, the weights in the test model will be nbits quantization-friendly,
         which means it will be first quantized according to the config, and then dequantized, so the
         numerical error introduced during the quantization test will be minimum.
+    :param use_linear: If set, use linear instead of conv in the model.
     """
     if quantize_config is not None and multi_layer:
         raise AssertionError("Multi-layer model doesn't support pre_quantize_nbits.")
 
     inputs = [ct.TensorType(name="data", shape=(1, 64, 10, 10))]
+    if use_linear:
+        inputs = [ct.TensorType(name="data", shape=(1, 64))]
+
     torch_input_values = [torch.rand(*i.shape.to_list()) for i in inputs]
     coreml_input_values = {
         i.name: val.detach().numpy() for i, val in zip(inputs, torch_input_values)
     }
     if multi_layer:
-        class Model(torch.nn.Module):
+
+        class ConvModel(torch.nn.Module):
             def __init__(self):
-                super(Model, self).__init__()
+                super(ConvModel, self).__init__()
                 self.conv_1 = torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=2)
                 self.conv_2 = torch.nn.Conv2d(in_channels=32, out_channels=64, kernel_size=2)
 
@@ -50,9 +57,22 @@ def forward(self, x):
                 conv_2 = self.conv_2(conv_1)
                 return conv_2
 
-        model = Model().eval()
+        class LinearModel(torch.nn.Module):
+            def __init__(self):
+                super(LinearModel, self).__init__()
+                self.linear_1 = torch.nn.Linear(in_features=64, out_features=32, bias=False)
+                self.linear_2 = torch.nn.Linear(in_features=32, out_features=16, bias=False)
+
+            def forward(self, x):
+                linear_1 = self.linear_1(x)
+                return self.linear_2(linear_1)
+
+        model = LinearModel().eval() if use_linear else ConvModel().eval()
     else:
         model = torch.nn.Conv2d(in_channels=64, out_channels=32, kernel_size=2)
+        if use_linear:
+            model = torch.nn.Linear(in_features=64, out_features=32, bias=False)
+
         if quantize_config is not None:
             # Manually change weight to make it quantization friendly.
             nbits_range_max = 2 ** (quantize_config.nbits - 1) - 1
diff --git a/coremltools/test/modelpackage/test_modelpackage.py b/coremltools/test/modelpackage/test_modelpackage.py
index 1618194e8..9d2cab934 100644
--- a/coremltools/test/modelpackage/test_modelpackage.py
+++ b/coremltools/test/modelpackage/test_modelpackage.py
@@ -3,6 +3,8 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+
+import itertools
 import json
 import os
 import platform
@@ -41,7 +43,6 @@ def _remove_path(path):
 class TestMLModel:
 
     def setup_class(self):
-
         spec = Model_pb2.Model()
         spec.specificationVersion = coremltools.SPECIFICATION_VERSION
 
@@ -459,6 +460,47 @@ def forward(self, x):
         shutil.rmtree(package_path)
 
 
+    @pytest.mark.skipif(utils._macos_version() < (15, 0),
+                        reason="optimization hints available only on macOS15+")
+    @pytest.mark.parametrize("reshapeFrequency, specializationStrategy",
+                             itertools.product(
+                                 (ct.ReshapeFrequency.Frequent, ct.ReshapeFrequency.Infrequent, None),
+                                 (ct.SpecializationStrategy.FastPrediction, ct.SpecializationStrategy.Default, None),
+                             ))
+    def test_optimization_hints(self, reshapeFrequency, specializationStrategy):
+        optimization_hints={}
+        if reshapeFrequency is not None:
+            optimization_hints['reshapeFrequency'] = reshapeFrequency
+        if specializationStrategy is not None:
+            optimization_hints['specializationStrategy'] = specializationStrategy
+        if len(optimization_hints) == 0:
+            optimization_hints = None
+
+        m = MLModel(self.spec, optimization_hints=optimization_hints)
+        assert isinstance(m, MLModel)
+        assert(m.optimization_hints == optimization_hints)
+
+
+    @pytest.mark.skipif(utils._macos_version() < (15, 0),
+                        reason="optimization hints available only on macOS15+")
+    def test_optimization_hint_error_cases(self):
+        with pytest.raises(TypeError, match='"optimization_hint_input" must be a dictionary'):
+            MLModel(self.spec, optimization_hints=12)
+
+        with pytest.raises(ValueError, match='Unrecognized key in optimization_hint dictionary: bad key'):
+            MLModel(self.spec, optimization_hints={'bad key': ct.ReshapeFrequency.Frequent})
+
+        with pytest.raises(TypeError, match='"specializationStrategy" value of "optimization_hint_input" dictionary must be of type coremltools.SpecializationStrategy'):
+            MLModel(self.spec, optimization_hints={"specializationStrategy": 12})
+
+        with pytest.raises(TypeError, match='"reshapeFrequency" value of "optimization_hint_input" dictionary must be of type coremltools.ReshapeFrequency'):
+            MLModel(self.spec, optimization_hints={"reshapeFrequency": 12})
+
+        with pytest.raises(TypeError, match='"reshapeFrequency" value of "optimization_hint_input" dictionary must be of type coremltools.ReshapeFrequency'):
+            # SpecializationStrategy value for ReshapeFrequency key
+            MLModel(self.spec, optimization_hints={"reshapeFrequency": ct.SpecializationStrategy.Default})
+
+
 class TestCompiledMLModel:
     @pytest.mark.skipif(ct.utils._macos_version() < (15, 0), reason="State only supported on macOS 15+")
     def test_state(self):
diff --git a/coremltools/test/neural_network/test_compiled_model.py b/coremltools/test/neural_network/test_compiled_model.py
index d2595ed4e..0e53acf3d 100644
--- a/coremltools/test/neural_network/test_compiled_model.py
+++ b/coremltools/test/neural_network/test_compiled_model.py
@@ -3,12 +3,14 @@
 # Use of this source code is governed by a BSD-3-clause license that can be
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
+
+import itertools
 from shutil import copytree, rmtree
 from tempfile import TemporaryDirectory
 
 import pytest
 
-from coremltools import ComputeUnit
+from coremltools import ComputeUnit, ReshapeFrequency, SpecializationStrategy, utils
 from coremltools.models import CompiledMLModel, MLModel
 from coremltools.models.utils import compile_model, load_spec, save_spec
 from coremltools.proto import Model_pb2
@@ -36,6 +38,12 @@ def setup(self):
         spec.description.predictedFeatureName = 'y'
         self.spec = spec
 
+        self.compiled_model_path = compile_model(self.spec)
+
+
+    def teardown_class(self):
+        rmtree(self.compiled_model_path)
+
 
     def _test_compile_model_path(self, compiled_model_path, compute_units=ComputeUnit.ALL):
         try:
@@ -114,3 +122,24 @@ def test_save_load_spec(self):
             my_spec = load_spec(file_path)
             compiled_model_path = compile_model(my_spec)
         self._test_compile_model_path(compiled_model_path)
+
+
+    @pytest.mark.skipif(utils._macos_version() < (15, 0),
+                        reason="optimization hints available only on macOS15+")
+    @pytest.mark.parametrize("reshapeFrequency, specializationStrategy",
+                             itertools.product(
+                                 (ReshapeFrequency.Frequent, ReshapeFrequency.Infrequent, None),
+                                 (SpecializationStrategy.FastPrediction, SpecializationStrategy.Default, None),
+                             ))
+    def test_optimization_hints(self, reshapeFrequency, specializationStrategy):
+        optimization_hints={}
+        if reshapeFrequency is not None:
+            optimization_hints['reshapeFrequency'] = reshapeFrequency
+        if specializationStrategy is not None:
+            optimization_hints["specializationStrategy"] = specializationStrategy
+        if len(optimization_hints) == 0:
+            optimization_hints = None
+
+        m = CompiledMLModel(self.compiled_model_path, optimization_hints=optimization_hints)
+        assert isinstance(m, CompiledMLModel)
+        assert(m.optimization_hints == optimization_hints)
diff --git a/coremltools/test/neural_network/test_tf_numeric.py b/coremltools/test/neural_network/test_tf_numeric.py
index e248a98c2..c952cea6f 100644
--- a/coremltools/test/neural_network/test_tf_numeric.py
+++ b/coremltools/test/neural_network/test_tf_numeric.py
@@ -165,11 +165,6 @@ def test_data_reorganize_cpu_only(self):
         self.test_data_reorganize(cpu_only=True)
 
     def test_depthwise_conv(self, cpu_only=False):
-        if not cpu_only:
-            pytest.xfail(
-                "rdar://116060011: re-activate coremltools tests blocked by Core ML regressions"
-            )
-
         def get_coreml_model_depthwise(X, params, w):
             eval = True
             mlmodel = None
diff --git a/coremltools/test/optimize/api/test_optimize_api.py b/coremltools/test/optimize/api/test_optimize_api.py
index cb2e1d8b8..1827ab01d 100644
--- a/coremltools/test/optimize/api/test_optimize_api.py
+++ b/coremltools/test/optimize/api/test_optimize_api.py
@@ -332,6 +332,46 @@ def test_programmatic_example_2(self):
         output_file = tempfile.NamedTemporaryFile(suffix=".mlpackage").name
         coreml_model.save(output_file)
 
+    def test_quantize_submodule(self):
+        import torch
+        from torchvision.models import mobilenet_v3_small
+
+        import coremltools as ct
+        from coremltools.optimize.torch.quantization import LinearQuantizer
+
+        class Model(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.model1 = mobilenet_v3_small()
+                self.model2 = mobilenet_v3_small()
+
+            def forward(self, x):
+                return self.model1(x), self.model2(x)
+
+        model = Model()
+        data = torch.randn(1, 3, 224, 224)
+        example_inputs = (data,)
+
+        quantizer = LinearQuantizer(model.model1)
+        model.model1 = quantizer.prepare(example_inputs=example_inputs)
+        model(data)
+        model.model1 = quantizer.finalize()
+
+        model = model.eval()
+        traced_model = torch.jit.trace(model, example_inputs=example_inputs)
+        coreml_model = ct.convert(
+            traced_model,
+            convert_to="mlprogram",
+            inputs=[ct.TensorType(shape=data.shape)],
+            minimum_deployment_target=ct.target.iOS18,
+            skip_model_load=True,
+        )
+        assert coreml_model is not None
+        quant_ops = coreml_model._mil_program.functions["main"].find_ops(
+            op_type="constexpr_blockwise_shift_scale"
+        )
+        assert len(quant_ops) > 0
+
 
 class TestConvertingCompressedSourceModels:
     """
diff --git a/coremltools/test/optimize/coreml/test_post_training_quantization.py b/coremltools/test/optimize/coreml/test_post_training_quantization.py
index 7b40e66fc..3090106b5 100644
--- a/coremltools/test/optimize/coreml/test_post_training_quantization.py
+++ b/coremltools/test/optimize/coreml/test_post_training_quantization.py
@@ -1379,7 +1379,12 @@ def test_palettization_pcs(self, compute_unit, backend):
         )[0]
         assert types.builtin_to_string(palettize_op.indices.dtype) == "uint4"
         # The per-channel-scale is represented by a quant op to do scaling.
-        assert palettize_op.outputs[0].child_ops[0].op_type == "constexpr_blockwise_shift_scale"
+        quantize_ops = mlmodel_palettized._mil_program.functions["main"].find_ops(
+            op_type="constexpr_blockwise_shift_scale"
+        )
+        assert len(quantize_ops) > 0
+        # Order of quant and lut op is determined by canonicalize_quantized_lut_pattern graph pass.
+        assert quantize_ops[0].outputs[0].child_ops[0].op_type == "constexpr_lut_to_dense"
 
         if _macos_version() >= (15, 0):
             verify_model_outputs(mlmodel, mlmodel_palettized, coreml_input_values)
@@ -1698,7 +1703,7 @@ def test_default_prune_pipeline_ios18(self, compute_unit, backend):
                 assert types.builtin_to_string(sparse_op.shape.dtype) == "uint32"
 
         if _macos_version() >= (15, 0):
-            verify_model_outputs(mlmodel, mlmodel_pruned, coreml_input_values, rtol=2e-3, atol=2e-3)
+            verify_model_outputs(mlmodel, mlmodel_pruned, coreml_input_values, rtol=3e-3, atol=2e-3)
 
 
 class TestJointCompressWeights:
@@ -1938,18 +1943,27 @@ def test_joint_prune_palettize_weights(
             )
 
     @pytest.mark.parametrize(
-        "compute_unit, backend, nbits, channel_group_size",
+        "compute_unit, backend, nbits, channel_group_size, quantize_first",
         itertools.product(
             compute_units,
             backends,
             (3, 4, 8),
             (0, 1, 2),
+            (True, False),
         ),
     )
     def test_joint_palettize_quantize_weights(
-        self, compute_unit, backend, nbits, channel_group_size
+        self, compute_unit, backend, nbits, channel_group_size, quantize_first
     ):
-        """First palettize to get fp16 lut, and then quantize the lut to make int8 lut."""
+        """
+        If quantize_first is True:
+            First quantize to get int8 weight, and then palettize to n-bit lut with int8 entries.
+        If quantize_first is False:
+            First palettize to get fp16 lut, and then quantize the lut to make int8 lut.
+
+        Notice no matter applies which one first, the final output model's op order is guaranteed to be consistent
+        by the common::canonicalize_quantized_lut_pattern graph pass.
+        """
         model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
         torchmodel = torch.jit.trace(model, torch_input_values)
         mlmodel = ct.convert(
@@ -1982,10 +1996,17 @@ def test_joint_palettize_quantize_weights(
             )
         )
 
-        mlmodel_palettized = cto.coreml.palettize_weights(mlmodel, palettize_config)
-        mlmodel_joint_palettized_quantized = cto.coreml.linear_quantize_weights(
-            mlmodel_palettized, quant_config, joint_compression=True
-        )
+        if quantize_first:
+            mlmodel_quantized = cto.coreml.linear_quantize_weights(mlmodel, quant_config)
+            mlmodel_joint_palettized_quantized = cto.coreml.palettize_weights(
+                mlmodel_quantized, palettize_config, joint_compression=True
+            )
+        else:
+            mlmodel_palettized = cto.coreml.palettize_weights(mlmodel, palettize_config)
+            mlmodel_joint_palettized_quantized = cto.coreml.linear_quantize_weights(
+                mlmodel_palettized, quant_config, joint_compression=True
+            )
+
         expected_ops = (
             ["constexpr_blockwise_shift_scale", "constexpr_lut_to_dense", "conv"] * 2
             + ["reshape"]
@@ -1995,13 +2016,13 @@ def test_joint_palettize_quantize_weights(
         )
         prog = mlmodel_joint_palettized_quantized._mil_program
         if channel_group_size == 0:
-            # When use per-tensor lut, the lut size is too small, so it's stored as ImmediateValue
+            # When doing lut first with per-tensor lut, the lut size is too small, so it's stored as ImmediateValue
             # which won't be quantized.
             ops_in_prog = get_op_types_in_program(prog)
-            if nbits >= 4:
-                assert ops_in_prog.count("constexpr_blockwise_shift_scale") >= 6
-            else:
+            if nbits < 4 and not quantize_first:
                 assert ops_in_prog.count("constexpr_blockwise_shift_scale") == 0
+            else:
+                assert ops_in_prog.count("constexpr_blockwise_shift_scale") >= 6
         else:
             assert get_op_types_in_program(prog) == expected_ops
 
@@ -2069,81 +2090,6 @@ def test_joint_palettize_quantize_weights_invalid(self, compute_unit, backend):
                 mlmodel_palettized, quant_config, joint_compression=True
             )
 
-    @pytest.mark.parametrize(
-        "compute_unit, backend, nbits, channel_group_size",
-        itertools.product(
-            compute_units,
-            backends,
-            (3, 4, 8),
-            (0, 1, 2),
-        ),
-    )
-    def test_joint_quantize_palettize_weights(
-        self, compute_unit, backend, nbits, channel_group_size
-    ):
-        """First quantize to get int8 weight, and then palettize to n-bit lut with int8 entries."""
-        model, inputs, torch_input_values, coreml_input_values = get_test_model_and_data_complex()
-        torchmodel = torch.jit.trace(model, torch_input_values)
-        mlmodel = ct.convert(
-            torchmodel,
-            inputs=inputs,
-            convert_to="mlprogram",
-            minimum_deployment_target=backend.opset_version,
-            compute_precision=ct.precision.FLOAT16
-            if backend.precision == "fp16"
-            else ct.precision.FLOAT32,
-            compute_units=compute_unit,
-        )
-
-        quant_config = cto.coreml.OptimizationConfig(
-            global_config=cto.coreml.OpLinearQuantizerConfig(
-                mode="linear",
-                dtype="int8",
-                granularity="per_tensor",
-                weight_threshold=500,
-            )
-        )
-        palettize_config = cto.coreml.OptimizationConfig(
-            global_config=cto.coreml.OpPalettizerConfig(
-                mode="uniform",
-                nbits=nbits,
-                granularity="per_grouped_channel",
-                group_size=channel_group_size,
-                weight_threshold=500,
-            )
-        )
-
-        mlmodel_quantized = cto.coreml.linear_quantize_weights(mlmodel, quant_config)
-        mlmodel_joint_quantized_palettized = cto.coreml.palettize_weights(
-            mlmodel_quantized, palettize_config, joint_compression=True
-        )
-        expected_ops = (
-            ["constexpr_lut_to_dense", "constexpr_blockwise_shift_scale", "conv"] * 2
-            + ["reshape"]
-            + ["constexpr_lut_to_dense", "constexpr_blockwise_shift_scale", "linear"] * 2
-            + ["constexpr_lut_to_dense", "constexpr_blockwise_shift_scale"] * 3
-            + ["lstm", "expand_dims", "expand_dims"]
-        )
-        prog = mlmodel_joint_quantized_palettized._mil_program
-        assert get_op_types_in_program(prog) == expected_ops
-
-        for linear_op in prog.find_ops(op_type="linear"):
-            assert linear_op.weight.op.op_type == "constexpr_blockwise_shift_scale"
-        for conv_op in prog.find_ops(op_type="conv"):
-            assert conv_op.weight.op.op_type == "constexpr_blockwise_shift_scale"
-
-        for palettize_op in prog.find_ops(op_type="constexpr_lut_to_dense"):
-            assert palettize_op.lut.dtype == types.int8
-            assert palettize_op.indices.dtype == types.string_to_builtin(f"uint{nbits}")
-            assert palettize_op.outputs[0].child_ops[0].op_type == "constexpr_blockwise_shift_scale"
-        for quantize_op in prog.find_ops(op_type="constexpr_blockwise_shift_scale"):
-            assert quantize_op.data.dtype == types.int8
-            assert quantize_op.scale.dtype == types.fp16
-            assert quantize_op.offset.dtype == types.int8
-
-        if _macos_version() >= (15, 0):
-            verify_model_outputs(mlmodel, mlmodel_joint_quantized_palettized, coreml_input_values)
-
     @pytest.mark.xfail(
         reason="rdar://131511244 Investigate Why Joint Prune x Anything are Failing on BNNS"
     )
diff --git a/coremltools/test/optimize/torch/quantization/test_configure.py b/coremltools/test/optimize/torch/quantization/test_configure.py
index 024c89d53..1807dc978 100644
--- a/coremltools/test/optimize/torch/quantization/test_configure.py
+++ b/coremltools/test/optimize/torch/quantization/test_configure.py
@@ -999,7 +999,10 @@ def test_embedding_layer_quantization(activation_dtype):
 
 @pytest.mark.parametrize("config", get_configs_for_qscheme())
 @pytest.mark.parametrize("activation_fn", list(_mod_activations) + [nn.ReLU])
-@pytest.mark.parametrize("elementwise_op", [operator.add, torch.add, operator.mul, torch.mul])
+@pytest.mark.parametrize(
+    "elementwise_op",
+    [operator.add, torch.add, operator.mul, torch.mul, torch.matmul, torch.einsum],
+)
 @pytest.mark.parametrize("conv_transpose", [False, True])
 def test_elementwise_op_act_fusion(config, activation_fn, elementwise_op, conv_transpose):
     class ElementWiseActModule(torch.nn.Module):
@@ -1012,6 +1015,11 @@ def __init__(self, conv_transpose):
             self.act = activation_fn()
 
         def forward(self, x):
+            if elementwise_op == torch.einsum:
+                return self.act(
+                    elementwise_op("bkhq,bchk->bchq", x.transpose(1, 3), self.conv1(x))
+                )
+
             return self.act(elementwise_op(x, self.conv1(x)))
 
     model = ElementWiseActModule(conv_transpose)
diff --git a/coremltools/test/optimize/torch/quantization/test_quantizer.py b/coremltools/test/optimize/torch/quantization/test_quantizer.py
index ef67d23a7..b9dcb645c 100644
--- a/coremltools/test/optimize/torch/quantization/test_quantizer.py
+++ b/coremltools/test/optimize/torch/quantization/test_quantizer.py
@@ -418,7 +418,13 @@ def test_linear_quantizer_report(
     print("\nREPORT\n" + str(report))
 
 
-@pytest.mark.parametrize("dtype", ["qint4", "qint8"])
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        pytest.param("qint4", marks=pytest.mark.xfail(reason="rdar://134169158")),
+        "qint8",
+    ],
+)
 @pytest.mark.parametrize("scheme", ["symmetric", "affine"])
 @pytest.mark.parametrize("conv_transpose", [False, True])
 def test_compression_metadata(dtype, scheme, conv_transpose):
@@ -432,6 +438,7 @@ def test_compression_metadata(dtype, scheme, conv_transpose):
                     "conv1",
                     (nn.Conv2d(1, 20, 3) if not conv_transpose else nn.ConvTranspose2d(1, 20, 3)),
                 ),
+                ("relu", nn.ReLU()),
                 ("fc1", nn.Linear(20, 100)),
             ]
         )
@@ -444,7 +451,7 @@ def test_compression_metadata(dtype, scheme, conv_transpose):
                     "quantization_scheme": scheme,
                 },
                 "fc1": None,
-            }
+            },
         }
     )
     quantizer = LinearQuantizer(model, config)
@@ -457,7 +464,7 @@ def test_compression_metadata(dtype, scheme, conv_transpose):
     assert "_COREML_/metadata_version" in model.state_dict()
 
     # Verify compression metadata is added for conv1
-    metadata_dict = CompressionMetadata.from_state_dict(model.conv1.state_dict())
+    metadata_dict = CompressionMetadata.from_state_dict(model.conv1[0].state_dict())
     assert len(metadata_dict) == 1
     assert "weight" in metadata_dict
 
@@ -470,6 +477,6 @@ def test_compression_metadata(dtype, scheme, conv_transpose):
     if scheme == "symmetric":
         assert torch.all(metadata.zero_point == 0)
 
-    # # Verify no compression metadata is added for fc1
+    # Verify no compression metadata is added for fc1
     metadata_dict = CompressionMetadata.from_state_dict(model.fc1.state_dict())
     assert len(metadata_dict) == 0
diff --git a/coremltools/test/optimize/torch/quantization/test_utils.py b/coremltools/test/optimize/torch/quantization/test_utils.py
index 3df9fe42e..45c321d0f 100644
--- a/coremltools/test/optimize/torch/quantization/test_utils.py
+++ b/coremltools/test/optimize/torch/quantization/test_utils.py
@@ -6,7 +6,7 @@
 import pytest
 import torch
 
-from coremltools.optimize.torch.quantization._utils import get_quant_range
+from coremltools.optimize.torch.quantization._utils import get_n_bits_from_range, get_quant_range
 
 
 @pytest.mark.parametrize("n_bits", list(range(2, 8)))
@@ -37,3 +37,11 @@ def test_quant_range(dtype, n_bits):
     else:
         assert quant_min == signed_expected_values[n_bits][0]
         assert quant_max == signed_expected_values[n_bits][1]
+
+
+@pytest.mark.parametrize("n_bits", list(range(2, 8)))
+@pytest.mark.parametrize("dtype", [torch.quint8, torch.uint8, torch.qint8, torch.int8])
+def test_n_bits_from_range(dtype, n_bits):
+    quant_min, quant_max = get_quant_range(n_bits, dtype)
+    output_n_bits = get_n_bits_from_range(quant_min, quant_max)
+    assert output_n_bits == n_bits
diff --git a/coremltools/version.py b/coremltools/version.py
index 8ef6b5a8a..9896fb832 100644
--- a/coremltools/version.py
+++ b/coremltools/version.py
@@ -4,4 +4,4 @@
 # found in the LICENSE.txt file or at https://opensource.org/licenses/BSD-3-Clause
 
 
-__version__ = "8.0b2"  # VERSION_STRING
+__version__ = "8.0"  # VERSION_STRING
diff --git a/docs-guides/source/flexible-inputs.md b/docs-guides/source/flexible-inputs.md
index 55eacacf5..3412ee77c 100644
--- a/docs-guides/source/flexible-inputs.md
+++ b/docs-guides/source/flexible-inputs.md
@@ -162,6 +162,18 @@ You can open the saved ML package in Xcode and click the **Predictions** tab to
 ![Range shape](images/range_shape.png)
 
 
+## Reshape Frequency Optimization Hint
+
+Setting the Reshape Frequency Optimization Hint to `Frequent` can allow flexible shaped models to run on the Neural Engine. This option can be set when loading your model:
+
+```python
+model = ct.model.MLModel(
+    'path/to/the/saved/model.mlmodel',
+    optimization_hints={ 'reshapeFrequency': ct.ReshapeFrequency.Frequent }
+)
+p```
+
+
 ## Enable Unbounded Ranges
 
 ```{warning}
diff --git a/docs-guides/source/model-prediction.md b/docs-guides/source/model-prediction.md
index fdc2606b6..0127beadf 100644
--- a/docs-guides/source/model-prediction.md
+++ b/docs-guides/source/model-prediction.md
@@ -67,6 +67,17 @@ In previous versions of coremltools, you would restrict execution to the CPU by
 
 For more information and values for this parameter, see [Set the Compute Units](load-and-convert-model.md#set-the-compute-units).
 
+## Fast Predictions
+
+A Model can be loaded using the Fast Prediction Optimization Hint. This will prefer the prediction latency at the potential cost of specialization time, memory footprint, and the disk space usage.
+
+```python
+model = ct.model.MLModel(
+    'path/to/the/saved/model.mlmodel',
+    optimization_hints={ 'specializationStrategy': ct.SpecializationStrategy.FastPrediction }
+)
+```
+
 ## Multi-array Prediction
 
 A model that takes a `MultiArray` input requires a NumPy array as an input with the `predict()` call. For example:
diff --git a/reqs/test.pip b/reqs/test.pip
index 4d2b15a56..43b3ebb1e 100644
--- a/reqs/test.pip
+++ b/reqs/test.pip
@@ -31,15 +31,16 @@ gast==0.4.0
 
 # torch 2.3 dropped support for x86 macOS
 torch==2.2.0; platform_machine != "arm64"
-torch==2.3.0; platform_machine == "arm64"
-executorch==0.2.0; platform_machine == "arm64" and python_version >= '3.10' and python_version <= '3.11'
+torch==2.4.0; platform_machine == "arm64"
+executorch==0.3.0; platform_machine == "arm64" and python_version >= '3.10' and python_version <= '3.11'
 torchaudio==2.2.0; platform_machine != "arm64"
-torchaudio==2.3.0; platform_machine == "arm64"
+torchaudio==2.4.0; platform_machine == "arm64"
 torchvision==0.17.0; platform_machine != "arm64"
-torchvision==0.18.0; platform_machine == "arm64"
+torchvision==0.19.0; platform_machine == "arm64"
+torchao==0.4.0; platform_machine == "arm64" and python_version == '3.10'
 
-torchsr==1.0.4; platform_machine == "arm64" and python_version >= '3.10' and python_version <= '3.11'
-timm==0.6.13; platform_machine == "arm64" and python_version >= '3.10' and python_version <= '3.11'
+torchsr==1.0.4; platform_machine == "arm64"
+timm==0.6.13; platform_machine == "arm64"
 xgboost==1.4.2; platform_machine != "arm64"
 mock
 wrapt