From af41dad6e8e7edd3c11e9b4dec268fba514685b2 Mon Sep 17 00:00:00 2001 From: Jing Xu Date: Wed, 15 Jan 2025 17:08:40 +0900 Subject: [PATCH] content update for 2.5.10 (#3447) --- .../_sources/tutorials/features/DDP.md.txt | 4 +- .../features/torch_compile_gpu.md.txt | 8 ++ xpu/2.5.10+xpu/_static/pygments.css | 36 +++--- .../design_doc/cpu/isa_dyndisp.html | 4 +- xpu/2.5.10+xpu/genindex.html | 4 +- xpu/2.5.10+xpu/index.html | 4 +- xpu/2.5.10+xpu/search.html | 4 +- xpu/2.5.10+xpu/searchindex.js | 2 +- xpu/2.5.10+xpu/tutorials/api_doc.html | 12 +- xpu/2.5.10+xpu/tutorials/contribution.html | 4 +- xpu/2.5.10+xpu/tutorials/examples.html | 112 +++++++++--------- xpu/2.5.10+xpu/tutorials/features.html | 4 +- xpu/2.5.10+xpu/tutorials/features/DDP.html | 36 +++--- xpu/2.5.10+xpu/tutorials/features/DLPack.html | 12 +- .../tutorials/features/DPC++_Extension.html | 24 ++-- xpu/2.5.10+xpu/tutorials/features/FSDP.html | 68 +++++------ .../features/advanced_configuration.html | 4 +- .../tutorials/features/amp_gpu.html | 10 +- .../features/auto_channels_last.html | 4 +- .../tutorials/features/compute_engine.html | 4 +- .../tutorials/features/deepspeed_kernels.html | 4 +- xpu/2.5.10+xpu/tutorials/features/float8.html | 6 +- .../tutorials/features/horovod.html | 16 +-- .../tutorials/features/int8_overview_xpu.html | 14 +-- .../tutorials/features/ipex_log.html | 4 +- xpu/2.5.10+xpu/tutorials/features/nhwc.html | 26 ++-- .../tutorials/features/profiler_kineto.html | 4 +- .../tutorials/features/profiler_legacy.html | 4 +- .../tutorials/features/torch_compile_gpu.html | 26 ++-- xpu/2.5.10+xpu/tutorials/getting_started.html | 8 +- xpu/2.5.10+xpu/tutorials/introduction.html | 4 +- xpu/2.5.10+xpu/tutorials/known_issues.html | 4 +- xpu/2.5.10+xpu/tutorials/license.html | 4 +- xpu/2.5.10+xpu/tutorials/llm.html | 4 +- .../llm/int4_weight_only_quantization.html | 12 +- .../llm/llm_optimize_transformers.html | 20 ++-- xpu/2.5.10+xpu/tutorials/performance.html | 4 +- xpu/2.5.10+xpu/tutorials/releases.html | 14 +-- .../tutorials/technical_details.html | 4 +- .../tutorials/technical_details/AOT.html | 4 +- .../technical_details/ipex_optimize.html | 4 +- .../technical_details/memory_management.html | 4 +- .../optimizer_fusion_gpu.html | 4 +- 43 files changed, 286 insertions(+), 272 deletions(-) diff --git a/xpu/2.5.10+xpu/_sources/tutorials/features/DDP.md.txt b/xpu/2.5.10+xpu/_sources/tutorials/features/DDP.md.txt index ea3617443..c8361867f 100644 --- a/xpu/2.5.10+xpu/_sources/tutorials/features/DDP.md.txt +++ b/xpu/2.5.10+xpu/_sources/tutorials/features/DDP.md.txt @@ -25,9 +25,9 @@ For more detailed information, check [Installation Guide](https://intel.github.i ``` # Generic Python* for CPU -REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/cpu/us/ +REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/ # Generic Python* for GPU -REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/ ``` Installation from either repository shares the command below. Replace the place holder `` with a real URL mentioned above. diff --git a/xpu/2.5.10+xpu/_sources/tutorials/features/torch_compile_gpu.md.txt b/xpu/2.5.10+xpu/_sources/tutorials/features/torch_compile_gpu.md.txt index ed4a4d9e0..8f5314df4 100644 --- a/xpu/2.5.10+xpu/_sources/tutorials/features/torch_compile_gpu.md.txt +++ b/xpu/2.5.10+xpu/_sources/tutorials/features/torch_compile_gpu.md.txt @@ -24,6 +24,14 @@ Triton could be directly installed using the following command: pip install --pre pytorch-triton-xpu==3.1.0+91b14bf559 --index-url https://download.pytorch.org/whl/nightly/xpu ``` +The cached files would be generated if you had run `torch.compile` with a previous version of triton, but they are generally conflicting with the new version. +So, if the folder `~/.triton` exists before your first running of the `torch.compile` script in the current environment, please delete it. + +```bash +# delete the cache files generated by previous version of triton, if exists +rm -rf ~/.triton +``` + Remember to activate the oneAPI DPC++/C++ Compiler by following commands. ```bash diff --git a/xpu/2.5.10+xpu/_static/pygments.css b/xpu/2.5.10+xpu/_static/pygments.css index 84ab3030a..6f8b210a1 100644 --- a/xpu/2.5.10+xpu/_static/pygments.css +++ b/xpu/2.5.10+xpu/_static/pygments.css @@ -6,9 +6,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: .highlight .hll { background-color: #ffffcc } .highlight { background: #f8f8f8; } .highlight .c { color: #3D7B7B; font-style: italic } /* Comment */ -.highlight .err { border: 1px solid #FF0000 } /* Error */ +.highlight .err { border: 1px solid #F00 } /* Error */ .highlight .k { color: #008000; font-weight: bold } /* Keyword */ -.highlight .o { color: #666666 } /* Operator */ +.highlight .o { color: #666 } /* Operator */ .highlight .ch { color: #3D7B7B; font-style: italic } /* Comment.Hashbang */ .highlight .cm { color: #3D7B7B; font-style: italic } /* Comment.Multiline */ .highlight .cp { color: #9C6500 } /* Comment.Preproc */ @@ -25,34 +25,34 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: .highlight .gp { color: #000080; font-weight: bold } /* Generic.Prompt */ .highlight .gs { font-weight: bold } /* Generic.Strong */ .highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ -.highlight .gt { color: #0044DD } /* Generic.Traceback */ +.highlight .gt { color: #04D } /* Generic.Traceback */ .highlight .kc { color: #008000; font-weight: bold } /* Keyword.Constant */ .highlight .kd { color: #008000; font-weight: bold } /* Keyword.Declaration */ .highlight .kn { color: #008000; font-weight: bold } /* Keyword.Namespace */ .highlight .kp { color: #008000 } /* Keyword.Pseudo */ .highlight .kr { color: #008000; font-weight: bold } /* Keyword.Reserved */ .highlight .kt { color: #B00040 } /* Keyword.Type */ -.highlight .m { color: #666666 } /* Literal.Number */ +.highlight .m { color: #666 } /* Literal.Number */ .highlight .s { color: #BA2121 } /* Literal.String */ .highlight .na { color: #687822 } /* Name.Attribute */ .highlight .nb { color: #008000 } /* Name.Builtin */ -.highlight .nc { color: #0000FF; font-weight: bold } /* Name.Class */ -.highlight .no { color: #880000 } /* Name.Constant */ -.highlight .nd { color: #AA22FF } /* Name.Decorator */ +.highlight .nc { color: #00F; font-weight: bold } /* Name.Class */ +.highlight .no { color: #800 } /* Name.Constant */ +.highlight .nd { color: #A2F } /* Name.Decorator */ .highlight .ni { color: #717171; font-weight: bold } /* Name.Entity */ .highlight .ne { color: #CB3F38; font-weight: bold } /* Name.Exception */ -.highlight .nf { color: #0000FF } /* Name.Function */ +.highlight .nf { color: #00F } /* Name.Function */ .highlight .nl { color: #767600 } /* Name.Label */ -.highlight .nn { color: #0000FF; font-weight: bold } /* Name.Namespace */ +.highlight .nn { color: #00F; font-weight: bold } /* Name.Namespace */ .highlight .nt { color: #008000; font-weight: bold } /* Name.Tag */ .highlight .nv { color: #19177C } /* Name.Variable */ -.highlight .ow { color: #AA22FF; font-weight: bold } /* Operator.Word */ -.highlight .w { color: #bbbbbb } /* Text.Whitespace */ -.highlight .mb { color: #666666 } /* Literal.Number.Bin */ -.highlight .mf { color: #666666 } /* Literal.Number.Float */ -.highlight .mh { color: #666666 } /* Literal.Number.Hex */ -.highlight .mi { color: #666666 } /* Literal.Number.Integer */ -.highlight .mo { color: #666666 } /* Literal.Number.Oct */ +.highlight .ow { color: #A2F; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #BBB } /* Text.Whitespace */ +.highlight .mb { color: #666 } /* Literal.Number.Bin */ +.highlight .mf { color: #666 } /* Literal.Number.Float */ +.highlight .mh { color: #666 } /* Literal.Number.Hex */ +.highlight .mi { color: #666 } /* Literal.Number.Integer */ +.highlight .mo { color: #666 } /* Literal.Number.Oct */ .highlight .sa { color: #BA2121 } /* Literal.String.Affix */ .highlight .sb { color: #BA2121 } /* Literal.String.Backtick */ .highlight .sc { color: #BA2121 } /* Literal.String.Char */ @@ -67,9 +67,9 @@ span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: .highlight .s1 { color: #BA2121 } /* Literal.String.Single */ .highlight .ss { color: #19177C } /* Literal.String.Symbol */ .highlight .bp { color: #008000 } /* Name.Builtin.Pseudo */ -.highlight .fm { color: #0000FF } /* Name.Function.Magic */ +.highlight .fm { color: #00F } /* Name.Function.Magic */ .highlight .vc { color: #19177C } /* Name.Variable.Class */ .highlight .vg { color: #19177C } /* Name.Variable.Global */ .highlight .vi { color: #19177C } /* Name.Variable.Instance */ .highlight .vm { color: #19177C } /* Name.Variable.Magic */ -.highlight .il { color: #666666 } /* Literal.Number.Integer.Long */ \ No newline at end of file +.highlight .il { color: #666 } /* Literal.Number.Integer.Long */ \ No newline at end of file diff --git a/xpu/2.5.10+xpu/design_doc/cpu/isa_dyndisp.html b/xpu/2.5.10+xpu/design_doc/cpu/isa_dyndisp.html index 8afaf860f..f56986bff 100644 --- a/xpu/2.5.10+xpu/design_doc/cpu/isa_dyndisp.html +++ b/xpu/2.5.10+xpu/design_doc/cpu/isa_dyndisp.html @@ -7,7 +7,7 @@ Intel® Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -673,7 +673,7 @@

CPU feature checkSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/genindex.html b/xpu/2.5.10+xpu/genindex.html index d8fa701a1..983f97429 100644 --- a/xpu/2.5.10+xpu/genindex.html +++ b/xpu/2.5.10+xpu/genindex.html @@ -6,7 +6,7 @@ Index — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -235,7 +235,7 @@

T

Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/index.html b/xpu/2.5.10+xpu/index.html index 3a514cd98..671b850b7 100644 --- a/xpu/2.5.10+xpu/index.html +++ b/xpu/2.5.10+xpu/index.html @@ -9,7 +9,7 @@ Intel® Extension for PyTorch* — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -180,7 +180,7 @@

Support Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/search.html b/xpu/2.5.10+xpu/search.html index 102bceff1..3355db7f7 100644 --- a/xpu/2.5.10+xpu/search.html +++ b/xpu/2.5.10+xpu/search.html @@ -6,7 +6,7 @@ Search — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -138,7 +138,7 @@ Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/searchindex.js b/xpu/2.5.10+xpu/searchindex.js index 19d475a44..a8ab45501 100644 --- a/xpu/2.5.10+xpu/searchindex.js +++ b/xpu/2.5.10+xpu/searchindex.js @@ -1 +1 @@ -Search.setIndex({"alltitles": {"1.10.200+gpu": [[31, "gpu"]], "1.13.10+xpu": [[31, "id23"]], "1.13.120+xpu": [[31, "id20"]], "2.0.110+xpu": [[31, "id17"]], "2.1.10+xpu": [[31, "id14"]], "2.1.20+xpu": [[31, "id11"]], "2.1.30+xpu": [[31, "id8"]], "2.1.40+xpu": [[31, "id5"]], "2.3.110+xpu": [[31, "id1"]], "2.5.10+xpu": [[31, "xpu"]], "API Documentation": [[2, null], [24, "api-documentation"]], "Add Custom Kernel": [[0, "add-custom-kernel"]], "Advanced Configuration": [[5, "advanced-configuration"], [10, null]], "Ahead of Time (AOT) Compilation": [[33, null]], "Ahead of Time Compilation (AOT) [GPU]": [[32, "ahead-of-time-compilation-aot-gpu"]], "Architecture": [[1, "architecture"]], "Asynchronous Programming": [[7, "asynchronous-programming"]], "Auto Channels Last": [[12, null]], "Auto Mixed Precision (AMP)": [[5, "auto-mixed-precision-amp"]], "Auto Mixed Precision (AMP) on GPU": [[11, null]], "Autocast Op Reference": [[11, "autocast-op-reference"]], "Automatic Channels Last": [[34, "automatic-channels-last"]], "BERT": [[4, "bert"], [4, "id3"], [4, "id7"], [4, "id10"], [4, "id13"], [4, "id16"]], "BFloat16": [[4, "bfloat16"], [4, "id4"]], "Basic Usage": [[4, "basic-usage"]], "Better local unit tests with pytest": [[3, "better-local-unit-tests-with-pytest"]], "Breaking Changes": [[31, "breaking-changes"], [31, "id3"]], "Build Time Configuration": [[10, "build-time-configuration"]], "Building documentation": [[3, "building-documentation"]], "Building with CMake": [[8, "building-with-cmake"]], "Building with setuptools": [[8, "building-with-setuptools"]], "C++": [[4, "c"]], "C++ API": [[2, "c-api"]], "CPU ISA build compiler requirement": [[0, "cpu-isa-build-compiler-requirement"]], "CPU feature check": [[0, "cpu-feature-check"]], "Channels Last": [[5, "channels-last"], [19, null]], "Channels Last 1D support on XPU": [[19, "channels-last-1d-support-on-xpu"]], "Channels Last Memory Format APIs": [[19, "channels-last-memory-format-apis"]], "Code Folder Struct": [[0, "code-folder-struct"]], "CodeGen Process": [[0, "codegen-process"]], "Compute Engine (Experimental feature for debug)": [[13, null]], "Compute Engine (Prototype feature for debug)": [[5, "compute-engine-prototype-feature-for-debug"]], "Configuration": [[30, "configuration"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[3, "contributing-to-intel-extension-for-pytorch"]], "Contribution": [[3, null]], "Customize DPC++ kernels": [[4, "customize-dpc-kernels"]], "DDP Usage": [[6, "ddp-usage"]], "DDP scaling API (GPU Only)": [[6, "ddp-scaling-api-gpu-only"]], "DLDevice and data pointer": [[7, "dldevice-and-data-pointer"]], "DLPack Solution": [[5, "dlpack-solution"], [7, null]], "DPC++ Extension": [[5, "dpc-extension"], [8, null]], "Deep Fusion Policy": [[27, "deep-fusion-policy"]], "Default Precision": [[11, "default-precision"]], "Design": [[7, "design"]], "Developing Intel\u00ae Extension for PyTorch* on XPU": [[3, "developing-intel-extension-for-pytorch-on-xpu"]], "Dispatch Stub implementation: csrc/cpu/dyndisp/DispatchStub.cpp and csrc/cpu/dyndisp/DispatchStub.h": [[0, "dispatch-stub-implementation-csrc-cpu-dyndisp-dispatchstub-cpp-and-csrc-cpu-dyndisp-dispatchstub-h"]], "Distributed Inference": [[27, "distributed-inference"]], "Distributed Inference with DeepSpeed": [[29, "distributed-inference-with-deepspeed"]], "Distributed Training": [[5, "distributed-training"]], "DistributedDataParallel (DDP)": [[6, null]], "Dynamic Dispatch Design": [[0, "dynamic-dispatch-design"]], "Ease-of-use auto channels last API": [[12, "ease-of-use-auto-channels-last-api"]], "Easy-to-use Python API": [[5, "easy-to-use-python-api"]], "Engine Selection Policy": [[13, "engine-selection-policy"]], "Enviornment settings": [[18, "enviornment-settings"]], "Environment Setup": [[28, "environment-setup"]], "Event Log": [[18, "event-log"]], "Example": [[9, "example"]], "Example Case": [[7, "example-case"]], "Example Usage": [[22, "example-usage"]], "Example Usage (MPI launch for single node):": [[6, "example-usage-mpi-launch-for-single-node"]], "Example:": [[0, "example"], [0, "id1"]], "Examples": [[4, null]], "Execute WOQ benchmark script": [[28, "execute-woq-benchmark-script"]], "Execution": [[23, "execution"]], "Export DLPack Capsule": [[7, "export-dlpack-capsule"]], "FP16": [[29, "fp16"]], "FP8 Quantization": [[15, "fp8-quantization"]], "FP8 usage example": [[15, "fp8-usage-example"]], "FSDP Usage (GPU only)": [[9, "fsdp-usage-gpu-only"]], "Features": [[5, null]], "Fetching the corresponding sycl::queue": [[8, "fetching-the-corresponding-sycl-queue"]], "Float16": [[4, "float16"]], "Float32": [[4, "float32"], [4, "id1"]], "Float8 Data Type": [[15, "float8-data-type"]], "Float8 Data Type Support (Prototype)": [[15, null]], "Fully Sharded Data Parallel (FSDP)": [[5, "fully-sharded-data-parallel-fsdp"], [9, null]], "General": [[2, "general"]], "General Usage": [[25, "general-usage"]], "Get Started": [[24, "get-started"]], "Hardware Configuration": [[30, "hardware-configuration"]], "Highlights": [[31, "highlights"], [31, "id2"], [31, "id6"], [31, "id9"], [31, "id12"], [31, "id15"], [31, "id18"], [31, "id21"], [31, "id24"], [31, "id26"]], "Horovod with PyTorch (Prototype)": [[16, null]], "Horovod with PyTorch Usage": [[16, "horovod-with-pytorch-usage"]], "INT8": [[4, "int8"]], "IPEX_LOG (Prototype feature for debug)": [[5, "ipex-log-prototype-feature-for-debug"]], "IPEX_LOG (Prototype)": [[18, null]], "IPEX_LOG Definition": [[18, "ipex-log-definition"]], "ISA intrinics specific kernel example:": [[0, "isa-intrinics-specific-kernel-example"]], "Imperative Mode": [[4, "imperative-mode"], [4, "id5"], [4, "id11"], [17, "imperative-mode"]], "Imperative mode": [[29, "imperative-mode"]], "Import DLPack Capsule": [[7, "import-dlpack-capsule"]], "Inference": [[4, "inference"]], "Inference with Imperative Path": [[11, "inference-with-imperative-path"]], "Inference with TorchScript Path": [[11, "inference-with-torchscript-path"]], "Inferenece with torch.compile": [[22, "inferenece-with-torch-compile"]], "Install Horovod with PyTorch": [[16, "install-horovod-with-pytorch"]], "Install Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "install-intel-oneccl-bindings-for-pytorch"]], "Install Neural-compressor": [[28, "install-neural-compressor"]], "Install PyTorch and Intel\u00ae Extension for PyTorch*": [[6, "install-pytorch-and-intel-extension-for-pytorch"]], "Install from source": [[6, "install-from-source"]], "Installation of Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "installation-of-intel-oneccl-bindings-for-pytorch"]], "Intel\u00ae AI Reference Models": [[4, "intel-ai-reference-models"]], "Intel\u00ae Extension for PyTorch*": [[1, null]], "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels": [[14, null]], "Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc": [[0, null]], "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]": [[17, null]], "Introduction": [[6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"], [11, "introduction"], [13, "introduction"], [14, "introduction"], [18, "introduction"], [20, "introduction"], [21, "introduction"], [22, "introduction"], [24, null], [28, "introduction"], [33, "introduction"], [36, "introduction"]], "JIT Compiling Extensions": [[8, "jit-compiling-extensions"]], "Kernel Stub: csrc/cpu/aten/xyz.cpp and csrc/cpu/aten/xyz.h": [[0, "kernel-stub-csrc-cpu-aten-xyz-cpp-and-csrc-cpu-aten-xyz-h"]], "Kernel implementation: csrc/cpu/aten/kernels/xyzKrnl.cpp": [[0, "kernel-implementation-csrc-cpu-aten-kernels-xyzkrnl-cpp"]], "Kineto Profiler": [[20, null]], "Kineto Supported Profiler Tool (Prototype)": [[5, "kineto-supported-profiler-tool-prototype"]], "Known Issues": [[31, "known-issues"], [31, "id4"], [31, "id7"], [31, "id10"], [31, "id13"], [31, "id16"], [31, "id19"], [31, "id22"], [31, "id25"], [31, "id27"]], "Known issue": [[12, "known-issue"]], "LLM Inference": [[27, "llm-inference"]], "LLM Performance v2.1.10": [[30, "llm-performance-v2-1-10"]], "LLM fine-tuning on Intel\u00ae Data Center Max 1550 GPU": [[27, "llm-fine-tuning-on-intel-data-center-max-1550-gpu"]], "Large Language Models (LLM) Optimizations Overview": [[27, null]], "Legacy Profiler Tool (Deprecated)": [[21, null]], "Library Dependencies": [[25, "library-dependencies"]], "License": [[26, null]], "Linear Operator Optimization": [[27, "linear-operator-optimization"]], "Log Component": [[18, "log-component"]], "Log Level": [[18, "log-level"]], "Low Precision Data Types": [[27, "low-precision-data-types"]], "Memory Format Is All That Matters": [[19, "memory-format-is-all-that-matters"]], "Memory Management": [[35, null]], "Memory Management [GPU]": [[32, "memory-management-gpu"]], "Memory management": [[2, "memory-management"]], "Motivation and Example": [[8, "motivation-and-example"]], "Multiple Implementations Operators and Engines": [[13, "multiple-implementations-operators-and-engines"]], "Op Eligibility": [[11, "op-eligibility"]], "Op-Specific Behavior": [[11, "op-specific-behavior"]], "Operation Fusion": [[36, "operation-fusion"]], "Ops that can autocast to bfloat16": [[11, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float16": [[11, "ops-that-can-autocast-to-float16"]], "Ops that can autocast to float32": [[11, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[11, "ops-that-promote-to-the-widest-input-type"]], "Optimization Methodologies": [[27, "optimization-methodologies"]], "Optimizer Fusion on GPU": [[36, null]], "Optimizer Optimization [GPU]": [[32, "optimizer-optimization-gpu"]], "Overview": [[0, "overview"], [30, "overview"]], "Performance": [[30, null]], "Performance Data for Intel\u00ae AI Data Center Products": [[30, "performance-data-for-intel-ai-data-center-products"]], "Performance Issue": [[25, "performance-issue"]], "Platforms": [[27, "platforms"]], "Private Debug APIs": [[0, "private-debug-apis"]], "Pseudocode of Common Usage Scenarios": [[29, "pseudocode-of-common-usage-scenarios"]], "PyTorch Strided Layout": [[19, "pytorch-strided-layout"]], "Python": [[4, "python"]], "Quantization": [[2, "quantization"], [5, "quantization"]], "Quantize Model and Inference": [[28, "quantize-model-and-inference"]], "Quick Start": [[23, null]], "References": [[28, "references"]], "Releases": [[31, null]], "Replace IPEX_SIMPLE_TRACE": [[18, "replace-ipex-simple-trace"]], "Replace IPEX_VERBOSE": [[18, "replace-ipex-verbose"]], "Requesting the current c10::xpu::XPUStream": [[8, "requesting-the-current-c10-xpu-xpustream"]], "Required Dependencies": [[22, "required-dependencies"]], "Requirement": [[33, "requirement"]], "Resnet50": [[4, "resnet50"], [4, "id2"], [4, "id6"], [4, "id9"], [4, "id12"], [4, "id15"]], "Run Weight-Only Quantization LLM on Intel\u00ae GPU": [[28, "run-weight-only-quantization-llm-on-intel-gpu"]], "Runtime Configuration": [[10, "runtime-configuration"]], "Runtime Dynamic Linking": [[6, "runtime-dynamic-linking"]], "Save and Load Quantized Model (Optional)": [[28, "save-and-load-quantized-model-optional"]], "Segment KV Cache": [[27, "segment-kv-cache"]], "Select ISA level manually.": [[0, "select-isa-level-manually"]], "Simple Log": [[18, "simple-log"]], "Single-Instance Training": [[4, "single-instance-training"]], "SmoothQuant": [[29, "smoothquant"]], "Software Version": [[30, "software-version"]], "Support": [[1, "support"]], "Supported Framework Model Matrix": [[28, "supported-framework-model-matrix"]], "Supported Platform": [[14, "supported-platform"]], "Supported operators": [[15, "supported-operators"]], "Supported running mode": [[15, "supported-running-mode"]], "Technical Details": [[32, null]], "Tips": [[3, "tips"]], "Tips and Debugging": [[3, "tips-and-debugging"]], "TorchScript Mode": [[4, "torchscript-mode"], [4, "id8"], [4, "id14"], [17, "torchscript-mode"], [29, "torchscript-mode"]], "Training": [[4, "training"]], "Training Support": [[11, "training-support"]], "Training with torch.compile": [[22, "training-with-torch-compile"]], "Transformers Optimization Frontend API": [[29, null]], "Troubleshooting": [[22, "troubleshooting"], [25, null]], "Unit testing": [[3, "unit-testing"]], "Usage in C++": [[18, "usage-in-c"]], "Usage in python": [[18, "usage-in-python"]], "Usage of DDP scaling API": [[6, "usage-of-ddp-scaling-api"]], "Usage of running Weight-Only Quantization LLM For Intel\u00ae GPU": [[28, "usage-of-running-weight-only-quantization-llm-for-intel-gpu"]], "Use Case": [[7, "use-case"], [11, "use-case"], [13, "use-case"]], "Use SYCL code": [[4, "use-sycl-code"]], "Use case": [[33, "use-case"]], "Using accessors": [[8, "using-accessors"]], "Validated Models List": [[27, "validated-models-list"]], "Vec specific kernel example:": [[0, "vec-specific-kernel-example"]], "Weight Only Quantization INT4": [[27, "weight-only-quantization-int4"]], "Weight-Only Quantization (Prototype)": [[28, null]], "Weight-Only Quantization Initialization": [[28, "weight-only-quantization-initialization"]], "Weight-Only Quantization LLM features in Intel\u00ae Extension for PyTorch*": [[28, "weight-only-quantization-llm-features-in-intel-extension-for-pytorch"]], "Weight-Only Quantization Linear Dispatch": [[28, "weight-only-quantization-linear-dispatch"]], "Weight-Only Quantization Runtime": [[28, "weight-only-quantization-runtime"]], "What is Channels Last": [[19, "what-is-channels-last"]], "Writing Channels Last Kernels on CPU": [[19, "writing-channels-last-kernels-on-cpu"]], "Writing a DPC++ Extension": [[8, "writing-a-dpc-extension"]], "Writing documentation": [[3, "writing-documentation"]], "Writing the DPC++ Op": [[8, "writing-the-dpc-op"]], "[Recommended] Install from prebuilt wheels": [[6, "recommended-install-from-prebuilt-wheels"]], "a. Create NHWC Memory": [[19, "a-create-nhwc-memory"]], "a. NCHW (default)": [[19, "a-nchw-default"]], "a. Register Channels Last Kernel in ATen Native Manner": [[19, "a-register-channels-last-kernel-in-aten-native-manner"]], "a. tensor conversion with Channels Last 1D": [[19, "a-tensor-conversion-with-channels-last-1d"]], "a. tensor creation": [[19, "a-tensor-creation"]], "b. Create Convolution Primitive": [[19, "b-create-convolution-primitive"]], "b. NHWC": [[19, "b-nhwc"]], "b. Register oneDNN Kernel on Channels Last": [[19, "b-register-onednn-kernel-on-channels-last"]], "b. model conversion with Channels Last 1D": [[19, "b-model-conversion-with-channels-last-1d"]], "b. tensor conversion": [[19, "b-tensor-conversion"]], "c. Blocked (nChw16c, on CPU)": [[19, "c-blocked-nchw16c-on-cpu"]], "c. determine if in Channels Last 1D memory format": [[19, "c-determine-if-in-channels-last-1d-memory-format"]], "c. model conversion": [[19, "c-model-conversion"]], "conv_bn_folding": [[34, "conv-bn-folding"]], "d. operator coverage in PyTorch": [[19, "d-operator-coverage-in-pytorch"]], "default": [[12, "default"]], "disable": [[12, "disable"]], "enable": [[12, "enable"]], "fuse_update_step": [[34, "fuse-update-step"]], "ipex.optimize Frontend API": [[34, null]], "ipex.optimize [GPU]": [[32, "ipex-optimize-gpu"]], "linear_bn_folding": [[34, "linear-bn-folding"]], "oneDNN NHWC APIs": [[19, "onednn-nhwc-apis"]], "replace_dropout_with_identity": [[34, "replace-dropout-with-identity"]], "split_master_weight_for_bf16": [[34, "split-master-weight-for-bf16"]], "torch.compile for GPU (Beta)": [[5, "torch-compile-for-gpu-beta"], [22, null]], "torch.xpu.optimize": [[4, "torch-xpu-optimize"]]}, "docnames": ["design_doc/cpu/isa_dyndisp", "index", "tutorials/api_doc", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/DDP", "tutorials/features/DLPack", "tutorials/features/DPC++_Extension", "tutorials/features/FSDP", "tutorials/features/advanced_configuration", "tutorials/features/amp_gpu", "tutorials/features/auto_channels_last", "tutorials/features/compute_engine", "tutorials/features/deepspeed_kernels", "tutorials/features/float8", "tutorials/features/horovod", "tutorials/features/int8_overview_xpu", "tutorials/features/ipex_log", "tutorials/features/nhwc", "tutorials/features/profiler_kineto", "tutorials/features/profiler_legacy", "tutorials/features/torch_compile_gpu", "tutorials/getting_started", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/int4_weight_only_quantization", "tutorials/llm/llm_optimize_transformers", "tutorials/performance", "tutorials/releases", "tutorials/technical_details", "tutorials/technical_details/AOT", "tutorials/technical_details/ipex_optimize", "tutorials/technical_details/memory_management", "tutorials/technical_details/optimizer_fusion_gpu"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["design_doc/cpu/isa_dyndisp.md", "index.rst", "tutorials/api_doc.rst", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/DDP.md", "tutorials/features/DLPack.md", "tutorials/features/DPC++_Extension.md", "tutorials/features/FSDP.md", "tutorials/features/advanced_configuration.md", "tutorials/features/amp_gpu.md", "tutorials/features/auto_channels_last.md", "tutorials/features/compute_engine.md", "tutorials/features/deepspeed_kernels.md", "tutorials/features/float8.md", "tutorials/features/horovod.md", "tutorials/features/int8_overview_xpu.md", "tutorials/features/ipex_log.md", "tutorials/features/nhwc.md", "tutorials/features/profiler_kineto.md", "tutorials/features/profiler_legacy.md", "tutorials/features/torch_compile_gpu.md", "tutorials/getting_started.md", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/int4_weight_only_quantization.md", "tutorials/llm/llm_optimize_transformers.md", "tutorials/performance.md", "tutorials/releases.md", "tutorials/technical_details.rst", "tutorials/technical_details/AOT.md", "tutorials/technical_details/ipex_optimize.md", "tutorials/technical_details/memory_management.rst", "tutorials/technical_details/optimizer_fusion_gpu.md"], "indexentries": {"empty_cache() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.empty_cache", false]], "fp8_autocast() (in module intel_extension_for_pytorch.quantization.fp8)": [[2, "intel_extension_for_pytorch.quantization.fp8.fp8_autocast", false]], "get_fp32_math_mode() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.get_fp32_math_mode", false]], "max_memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.max_memory_allocated", false]], "max_memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.max_memory_reserved", false]], "mem_get_info() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.mem_get_info", false]], "memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_allocated", false]], "memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_reserved", false]], "memory_snapshot() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_snapshot", false]], "memory_stats() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_stats", false]], "memory_stats_as_nested_dict() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_stats_as_nested_dict", false]], "memory_summary() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_summary", false]], "optimize() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.optimize", false]], "optimize() (in module intel_extension_for_pytorch.llm)": [[2, "intel_extension_for_pytorch.llm.optimize", false]], "reset_accumulated_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.reset_accumulated_memory_stats", false]], "reset_peak_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.reset_peak_memory_stats", false]], "set_fp32_math_mode() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.set_fp32_math_mode", false]], "torch_ipex::xpu::fp32_math_mode (c++ enum)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODEE", false]], "torch_ipex::xpu::fp32_math_mode::bf32 (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4BF32E", false]], "torch_ipex::xpu::fp32_math_mode::fp32 (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4FP32E", false]], "torch_ipex::xpu::fp32_math_mode::fp32_math_mode_max (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", false]], "torch_ipex::xpu::fp32_math_mode::fp32_math_mode_min (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", false]], "torch_ipex::xpu::fp32_math_mode::tf32 (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4TF32E", false]], "torch_ipex::xpu::set_fp32_math_mode (c++ function)": [[2, "_CPPv4N10torch_ipex3xpu18set_fp32_math_modeE14FP32_MATH_MODE", false]]}, "objects": {"": [[2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4BF32E", "torch_ipex::xpu::BF32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4FP32E", "torch_ipex::xpu::FP32"], [2, 1, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODEE", "torch_ipex::xpu::FP32_MATH_MODE"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4BF32E", "torch_ipex::xpu::FP32_MATH_MODE::BF32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4FP32E", "torch_ipex::xpu::FP32_MATH_MODE::FP32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "torch_ipex::xpu::FP32_MATH_MODE::FP32_MATH_MODE_MAX"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "torch_ipex::xpu::FP32_MATH_MODE::FP32_MATH_MODE_MIN"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4TF32E", "torch_ipex::xpu::FP32_MATH_MODE::TF32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "torch_ipex::xpu::FP32_MATH_MODE_MAX"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "torch_ipex::xpu::FP32_MATH_MODE_MIN"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4TF32E", "torch_ipex::xpu::TF32"], [2, 2, 1, "_CPPv4N10torch_ipex3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "torch_ipex::xpu::set_fp32_math_mode"], [2, 3, 1, "_CPPv4N10torch_ipex3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "torch_ipex::xpu::set_fp32_math_mode::mode"]], "intel_extension_for_pytorch": [[2, 4, 1, "", "get_fp32_math_mode"], [2, 4, 1, "", "optimize"], [2, 4, 1, "", "set_fp32_math_mode"]], "intel_extension_for_pytorch.llm": [[2, 4, 1, "", "optimize"]], "intel_extension_for_pytorch.quantization.fp8": [[2, 4, 1, "", "fp8_autocast"]], "intel_extension_for_pytorch.xpu": [[2, 4, 1, "", "empty_cache"], [2, 4, 1, "", "max_memory_allocated"], [2, 4, 1, "", "max_memory_reserved"], [2, 4, 1, "", "mem_get_info"], [2, 4, 1, "", "memory_allocated"], [2, 4, 1, "", "memory_reserved"], [2, 4, 1, "", "memory_snapshot"], [2, 4, 1, "", "memory_stats"], [2, 4, 1, "", "memory_stats_as_nested_dict"], [2, 4, 1, "", "memory_summary"], [2, 4, 1, "", "reset_accumulated_memory_stats"], [2, 4, 1, "", "reset_peak_memory_stats"]]}, "objnames": {"0": ["cpp", "enumerator", "C++ enumerator"], "1": ["cpp", "enum", "C++ enum"], "2": ["cpp", "function", "C++ function"], "3": ["cpp", "functionParam", "C++ function parameter"], "4": ["py", "function", "Python function"]}, "objtypes": {"0": "cpp:enumerator", "1": "cpp:enum", "2": "cpp:function", "3": "cpp:functionParam", "4": "py:function"}, "terms": {"": [2, 6, 8, 9, 11, 18, 19, 20, 25, 28, 36], "0": [0, 1, 3, 4, 6, 8, 9, 10, 11, 16, 18, 22, 25, 26, 28, 30, 36], "00000000000602e7": 0, "001": [4, 6, 11], "00978": 28, "04": [25, 30, 31], "05516": 28, "09": 0, "09557": 28, "0f": 8, "0x00007fd552f36000": 4, "0x00007fd55321b000": 4, "0x00007fd553600000": 4, "0x00007fd553a9c000": 4, "0x00007fd553b11000": 4, "0x00007fd55511d000": 4, "0x00007fd55512c000": 4, "0x00007fd57eb1d000": 4, "0x00007fd5806cc000": 4, "0x00007fd584ab0000": 4, "0x00007fd5862b0000": 4, "0x00007fd5a1a1b000": 4, "0x00007fd5a44d8000": 4, "0x00007fd5bb895000": 4, "0x00007fd5bb927000": 4, "0x1": 0, "0x2b0004b1": 30, "0x7fff": 0, "0xffff": 0, "1": [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 13, 18, 19, 22, 25, 27, 28, 34, 36], "10": [0, 3, 7, 9, 18, 19, 22], "100": [0, 9, 16, 30, 31], "1000": 9, "10004": 28, "1020": 30, "1024": [8, 30], "1024gb": 30, "10944": 28, "11": 0, "12": [0, 31], "123": 6, "12355": 9, "127": 6, "128": [4, 9, 11, 22, 28], "128k": 27, "13": 0, "1307": 9, "13b": [27, 30, 31], "14": 9, "15": 0, "1550": [30, 31], "16": [0, 22, 30], "17": [4, 8, 30], "170": [25, 31, 33], "17323": 28, "18": [8, 31], "1d": 10, "1mb": 2, "2": [0, 1, 4, 6, 7, 8, 9, 11, 18, 19, 20, 22, 25, 26, 27, 28, 30, 32, 33, 34], "20": [13, 19, 25], "200": 28, "2019": 2, "2020": 7, "2021": [0, 31], "2022": [28, 31], "2023": [28, 30, 31], "202306": 28, "2024": [30, 31], "2025": [22, 31], "202x": 4, "2048": 8, "21": 31, "22": [25, 30], "2206": 28, "2210": 28, "224": [4, 11, 22], "23": 25, "2304190630": 30, "2306": 28, "2309": 28, "2310": 28, "25": [9, 30], "29500": 6, "2d": [19, 34], "2f": 9, "2gb": 31, "3": [0, 3, 4, 6, 8, 9, 11, 13, 18, 19, 22, 25, 27, 28, 30, 34], "3081": 9, "30b": 27, "31": 30, "32": [9, 19, 22], "33": 0, "3696": 31, "3706": 31, "3788": 31, "3796": 31, "3808": 31, "3829": 31, "3841": 31, "3882": 31, "3887": 31, "3970": 31, "3_25mhzi_quad_dameni_oam600w_ifrv2332i_pscnull_ifwi": 30, "3d": 19, "4": [6, 8, 18, 19, 22, 27, 28, 31], "40b": 27, "4280": 31, "4317": 31, "4354": 31, "4358": 31, "4361": 31, "4407": 31, "4429": 31, "4439": 31, "4450": 31, "4463": 31, "4468": 31, "4480": 31, "4495": 31, "4504": 31, "4527": 31, "4557": 31, "4558": 31, "4800": 30, "4bit": 28, "4f": 9, "4fc181b0": 30, "4k": 27, "4x": 30, "5": [0, 4, 6, 9, 13, 17, 18, 19, 20, 22, 28, 29, 30], "50": 19, "500mb": [32, 33], "512": [1, 4, 31], "56": [22, 30], "58": 0, "5b": 27, "5gb": [32, 33], "5x": 31, "6": [3, 4, 27, 30, 31], "64": [9, 11, 22, 28], "64gb": 30, "66": 0, "6b": [27, 28, 30, 31], "6f": 9, "7": [0, 7, 9], "70b": [27, 31], "736": 30, "7b": [27, 28, 30, 31], "7b1": 27, "8": [8, 15], "80": 3, "8480": 30, "86b": 30, "8b": [27, 28, 31], "9": [0, 4, 31], "91b14bf559": [22, 25], "9216": 9, "9525": 30, "997": 25, "9b": [27, 28], "A": [0, 3, 4, 5, 17, 19, 25, 28, 31, 33], "And": [8, 31], "As": [8, 17, 25, 28, 36], "At": [0, 8, 27], "Be": 17, "But": [0, 19], "By": [0, 2, 5, 10, 34], "FOR": 15, "For": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 18, 19, 20, 23, 24, 25, 27, 31, 32, 33, 34, 35, 36], "If": [0, 2, 3, 4, 6, 7, 8, 11, 12, 13, 17, 18, 19, 22, 25, 28, 31, 32, 33, 34], "In": [0, 1, 2, 4, 5, 8, 11, 13, 19, 27, 28, 31], "It": [0, 2, 5, 6, 7, 8, 9, 10, 14, 18, 19, 23, 28, 29, 31, 32, 33], "Its": [32, 34], "NOT": [14, 19], "No": [2, 19, 25, 31], "Not": 25, "ON": [10, 30], "On": [1, 5, 15, 19, 27, 28], "One": [7, 15, 19, 36], "Or": 6, "Such": [0, 31], "The": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 36], "Then": [3, 6, 17, 25, 28, 31], "There": [18, 23, 27], "These": [1, 4, 11, 15, 25, 27, 28, 31], "To": [0, 2, 3, 4, 5, 6, 9, 16, 19, 23, 27, 28, 31], "Will": 19, "With": [1, 2, 5, 6, 8, 16, 17, 31], "_": [0, 4, 6, 7, 8, 14, 17, 18, 19, 25, 28, 31, 34], "__init__": [3, 6, 9, 11, 19, 22], "__m256i": 0, "__m512": 0, "__m512i": 0, "__main__": [6, 9], "__name__": [6, 9], "__release_lnx": 31, "_appli": 19, "_build": 3, "_c": [0, 2], "_cmp_ord_q": 0, "_convolut": 11, "_cvt_fp32_to_bf16": 0, "_distributed_c10d": 2, "_get_current_isa_level": 0, "_get_highest_binary_support_isa_level": 0, "_get_highest_cpu_support_isa_level": 0, "_glibcxx_use_cxx11_abi": 25, "_mm256_mask_storeu_epi16": 0, "_mm256_storeu_si256": 0, "_mm512_add_epi32": 0, "_mm512_and_si512": 0, "_mm512_castps_si512": 0, "_mm512_cmp_ps_mask": 0, "_mm512_cvtneps_pbh": 0, "_mm512_cvtusepi32_epi16": 0, "_mm512_loadu_p": 0, "_mm512_mask_blend_epi32": 0, "_mm512_maskz_loadu_p": 0, "_mm512_set1_epi32": 0, "_mm512_srli_epi32": 0, "_optimizer_util": 34, "_original_step": 34, "_parameter_wrapp": 34, "_recurs": 4, "_thnn_fused_gru_cel": 11, "_xpu": 8, "_znk5torch8autograd4node4nameb5cxx11ev": [25, 31], "a770": 31, "ab": [18, 31], "abbrevi": 2, "abi": [0, 4, 31], "about": [1, 2, 3, 6, 8, 9, 31], "abov": [3, 6, 7, 9, 10, 18, 19, 27, 36], "absenc": 25, "absolut": 4, "abstract": 8, "acceler": [1, 2, 5, 15, 22, 24, 28, 31], "accept": [10, 18], "access": [7, 8, 19, 27, 31, 36], "accommod": 19, "accomplish": 16, "accord": [2, 27, 28], "accumul": 2, "accur": [11, 27, 28], "accuraci": [2, 9, 11, 27, 28, 31], "achiev": [1, 2, 4, 31], "across": [2, 5, 6, 9], "action": [9, 18], "activ": [2, 4, 15, 16, 17, 22, 25, 27, 28, 29], "active_byt": 2, "actual": [7, 8, 19, 25, 31], "ad": [2, 4, 22, 31], "adadelta": 9, "adam": 36, "adamw": [31, 36], "adaptiveaveragepoolingkrnl": 0, "add": [10, 11, 14, 16, 18, 19, 25, 31, 36], "add_": 36, "add_argu": 9, "add_execut": 4, "add_librari": 8, "addbmm": 11, "addcdiv": 11, "addcmul": 11, "addit": [0, 2, 4, 28, 31, 32, 33, 34], "addition": [22, 28], "addmm": [8, 11, 31], "addmm_": 11, "addmv": 11, "addr": 11, "address": [6, 10, 19, 25], "addtion": 0, "adjust": [6, 28], "adopt": [7, 27, 28, 31], "advanc": [1, 8, 23, 28, 31, 35], "advantag": [1, 2, 12, 19, 24, 31], "aes_ni": 0, "after": [2, 3, 4, 8, 17, 23, 25, 28, 31, 36], "afterward": 8, "again": [3, 36], "against": 4, "agnost": 8, "agre": 3, "ahead": [3, 8], "ai": [1, 10, 24, 25, 27, 31], "aka": 19, "al": 28, "algebra": [13, 27], "algorithm": [10, 15, 19, 28, 31], "alia": [2, 4, 8], "align": [0, 9, 18, 31], "align_corn": 13, "all": [0, 2, 3, 4, 6, 8, 9, 10, 11, 13, 16, 18, 25, 27, 28, 29, 30, 31, 34, 35, 36], "all_reduc": 9, "allgath": [6, 9, 16, 31], "alloc": [2, 7, 16, 18, 27, 32, 35], "allocated_byt": 2, "allow": [2, 4, 10, 11, 25, 28, 31, 32, 33], "allreduc": [2, 6, 16, 31], "alltoal": [6, 16], "almost": 19, "along": 3, "alpha": [8, 36], "alreadi": [1, 3, 4, 16, 19, 27, 32], "also": [1, 2, 4, 5, 7, 8, 10, 17, 18, 19, 25, 27, 28, 29, 31, 32, 33, 35, 36], "altern": [2, 4, 5, 6, 19], "although": 2, "alwai": [3, 11, 13, 19, 31], "amax": 2, "amc": 30, "among": [5, 7, 16], "amount": [2, 35], "amp": [4, 22, 23, 31], "amp_dtyp": 29, "amplifi": 1, "amx": [0, 1, 31], "amx_bf16": 0, "amx_int8": 0, "amx_til": 0, "an": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 16, 17, 18, 19, 25, 27, 28, 31, 35, 36], "anaconda": 0, "analysi": 17, "ani": [0, 2, 3, 11, 18, 19, 22, 25, 31, 32], "annot": 10, "anonym": 0, "anoth": 28, "answer": 19, "anymor": 5, "aot": [8, 10, 31], "apach": [16, 26], "api": [1, 4, 8, 9, 10, 17, 22, 27, 28, 31, 32, 35], "app": 4, "appear": [25, 31], "append": 4, "appli": [2, 4, 5, 11, 16, 19, 23, 27, 28, 29, 31, 36], "applic": [1, 2, 4, 27, 28, 32, 33, 34, 35], "approach": [8, 25, 27], "appropri": 32, "ar": [0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 23, 25, 27, 28, 29, 31, 32, 33, 34, 36], "arc": [25, 27, 28, 31, 33], "architectur": [10, 27, 28], "arg": [6, 9, 16, 18, 36], "argc": 4, "argmax": 9, "argpars": 9, "argument": [6, 9, 13, 28], "argumentpars": 9, "argv": 4, "around": 8, "arrai": [19, 34], "arxiv": 28, "ask": 3, "assert": 10, "assign": 19, "associ": [8, 32], "assum": [11, 23], "asymmetr": [17, 31], "asynchron": 10, "at_dispatch_all_typ": 8, "at_dispatch_floating_typ": 8, "atan2": 11, "aten": [2, 4, 7, 8, 10, 31], "aten_cpu_cap": 0, "atendlmtensor": 7, "atenipextypexpu": 28, "ats": 33, "attempt": [18, 32], "attent": [1, 27, 28, 31], "attention_mask": 15, "attn": 31, "attribut": [2, 19], "auto": [0, 2, 4, 8, 19, 27, 31], "auto_kernel_select": 2, "autocast": [4, 22, 23], "autoclass": 3, "autofunct": 3, "autom": 11, "automat": [1, 2, 4, 5, 12, 17, 19, 25, 27, 28, 31, 32, 33], "automaticlli": 2, "automodelforcausallm": [28, 29, 31], "autoround": 28, "autotoken": 28, "avail": [0, 1, 2, 4, 5, 6, 8, 10, 13, 23, 28, 29, 31, 35], "aval": 8, "averag": [9, 16], "averagepool2d": 13, "avg_pool": 18, "avoid": [2, 3, 8, 25], "avx": [0, 1, 31], "avx2": 0, "avx256": 0, "avx2_vnni": 0, "avx512": [0, 19, 31], "avx512_4fmap": 0, "avx512_4vnniw": 0, "avx512_bf16": 0, "avx512_bitalg": 0, "avx512_bw": 0, "avx512_cd": 0, "avx512_dq": 0, "avx512_er": 0, "avx512_f": 0, "avx512_fp16": 0, "avx512_ifma": 0, "avx512_pf": 0, "avx512_vbmi": 0, "avx512_vbmi2": 0, "avx512_vl": 0, "avx512_vnni": 0, "avx512_vp2intersect": 0, "avx512_vpclmul": 0, "avx512_vpopcntdq": 0, "avx_vnni": 0, "awar": [19, 28], "awq": [2, 28], "b": [3, 11, 31], "b4": 30, "back": [0, 13, 19, 25, 31], "backend": [0, 1, 2, 5, 6, 7, 8, 9, 13, 16, 22, 25, 27, 28, 31, 33], "background": 8, "backpropag": 28, "backward": [4, 6, 8, 9, 11, 16, 22, 31, 34], "backwardprefetch": 9, "baddbmm": 11, "baeseong": 28, "bag": 31, "baichuan": 27, "baichuan2": [27, 31], "bandwidth": [27, 28], "barrier": 9, "base": [0, 1, 2, 3, 4, 8, 9, 10, 13, 16, 27, 28, 29, 30, 31], "basekit": [6, 16], "bash": [25, 28], "basic": [13, 31], "batch": [2, 4, 8, 9, 16, 19, 31, 32, 34], "batch_idx": [4, 9, 16], "batch_siz": [4, 6, 8, 9, 16, 19], "batchnorm": [0, 34], "batchnorm1d": 19, "beam": [27, 31], "becaus": [0, 4, 11, 19, 27, 33], "becom": 27, "been": [0, 1, 4, 5, 8, 19, 20, 22, 25, 31], "befor": [0, 1, 2, 3, 4, 5, 10, 17, 19, 25, 28, 32, 33, 34], "beforehand": [28, 32, 33], "begin": [2, 3, 8], "behavior": [2, 10, 13], "being": [14, 31], "believ": [11, 19], "belong": 18, "below": [4, 6, 8, 11, 13, 14, 18, 19, 23, 25, 28, 31, 33, 36], "benchmark": [4, 30, 35], "benefici": 19, "benefit": [4, 5, 11, 17], "benifit": [32, 33], "bert": 15, "bertmodel": 4, "besid": [8, 27, 28, 31], "best": [0, 2, 11, 17, 27], "beta": 31, "better": [1, 2, 10, 13, 17, 19, 27, 28, 31, 34, 36], "between": [0, 7, 11, 27, 28], "bf16": [0, 2, 23, 27, 31, 36], "bf32": [2, 10], "bfloat16": [0, 2, 5, 13, 22, 23, 31, 34], "bia": [8, 11, 14, 19, 28, 31], "big": 19, "bigscienc": 27, "bilinear": 11, "bin": [0, 4, 25, 30, 31], "binari": [0, 3, 4, 11, 19, 32, 33], "binary_cross_entropi": 11, "binary_cross_entropy_with_logit": 11, "binaryop": 31, "bind": [5, 8, 9, 31], "bio": 30, "bit": 15, "bla": 10, "block": [2, 3, 10, 28, 31, 32, 34], "blocksiz": 28, "bloom": [27, 30], "bmm": 11, "bmp": 19, "bn": 2, "boast": 28, "bodi": 0, "bool": 2, "boost": [4, 12, 30, 31], "both": [1, 2, 4, 5, 7, 15, 17, 19, 27, 28, 29, 31, 33, 34, 36], "bottl": 36, "bottleneck": [27, 28], "bottom": 34, "bound": [27, 31, 36], "box": 4, "brain": 28, "branch": 1, "bridg": 8, "brief": 27, "bring": [12, 17, 27, 31, 32], "broad": [12, 31], "broadcast": 16, "broadcast_optimizer_st": 16, "broadcast_paramet": 16, "broader": [28, 31], "broken": 2, "buf": 36, "buffer": 27, "bug": [1, 3, 31, 32, 33], "build": [4, 5, 16, 25, 31, 33], "build_by_per_kernel": 10, "build_conv_contigu": 10, "build_ext": 8, "build_internal_debug": 10, "build_opt_level": 10, "build_separate_op": 10, "build_with_sanit": 10, "built": [0, 8, 25, 31, 33], "bwd": 31, "byeongwook": 28, "byte": 2, "c": [0, 1, 5, 6, 8, 11, 22, 25], "c10": [0, 4], "c10d": [6, 9], "cach": [2, 3, 8, 10, 14, 18, 31, 32, 35, 36], "cache_en": 23, "cache_weight_for_large_batch": 2, "cai": 28, "calcul": [1, 2, 8, 11, 18, 28, 31], "calib_dataload": 28, "calib_dataset": [17, 29], "calib_func": 28, "calibr": [2, 4, 17, 29], "calibration_data_load": 4, "call": [0, 2, 4, 6, 8, 11, 18, 19, 28, 34, 35], "can": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 13, 16, 17, 18, 19, 23, 25, 27, 28, 29, 31, 32, 33, 34, 35, 36], "can_cast_train": 34, "candidate_cel": 8, "cannot": [11, 19, 25, 31], "canon": 19, "capabl": [0, 5, 17, 18, 22, 28, 31, 35], "capac": [5, 13, 30], "capsule2": 7, "captur": 35, "card": [6, 10, 19, 27, 31], "care": 8, "case": [0, 2, 4, 5, 8, 9, 10, 12, 19, 28, 31], "cast": [2, 8, 11], "cat": [8, 11, 13], "catch": [4, 17, 18], "categori": [11, 14], "caus": [2, 25, 27, 31, 33], "cc": [0, 4], "ccl": [6, 9, 10, 16, 30], "ccl_blocking_wait": 10, "ccl_same_stream": 10, "cd": [3, 4], "cdist": 11, "center": [5, 14, 25, 28, 31, 33], "cerr": 4, "certain": [1, 2, 25, 28, 29], "cgf": 8, "cgh": 8, "chain_matmul": 11, "challeng": [27, 28, 31], "chang": [0, 3, 4, 6, 8, 9, 11, 16, 19, 23, 25, 28, 29], "channel": [10, 17, 31, 32], "channels_last": [4, 5, 19], "char": 4, "charact": 3, "chat": [27, 28], "chatglm3": 31, "check": [4, 5, 6, 8, 10, 13, 19, 23, 27, 29, 31, 32, 34], "check_contigu": 8, "check_input": 8, "check_xpu": 8, "checkpoint": [2, 4, 16], "cheng": 28, "chines": 31, "choos": [4, 5, 11, 13, 31], "chosen": [0, 11, 13], "chw": 19, "chwn": 19, "cifar10": 4, "circumst": 11, "cl": 4, "cl_device_not_found": 25, "class": [6, 9, 11, 19, 22], "classif": 9, "claus": 36, "clean": [3, 25, 31], "cleanup": 9, "client": 31, "clone": [3, 36], "close": 19, "cmake": [0, 4, 5, 31], "cmake_cxx_flag": 4, "cmake_have_libc_pthread": 4, "cmake_minimum_requir": [4, 8], "cmake_prefix_path": 8, "cmakefil": 0, "cmakelist": [4, 8], "cmdclass": 8, "cnn": 19, "co": 31, "code": [1, 2, 3, 5, 8, 9, 10, 13, 16, 19, 23, 25, 26, 29, 31, 32, 33, 35, 36], "codegen": 22, "collect": [2, 4, 5, 6, 9, 10, 17, 31], "column": [4, 8], "com": [3, 6, 9, 25], "combin": [2, 17], "come": 8, "comma": [10, 33], "command": [3, 4, 6, 8, 16, 22, 25, 31], "comment": [0, 3], "commit": [3, 30], "common": [0, 14], "commun": [5, 6, 7, 9, 10, 31], "compar": [1, 2, 5, 19, 28, 31, 36], "compat": [0, 28, 31], "compens": 16, "competit": 31, "compil": [1, 3, 4, 10, 25, 31], "compile_flag": 8, "compiled_model": 22, "complet": [3, 4, 18, 19, 29, 35], "complex": [0, 27, 28], "complexdoubl": 0, "complexfloat": 0, "complic": 8, "complier": 0, "compon": [8, 10, 26, 27], "compos": [4, 9, 15], "comprehens": [1, 31, 35], "compress": [15, 28], "compression_dim": 28, "compression_dtyp": 28, "compris": 19, "comput": [2, 4, 10, 15, 16, 19, 22, 27, 28, 31, 32, 33, 34], "compute_dtyp": 28, "compute_eng": 13, "concat": [13, 19, 27], "concat_linear": 2, "concaten": [13, 27], "concept": [16, 19], "conclus": 19, "concret": 31, "conda": [25, 31], "condit": [26, 31, 34], "conf": 28, "config": [2, 4, 17], "configur": [0, 2, 4, 8, 17, 18, 23, 25, 28, 31, 33], "conflict": [0, 25], "connect": 34, "conserv": 25, "consid": 8, "consider": 17, "consist": [16, 27, 31], "consol": 18, "const": [0, 4, 8], "constrain": 28, "consum": 7, "contain": [0, 3, 6, 7, 10, 13, 28], "content": [28, 29, 31], "context": [2, 3, 7, 8, 11, 27], "contigu": [8, 10, 19, 27, 31], "contiguous_format": 19, "continu": [5, 13, 18, 25, 31], "contribut": 22, "control": [1, 10], "conv": [2, 10, 11, 18, 31, 34], "conv1": [9, 22], "conv1d": [11, 19], "conv2": [9, 22], "conv2d": [2, 9, 11, 19, 22, 31], "conv3d": [11, 31], "conv_binari": 17, "conv_bn": 2, "conv_bn_fold": 2, "conv_relu": 17, "conv_sum_relu": 17, "conv_tbc": 11, "conv_transpose1d": 11, "conv_transpose3d": 11, "conv_unari": 17, "conveni": [8, 11], "convent": 9, "convers": [2, 4, 11, 17, 28, 31], "convert": [0, 1, 2, 4, 5, 7, 11, 12, 15, 17, 19, 28, 29, 31, 34], "convert_dtype_str2torch": 28, "convert_jit": [4, 17, 29], "convolut": [2, 5, 11, 22, 32, 34], "convolutuon": 2, "convtranspos": 34, "convtranspose2d": 2, "coo": 19, "copi": [0, 3, 5, 7, 19], "copyright": [0, 26], "core": [0, 2, 16, 22, 25, 27, 28, 30, 31], "correct": [8, 9, 19, 25], "correctli": 25, "correspond": [2, 6, 25, 28, 31], "correspondingli": 28, "corrspond": 19, "corrupt": 16, "cosine_embedding_loss": 11, "cosine_similar": 11, "cost": [2, 5, 8, 21], "could": [6, 10, 17, 18, 19, 22, 28, 29, 34], "counter": 2, "counterpart": [2, 31], "cout": 4, "cover": [19, 27, 31], "cpp": [3, 4, 8], "cpp_extens": 8, "cppsdk": 4, "cpu": [1, 2, 4, 5, 6, 23, 25, 30, 31, 34], "cpu_capability_avx512": 0, "cpu_capability_avx512_bf16": 0, "cpu_featur": 0, "cpu_feature_main": 0, "cpuid": 0, "cpuinfo": 0, "cpuoffload": 9, "creat": [2, 3, 4, 7, 8, 14, 17, 22, 29, 31, 34], "credit": 0, "criterion": [4, 11], "cross": [10, 11, 31], "cross_entropy_loss": 11, "crossentropyloss": 4, "crucial": 25, "cuda": [8, 9, 31], "cudnn": 19, "curdevid": 28, "current": [0, 1, 2, 3, 5, 7, 9, 13, 17, 18, 25, 27, 28, 29, 31, 33, 34, 36], "current_devic": 2, "current_stream": 8, "custom": [1, 5, 8, 10, 13, 14, 27, 31], "cvt_fp32_to_bf16": 0, "cvt_fp32_to_bf16_kernel_fn": 0, "cvt_fp32_to_bf16_kernel_impl": 0, "cvt_fp32_to_bf16_kernel_stub": 0, "cvtfp32tobf16": 0, "cvtfp32tobf16krnl": 0, "cwd": 6, "cxx": [0, 4, 25], "cxx11": 31, "cxx_standard": [4, 8], "d": [3, 4, 8, 11, 34], "d25": 30, "d2h": 25, "d__avx512f__": 0, "d__avx__": 0, "d_bia": 8, "d_candidate_cel": 8, "d_elu": 8, "d_gate": 8, "d_gate_weight": 8, "d_gates_": 8, "d_input": 8, "d_input_g": 8, "d_new_cel": 8, "d_old_cel": 8, "d_old_cell_": 8, "d_old_h": 8, "d_output_g": 8, "d_relu": 8, "d_sigmoid": 8, "d_tanh": 8, "d_tanh_new_cel": 8, "d_weight": 8, "d_x": 8, "dampen": 36, "data": [0, 2, 4, 6, 8, 11, 12, 14, 16, 17, 19, 23, 25, 28, 29, 31, 33, 36], "data_prepare_finish": 18, "data_ptr": 7, "data_typ": 19, "dataload": [4, 6, 9, 16], "dataset": [4, 6, 9, 16, 28], "dataset1": 9, "dataset2": 9, "datatyp": [28, 30, 31], "date": 31, "dcmake_c_compil": 8, "dcmake_cxx_compil": 8, "dcmake_prefix_path": [4, 8], "dcpmm": 30, "dcpu_cap": 0, "dcpu_capability_amx": 0, "dcpu_capability_avx2": 0, "dcpu_capability_avx512": 0, "dcpu_capability_avx512_bf16": 0, "dcpu_capability_avx512_fp16": 0, "dcpu_capability_avx512_vnni": 0, "dcpu_capability_default": 0, "ddp": [2, 5, 9, 31], "ddp_loss": 9, "ddr": 30, "deactiv": 25, "dealloc": 32, "debug": [10, 17, 18, 29], "decid": 27, "declar": [0, 8], "decltyp": 0, "decod": [27, 31], "decompress": 15, "decreas": [2, 25], "dedic": [2, 31], "deep": [6, 7, 9, 11, 13, 15, 16], "deepcopi": 2, "deepspe": [2, 10, 27, 30, 31], "def": [6, 8, 9, 11, 19, 22, 25, 31], "default": [0, 2, 4, 5, 6, 7, 9, 10, 18, 22, 25, 31, 34], "default_weight_observ": [4, 17, 29], "defaultvalu": 10, "defin": [0, 2, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 22, 29, 31, 34], "definit": [0, 2, 8], "deinit": 3, "delai": 15, "delayedsc": [2, 15], "deleg": [16, 32], "delimit": 33, "deliv": 17, "deliveri": [32, 33], "demand": 5, "demonstr": [4, 7, 13, 19, 28], "demostr": 23, "dens": 19, "depend": [0, 4, 19, 31], "deploi": [27, 28, 31], "deploy": [2, 4, 28], "deployment_mod": 2, "deprec": [10, 12, 31], "dequant": [14, 31], "desc": 19, "descent": 28, "describ": [5, 6, 11, 19, 28], "descript": [2, 5, 9, 10, 18, 19, 24, 34], "descriptor": 31, "design": [3, 9, 11, 13, 14, 19, 28, 29, 31, 32, 34], "desir": [4, 17], "destroy_process_group": 9, "detach": 36, "detail": [0, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 17, 19, 23, 24, 27, 28, 30, 31, 34], "detect": [0, 1, 4], "determin": [0, 28], "develop": [1, 4, 8, 32, 33], "devic": [1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 18, 19, 21, 22, 24, 25, 27, 28, 29, 31, 32, 33, 34], "device_count": 9, "device_id": [6, 7, 9], "device_map": 28, "devid": 16, "dgpu": 28, "diagram": 19, "dict": 2, "dictionari": 2, "differ": [0, 1, 2, 4, 6, 7, 10, 19, 27, 28], "difficult": 19, "digit": 9, "dim": [4, 7, 8, 9, 13, 19], "dimens": [2, 8, 13, 19, 34], "dimension": 8, "dir": [0, 10], "direct": 3, "directli": [2, 8, 22, 28], "directori": [1, 3, 8, 10, 25, 29, 31], "disabl": [2, 9, 18, 34], "disable_auto_channels_last": [12, 34], "disadvantag": [32, 33], "discret": [1, 24, 31], "discrete gpu": 1, "discuss": [3, 8, 19], "dispatch": [1, 8, 31], "displai": 2, "dist": [6, 9, 11], "distribut": [2, 6, 9, 10, 16, 25, 31, 32, 33], "distributeddataparallel": [9, 31], "distributedoptim": 16, "distributedsampl": [9, 16], "div": [19, 31], "divid": 34, "divis": 13, "dkm": 25, "dldevicetyp": 7, "dlmanagedtensor": 7, "dmesg": 25, "dmlc": 7, "dnn": 15, "do": [2, 3, 5, 8, 11, 19, 25, 27, 28], "doc": [1, 3, 17, 29, 31, 34], "dockerfil": 31, "docstr": 3, "document": [0, 4, 5, 8, 10, 29, 31], "doe": [7, 8, 18, 19, 25, 31, 34], "doesn": [2, 19], "domain": [7, 15], "domin": [1, 27], "don": [0, 3, 8, 11], "done": [0, 4, 9, 22], "dongsoo": 28, "dot": [11, 19, 27], "doubl": [0, 8], "down": [2, 25], "download": [4, 9, 22, 25], "downstream": 11, "dpc": [1, 7, 10, 22, 25, 31], "dpccp": 2, "dpcpp": [8, 22, 25, 31], "dpcppbuildextens": 8, "dpcppextens": 8, "dpcpproot": [22, 25], "drawback": 2, "drive": [1, 27], "driver": [5, 25, 30, 33], "dropout": [2, 9, 31, 32, 34], "dropout1": 9, "dropout2": 9, "dspevd": 31, "dst": 0, "dtype": [0, 2, 4, 11, 13, 17, 22, 23, 28, 29, 31, 34], "due": [0, 1, 11, 17, 25, 27, 28, 31], "dummi": 10, "dump": [17, 31], "duplic": 10, "durat": 25, "dure": [2, 3, 4, 27, 28, 33, 34], "dynam": [1, 4, 15, 25, 27, 31], "e": [0, 1, 2, 4, 8, 10, 11, 17, 19, 24, 25, 27, 31, 32, 33], "e4m3": 15, "e5m2": 15, "each": [0, 2, 5, 6, 8, 9, 10, 11, 13, 16, 18, 31, 34], "eager": [1, 17, 31], "earlier": 25, "eas": [8, 19], "easi": [1, 16, 24, 31], "easier": [19, 22], "easiest": 28, "easili": 28, "ec33277": 30, "ecc": 30, "ecolog": 14, "edit": 3, "educ": 31, "effect": [0, 16, 17, 28], "effici": [1, 6, 8, 9, 15, 22, 27, 28, 31, 36], "effort": 28, "either": [2, 5, 6, 25], "elabor": 8, "elaps": 9, "elapsed_tim": 9, "element": [10, 19, 36], "eleutherai": 27, "elia": 28, "elimin": 27, "els": [0, 6, 19, 36], "elu": [8, 31], "embed": [2, 27, 31], "emerg": [1, 27], "emit": 8, "empir": 13, "empow": [5, 22], "empti": [7, 18, 19], "empty_cach": [2, 35], "emul": 31, "enabl": [1, 2, 4, 5, 6, 10, 11, 15, 17, 19, 23, 27, 28, 31, 32, 33, 34], "enable_auto_channels_last": [12, 34], "enable_tim": 9, "enable_wrap": 9, "encount": [22, 25, 32, 33], "end": [2, 4, 18, 25, 28, 31, 32, 33, 34], "endif": 0, "endl": 4, "engin": [1, 4, 19, 24, 28, 31], "enhanc": [1, 22, 27, 28, 31], "enough": [2, 25], "ensur": [4, 16, 25], "entir": [8, 27], "enum": 2, "enumer": [2, 4, 7, 9, 16], "env": [6, 16, 22, 25], "env_key1": 3, "env_key2": 3, "env_val1": 3, "env_val2": 3, "environ": [0, 3, 5, 6, 9, 10, 16, 18, 23, 25, 27], "epoch": [6, 9, 16], "eq": [9, 31], "equal": [10, 25], "equival": [8, 28, 31, 36], "err": 18, "error": [2, 3, 4, 8, 9, 18, 19, 25, 28, 31], "especi": [2, 3, 8], "essenti": 8, "estim": [2, 35], "et": 28, "etc": [0, 3, 5, 14, 18], "eval": [2, 4, 9, 11, 17, 23, 29, 31], "evalu": [2, 23, 31], "even": [2, 31], "event": [2, 9], "event_id": 18, "ever": 22, "everi": [6, 27], "exactli": [6, 8], "examin": 36, "exampl": [2, 3, 5, 10, 11, 13, 16, 18, 19, 23, 24, 27, 28, 29, 31, 36], "example_ddp": 6, "example_input": [17, 29], "except": [2, 6, 27], "exchang": 31, "exclus": [6, 9, 10], "execut": [0, 2, 4, 5, 7, 8, 10, 11, 13, 18, 21, 25, 31, 32, 33, 34, 36], "exist": [1, 3, 17, 25, 28, 31], "exit": [25, 31], "exp": [8, 31], "expand": 27, "expect": [19, 25], "expens": 19, "experi": [3, 13, 19, 27, 28, 31], "experiment": 3, "explain": [0, 19, 28], "explicit": 6, "explicitli": [2, 4, 8, 10, 11], "explor": 28, "expon": 15, "export": [10, 18, 25, 31], "export_compressed_model": 28, "expos": [5, 11], "express": 19, "ext": 4, "ext_modul": 8, "extend": [1, 5, 7, 22, 24, 25, 27, 28, 31], "extens": [2, 4, 7, 9, 10, 12, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 36], "extern": 7, "extra": [2, 6, 31], "extract": 7, "f": [3, 4, 9, 16], "f16c": 0, "f32": [0, 19], "face": 31, "facebook": 27, "facilit": 22, "fact": [8, 19], "fail": [2, 25, 31, 34], "failur": [25, 31], "falcon": [2, 27], "fall": 13, "fallback": 31, "fals": [0, 2, 4, 9, 11, 17, 19, 23, 28, 29], "famili": [2, 25, 27, 31], "familiar": [2, 8], "far": 7, "fast": [2, 8, 16, 28, 32], "faster": 11, "fastest": 0, "fatal": [18, 25, 31], "fatal_error": [4, 8], "fault": 31, "fc1": [9, 22], "fc2": [9, 22], "feasibl": [5, 9], "featur": [1, 3, 4, 10, 11, 14, 19, 22, 24, 25, 31, 32, 33, 34], "feb": 31, "fed": 6, "feed": [2, 12, 19], "feedforward": 27, "few": [3, 12, 19], "fft": 31, "fft_fft": 11, "fft_fft2": 11, "fft_fftn": 11, "fft_hfft": 11, "fft_ifft": 11, "fft_ifft2": 11, "fft_ifftn": 11, "fft_ihfft": 11, "fft_irfft": 11, "fft_irfft2": 11, "fft_irfftn": 11, "fft_rfft": 11, "fft_rfft2": 11, "fft_rfftn": 11, "field": 5, "figur": [1, 7, 27], "file": [0, 2, 3, 4, 8, 10, 11, 18, 19, 25, 31, 33], "filenam": 3, "fill": 8, "filter": 7, "final": [9, 32, 33], "find": [1, 4, 7, 8, 25, 30, 31, 32], "find_packag": [4, 8], "findavx": 0, "fine": [8, 28, 31], "finer": 1, "finetun": 31, "finish": [4, 13, 18], "first": [3, 4, 8, 12, 16, 17, 28, 31], "firstli": [22, 27, 28], "fit": [3, 28, 31, 32], "five": 18, "fix": [2, 3, 31], "flag": [0, 34], "flagship": [5, 22], "flash": 31, "flatten": 9, "flavor": 8, "flex": [5, 25, 31, 33], "float": [0, 2, 4, 5, 8, 9, 11, 15, 34], "float16": [2, 5, 22, 23, 28, 29, 31], "float32": [2, 23], "float64": 11, "flow": 7, "flush": 2, "fly": 8, "fma": 0, "fmax": 8, "fmin": 8, "fmt": 18, "fn_type": 0, "focu": [2, 19, 27, 28, 29, 31], "focus": 31, "fold": 2, "folder": 3, "follow": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 22, 23, 25, 26, 27, 29, 31, 34], "footprint": [5, 9, 15, 27, 31], "forc": 31, "fork": 0, "format": [2, 3, 5, 6, 7, 9, 12, 13, 15, 16, 18, 28, 31, 34], "format_tag": 19, "former": [4, 8], "formerli": [5, 6, 9], "forth": [16, 31], "forward": [2, 4, 6, 8, 9, 11, 13, 19, 22, 34], "found": [1, 4, 17, 19, 25, 29, 31], "foundat": 19, "four": 18, "fp16": [0, 13, 14, 23, 27, 28, 30, 31], "fp32": [0, 2, 10, 13, 14, 17, 23, 27, 31, 36], "fp32_math_mod": 2, "fp32_math_mode_max": 2, "fp32_math_mode_min": 2, "fp32mathmod": 2, "fp64": [10, 25, 31], "fp8": [2, 5, 31], "fp8_autocast": [2, 15], "fp8_group": 2, "fp8_model": 15, "fp8_recip": [2, 15], "fp8linear": 15, "fpmath": 2, "fpmath_mod": 2, "fragment": 2, "framework": [5, 7, 10, 16, 31], "frantar": 28, "free": [2, 17, 18, 35], "freed": [2, 35], "freez": [4, 11, 22, 23], "frequenc": 30, "frobenius_norm": 11, "from": [0, 1, 2, 3, 4, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 20, 21, 25, 27, 28, 29, 31, 32, 33, 34, 35], "from_blob": 4, "from_dlpack": 7, "from_pretrain": [4, 28, 31], "front": 28, "frontend": [1, 2, 5, 27, 28, 31], "fsdp": 31, "fsdp_main": 9, "fsdp_mnist_xpu": 9, "fsycl": [4, 8, 33], "full": [3, 8, 22, 27, 31], "fulli": [0, 3, 17, 29, 31], "fully_sharded_data_parallel": 9, "fullyshardeddataparallel": 9, "function": [0, 2, 3, 4, 5, 8, 9, 11, 17, 18, 23, 25, 27, 28, 29, 31, 36], "functool": 9, "further": [1, 2, 4, 5, 18, 19, 27, 28], "fuse": [2, 27, 28, 31, 32, 34, 36], "fuse_update_step": 2, "fusion": [1, 2, 4, 17, 22, 29, 31, 32, 34], "futur": [3, 12, 31], "fw": 30, "fwd": 31, "g": [0, 2, 10, 11, 17, 19, 25, 27, 31, 32, 33], "gain": [1, 27], "gamma": 9, "gate": 8, "gate_weight": 8, "gates_row": 8, "gcc": [0, 31], "ge": 31, "geglu": 14, "gelu": 31, "gemm": [19, 27, 31], "genai": [1, 27], "gener": [0, 1, 3, 4, 6, 7, 8, 13, 17, 18, 19, 27, 28, 29, 31, 33, 34], "get": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 18, 27, 31], "get_cpp_typesize_and_vecs": 0, "get_cpp_typesize_and_vecsize_kernel_fn": 0, "get_cpp_typesize_and_vecsize_kernel_impl": 0, "get_cpp_typesize_and_vecsize_kernel_stub": 0, "get_devic": 7, "get_fp32_math_mod": 2, "get_group": 8, "get_group_rang": 8, "get_local_id": 8, "get_log_compon": 18, "get_log_level": 18, "get_log_output_file_path": 18, "get_log_rotate_file_s": 18, "get_log_split_file_s": 18, "get_rank": 6, "get_world_s": 6, "getcurrentxpustream": [4, 8], "getveclength": 0, "getveclengthkrnl": 0, "girl": 28, "git": 3, "github": [1, 3, 6, 7, 9, 11, 25], "given": [2, 27], "glibcxx": 31, "glibcxx_use_cxx11_abi": 25, "glm": [27, 28], "global": [2, 6], "gnu": 0, "go": [2, 3, 4, 8, 11, 32, 33], "goal": 16, "goe": 8, "good": [1, 2, 3, 19, 27, 36], "googl": 3, "gpt": [2, 27, 28, 30, 31], "gptq": [2, 28, 31], "gpu": [1, 2, 3, 4, 8, 10, 13, 14, 15, 16, 21, 23, 24, 25, 30, 33, 34, 35], "grad": [31, 36], "grad_cel": 8, "grad_h": 8, "grade": [5, 9, 31], "gradient": [5, 9, 15, 16, 28], "grain": 1, "graph": [1, 2, 5, 11, 17, 22, 31, 34], "graph_for": [17, 29], "graph_mod": 2, "graphic": [25, 27, 28, 31, 33], "greater": [13, 25], "grep": 25, "grid": 8, "grid_sampl": 11, "group": [2, 6, 8, 9], "group_siz": 28, "gru_cel": 11, "gt": 31, "guarante": 13, "guard": 16, "guess": 31, "guid": [0, 6], "guidanc": 5, "guidelin": 19, "gunho": 28, "h": [3, 4, 8, 18, 19, 28, 31], "h2d": 25, "ha": [0, 1, 2, 4, 5, 7, 8, 13, 19, 20, 22, 25, 28, 31, 33], "had": 25, "half": [0, 2], "hand": 8, "handcraft": 28, "handl": [2, 4, 7, 19, 31], "handler": 8, "handwritten": 9, "har": [5, 22], "hard": [7, 19], "hardsigmoid": 31, "hardswish": 31, "hardtanh": 31, "hardwar": [0, 1, 5, 24, 27, 31], "has_2d_block_arrai": 28, "hav": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 12, 13, 17, 19, 23, 25, 26, 27, 28, 33], "hbm": 31, "he": 28, "header": [0, 8], "heavier": 27, "height": 19, "held": 2, "help": [0, 2, 3, 4, 8, 9, 13, 27, 31, 32, 33, 35], "helper": 8, "henc": [17, 31], "here": [0, 3, 6, 7, 8, 9, 11, 18, 19], "heurist": 2, "hf": 27, "hgemm_bias_wint4_arc": 28, "hgemm_int4_common_dispatch": 28, "hgemmxetla_int4": 28, "hidden": [19, 27], "hierarchi": 5, "high": [8, 27, 28, 31, 36], "higher": [0, 19, 27], "highest": 13, "highli": [8, 13, 17, 23, 27, 28, 31], "hinge_embedding_loss": 11, "histor": 2, "histori": 27, "hold": [6, 9, 19], "holder": [6, 18], "home": [8, 16, 25], "hook": 28, "horovod": [5, 25, 31], "host": [7, 10, 25, 28, 30, 31], "how": [0, 1, 4, 5, 6, 7, 8, 18, 19, 23], "howev": [2, 3, 5, 8, 10, 11, 12, 19, 27, 28, 31, 35], "hpp": 4, "html": [3, 7, 8], "http": [3, 6, 7, 8, 9, 22, 25], "hub": [27, 28], "huber_loss": 11, "hug": 31, "huggingfac": [27, 28], "human": [2, 31], "hvd": [16, 25], "hw": [19, 33], "hwc": 19, "hwio": 19, "hwn": 19, "hybrid": 31, "hypeparamet": 28, "hyper": 30, "hyperparamet": 28, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], "icpx": [4, 8], "icx": [4, 8], "id": [6, 7, 9], "ideal": 13, "ideep": [0, 19], "ident": [2, 4, 19, 32, 34], "identif": [0, 4], "identifi": 18, "idx": 27, "ifwi": 30, "ignor": 34, "igpu": 28, "illustr": [5, 6, 9, 17, 19], "imag": [11, 19, 31], "immintrin": 0, "impact": 2, "imper": 5, "impl": [0, 8, 31], "implement": [1, 3, 4, 5, 6, 7, 8, 9, 19, 27, 28, 31, 36], "implicit": [2, 31], "import": [0, 1, 2, 3, 4, 6, 8, 9, 15, 16, 17, 19, 22, 23, 25, 27, 28, 29, 31], "importerror": [6, 25, 31], "impos": 28, "impress": 28, "improp": 25, "improv": [1, 11, 15, 27, 28, 31, 34], "inact": 2, "inactive_split": 2, "inactive_split_byt": 2, "inc": [0, 27], "inc_model": 28, "includ": [0, 1, 2, 4, 8, 10, 14, 23, 25, 26, 27, 28, 30, 31, 32, 34], "include_dir": 8, "include_path": 8, "incorrectli": [25, 31], "increas": [1, 2, 16, 25, 27, 28, 31, 32, 33, 35], "increment": 8, "inde": [8, 25], "index": [2, 3, 6, 7, 8, 9, 19, 22, 25, 27, 31], "index_put": 11, "indic": 19, "indirect": 27, "individu": [2, 3], "inductor": [5, 22, 31], "industri": [5, 9, 31], "ineffici": 8, "infer": [2, 5, 14, 15, 17, 19, 22, 23, 31, 34], "inferenc": 2, "inference_data": [17, 29], "inference_dta": [17, 29], "influenc": 27, "info": [0, 4, 17, 18], "inform": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 17, 18, 19, 23, 25, 27, 28, 31, 32], "ingredi": 19, "init": [3, 16], "init_end_ev": 9, "init_process_group": [6, 9], "init_start_ev": 9, "initi": [6, 9, 16], "inlin": 0, "inp": 2, "inplac": [2, 17, 19, 28, 29], "input": [0, 2, 4, 6, 8, 9, 12, 13, 17, 18, 19, 22, 28, 30, 31], "input_g": 8, "input_id": [15, 28], "input_mask": 15, "input_ptr": 4, "input_xpu": 19, "insert": [4, 17, 29], "insid": [3, 8, 18, 32, 34], "instal": [3, 4, 9, 10, 22, 23, 24, 25, 27, 31, 33], "instanc": 34, "instanti": 8, "instead": [20, 21, 29, 31, 36], "instruct": [0, 1, 3, 4, 23, 24, 25, 27, 28, 31], "int": [0, 2, 4, 6, 8, 9], "int32": 31, "int4": [2, 28, 31], "int4_fullrang": 28, "int8": [0, 1, 2, 5, 17, 28, 29, 31], "integ": [2, 28], "integar": 18, "integr": [14, 25, 31, 33], "intel": [2, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 31, 32, 33, 34, 36], "intel discrete gpu": 1, "intel optim": 1, "intel64": [4, 25, 31], "intel64_lin": 4, "intel_extension_for_pytorch": [0, 1, 2, 3, 4, 6, 7, 8, 9, 15, 16, 17, 19, 22, 23, 25, 28, 29, 31], "intelllvm": 4, "intel\u00ae extension for pytorch*": 1, "intend": 3, "intens": [22, 31], "intent": 3, "interest": 3, "interfac": [3, 4, 8, 19, 27, 28], "intern": [0, 2, 10, 19], "interopar": 5, "interoper": 7, "interpret": [2, 8], "intervent": 11, "intrins": 0, "introduc": [1, 8, 19, 31], "introduct": 27, "introductori": 8, "intuit": 28, "invalid": [19, 25, 31], "invoc": [1, 25, 31], "invok": [2, 4, 8, 11, 23, 25, 29, 31], "io": 7, "iostream": 4, "ipex": [0, 1, 2, 4, 5, 8, 10, 12, 18, 23, 27, 28, 29, 31], "ipex_declare_dispatch": 0, "ipex_define_dispatch": 0, "ipex_event_log": 18, "ipex_fp32_math_mod": 10, "ipex_fused_optimizer_list_xpu": 34, "ipex_gpu_root_dir": 10, "ipex_info_event_end": 18, "ipex_info_log": 18, "ipex_log": 10, "ipex_log_compon": [10, 18], "ipex_log_level": [10, 18], "ipex_log_output": [10, 18], "ipex_log_rotate_s": [10, 18], "ipex_log_split_s": [10, 18], "ipex_op_regist": 28, "ipex_register_dispatch": 0, "ipex_weight_convert_module_xpu": 34, "ipex_xxx_event_end": 18, "ipex_xxx_log": 18, "ipextransformerattnoptimizedint4": 28, "ipextransformerlinear": 28, "ipextransformermlpoptimizedint4": 28, "irfft": 31, "is_contigu": [8, 19], "is_contiguous_channels_last_1d": 19, "is_xpu": 8, "isa": 1, "isa_codegen": 0, "isa_nam": 0, "isacodegen": 0, "issu": [1, 3, 11, 22, 27, 28], "item": [8, 9, 16], "iter": [2, 6, 27], "its": [0, 2, 4, 6, 7, 10, 11, 13, 14, 28], "itself": [2, 19], "itt": 10, "ivalu": 4, "j": [0, 2, 27, 28, 30, 31], "ji": 28, "jit": [1, 2, 4, 11, 17, 19, 23, 29, 31, 32, 33], "job": [3, 25], "join": 9, "json": 2, "jung": 28, "jupyt": 3, "just": [2, 8, 28, 29, 31, 32, 33], "k": [2, 28], "kcpu": 0, "kdloneapi": 7, "kdlsycl": 7, "keep": [3, 19], "keepdim": [8, 9], "kei": [2, 5, 27, 31, 32], "kept": 27, "kera": 16, "kernel": [1, 2, 5, 8, 10, 13, 18, 22, 25, 27, 28, 30, 31, 34], "kernel_s": [19, 22], "kfloat": 4, "kfn": 8, "kill": 25, "killer": 25, "kim": 28, "kineto": [21, 31], "kl_div": 11, "kmd": 25, "knob": 2, "know": [3, 7, 8, 32, 33], "knowledg": 28, "known": [5, 6, 9, 27], "known_issu": 22, "kv": 31, "kv_cach": 27, "kwon": 28, "kxpu": 4, "l": 6, "l1_loss": 11, "label": [6, 11, 15, 25], "lake": 27, "lamb": 36, "lambda": 8, "landscap": [1, 27], "languag": [1, 8, 28, 29, 31], "lapack": 31, "lar": 36, "larg": [1, 2, 5, 9, 18, 25, 28, 29, 31, 36], "large_pool": 2, "last": [8, 10, 31, 32], "latenc": [27, 31], "later": 28, "latest": [1, 6, 7, 16, 22, 24, 25, 27, 30], "latter": 8, "launch": [8, 10, 18, 27, 31], "launcher": 6, "layer": [27, 31, 32, 34], "layernorm": [13, 14, 31], "layout": [2, 5], "ld": [25, 31], "ld_preload": [25, 31], "ldd": 4, "le": 31, "lead": 27, "leak": [10, 31], "leaki": 31, "leaky_relu": 31, "learn": [6, 7, 9, 11, 15, 16], "lee": 28, "left": 27, "len": [0, 4, 9, 16], "length": [3, 30], "less": [2, 5, 10, 11, 25, 31], "let": [8, 19, 36], "level": [2, 5, 8, 10, 19, 27, 28, 31, 33], "leverag": [1, 22], "lib": [4, 10, 18, 25, 31], "libc10": 4, "libimf": 4, "libintel": 4, "libintlc": 4, "libirng": 4, "libmkl": [25, 31], "libmkl_cor": [4, 25, 31], "libmkl_gnu_thread": [4, 25], "libmkl_intel_ilp64": [25, 31], "libmkl_intel_lp64": [4, 25, 31], "libmkl_sequenti": 31, "libmkl_sycl": [4, 25, 31], "libmkl_vml_avx512": 25, "libopencl": 4, "libpytorch": 4, "libpytorch_path": 4, "librari": [0, 1, 4, 5, 6, 7, 8, 9, 10, 13, 14, 18, 22, 31], "libstdc": 25, "libsvml": 4, "libsycl": 4, "libtorch": [4, 31], "libtorch_cpu": 4, "libtriton": 25, "licens": 0, "lifecycl": [32, 33], "lighter": 11, "like": [1, 2, 3, 8, 17, 18, 25, 27, 28, 31], "limit": [3, 11, 19, 27, 28, 31], "lin": 28, "linalg_multi_dot": 11, "line": [3, 8, 19], "linear": [2, 6, 9, 11, 13, 15, 19, 22, 31, 32, 34], "linear_bn": 2, "linear_bn_fold": 2, "linear_unari": 17, "link": [0, 1, 4], "linker": [25, 31], "linux": [0, 4, 8, 25, 31], "list": [3, 4, 10, 11, 19, 24, 25, 29, 31, 33], "littl": 28, "live": 3, "ll": [3, 8], "llama": [2, 27, 31], "llama2": [27, 30], "llama3": [27, 28], "llm": [1, 2, 29, 31], "lltm": 8, "lltm_backward": 8, "lltm_backward_xpu": 8, "lltm_forward": 8, "lltm_forward_xpu": 8, "lltm_xpu": 8, "lltm_xpu_backward": 8, "lltm_xpu_backward_kernel": 8, "lltm_xpu_forward": 8, "lltm_xpu_forward_kernel": 8, "lltm_xpu_kernel": 8, "lmkl_core": [25, 31], "lmkl_intel_ilp64": [25, 31], "lmkl_sycl": [25, 31], "lmkl_tbb_thread": [25, 31], "load": [0, 1, 2, 4, 8, 25, 31, 34], "load_in_4bit": 28, "load_state_dict": 2, "loaded_model": 28, "loader": 6, "local": [6, 9, 16], "local_rank": [6, 9, 16], "localhost": 9, "locat": [0, 3, 22, 25], "log": [4, 5, 9, 10, 31], "log_compon": 18, "log_interv": 16, "log_level": 18, "log_path": 18, "log_sigmoid": 31, "log_softmax": [9, 11], "logic": [9, 19, 25], "logutil": 18, "long": [8, 19, 27, 31], "longer": 31, "look": [3, 4, 8, 19, 28], "loop": [2, 3, 10], "lora": [27, 31], "loss": [2, 4, 6, 9, 11, 16, 19, 22, 27, 28], "loss_fn": 6, "loss_funct": 22, "lot": [25, 27, 31, 32, 33], "low": [5, 6, 8, 23, 31, 36], "low_precision_checkpoint": 2, "lower": [0, 11, 17, 27, 28, 31], "loweringexcept": 25, "lowp": 2, "lp64": 31, "lr": [4, 6, 9, 11, 22, 36], "lr_schedul": 9, "lsb": 0, "lstm": [2, 13], "lt": [30, 31], "lunar": 27, "lv": 28, "m": [6, 8, 9, 16], "m150": 33, "machin": [0, 3, 6, 28], "macro": [0, 5, 18], "made": [3, 7, 31], "mai": [0, 1, 2, 3, 7, 8, 11, 12, 13, 17, 19, 25, 28, 31], "main": [1, 3, 4, 9, 22, 27, 28], "main_work": 6, "maintain": [5, 6, 8, 9, 11, 20], "major": 28, "make": [0, 3, 4, 5, 6, 8, 9, 16, 22, 23, 27, 28, 31, 33], "make_tupl": 0, "malloc_devic": 4, "mamx": 0, "manag": [6, 8, 11, 27, 31], "mani": [3, 5, 8, 31], "manipul": 19, "mantissa": 15, "manual": [2, 5, 13, 19, 34], "manual_se": [6, 9], "map": 19, "margin_ranking_loss": 11, "mark": [4, 18, 28], "mask": [0, 31], "mask_valu": 0, "masked_lm_label": 15, "master": [2, 6, 32, 34], "master_addr": [6, 9], "master_port": [6, 9], "match": [0, 2, 11, 34], "math": [2, 5, 8, 10, 13], "matmul": [11, 14, 28, 31], "matric": [8, 28], "matrix": [1, 22, 24, 31], "mavx2": 0, "mavx512bf16": 0, "mavx512bw": 0, "mavx512dq": 0, "mavx512f": 0, "mavx512fp16": 0, "mavx512vl": 0, "mavx512vnni": 0, "max": [0, 9, 14, 25, 28, 30, 31, 33], "max_job": 25, "max_memory_alloc": [2, 35], "max_memory_reserv": [2, 35], "max_pool2d": 9, "maximum": [0, 2], "maxpool1d": 19, "maxpool2d": [13, 22], "maxpool3d": 13, "mb": [10, 18], "md": [3, 10, 19], "me": 19, "mean": [0, 2, 10, 19, 25, 27, 28, 31], "measur": 2, "mechan": [0, 1, 8, 31], "meet": [5, 15, 34], "meltdown": 30, "mem_get_info": [2, 35], "memori": [4, 5, 7, 8, 9, 11, 12, 15, 18, 25, 27, 28, 30, 31, 34, 36], "memory_alloc": [2, 35], "memory_format": [4, 5, 19], "memory_reserv": [2, 35], "memory_snapshot": [2, 35], "memory_stat": [2, 35], "memory_stats_as_nested_dict": 2, "memory_summari": 2, "mention": [6, 8, 18], "merg": 31, "merit": 19, "mermori": 2, "messag": [8, 10, 18, 19, 25, 31], "met": 34, "meta": [19, 27], "metavar": 9, "method": [2, 7, 8, 11, 27, 28, 31], "methodologi": [2, 4, 5, 8, 36], "metric": 2, "mfma": 0, "microsoft": 27, "might": [2, 19, 25, 36], "min": [28, 31], "min_num_param": 9, "mind": 19, "mini": [27, 28, 31], "minim": [0, 10, 27, 28], "minimum": 19, "minmax": 28, "minmaxobserv": [4, 17, 29], "minor": [6, 31], "mish": 31, "miss": [3, 25], "mitig": [28, 30], "mix": [2, 4, 8, 27, 31], "mixtur": 11, "mkdir": 4, "mkl": [4, 25, 31], "mkl_dpcpp_root": [25, 31], "mkl_lapack_dspevd": 25, "mkldnn": 19, "mkldnn_util": 19, "mllama": 2, "mlp": [14, 28], "mm": [8, 11], "mm_bias_int4": 28, "mm_qkv_out_int4": 28, "mm_silu_mul_int4": 28, "mmx": 0, "mnist": 9, "mnist_cnn": 9, "mno": 0, "mode": [1, 2, 3, 5, 10, 19, 23, 25, 31], "model": [1, 2, 5, 6, 9, 10, 11, 12, 13, 15, 16, 17, 22, 23, 25, 29, 30, 31, 34], "model_nam": 28, "model_name_or_path": [29, 31], "model_state_dict": 4, "modelimp": [17, 29], "modeljit": [4, 17, 29], "modif": [6, 9, 16, 17], "modifi": [2, 3, 16], "modul": [0, 1, 2, 4, 5, 6, 8, 9, 11, 14, 17, 19, 22, 25, 28, 29, 31, 32, 34], "modular": [2, 4], "moe": 14, "momentum": [4, 22, 36], "momentum_buffer_list": 36, "monitor": [7, 35], "more": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 17, 23, 25, 27, 28, 31, 32, 33, 35, 36], "moreov": [1, 27, 31], "most": [5, 10, 13, 17, 25, 27, 31], "motiv": [2, 4], "move": [4, 17, 19, 23], "mp": 9, "mpi": [16, 25], "mpi_rank": 6, "mpi_world_s": 6, "mpirun": 6, "mse_loss": 11, "mseloss": 6, "mtl": 31, "much": [8, 19, 31, 36], "mul": 31, "mul_": 36, "multi": [6, 31, 33], "multi_margin_loss": 11, "multi_process_spawn": 6, "multidimension": 19, "multiheadattent": 27, "multilabel_margin_loss": 11, "multipl": [0, 2, 3, 5, 6, 11, 19, 22, 25, 27, 31, 33], "multiprocess": [6, 9], "must": [0, 3, 8, 33, 34, 36], "mv": 11, "mxnet": 16, "my": 19, "my_auto_wrap_polici": 9, "mykernel": 0, "n": [4, 6, 8, 9, 19], "n1": 19, "n2": 19, "name": [0, 8, 13, 25, 28], "named_paramet": 16, "namespac": [0, 4, 11], "nan": 0, "nativ": [0, 1, 5, 11, 25, 31, 36], "natur": [8, 19, 27, 28, 31], "nb": 19, "nchw": 5, "nd": 19, "nd_item": 8, "nd_rang": 8, "ndim": 7, "ne": 31, "nearest": [19, 28], "necessari": [6, 9, 16, 19, 25], "necessarili": 14, "necessit": 28, "neck": 36, "need": [0, 2, 3, 4, 5, 6, 8, 9, 16, 17, 19, 23, 29, 31, 32, 33, 34, 36], "neg": 2, "neglig": 19, "neox": 2, "nest": 2, "nesterov": 36, "net": 9, "network": [1, 5, 11, 13, 15, 31], "neural": [1, 5, 13, 15, 31], "neural_compressor": 28, "new": [0, 3, 15, 19, 28, 29, 31, 32], "new_cel": 8, "new_h": 8, "newer": 1, "newkernel": 0, "newkernelkrnl": 0, "next": [3, 8, 13], "next_sentence_label": 15, "nf4": 28, "nhwc": [5, 31], "nightli": [22, 25], "ninja": 8, "nll_loss": [9, 11, 16], "nll_loss2d": 11, "nll_loss_nd": 11, "nlp": 4, "nn": [2, 4, 6, 9, 11, 13, 19, 22, 32, 34], "no_grad": [4, 9, 22, 23], "node": [30, 31], "non": [2, 3, 10, 11, 19, 28], "noncontigu": 19, "none": [2, 6, 9, 36], "norm": [14, 31], "normal": [1, 4, 9, 16, 27, 28, 32, 34], "note": [0, 2, 3, 6, 7, 8, 9, 12, 14, 16, 19, 25, 27, 28, 31, 33, 35], "noth": 2, "notic": 26, "nov": 31, "now": [2, 5, 8, 19, 22], "nproc": 9, "nuclear_norm": 11, "null": [10, 18], "num_alloc_retri": 2, "num_oom": 2, "num_replica": [9, 16], "num_work": [6, 9], "number": [1, 2, 3, 4, 6, 8, 9, 16, 18, 25, 30, 31, 36], "numer": [2, 11], "numpi": 7, "nuqmm": 28, "o": [0, 4, 6, 9, 25, 30, 31], "o0": 2, "o1": 2, "o3": 0, "oam": 30, "object": [0, 2, 4, 25, 31], "observ": [4, 12, 17, 29], "obtain": [17, 27, 28, 29], "occasion": 28, "occupi": [2, 35], "occur": [25, 28, 31], "oct": 31, "octob": 2, "oem": 30, "off": [10, 11, 25, 27, 28, 31], "offer": [1, 3, 28, 35], "offici": [3, 17, 20, 31], "offset": [19, 27], "old": 25, "old_cel": 8, "old_h": 8, "onc": [0, 2, 3, 13, 18, 19, 28, 33, 34], "one": [2, 3, 6, 7, 8, 10, 13, 16, 17, 18, 19, 25, 29, 31, 36], "oneapi": [4, 5, 6, 7, 9, 13, 16, 22, 25, 30, 31, 33], "oneapi_root": 6, "oneccl": [5, 9, 10, 31], "oneccl_bind_pt": 6, "oneccl_bindings_for_pytorch": [6, 9], "onednn": [0, 2, 5, 10, 13, 18, 22, 27, 31], "onednn_layout": 13, "onemkl": [10, 13, 18, 25, 31], "ones": [0, 8, 17, 28], "onli": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 23, 31, 34], "onlin": [15, 31], "onnx": 28, "oob": 31, "oom": 25, "op": [3, 9, 10, 13, 18, 22, 27, 28], "open": [1, 25, 31], "openai": 25, "opencl": 33, "oper": [1, 2, 4, 5, 7, 8, 10, 11, 17, 18, 21, 22, 28, 31, 32], "opportun": 34, "opportunit": 2, "opt": [0, 2, 6, 22, 25, 27, 30, 31], "optim": [1, 2, 5, 6, 9, 10, 11, 12, 13, 14, 16, 19, 22, 23, 24, 25, 28, 31], "optimize_lstm": 2, "optimize_transform": 31, "optimized_model": 2, "optimized_optim": 2, "optimizer_state_dict": 4, "option": [1, 2, 4, 10, 22, 25, 32, 33, 34], "optioncpu": 10, "optionexperiment": 10, "optiongpu": 10, "order": [0, 2, 7, 13, 15, 19, 25, 31], "ordered_gemm_wint4_config_set_arc": 28, "ordered_gemm_wint4_config_set_pvc": 28, "org": [8, 22, 25], "organ": 19, "origin": [0, 2, 7, 15, 16, 28, 29, 34, 36], "oserror": [25, 31], "other": [0, 2, 5, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, 23, 25, 27, 28, 31, 35, 36], "otherwis": [2, 9, 10, 28], "our": [3, 4, 8, 13, 17, 27, 28], "out": [2, 4, 7, 8, 11, 25, 28, 31, 36], "outplac": 19, "output": [2, 4, 9, 10, 11, 13, 15, 16, 18, 19, 22, 28], "output_g": 8, "output_tensor": 4, "outstand": 3, "over": [2, 3, 5, 8, 10, 11, 12, 19, 31], "overal": 17, "overcom": [27, 28], "overestim": 35, "overhead": [1, 2, 8, 10, 27, 31, 32, 36], "overrid": [10, 31], "overridden": [0, 2], "overview": [29, 31], "overwrit": 2, "own": [4, 8], "ox": 10, "p": 30, "pack": [2, 7], "packag": [1, 4, 6, 8, 9, 23, 25, 31], "packed_accessor32": 8, "packedtensoraccessor32": 8, "packet": 2, "pad": [11, 19, 22, 31], "page": [4, 6, 9, 20, 29, 30, 31, 34], "parallel": [2, 6, 25, 27, 31], "parallel_for": 8, "param": [2, 36], "paramet": [0, 2, 4, 5, 6, 9, 11, 16, 18, 22, 27, 28, 31, 34, 36], "parameterwrapp": 34, "parent": 7, "park": 28, "pars": [7, 9, 31], "parse_arg": 9, "parser": 9, "part": [4, 8, 11, 18, 19, 32, 34], "partial": [9, 27], "particular": [3, 11, 25, 28, 29, 31], "particularli": [5, 7], "partit": 16, "pass": [0, 1, 3, 4, 8, 25, 28], "patch": 31, "path": [2, 4, 8, 9, 10, 18, 19, 22, 25, 34], "path_to_your_onemkl": 31, "pattern": [8, 17, 19, 29, 31, 35], "pdist": [11, 31], "peak": 2, "peer": 28, "peft": 31, "per": [2, 7, 8, 16, 17, 30], "per_kernel": 10, "per_tensor_symmetr": [4, 17, 29], "perchannel": [17, 29], "perf": 19, "perform": [1, 2, 4, 5, 8, 10, 11, 12, 13, 14, 17, 19, 22, 24, 27, 28, 29, 31, 34, 36], "period": 2, "permut": 31, "permutecontigu": 13, "persist": 10, "perspect": [2, 19], "pertain": 0, "phase": [2, 27], "phi": [27, 31], "phi3": 28, "pick": 3, "pin": 16, "pin_memori": [6, 9], "pip": [3, 6, 16, 22, 25, 28], "pixelshuffl": 31, "pl": 10, "place": [2, 6, 11, 18, 27], "plan": 3, "platform": [4, 7, 10, 19, 22, 25, 28, 31, 33, 34], "platinum": 30, "pleas": [2, 3, 5, 6, 9, 18, 20, 21, 22, 23, 25, 28, 31, 33, 34, 35], "plug": 8, "pmi_rank": 6, "pmi_siz": 6, "point": [2, 5, 7, 8, 11, 15, 28], "pointer": [0, 8, 25], "poisson_nll_loss": 11, "polici": [28, 31], "polymorph": 0, "pool": [2, 22, 32], "popular": [1, 7, 27, 28, 30, 31], "popup": 3, "port": [3, 6], "posit": [27, 28], "possibl": [2, 4, 7, 13, 31], "post": [3, 5, 17, 22, 27, 28, 31], "potenti": [22, 34], "pow": [11, 31], "power": [8, 15], "pr": 19, "practic": [5, 8, 27], "pragma": 0, "pre": [14, 22, 25, 27, 28, 33], "prebuilt": [25, 31, 33], "precis": [4, 6, 15, 23, 28, 30, 31], "pred": 9, "predefin": 2, "prefer": [1, 13], "prefetchw": 0, "prefetchwt1": 0, "prefil": [2, 27], "prefix": 8, "preload": [25, 31], "prelu": 11, "prepack": [2, 19, 27], "prepar": [17, 29], "prepare_data": 18, "prepare_fp8": 15, "prepare_jit": [4, 17, 29], "preprint": 28, "prerequisit": [3, 4], "present": 28, "preserv": [27, 28], "prevent": [16, 36], "previou": [18, 19, 31], "previous": 8, "primari": 31, "primit": [2, 10, 31], "principl": [16, 19], "print": [0, 4, 6, 9, 16, 17, 19, 22, 29], "printout": 2, "prior": [2, 23], "prioriti": 13, "probabl": [7, 9, 25], "problem": [25, 36], "procedur": [25, 28], "process": [4, 5, 6, 8, 9, 13, 15, 16, 25, 27, 28, 31, 34], "processgroup": [2, 6, 9], "processor": [31, 36], "produc": [3, 7, 8, 11, 35], "product": [1, 27, 31, 32, 33], "profil": 31, "program": [1, 2], "progress": [25, 27], "project": [1, 4, 8], "prompt": [27, 28], "properti": [4, 8], "propos": [3, 19, 27, 28], "prototyp": [2, 10, 31, 33], "provid": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 18, 24, 25, 27, 28, 29, 31, 33, 34, 35, 36], "pseudo": 36, "pt": [4, 9, 28], "pth": 4, "pthread": 4, "pti": 31, "ptr": 31, "public": [8, 31], "publicli": 31, "pull": 3, "purpos": [0, 8], "push_back": 4, "put": [6, 7, 9], "pvc": [30, 33], "py": [3, 6, 8, 9, 10], "pybind11": 8, "pybind11_modul": 8, "pyi": 3, "python": [0, 1, 2, 3, 6, 7, 8, 9, 10, 16, 27, 28, 29, 31, 32, 34], "python_include_dir": 8, "pytorch": [2, 4, 5, 7, 8, 9, 10, 11, 12, 18, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36], "qconfig": [4, 17, 29], "qconfig_summary_fil": 2, "qkv": 28, "qmodel": 28, "qscheme": [4, 17, 29], "qualiti": [27, 28, 31, 32, 33], "quant": 2, "quant_method": 2, "quantiz": [1, 4, 14, 29, 31], "quantizaiton": 15, "quantizat": 2, "quantization_config": [2, 28], "quantize_jit": [4, 17, 29], "quantwrapp": [17, 29], "queri": [0, 19], "question": 19, "queue": [5, 10, 18], "quick": [1, 24], "quint8": 4, "quit": [0, 28, 31], "qwen": [27, 28, 31], "qwen2": [27, 28], "r": [3, 30], "rais": [2, 6], "ram": 25, "rand": [4, 11, 19, 22], "randint": 4, "randn": [6, 13, 19], "random": [9, 16], "rang": [1, 6, 8, 9, 15, 16, 17, 29], "rank": [5, 6, 9, 16, 31], "ransform": 28, "rate": [9, 16, 31], "rather": [2, 18, 19], "re": [3, 6, 8, 11], "reach": 31, "read": [3, 8, 28, 36], "readabl": [2, 8], "readi": 7, "readm": 3, "real": [2, 4, 6], "realli": [3, 8], "realloc": 18, "reason": 19, "rebas": 3, "reboot": 25, "receiv": [2, 25, 31], "recent": [19, 28], "recip": [2, 31], "reciproc": 11, "recogn": 7, "recommend": [1, 4, 12, 13, 17, 23, 25, 28, 31], "record": [9, 18], "record_avg_pool": 18, "recurs": 3, "reduc": [1, 2, 5, 9, 15, 27, 28, 31, 36], "reduce_rang": [4, 17, 29], "reduceop": 9, "reducescatt": [9, 31], "reduct": 9, "redund": 31, "refer": [0, 1, 2, 6, 8, 9, 10, 12, 13, 19, 20, 22, 23, 24, 27, 31, 33, 34], "referenc": 34, "regard": [6, 19], "regardless": 11, "region": [0, 11], "regist": [0, 1, 28, 31], "registri": 25, "regress": [3, 12], "regular": 4, "reinstal": [3, 25], "reinterpret": 19, "reinterpret_cast": 0, "relat": [0, 7, 9, 17, 18, 22, 25], "releas": [0, 1, 2, 6, 12, 14, 19, 20, 25, 33, 35], "reli": [7, 19], "relianc": 28, "reload": 8, "relu": [9, 19, 22, 31], "remain": [28, 36], "remark": [27, 28, 31], "rememb": 22, "remov": [2, 3, 25, 31], "renorm": 11, "reorder": [2, 19, 27], "reorder_cach": 27, "repeat": 19, "repeatedli": 3, "replac": [2, 3, 6, 9, 17, 28, 31, 32, 34], "replace_dropout_with_ident": 2, "replic": 6, "replica": [5, 6, 9], "repo": [3, 6], "repo_url": 6, "report": [0, 1, 18, 25], "repositori": 6, "repres": [3, 6, 18, 31], "represent": 19, "request": [1, 2, 3, 32], "requir": [2, 3, 4, 6, 7, 8, 10, 11, 13, 15, 17, 19, 23, 25, 27, 28, 29, 31, 34], "reserv": [2, 18, 32], "reserved_byt": 2, "reset": [2, 14], "reset_accumulated_memory_stat": 2, "reset_peak_memory_stat": 2, "reset_peak_stat": 2, "reshap": 8, "resid": 6, "residu": 14, "resiz": 4, "resnet50": 10, "resnet50_weight": 4, "resolv": [25, 31], "resourc": [27, 28, 31], "respons": [7, 17, 27, 31], "restor": 16, "result": [1, 2, 8, 19], "retak": 14, "retri": 2, "retriev": 8, "return": [0, 2, 4, 6, 8, 9, 11, 19, 22], "return_tensor": 28, "reus": 2, "review": 28, "rf": 3, "rfc": 19, "rh": 0, "right": [8, 23, 27], "rm": [3, 14, 31], "rmsnorm": 27, "root": [0, 4, 6, 22, 25, 27, 31], "root_rank": 16, "rope": 27, "rotari": 27, "rotat": [10, 18], "roughli": 19, "round": [28, 31], "rounding_bia": 0, "rst": 3, "rtn": 28, "run": [2, 3, 4, 5, 6, 8, 9, 10, 11, 23, 25, 31, 32, 33, 34], "run_benchmark_woq": 28, "rune": 6, "runtim": [0, 1, 2, 5, 7, 8, 9, 11, 18, 23, 25, 31, 33], "runtimeerror": 25, "sacrif": 11, "safe": 7, "same": [0, 6, 8, 9, 18, 19, 27, 31], "sampl": [0, 2, 6, 12], "sample_input": [2, 12], "sampler": [6, 9, 16], "sampler1": 9, "sampler2": 9, "sanit": 10, "save": [2, 3, 4, 9, 15, 16, 19, 31, 34], "save_model": 9, "save_pretrain": 28, "saved_dir": 28, "scalar": 31, "scalar_t": 8, "scalartyp": 0, "scalartypetocpptyp": 0, "scale": [2, 5, 9, 10, 15, 16, 27, 28, 31], "scale_dtyp": 28, "scaled_dot_product_attent": 11, "scatter_add": 11, "scenario": [2, 7, 14, 17, 28, 31], "schedul": [1, 9], "scope": 11, "scratchpad": 10, "script": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 16, 17, 27, 29, 34], "scriptmodul": [17, 29], "sdp": 31, "sdpa": [27, 31], "se": 28, "se5c7411": 30, "seamlessli": [5, 22], "search": [1, 3, 27, 31], "sec": 9, "second": [8, 16, 18, 25], "secret": 19, "section": [1, 4, 5, 11, 17, 22, 24, 27, 28, 29, 34], "see": [1, 2, 3, 8, 11, 15, 19, 25, 31, 33], "seed": [6, 9], "seed_numb": 6, "segment": [2, 5, 31], "segment_id": 15, "select": [2, 4, 5, 28, 31], "selector": 7, "self": [6, 9, 11, 19, 22], "semant": 19, "semidefinit": 28, "sep": [0, 10], "separ": [4, 8, 10, 18, 26], "seper": 33, "seq_length": 4, "sequenc": [19, 27], "sequenti": 19, "seri": [5, 14, 25, 27, 28, 31, 33], "serv": 31, "server": [16, 25], "servic": 4, "set": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 13, 16, 23, 25, 28, 30, 31, 33, 34], "set_devic": [6, 9, 16], "set_epoch": [6, 9], "set_fp32_math_mod": 2, "set_log_compon": 18, "set_log_level": 18, "set_log_output_file_path": 18, "set_log_rotate_file_s": 18, "set_log_split_file_s": 18, "set_properti": [4, 8], "set_source_files_properti": 8, "setup": [3, 8, 9, 16, 27], "setuptool": 5, "setvar": 6, "sever": [2, 18, 30, 36], "sgd": [2, 4, 6, 11, 16, 22, 31, 34, 36], "sgd_fused_step": 36, "sh": [6, 16, 22, 25, 28], "sha": 0, "shall": [3, 19], "shape": [2, 7, 13, 22, 27], "shard": [27, 31], "share": [1, 3, 5, 6, 7, 8, 25, 31], "shen": 28, "ship": 25, "shorten": 3, "should": [2, 3, 8, 9, 11, 18, 25, 27, 28], "show": [0, 4, 6, 7, 8, 11, 27, 28, 29, 30, 31], "showcas": [15, 28], "shown": [1, 2, 4, 6, 19, 27, 28], "shuffl": [6, 9], "si": 30, "side": [5, 7], "sigmoid": [8, 31], "sign": [15, 28], "significantli": [27, 28, 31], "silu": 31, "similar": [0, 9, 25, 31], "simpl": [2, 5, 8, 11, 19, 22, 28, 31], "simplenet": [11, 22], "simpli": [4, 8], "simplic": 28, "simultan": 34, "sinc": [2, 7, 8, 19, 20, 28, 31, 32, 33], "singl": [9, 16, 18, 27, 30, 31, 36], "single_card": 6, "single_card_dist": 6, "situat": 7, "size": [0, 2, 4, 6, 7, 8, 9, 10, 16, 18, 19, 25, 27, 28, 31, 32, 33], "size_based_auto_wrap_polici": 9, "size_t": 8, "sizeof": 0, "skip": [0, 3, 4, 19, 28, 32, 33], "sleef": 0, "slice": [4, 8, 19], "slide": 31, "slot": 30, "slow": 25, "slower": 11, "small": [2, 17], "small_pool": 2, "smaller": [0, 11, 31], "smallest": 32, "smooth_l1_loss": 11, "smoothquant": 27, "snapshot": [2, 35], "snippet": [9, 13, 29], "so": [0, 2, 4, 5, 7, 11, 16, 19, 25, 31, 35, 36], "socket": 30, "soft_margin_loss": 11, "softmax": [13, 31], "softplu": 31, "softwar": 26, "solut": [2, 9, 25, 27, 31], "solv": 36, "some": [0, 2, 3, 5, 8, 9, 11, 13, 19, 23, 25], "someth": 19, "soon": 21, "sourc": [0, 1, 3, 8, 10, 16, 22, 25, 26, 33], "space": [19, 28], "span": 31, "spars": 19, "spawn": [6, 9], "spec": 7, "special": [0, 13, 27], "specif": [1, 2, 4, 5, 7, 10, 13, 14, 16, 18, 19, 27, 31, 32, 34], "specifi": [2, 8, 9, 10, 18, 25, 33, 34], "specifii": 0, "spectr": 30, "specul": 31, "speed": [2, 5, 8, 27, 31, 32, 36], "speedup": [11, 13, 27, 31], "sphinx": 3, "spir64_gen": 33, "split": [0, 2, 8, 10, 18, 32, 34], "split_master_weight_for_bf16": 2, "spontan": 19, "sqrt": 31, "squar": [27, 31], "src": 0, "src_data_ptr": 19, "src_md": 19, "src_mem": 19, "sse": 0, "sse2": 0, "sse3": 0, "sse4_1": 0, "sse4_2": 0, "ssse3": 0, "stabil": 11, "stabl": [5, 6, 7, 11], "stack": [11, 18], "stage": [17, 36], "stai": 28, "standard": [1, 8, 27], "start": [1, 2, 3, 4, 6, 16, 18, 31], "stat": 2, "state": [2, 5, 8, 9, 16, 27, 35], "state_dict": [2, 4, 9, 16], "state_s": 8, "statement": [0, 5], "static": [2, 5, 17, 27, 28, 31, 34], "statist": [2, 4, 17, 29], "statu": 0, "std": [0, 4, 8], "stead": 0, "step": [2, 3, 4, 6, 8, 9, 11, 13, 16, 18, 22, 25, 28, 32, 34], "step2": 13, "step3": 13, "step4": 13, "step_id": 18, "step_siz": 9, "steplr": 9, "still": [3, 5, 8, 11, 19, 31, 33], "stock": [3, 7, 10, 19, 31], "stop": 25, "storag": 36, "store": [0, 14, 19, 27, 36], "store_tru": 9, "str": [2, 6], "strategi": 8, "stream": [4, 8, 10], "strict": 4, "stride": [7, 11, 22], "stride_c": 19, "stride_h": 19, "stride_n": 19, "stride_w": 19, "string": [2, 9, 18], "structur": [1, 5, 7], "style": [2, 3, 4, 8], "sub": [18, 31], "sub_fold": 3, "subcompon": 10, "subfold": 0, "subgraph": 2, "subject": [0, 26], "submit": [1, 3, 8], "submit_barri": 10, "submodul": 3, "subsequ": [8, 19, 25], "substitut": 28, "success": 4, "successfulli": 6, "suffici": [5, 10, 28], "suffix": [0, 25, 31], "suggest": [1, 19], "suit": 3, "suitabl": 28, "sum": [8, 9, 19, 31], "summari": 2, "summit": 28, "super": [6, 9, 11, 19, 22], "suppli": [11, 19], "support": [0, 2, 3, 4, 6, 7, 8, 10, 13, 17, 18, 21, 22, 24, 25, 27, 29, 31, 33, 34, 36], "sure": [3, 6, 9, 16], "surgeon": 28, "sw": 30, "swap": [17, 25], "switch": [0, 9], "sycl": [1, 5, 7, 10, 13, 18, 31, 32, 34], "sycl_devic": 19, "sycl_queu": 8, "symbol": [25, 31], "symlink": 3, "symmetr": 17, "sync": 3, "synchron": [7, 8, 10, 16], "syngraph": 18, "sysman": 2, "system": [0, 8, 25, 31], "t": [0, 2, 3, 7, 8, 11, 19, 31], "t2": 7, "t_valu": 0, "tabl": [0, 33], "take": [1, 2, 8, 11, 19, 24, 28, 31], "tanh": [8, 31], "target": [0, 4, 8, 9, 16, 31, 32, 33], "target_include_directori": 8, "target_link_librari": [4, 8], "task": [2, 25, 27, 28, 31], "tdr": 25, "tdrdelai": 25, "team": [1, 3], "techniqu": [1, 2, 8], "technolog": [1, 27], "tell": 19, "templat": [8, 13, 18, 27, 31], "temporari": 8, "tenor": 19, "tensor": [0, 2, 4, 5, 7, 8, 11, 13, 17, 27, 31, 35], "tensordot": 11, "tensorflow": [16, 19], "tensoropt": 4, "teq": 28, "term": [8, 26], "termin": 25, "test": [0, 4, 9, 30, 31, 32, 33], "test_": 3, "test_batch_s": 9, "test_input": 19, "test_input_xpu": 19, "test_kwarg": 9, "test_load": 9, "test_loss": 9, "text": 33, "tf32": [2, 10], "tgi": 31, "than": [0, 2, 10, 13, 14, 17, 18, 19, 22, 25, 28, 31], "thank": 3, "thei": [8, 11, 19, 27, 28, 31], "them": [1, 3, 10, 16, 19, 25, 27, 28, 31, 34, 36], "therefor": [14, 17], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36], "thing": 25, "those": [2, 8, 16, 25, 27, 35], "thread": [1, 4, 8, 10, 30], "three": [0, 7, 27], "through": [1, 2, 4, 5, 8, 11, 22, 24, 28, 31], "throughput": [27, 31], "throw": 9, "thrown": [2, 25], "thu": [2, 8, 11, 13, 19, 31], "thudm": 27, "tiiuae": 27, "tile": [0, 5, 6, 27, 30], "time": [0, 2, 3, 5, 8, 9, 18, 19, 21, 25, 27, 28, 31, 36], "timeout": 3, "timestamp": 27, "tip": 0, "tloss": [9, 16], "tmp": 8, "to_channels_last_1d": 19, "to_dens": 19, "to_dlpack": 7, "to_mkldnn": 19, "togeth": 33, "toi": 9, "token": [28, 30, 31], "token_type_id": 15, "too": 25, "tool": 0, "toolkit": [2, 30, 31], "toolset": 0, "top": [31, 34], "toplevel": 3, "topologi": [30, 36], "torch": [1, 2, 6, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, 23, 25, 28, 29, 30, 31, 34, 36], "torch_ccl": [5, 6], "torch_check": [0, 5, 8, 18], "torch_error": [5, 18], "torch_extens": 8, "torch_extension_nam": 8, "torch_ipex": [0, 2, 28], "torch_ipex_include_dir": 8, "torch_ipex_librari": [4, 8], "torch_librari": 8, "torch_llm_allreduc": [10, 31], "torchdynamo": 1, "torchinductor": [5, 22], "torchscirpt": 2, "torchscript": [1, 2, 5, 23, 36], "torchvis": [4, 9], "total": [2, 8, 30, 31, 35], "totensor": [4, 9], "tpp": 27, "trace": [1, 4, 5, 10, 11, 17, 18, 23, 29], "track": [1, 2], "trade": [11, 27, 28, 31], "train": [2, 6, 9, 15, 16, 17, 19, 23, 28, 29, 31, 32, 34], "train_dataset": [4, 6, 16], "train_kwarg": 9, "train_load": [4, 6, 9, 11, 16], "train_sampl": [6, 16], "trainabl": 28, "transfer": 25, "transform": [2, 4, 9, 14, 19, 28, 30, 31], "transpar": [29, 31], "transpos": [8, 31], "tree": 3, "tri": 34, "trigger": [9, 17, 25, 29, 31, 33], "triplet_margin_loss": 11, "triton": [22, 25, 31], "true": [0, 2, 4, 6, 8, 9, 13, 15, 17, 22, 23, 28, 29, 34], "trust_remote_cod": 28, "try": [2, 3, 4, 6, 18, 25], "tune": [11, 28, 31], "tupl": [0, 2, 6], "turboboost": 30, "turn": 31, "tutori": [3, 4, 6, 8, 9, 20], "two": [2, 5, 7, 8, 15, 18, 27, 32], "txt": [3, 4, 8], "type": [0, 2, 3, 4, 5, 7, 8, 9, 17, 19, 23, 25, 28, 31, 32, 33, 34], "typenam": 8, "types": 0, "typic": [5, 7, 16, 28, 31], "u": [6, 8], "ubuntu": [25, 30, 31], "ucod": 30, "uint32_t": 0, "uint8": 17, "ultim": 8, "ultra": [27, 28, 31], "unabl": 32, "unalign": 0, "unaryop": 31, "uncas": 4, "undefin": [25, 31], "under": [2, 7, 11, 14, 19, 25, 26, 31], "undergo": 29, "underlai": 8, "underli": [0, 1, 5, 27, 35], "underneath": 31, "understand": 35, "unifi": [2, 4], "uniform": 28, "uninstal": [3, 25], "uniqu": 18, "unit": [1, 8, 31], "unlik": [4, 5, 9, 27, 28], "unlist": 11, "unload": 25, "unlock": 22, "unoccupi": 2, "unpredict": 2, "unquant": 28, "unstabl": 11, "unsupport": [25, 31], "until": 3, "untrack": 3, "unus": [2, 10, 35], "up": [2, 5, 7, 8, 9, 10, 27, 31, 32], "updat": [2, 3, 6, 9, 31, 32, 34, 36], "upgrad": 31, "uplift": 31, "upon": 28, "upper": 19, "upsampl": [13, 19], "upsampleblinear2d": 13, "upsamplenearest": 13, "upstream": 19, "url": [6, 22, 25], "us": [0, 1, 2, 3, 6, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 31, 32, 34, 35], "usag": [2, 5, 7, 11, 13, 17, 19, 20, 23, 24, 31], "use_aot_devlist": [10, 33], "use_channels_last_1d": 10, "use_ds_kernel": 10, "use_itt_annot": 10, "use_llm_runtim": 28, "use_onednn_dir": 10, "use_onemkl": [10, 25, 31], "use_optimum_format": 28, "use_override_op": 10, "use_persist_stream": 10, "use_primitive_cach": 10, "use_queue_barri": 10, "use_scratchpad_mod": 10, "use_split_fp64_loop": 10, "use_sycl_assert": 10, "use_xetla": 10, "use_xetla_src": 10, "user": [1, 2, 4, 5, 10, 12, 13, 19, 22, 31, 32, 33, 34, 35], "user_nam": 8, "usm": [4, 7], "usr": [0, 25, 31], "usual": [19, 22, 25, 27, 28], "util": [1, 4, 5, 6, 7, 8, 9, 13, 16, 18, 19, 25, 28, 33, 34], "v": [25, 31], "v0": [7, 31], "v1": 31, "v2": [22, 31], "v3": [22, 31], "v4": 30, "valid": [2, 7, 9, 10, 14, 28], "valu": [0, 2, 4, 10, 15, 25, 27, 28], "vanilla": 8, "var": [6, 16, 22, 25], "variabl": [0, 3, 5, 10, 16, 18, 23, 25], "variant": 11, "variou": [7, 22, 27, 28, 31], "vec256": 0, "vec512": 0, "vec_bia": 0, "vector": [0, 1, 2, 4, 8, 14, 19, 31], "vectors": 0, "vehicl": 31, "ver": 8, "verbos": [5, 8, 18], "veri": [3, 8, 19, 21, 25, 27, 31], "verif": 31, "verifi": [4, 22, 25, 27, 31], "version": [0, 4, 7, 8, 22, 25, 26, 31, 34, 36], "via": [3, 5, 7, 8, 10, 17, 22, 25, 28, 33, 35], "view": [17, 19, 22, 31], "view_a": 9, "virtual": 0, "virtualguardimpl": 8, "visibl": [2, 31], "vision": 4, "vl": 27, "vllm": 31, "vml": [25, 31], "vnni": [0, 1, 31], "vocab_s": 4, "void": [0, 8], "w": [19, 28], "w4g32": 28, "w8": 28, "w8a8": [27, 28], "wa": [4, 7, 8, 25, 31], "wai": [3, 8, 19, 27, 28, 36], "wait": [7, 28], "walk": 8, "want": [0, 3, 8, 10, 19, 23], "warmup": [17, 29], "warmup_data": [17, 29], "warn": [3, 18], "warp": 6, "wast": 27, "wc": 19, "we": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 17, 19, 25, 27, 28, 30, 31, 35, 36], "webpag": 31, "weight": [1, 2, 4, 6, 8, 15, 16, 17, 19, 29, 31, 32, 34], "weight_decai": [22, 36], "weight_dtyp": 28, "weightonlyquantconfig": 28, "weightonlyquantizedlinear": 28, "weights_prepack": 2, "well": [1, 2, 3, 4, 27, 28, 31], "were": 8, "what": [11, 31, 32, 33], "wheel": [25, 31, 33], "when": [2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 25, 27, 28, 31, 32, 33, 34, 36], "where": [2, 3, 5, 6, 7, 9, 22, 25, 31], "whether": [2, 10, 11, 19, 34], "which": [0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 15, 18, 19, 25, 27, 28, 31, 33, 35], "while": [10, 11, 17, 19, 27, 28, 31], "whl": [6, 22, 25], "who": [10, 31, 33], "whole": [17, 18, 31], "wide": [7, 28], "wider": [1, 4], "widespread": [1, 27], "width": [0, 19, 27], "window": [25, 31], "wise": [10, 28, 29, 31, 36], "wish": [3, 8, 19], "with_arg": [4, 17, 29], "within": [3, 18, 28, 29, 31, 32, 34], "without": [2, 5, 7, 11, 25, 28, 31, 32], "wn": 19, "won": [0, 2, 11], "woq": 27, "woq_quantization_config": 28, "work": [0, 2, 3, 4, 6, 8, 9, 19, 23, 25, 27, 29, 31], "work_group": 8, "workabl": 2, "workaround": 31, "worker": [5, 6, 9, 16], "workflow": [5, 17], "workgroup": 31, "workload": [1, 4, 5, 11, 17, 25, 27, 29, 31, 32], "workspac": [4, 14], "world": 6, "world_siz": [6, 9], "wors": 28, "worth": 14, "would": [0, 3, 4, 8, 13, 17, 18, 19, 25, 31], "wrap": [6, 9, 16, 34], "wrap_cpp_modul": 4, "wrapper": [6, 8], "write": [0, 5], "written": [0, 4, 31, 34], "wrong": [25, 31], "wsl2": [25, 31], "ww42": 30, "x": [0, 1, 4, 8, 9, 11, 19, 22, 24, 28, 33], "x1": 13, "x2": 13, "xcr0": 0, "xdf": 3, "xe": [13, 27, 31], "xe_hpg": 10, "xe_lpg": 10, "xelink": [10, 31], "xelta": 27, "xeon": 30, "xetla": [10, 13, 31], "xmx": [1, 24, 31], "xpu": [1, 2, 5, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 30, 34], "xpu_kwarg": 9, "xpu_r": 19, "xpucomputeeng": 13, "xpumalloc": 2, "xpustream": 4, "xsave": 0, "xxx": [18, 31], "y": [4, 11, 28], "ye": 3, "yield": 1, "you": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 16, 18, 19, 22, 23, 25, 27, 28, 29, 31, 32, 33, 35], "youngjoo": 28, "your": [1, 3, 4, 6, 8, 9, 11, 16, 22, 23, 25, 26, 28, 32, 33, 35], "your_generation_param": 31, "yourself": 8, "z": [4, 8], "ze_flat_device_hierarchi": 5, "zero": [5, 9, 25, 28, 33], "zero_grad": [4, 9, 16, 22], "zero_point": 17, "zeros_lik": 8, "zhang": 28, "zoo": 4}, "titles": ["Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc", "Intel\u00ae Extension for PyTorch*", "API Documentation", "Contribution", "Examples", "Features", "DistributedDataParallel (DDP)", "DLPack Solution", "DPC++ Extension", "Fully Sharded Data Parallel (FSDP)", "Advanced Configuration", "Auto Mixed Precision (AMP) on GPU", "Auto Channels Last", "Compute Engine (Experimental feature for debug)", "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels", "Float8 Data Type Support (Prototype)", "Horovod with PyTorch (Prototype)", "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]", "IPEX_LOG (Prototype)", "Channels Last", "Kineto Profiler", "Legacy Profiler Tool (Deprecated)", "torch.compile for GPU (Beta)", "Quick Start", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimizations Overview", "Weight-Only Quantization (Prototype)", "Transformers Optimization Frontend API", "Performance", "Releases", "Technical Details", "Ahead of Time (AOT) Compilation", "ipex.optimize Frontend API", "Memory Management", "Optimizer Fusion on GPU"], "titleterms": {"0": 31, "1": [30, 31], "10": [30, 31], "110": 31, "120": 31, "13": 31, "1550": 27, "1d": 19, "2": 31, "20": 31, "200": 31, "3": 31, "30": 31, "40": 31, "5": 31, "For": 28, "That": 19, "accessor": 8, "add": 0, "advanc": [5, 10], "ahead": [32, 33], "ai": [4, 30], "all": 19, "amp": [5, 11], "aot": [32, 33], "api": [0, 2, 5, 6, 12, 19, 24, 29, 34], "architectur": 1, "asynchron": 7, "aten": [0, 19], "auto": [5, 11, 12], "autocast": 11, "automat": 34, "b": 19, "basic": 4, "behavior": 11, "benchmark": 28, "bert": 4, "beta": [5, 22], "better": 3, "bfloat16": [4, 11], "bind": 6, "block": 19, "break": 31, "build": [0, 3, 8, 10], "c": [2, 4, 18, 19], "c10": 8, "cach": 27, "can": 11, "capsul": 7, "case": [7, 11, 13, 33], "center": [27, 30], "chang": 31, "channel": [5, 12, 19, 34], "check": 0, "cmake": 8, "code": [0, 4], "codegen": 0, "common": 29, "compil": [0, 5, 8, 22, 32, 33], "compon": 18, "compressor": 28, "comput": [5, 13], "configur": [5, 10, 30], "contribut": 3, "conv_bn_fold": 34, "convers": 19, "convolut": 19, "correspond": 8, "coverag": 19, "cpp": 0, "cpu": [0, 19], "creat": 19, "creation": 19, "csrc": 0, "current": 8, "custom": [0, 4], "d": 19, "data": [5, 7, 9, 15, 27, 30], "ddp": 6, "debug": [0, 3, 5, 13], "deep": 27, "deepspe": [14, 29], "default": [11, 12, 19], "definit": 18, "depend": [22, 25], "deprec": 21, "design": [0, 7], "detail": 32, "determin": 19, "develop": 3, "disabl": 12, "dispatch": [0, 28], "dispatchstub": 0, "distribut": [5, 27, 29], "distributeddataparallel": 6, "dldevic": 7, "dlpack": [5, 7], "doc": 0, "document": [2, 3, 24], "dpc": [4, 5, 8], "dynam": [0, 6], "dyndisp": 0, "eas": 12, "easi": 5, "elig": 11, "enabl": 12, "engin": [5, 13], "enviorn": 18, "environ": 28, "event": 18, "exampl": [0, 4, 6, 7, 8, 9, 15, 22], "execut": [23, 28], "experiment": 13, "export": 7, "extens": [0, 1, 3, 5, 6, 8, 14, 17, 28], "featur": [0, 5, 13, 28], "fetch": 8, "fine": 27, "float16": [4, 11], "float32": [4, 11], "float8": 15, "folder": 0, "format": 19, "fp16": 29, "fp8": 15, "framework": 28, "from": 6, "frontend": [29, 34], "fsdp": [5, 9], "fulli": [5, 9], "fuse_update_step": 34, "fusion": [27, 36], "gener": [2, 25], "get": 24, "gpu": [5, 6, 9, 11, 17, 22, 27, 28, 31, 32, 36], "h": 0, "hardwar": 30, "highlight": 31, "horovod": 16, "i": 19, "imper": [4, 11, 17, 29], "implement": [0, 13], "import": 7, "infer": [4, 11, 27, 28, 29], "inferenec": 22, "initi": 28, "input": 11, "instal": [6, 16, 28], "instanc": 4, "int4": 27, "int8": 4, "intel": [0, 1, 3, 4, 6, 14, 17, 27, 28, 30], "intrin": 0, "introduct": [6, 7, 8, 9, 11, 13, 14, 18, 20, 21, 22, 24, 28, 33, 36], "ipex": [32, 34], "ipex_log": [5, 18], "ipex_simple_trac": 18, "ipex_verbos": 18, "isa": 0, "issu": [12, 25, 31], "jit": 8, "kernel": [0, 4, 14, 19], "kineto": [5, 20], "known": [12, 31], "kv": 27, "languag": 27, "larg": 27, "last": [5, 12, 19, 34], "launch": 6, "layout": 19, "legaci": 21, "level": [0, 18], "librari": 25, "licens": 26, "linear": [27, 28], "linear_bn_fold": 34, "link": 6, "list": 27, "llm": [27, 28, 30], "load": 28, "local": 3, "log": 18, "low": 27, "manag": [2, 32, 35], "manner": 19, "manual": 0, "matrix": 28, "matter": 19, "max": 27, "memori": [2, 19, 32, 35], "methodologi": 27, "mix": [5, 11], "mode": [4, 15, 17, 29], "model": [4, 19, 27, 28], "motiv": 8, "mpi": 6, "multipl": 13, "nativ": 19, "nchw": 19, "nchw16c": 19, "neural": 28, "nhwc": 19, "node": 6, "oneccl": 6, "onednn": 19, "onli": [6, 9, 27, 28], "op": [8, 11], "oper": [13, 15, 19, 27, 36], "optim": [4, 17, 27, 29, 32, 34, 36], "option": 28, "overview": [0, 27, 30], "parallel": [5, 9], "path": 11, "perform": [25, 30], "platform": [14, 27], "pointer": 7, "polici": [13, 27], "prebuilt": 6, "precis": [5, 11, 27], "primit": 19, "privat": 0, "process": 0, "product": 30, "profil": [5, 20, 21], "program": 7, "promot": 11, "prototyp": [5, 15, 16, 18, 28], "pseudocod": 29, "pytest": 3, "python": [4, 5, 18], "pytorch": [0, 1, 3, 6, 14, 16, 17, 19, 28], "quantiz": [2, 5, 15, 17, 27, 28], "queue": 8, "quick": 23, "recommend": 6, "refer": [4, 11, 28], "regist": 19, "releas": 31, "replac": 18, "replace_dropout_with_ident": 34, "request": 8, "requir": [0, 22, 33], "resnet50": 4, "run": [15, 28], "runtim": [6, 10, 28], "save": 28, "scale": 6, "scenario": 29, "script": 28, "segment": 27, "select": [0, 13], "set": 18, "setup": 28, "setuptool": 8, "shard": [5, 9], "simpl": 18, "singl": [4, 6], "smoothquant": 29, "softwar": 30, "solut": [5, 7], "sourc": 6, "specif": [0, 11], "split_master_weight_for_bf16": 34, "start": [23, 24], "stride": 19, "struct": 0, "stub": 0, "support": [1, 5, 11, 14, 15, 19, 28], "sycl": [4, 8], "technic": 32, "tensor": 19, "test": 3, "time": [10, 32, 33], "tip": 3, "tool": [5, 21], "torch": [4, 5, 22], "torchscript": [4, 11, 17, 29], "train": [4, 5, 11, 22], "transform": 29, "troubleshoot": [22, 25], "tune": 27, "type": [11, 15, 27], "unit": 3, "us": [4, 5, 7, 8, 11, 12, 13, 33], "usag": [4, 6, 9, 15, 16, 18, 22, 25, 28, 29], "v2": 30, "valid": 27, "vec": 0, "version": 30, "weight": [27, 28], "what": 19, "wheel": 6, "widest": 11, "woq": 28, "write": [3, 8, 19], "xpu": [3, 4, 8, 19, 31], "xpustream": 8, "xyz": 0, "xyzkrnl": 0}}) \ No newline at end of file +Search.setIndex({"alltitles": {"1.10.200+gpu": [[31, "gpu"]], "1.13.10+xpu": [[31, "id23"]], "1.13.120+xpu": [[31, "id20"]], "2.0.110+xpu": [[31, "id17"]], "2.1.10+xpu": [[31, "id14"]], "2.1.20+xpu": [[31, "id11"]], "2.1.30+xpu": [[31, "id8"]], "2.1.40+xpu": [[31, "id5"]], "2.3.110+xpu": [[31, "id1"]], "2.5.10+xpu": [[31, "xpu"]], "API Documentation": [[2, null], [24, "api-documentation"]], "Add Custom Kernel": [[0, "add-custom-kernel"]], "Advanced Configuration": [[5, "advanced-configuration"], [10, null]], "Ahead of Time (AOT) Compilation": [[33, null]], "Ahead of Time Compilation (AOT) [GPU]": [[32, "ahead-of-time-compilation-aot-gpu"]], "Architecture": [[1, "architecture"]], "Asynchronous Programming": [[7, "asynchronous-programming"]], "Auto Channels Last": [[12, null]], "Auto Mixed Precision (AMP)": [[5, "auto-mixed-precision-amp"]], "Auto Mixed Precision (AMP) on GPU": [[11, null]], "Autocast Op Reference": [[11, "autocast-op-reference"]], "Automatic Channels Last": [[34, "automatic-channels-last"]], "BERT": [[4, "bert"], [4, "id3"], [4, "id7"], [4, "id10"], [4, "id13"], [4, "id16"]], "BFloat16": [[4, "bfloat16"], [4, "id4"]], "Basic Usage": [[4, "basic-usage"]], "Better local unit tests with pytest": [[3, "better-local-unit-tests-with-pytest"]], "Breaking Changes": [[31, "breaking-changes"], [31, "id3"]], "Build Time Configuration": [[10, "build-time-configuration"]], "Building documentation": [[3, "building-documentation"]], "Building with CMake": [[8, "building-with-cmake"]], "Building with setuptools": [[8, "building-with-setuptools"]], "C++": [[4, "c"]], "C++ API": [[2, "c-api"]], "CPU ISA build compiler requirement": [[0, "cpu-isa-build-compiler-requirement"]], "CPU feature check": [[0, "cpu-feature-check"]], "Channels Last": [[5, "channels-last"], [19, null]], "Channels Last 1D support on XPU": [[19, "channels-last-1d-support-on-xpu"]], "Channels Last Memory Format APIs": [[19, "channels-last-memory-format-apis"]], "Code Folder Struct": [[0, "code-folder-struct"]], "CodeGen Process": [[0, "codegen-process"]], "Compute Engine (Experimental feature for debug)": [[13, null]], "Compute Engine (Prototype feature for debug)": [[5, "compute-engine-prototype-feature-for-debug"]], "Configuration": [[30, "configuration"]], "Contributing to Intel\u00ae Extension for PyTorch*": [[3, "contributing-to-intel-extension-for-pytorch"]], "Contribution": [[3, null]], "Customize DPC++ kernels": [[4, "customize-dpc-kernels"]], "DDP Usage": [[6, "ddp-usage"]], "DDP scaling API (GPU Only)": [[6, "ddp-scaling-api-gpu-only"]], "DLDevice and data pointer": [[7, "dldevice-and-data-pointer"]], "DLPack Solution": [[5, "dlpack-solution"], [7, null]], "DPC++ Extension": [[5, "dpc-extension"], [8, null]], "Deep Fusion Policy": [[27, "deep-fusion-policy"]], "Default Precision": [[11, "default-precision"]], "Design": [[7, "design"]], "Developing Intel\u00ae Extension for PyTorch* on XPU": [[3, "developing-intel-extension-for-pytorch-on-xpu"]], "Dispatch Stub implementation: csrc/cpu/dyndisp/DispatchStub.cpp and csrc/cpu/dyndisp/DispatchStub.h": [[0, "dispatch-stub-implementation-csrc-cpu-dyndisp-dispatchstub-cpp-and-csrc-cpu-dyndisp-dispatchstub-h"]], "Distributed Inference": [[27, "distributed-inference"]], "Distributed Inference with DeepSpeed": [[29, "distributed-inference-with-deepspeed"]], "Distributed Training": [[5, "distributed-training"]], "DistributedDataParallel (DDP)": [[6, null]], "Dynamic Dispatch Design": [[0, "dynamic-dispatch-design"]], "Ease-of-use auto channels last API": [[12, "ease-of-use-auto-channels-last-api"]], "Easy-to-use Python API": [[5, "easy-to-use-python-api"]], "Engine Selection Policy": [[13, "engine-selection-policy"]], "Enviornment settings": [[18, "enviornment-settings"]], "Environment Setup": [[28, "environment-setup"]], "Event Log": [[18, "event-log"]], "Example": [[9, "example"]], "Example Case": [[7, "example-case"]], "Example Usage": [[22, "example-usage"]], "Example Usage (MPI launch for single node):": [[6, "example-usage-mpi-launch-for-single-node"]], "Example:": [[0, "example"], [0, "id1"]], "Examples": [[4, null]], "Execute WOQ benchmark script": [[28, "execute-woq-benchmark-script"]], "Execution": [[23, "execution"]], "Export DLPack Capsule": [[7, "export-dlpack-capsule"]], "FP16": [[29, "fp16"]], "FP8 Quantization": [[15, "fp8-quantization"]], "FP8 usage example": [[15, "fp8-usage-example"]], "FSDP Usage (GPU only)": [[9, "fsdp-usage-gpu-only"]], "Features": [[5, null]], "Fetching the corresponding sycl::queue": [[8, "fetching-the-corresponding-sycl-queue"]], "Float16": [[4, "float16"]], "Float32": [[4, "float32"], [4, "id1"]], "Float8 Data Type": [[15, "float8-data-type"]], "Float8 Data Type Support (Prototype)": [[15, null]], "Fully Sharded Data Parallel (FSDP)": [[5, "fully-sharded-data-parallel-fsdp"], [9, null]], "General": [[2, "general"]], "General Usage": [[25, "general-usage"]], "Get Started": [[24, "get-started"]], "Hardware Configuration": [[30, "hardware-configuration"]], "Highlights": [[31, "highlights"], [31, "id2"], [31, "id6"], [31, "id9"], [31, "id12"], [31, "id15"], [31, "id18"], [31, "id21"], [31, "id24"], [31, "id26"]], "Horovod with PyTorch (Prototype)": [[16, null]], "Horovod with PyTorch Usage": [[16, "horovod-with-pytorch-usage"]], "INT8": [[4, "int8"]], "IPEX_LOG (Prototype feature for debug)": [[5, "ipex-log-prototype-feature-for-debug"]], "IPEX_LOG (Prototype)": [[18, null]], "IPEX_LOG Definition": [[18, "ipex-log-definition"]], "ISA intrinics specific kernel example:": [[0, "isa-intrinics-specific-kernel-example"]], "Imperative Mode": [[4, "imperative-mode"], [4, "id5"], [4, "id11"], [17, "imperative-mode"]], "Imperative mode": [[29, "imperative-mode"]], "Import DLPack Capsule": [[7, "import-dlpack-capsule"]], "Inference": [[4, "inference"]], "Inference with Imperative Path": [[11, "inference-with-imperative-path"]], "Inference with TorchScript Path": [[11, "inference-with-torchscript-path"]], "Inferenece with torch.compile": [[22, "inferenece-with-torch-compile"]], "Install Horovod with PyTorch": [[16, "install-horovod-with-pytorch"]], "Install Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "install-intel-oneccl-bindings-for-pytorch"]], "Install Neural-compressor": [[28, "install-neural-compressor"]], "Install PyTorch and Intel\u00ae Extension for PyTorch*": [[6, "install-pytorch-and-intel-extension-for-pytorch"]], "Install from source": [[6, "install-from-source"]], "Installation of Intel\u00ae oneCCL Bindings for Pytorch*": [[6, "installation-of-intel-oneccl-bindings-for-pytorch"]], "Intel\u00ae AI Reference Models": [[4, "intel-ai-reference-models"]], "Intel\u00ae Extension for PyTorch*": [[1, null]], "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels": [[14, null]], "Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc": [[0, null]], "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]": [[17, null]], "Introduction": [[6, "introduction"], [7, "introduction"], [8, "introduction"], [9, "introduction"], [11, "introduction"], [13, "introduction"], [14, "introduction"], [18, "introduction"], [20, "introduction"], [21, "introduction"], [22, "introduction"], [24, null], [28, "introduction"], [33, "introduction"], [36, "introduction"]], "JIT Compiling Extensions": [[8, "jit-compiling-extensions"]], "Kernel Stub: csrc/cpu/aten/xyz.cpp and csrc/cpu/aten/xyz.h": [[0, "kernel-stub-csrc-cpu-aten-xyz-cpp-and-csrc-cpu-aten-xyz-h"]], "Kernel implementation: csrc/cpu/aten/kernels/xyzKrnl.cpp": [[0, "kernel-implementation-csrc-cpu-aten-kernels-xyzkrnl-cpp"]], "Kineto Profiler": [[20, null]], "Kineto Supported Profiler Tool (Prototype)": [[5, "kineto-supported-profiler-tool-prototype"]], "Known Issues": [[31, "known-issues"], [31, "id4"], [31, "id7"], [31, "id10"], [31, "id13"], [31, "id16"], [31, "id19"], [31, "id22"], [31, "id25"], [31, "id27"]], "Known issue": [[12, "known-issue"]], "LLM Inference": [[27, "llm-inference"]], "LLM Performance v2.1.10": [[30, "llm-performance-v2-1-10"]], "LLM fine-tuning on Intel\u00ae Data Center Max 1550 GPU": [[27, "llm-fine-tuning-on-intel-data-center-max-1550-gpu"]], "Large Language Models (LLM) Optimizations Overview": [[27, null]], "Legacy Profiler Tool (Deprecated)": [[21, null]], "Library Dependencies": [[25, "library-dependencies"]], "License": [[26, null]], "Linear Operator Optimization": [[27, "linear-operator-optimization"]], "Log Component": [[18, "log-component"]], "Log Level": [[18, "log-level"]], "Low Precision Data Types": [[27, "low-precision-data-types"]], "Memory Format Is All That Matters": [[19, "memory-format-is-all-that-matters"]], "Memory Management": [[35, null]], "Memory Management [GPU]": [[32, "memory-management-gpu"]], "Memory management": [[2, "memory-management"]], "Motivation and Example": [[8, "motivation-and-example"]], "Multiple Implementations Operators and Engines": [[13, "multiple-implementations-operators-and-engines"]], "Op Eligibility": [[11, "op-eligibility"]], "Op-Specific Behavior": [[11, "op-specific-behavior"]], "Operation Fusion": [[36, "operation-fusion"]], "Ops that can autocast to bfloat16": [[11, "ops-that-can-autocast-to-bfloat16"]], "Ops that can autocast to float16": [[11, "ops-that-can-autocast-to-float16"]], "Ops that can autocast to float32": [[11, "ops-that-can-autocast-to-float32"]], "Ops that promote to the widest input type": [[11, "ops-that-promote-to-the-widest-input-type"]], "Optimization Methodologies": [[27, "optimization-methodologies"]], "Optimizer Fusion on GPU": [[36, null]], "Optimizer Optimization [GPU]": [[32, "optimizer-optimization-gpu"]], "Overview": [[0, "overview"], [30, "overview"]], "Performance": [[30, null]], "Performance Data for Intel\u00ae AI Data Center Products": [[30, "performance-data-for-intel-ai-data-center-products"]], "Performance Issue": [[25, "performance-issue"]], "Platforms": [[27, "platforms"]], "Private Debug APIs": [[0, "private-debug-apis"]], "Pseudocode of Common Usage Scenarios": [[29, "pseudocode-of-common-usage-scenarios"]], "PyTorch Strided Layout": [[19, "pytorch-strided-layout"]], "Python": [[4, "python"]], "Quantization": [[2, "quantization"], [5, "quantization"]], "Quantize Model and Inference": [[28, "quantize-model-and-inference"]], "Quick Start": [[23, null]], "References": [[28, "references"]], "Releases": [[31, null]], "Replace IPEX_SIMPLE_TRACE": [[18, "replace-ipex-simple-trace"]], "Replace IPEX_VERBOSE": [[18, "replace-ipex-verbose"]], "Requesting the current c10::xpu::XPUStream": [[8, "requesting-the-current-c10-xpu-xpustream"]], "Required Dependencies": [[22, "required-dependencies"]], "Requirement": [[33, "requirement"]], "Resnet50": [[4, "resnet50"], [4, "id2"], [4, "id6"], [4, "id9"], [4, "id12"], [4, "id15"]], "Run Weight-Only Quantization LLM on Intel\u00ae GPU": [[28, "run-weight-only-quantization-llm-on-intel-gpu"]], "Runtime Configuration": [[10, "runtime-configuration"]], "Runtime Dynamic Linking": [[6, "runtime-dynamic-linking"]], "Save and Load Quantized Model (Optional)": [[28, "save-and-load-quantized-model-optional"]], "Segment KV Cache": [[27, "segment-kv-cache"]], "Select ISA level manually.": [[0, "select-isa-level-manually"]], "Simple Log": [[18, "simple-log"]], "Single-Instance Training": [[4, "single-instance-training"]], "SmoothQuant": [[29, "smoothquant"]], "Software Version": [[30, "software-version"]], "Support": [[1, "support"]], "Supported Framework Model Matrix": [[28, "supported-framework-model-matrix"]], "Supported Platform": [[14, "supported-platform"]], "Supported operators": [[15, "supported-operators"]], "Supported running mode": [[15, "supported-running-mode"]], "Technical Details": [[32, null]], "Tips": [[3, "tips"]], "Tips and Debugging": [[3, "tips-and-debugging"]], "TorchScript Mode": [[4, "torchscript-mode"], [4, "id8"], [4, "id14"], [17, "torchscript-mode"], [29, "torchscript-mode"]], "Training": [[4, "training"]], "Training Support": [[11, "training-support"]], "Training with torch.compile": [[22, "training-with-torch-compile"]], "Transformers Optimization Frontend API": [[29, null]], "Troubleshooting": [[22, "troubleshooting"], [25, null]], "Unit testing": [[3, "unit-testing"]], "Usage in C++": [[18, "usage-in-c"]], "Usage in python": [[18, "usage-in-python"]], "Usage of DDP scaling API": [[6, "usage-of-ddp-scaling-api"]], "Usage of running Weight-Only Quantization LLM For Intel\u00ae GPU": [[28, "usage-of-running-weight-only-quantization-llm-for-intel-gpu"]], "Use Case": [[7, "use-case"], [11, "use-case"], [13, "use-case"]], "Use SYCL code": [[4, "use-sycl-code"]], "Use case": [[33, "use-case"]], "Using accessors": [[8, "using-accessors"]], "Validated Models List": [[27, "validated-models-list"]], "Vec specific kernel example:": [[0, "vec-specific-kernel-example"]], "Weight Only Quantization INT4": [[27, "weight-only-quantization-int4"]], "Weight-Only Quantization (Prototype)": [[28, null]], "Weight-Only Quantization Initialization": [[28, "weight-only-quantization-initialization"]], "Weight-Only Quantization LLM features in Intel\u00ae Extension for PyTorch*": [[28, "weight-only-quantization-llm-features-in-intel-extension-for-pytorch"]], "Weight-Only Quantization Linear Dispatch": [[28, "weight-only-quantization-linear-dispatch"]], "Weight-Only Quantization Runtime": [[28, "weight-only-quantization-runtime"]], "What is Channels Last": [[19, "what-is-channels-last"]], "Writing Channels Last Kernels on CPU": [[19, "writing-channels-last-kernels-on-cpu"]], "Writing a DPC++ Extension": [[8, "writing-a-dpc-extension"]], "Writing documentation": [[3, "writing-documentation"]], "Writing the DPC++ Op": [[8, "writing-the-dpc-op"]], "[Recommended] Install from prebuilt wheels": [[6, "recommended-install-from-prebuilt-wheels"]], "a. Create NHWC Memory": [[19, "a-create-nhwc-memory"]], "a. NCHW (default)": [[19, "a-nchw-default"]], "a. Register Channels Last Kernel in ATen Native Manner": [[19, "a-register-channels-last-kernel-in-aten-native-manner"]], "a. tensor conversion with Channels Last 1D": [[19, "a-tensor-conversion-with-channels-last-1d"]], "a. tensor creation": [[19, "a-tensor-creation"]], "b. Create Convolution Primitive": [[19, "b-create-convolution-primitive"]], "b. NHWC": [[19, "b-nhwc"]], "b. Register oneDNN Kernel on Channels Last": [[19, "b-register-onednn-kernel-on-channels-last"]], "b. model conversion with Channels Last 1D": [[19, "b-model-conversion-with-channels-last-1d"]], "b. tensor conversion": [[19, "b-tensor-conversion"]], "c. Blocked (nChw16c, on CPU)": [[19, "c-blocked-nchw16c-on-cpu"]], "c. determine if in Channels Last 1D memory format": [[19, "c-determine-if-in-channels-last-1d-memory-format"]], "c. model conversion": [[19, "c-model-conversion"]], "conv_bn_folding": [[34, "conv-bn-folding"]], "d. operator coverage in PyTorch": [[19, "d-operator-coverage-in-pytorch"]], "default": [[12, "default"]], "disable": [[12, "disable"]], "enable": [[12, "enable"]], "fuse_update_step": [[34, "fuse-update-step"]], "ipex.optimize Frontend API": [[34, null]], "ipex.optimize [GPU]": [[32, "ipex-optimize-gpu"]], "linear_bn_folding": [[34, "linear-bn-folding"]], "oneDNN NHWC APIs": [[19, "onednn-nhwc-apis"]], "replace_dropout_with_identity": [[34, "replace-dropout-with-identity"]], "split_master_weight_for_bf16": [[34, "split-master-weight-for-bf16"]], "torch.compile for GPU (Beta)": [[5, "torch-compile-for-gpu-beta"], [22, null]], "torch.xpu.optimize": [[4, "torch-xpu-optimize"]]}, "docnames": ["design_doc/cpu/isa_dyndisp", "index", "tutorials/api_doc", "tutorials/contribution", "tutorials/examples", "tutorials/features", "tutorials/features/DDP", "tutorials/features/DLPack", "tutorials/features/DPC++_Extension", "tutorials/features/FSDP", "tutorials/features/advanced_configuration", "tutorials/features/amp_gpu", "tutorials/features/auto_channels_last", "tutorials/features/compute_engine", "tutorials/features/deepspeed_kernels", "tutorials/features/float8", "tutorials/features/horovod", "tutorials/features/int8_overview_xpu", "tutorials/features/ipex_log", "tutorials/features/nhwc", "tutorials/features/profiler_kineto", "tutorials/features/profiler_legacy", "tutorials/features/torch_compile_gpu", "tutorials/getting_started", "tutorials/introduction", "tutorials/known_issues", "tutorials/license", "tutorials/llm", "tutorials/llm/int4_weight_only_quantization", "tutorials/llm/llm_optimize_transformers", "tutorials/performance", "tutorials/releases", "tutorials/technical_details", "tutorials/technical_details/AOT", "tutorials/technical_details/ipex_optimize", "tutorials/technical_details/memory_management", "tutorials/technical_details/optimizer_fusion_gpu"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2}, "filenames": ["design_doc/cpu/isa_dyndisp.md", "index.rst", "tutorials/api_doc.rst", "tutorials/contribution.md", "tutorials/examples.md", "tutorials/features.rst", "tutorials/features/DDP.md", "tutorials/features/DLPack.md", "tutorials/features/DPC++_Extension.md", "tutorials/features/FSDP.md", "tutorials/features/advanced_configuration.md", "tutorials/features/amp_gpu.md", "tutorials/features/auto_channels_last.md", "tutorials/features/compute_engine.md", "tutorials/features/deepspeed_kernels.md", "tutorials/features/float8.md", "tutorials/features/horovod.md", "tutorials/features/int8_overview_xpu.md", "tutorials/features/ipex_log.md", "tutorials/features/nhwc.md", "tutorials/features/profiler_kineto.md", "tutorials/features/profiler_legacy.md", "tutorials/features/torch_compile_gpu.md", "tutorials/getting_started.md", "tutorials/introduction.rst", "tutorials/known_issues.md", "tutorials/license.md", "tutorials/llm.rst", "tutorials/llm/int4_weight_only_quantization.md", "tutorials/llm/llm_optimize_transformers.md", "tutorials/performance.md", "tutorials/releases.md", "tutorials/technical_details.rst", "tutorials/technical_details/AOT.md", "tutorials/technical_details/ipex_optimize.md", "tutorials/technical_details/memory_management.rst", "tutorials/technical_details/optimizer_fusion_gpu.md"], "indexentries": {"empty_cache() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.empty_cache", false]], "fp8_autocast() (in module intel_extension_for_pytorch.quantization.fp8)": [[2, "intel_extension_for_pytorch.quantization.fp8.fp8_autocast", false]], "get_fp32_math_mode() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.get_fp32_math_mode", false]], "max_memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.max_memory_allocated", false]], "max_memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.max_memory_reserved", false]], "mem_get_info() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.mem_get_info", false]], "memory_allocated() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_allocated", false]], "memory_reserved() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_reserved", false]], "memory_snapshot() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_snapshot", false]], "memory_stats() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_stats", false]], "memory_stats_as_nested_dict() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_stats_as_nested_dict", false]], "memory_summary() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.memory_summary", false]], "optimize() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.optimize", false]], "optimize() (in module intel_extension_for_pytorch.llm)": [[2, "intel_extension_for_pytorch.llm.optimize", false]], "reset_accumulated_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.reset_accumulated_memory_stats", false]], "reset_peak_memory_stats() (in module intel_extension_for_pytorch.xpu)": [[2, "intel_extension_for_pytorch.xpu.reset_peak_memory_stats", false]], "set_fp32_math_mode() (in module intel_extension_for_pytorch)": [[2, "intel_extension_for_pytorch.set_fp32_math_mode", false]], "torch_ipex::xpu::fp32_math_mode (c++ enum)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODEE", false]], "torch_ipex::xpu::fp32_math_mode::bf32 (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4BF32E", false]], "torch_ipex::xpu::fp32_math_mode::fp32 (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4FP32E", false]], "torch_ipex::xpu::fp32_math_mode::fp32_math_mode_max (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", false]], "torch_ipex::xpu::fp32_math_mode::fp32_math_mode_min (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", false]], "torch_ipex::xpu::fp32_math_mode::tf32 (c++ enumerator)": [[2, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4TF32E", false]], "torch_ipex::xpu::set_fp32_math_mode (c++ function)": [[2, "_CPPv4N10torch_ipex3xpu18set_fp32_math_modeE14FP32_MATH_MODE", false]]}, "objects": {"": [[2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4BF32E", "torch_ipex::xpu::BF32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4FP32E", "torch_ipex::xpu::FP32"], [2, 1, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODEE", "torch_ipex::xpu::FP32_MATH_MODE"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4BF32E", "torch_ipex::xpu::FP32_MATH_MODE::BF32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4FP32E", "torch_ipex::xpu::FP32_MATH_MODE::FP32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "torch_ipex::xpu::FP32_MATH_MODE::FP32_MATH_MODE_MAX"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "torch_ipex::xpu::FP32_MATH_MODE::FP32_MATH_MODE_MIN"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4TF32E", "torch_ipex::xpu::FP32_MATH_MODE::TF32"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MAXE", "torch_ipex::xpu::FP32_MATH_MODE_MAX"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE18FP32_MATH_MODE_MINE", "torch_ipex::xpu::FP32_MATH_MODE_MIN"], [2, 0, 1, "_CPPv4N10torch_ipex3xpu14FP32_MATH_MODE4TF32E", "torch_ipex::xpu::TF32"], [2, 2, 1, "_CPPv4N10torch_ipex3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "torch_ipex::xpu::set_fp32_math_mode"], [2, 3, 1, "_CPPv4N10torch_ipex3xpu18set_fp32_math_modeE14FP32_MATH_MODE", "torch_ipex::xpu::set_fp32_math_mode::mode"]], "intel_extension_for_pytorch": [[2, 4, 1, "", "get_fp32_math_mode"], [2, 4, 1, "", "optimize"], [2, 4, 1, "", "set_fp32_math_mode"]], "intel_extension_for_pytorch.llm": [[2, 4, 1, "", "optimize"]], "intel_extension_for_pytorch.quantization.fp8": [[2, 4, 1, "", "fp8_autocast"]], "intel_extension_for_pytorch.xpu": [[2, 4, 1, "", "empty_cache"], [2, 4, 1, "", "max_memory_allocated"], [2, 4, 1, "", "max_memory_reserved"], [2, 4, 1, "", "mem_get_info"], [2, 4, 1, "", "memory_allocated"], [2, 4, 1, "", "memory_reserved"], [2, 4, 1, "", "memory_snapshot"], [2, 4, 1, "", "memory_stats"], [2, 4, 1, "", "memory_stats_as_nested_dict"], [2, 4, 1, "", "memory_summary"], [2, 4, 1, "", "reset_accumulated_memory_stats"], [2, 4, 1, "", "reset_peak_memory_stats"]]}, "objnames": {"0": ["cpp", "enumerator", "C++ enumerator"], "1": ["cpp", "enum", "C++ enum"], "2": ["cpp", "function", "C++ function"], "3": ["cpp", "functionParam", "C++ function parameter"], "4": ["py", "function", "Python function"]}, "objtypes": {"0": "cpp:enumerator", "1": "cpp:enum", "2": "cpp:function", "3": "cpp:functionParam", "4": "py:function"}, "terms": {"": [2, 6, 8, 9, 11, 18, 19, 20, 25, 28, 36], "0": [0, 1, 3, 4, 6, 8, 9, 10, 11, 16, 18, 22, 25, 26, 28, 30, 36], "00000000000602e7": 0, "001": [4, 6, 11], "00978": 28, "04": [25, 30, 31], "05516": 28, "09": 0, "09557": 28, "0f": 8, "0x00007fd552f36000": 4, "0x00007fd55321b000": 4, "0x00007fd553600000": 4, "0x00007fd553a9c000": 4, "0x00007fd553b11000": 4, "0x00007fd55511d000": 4, "0x00007fd55512c000": 4, "0x00007fd57eb1d000": 4, "0x00007fd5806cc000": 4, "0x00007fd584ab0000": 4, "0x00007fd5862b0000": 4, "0x00007fd5a1a1b000": 4, "0x00007fd5a44d8000": 4, "0x00007fd5bb895000": 4, "0x00007fd5bb927000": 4, "0x1": 0, "0x2b0004b1": 30, "0x7fff": 0, "0xffff": 0, "1": [0, 1, 3, 4, 6, 7, 8, 9, 10, 11, 13, 18, 19, 22, 25, 27, 28, 34, 36], "10": [0, 3, 7, 9, 18, 19, 22], "100": [0, 9, 16, 30, 31], "1000": 9, "10004": 28, "1020": 30, "1024": [8, 30], "1024gb": 30, "10944": 28, "11": 0, "12": [0, 31], "123": 6, "12355": 9, "127": 6, "128": [4, 9, 11, 22, 28], "128k": 27, "13": 0, "1307": 9, "13b": [27, 30, 31], "14": 9, "15": 0, "1550": [30, 31], "16": [0, 22, 30], "17": [4, 8, 30], "170": [25, 31, 33], "17323": 28, "18": [8, 31], "1d": 10, "1mb": 2, "2": [0, 1, 4, 6, 7, 8, 9, 11, 18, 19, 20, 22, 25, 26, 27, 28, 30, 32, 33, 34], "20": [13, 19, 25], "200": 28, "2019": 2, "2020": 7, "2021": [0, 31], "2022": [28, 31], "2023": [28, 30, 31], "202306": 28, "2024": [30, 31], "2025": [22, 31], "202x": 4, "2048": 8, "21": 31, "22": [25, 30], "2206": 28, "2210": 28, "224": [4, 11, 22], "23": 25, "2304190630": 30, "2306": 28, "2309": 28, "2310": 28, "25": [9, 30], "29500": 6, "2d": [19, 34], "2f": 9, "2gb": 31, "3": [0, 3, 4, 6, 8, 9, 11, 13, 18, 19, 22, 25, 27, 28, 30, 34], "3081": 9, "30b": 27, "31": 30, "32": [9, 19, 22], "33": 0, "3696": 31, "3706": 31, "3788": 31, "3796": 31, "3808": 31, "3829": 31, "3841": 31, "3882": 31, "3887": 31, "3970": 31, "3_25mhzi_quad_dameni_oam600w_ifrv2332i_pscnull_ifwi": 30, "3d": 19, "4": [6, 8, 18, 19, 22, 27, 28, 31], "40b": 27, "4280": 31, "4317": 31, "4354": 31, "4358": 31, "4361": 31, "4407": 31, "4429": 31, "4439": 31, "4450": 31, "4463": 31, "4468": 31, "4480": 31, "4495": 31, "4504": 31, "4527": 31, "4557": 31, "4558": 31, "4800": 30, "4bit": 28, "4f": 9, "4fc181b0": 30, "4k": 27, "4x": 30, "5": [0, 4, 6, 9, 13, 17, 18, 19, 20, 22, 28, 29, 30], "50": 19, "500mb": [32, 33], "512": [1, 4, 31], "56": [22, 30], "58": 0, "5b": 27, "5gb": [32, 33], "5x": 31, "6": [3, 4, 27, 30, 31], "64": [9, 11, 22, 28], "64gb": 30, "66": 0, "6b": [27, 28, 30, 31], "6f": 9, "7": [0, 7, 9], "70b": [27, 31], "736": 30, "7b": [27, 28, 30, 31], "7b1": 27, "8": [8, 15], "80": 3, "8480": 30, "86b": 30, "8b": [27, 28, 31], "9": [0, 4, 31], "91b14bf559": [22, 25], "9216": 9, "9525": 30, "997": 25, "9b": [27, 28], "A": [0, 3, 4, 5, 17, 19, 25, 28, 31, 33], "And": [8, 31], "As": [8, 17, 25, 28, 36], "At": [0, 8, 27], "Be": 17, "But": [0, 19], "By": [0, 2, 5, 10, 34], "FOR": 15, "For": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 18, 19, 20, 23, 24, 25, 27, 31, 32, 33, 34, 35, 36], "If": [0, 2, 3, 4, 6, 7, 8, 11, 12, 13, 17, 18, 19, 22, 25, 28, 31, 32, 33, 34], "In": [0, 1, 2, 4, 5, 8, 11, 13, 19, 27, 28, 31], "It": [0, 2, 5, 6, 7, 8, 9, 10, 14, 18, 19, 23, 28, 29, 31, 32, 33], "Its": [32, 34], "NOT": [14, 19], "No": [2, 19, 25, 31], "Not": 25, "ON": [10, 30], "On": [1, 5, 15, 19, 27, 28], "One": [7, 15, 19, 36], "Or": 6, "Such": [0, 31], "The": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 36], "Then": [3, 6, 17, 25, 28, 31], "There": [18, 23, 27], "These": [1, 4, 11, 15, 25, 27, 28, 31], "To": [0, 2, 3, 4, 5, 6, 9, 16, 19, 23, 27, 28, 31], "Will": 19, "With": [1, 2, 5, 6, 8, 16, 17, 31], "_": [0, 4, 6, 7, 8, 14, 17, 18, 19, 25, 28, 31, 34], "__init__": [3, 6, 9, 11, 19, 22], "__m256i": 0, "__m512": 0, "__m512i": 0, "__main__": [6, 9], "__name__": [6, 9], "__release_lnx": 31, "_appli": 19, "_build": 3, "_c": [0, 2], "_cmp_ord_q": 0, "_convolut": 11, "_cvt_fp32_to_bf16": 0, "_distributed_c10d": 2, "_get_current_isa_level": 0, "_get_highest_binary_support_isa_level": 0, "_get_highest_cpu_support_isa_level": 0, "_glibcxx_use_cxx11_abi": 25, "_mm256_mask_storeu_epi16": 0, "_mm256_storeu_si256": 0, "_mm512_add_epi32": 0, "_mm512_and_si512": 0, "_mm512_castps_si512": 0, "_mm512_cmp_ps_mask": 0, "_mm512_cvtneps_pbh": 0, "_mm512_cvtusepi32_epi16": 0, "_mm512_loadu_p": 0, "_mm512_mask_blend_epi32": 0, "_mm512_maskz_loadu_p": 0, "_mm512_set1_epi32": 0, "_mm512_srli_epi32": 0, "_optimizer_util": 34, "_original_step": 34, "_parameter_wrapp": 34, "_recurs": 4, "_thnn_fused_gru_cel": 11, "_xpu": 8, "_znk5torch8autograd4node4nameb5cxx11ev": [25, 31], "a770": 31, "ab": [18, 31], "abbrevi": 2, "abi": [0, 4, 31], "about": [1, 2, 3, 6, 8, 9, 31], "abov": [3, 6, 7, 9, 10, 18, 19, 27, 36], "absenc": 25, "absolut": 4, "abstract": 8, "acceler": [1, 2, 5, 15, 22, 24, 28, 31], "accept": [10, 18], "access": [7, 8, 19, 27, 31, 36], "accommod": 19, "accomplish": 16, "accord": [2, 27, 28], "accumul": 2, "accur": [11, 27, 28], "accuraci": [2, 9, 11, 27, 28, 31], "achiev": [1, 2, 4, 31], "across": [2, 5, 6, 9], "action": [9, 18], "activ": [2, 4, 15, 16, 17, 22, 25, 27, 28, 29], "active_byt": 2, "actual": [7, 8, 19, 25, 31], "ad": [2, 4, 22, 31], "adadelta": 9, "adam": 36, "adamw": [31, 36], "adaptiveaveragepoolingkrnl": 0, "add": [10, 11, 14, 16, 18, 19, 25, 31, 36], "add_": 36, "add_argu": 9, "add_execut": 4, "add_librari": 8, "addbmm": 11, "addcdiv": 11, "addcmul": 11, "addit": [0, 2, 4, 28, 31, 32, 33, 34], "addition": [22, 28], "addmm": [8, 11, 31], "addmm_": 11, "addmv": 11, "addr": 11, "address": [6, 10, 19, 25], "addtion": 0, "adjust": [6, 28], "adopt": [7, 27, 28, 31], "advanc": [1, 8, 23, 28, 31, 35], "advantag": [1, 2, 12, 19, 24, 31], "aes_ni": 0, "after": [2, 3, 4, 8, 17, 23, 25, 28, 31, 36], "afterward": 8, "again": [3, 36], "against": 4, "agnost": 8, "agre": 3, "ahead": [3, 8], "ai": [1, 10, 24, 25, 27, 31], "aka": 19, "al": 28, "algebra": [13, 27], "algorithm": [10, 15, 19, 28, 31], "alia": [2, 4, 8], "align": [0, 9, 18, 31], "align_corn": 13, "all": [0, 2, 3, 4, 6, 8, 9, 10, 11, 13, 16, 18, 25, 27, 28, 29, 30, 31, 34, 35, 36], "all_reduc": 9, "allgath": [6, 9, 16, 31], "alloc": [2, 7, 16, 18, 27, 32, 35], "allocated_byt": 2, "allow": [2, 4, 10, 11, 25, 28, 31, 32, 33], "allreduc": [2, 6, 16, 31], "alltoal": [6, 16], "almost": 19, "along": 3, "alpha": [8, 36], "alreadi": [1, 3, 4, 16, 19, 27, 32], "also": [1, 2, 4, 5, 7, 8, 10, 17, 18, 19, 25, 27, 28, 29, 31, 32, 33, 35, 36], "altern": [2, 4, 5, 6, 19], "although": 2, "alwai": [3, 11, 13, 19, 31], "amax": 2, "amc": 30, "among": [5, 7, 16], "amount": [2, 35], "amp": [4, 22, 23, 31], "amp_dtyp": 29, "amplifi": 1, "amx": [0, 1, 31], "amx_bf16": 0, "amx_int8": 0, "amx_til": 0, "an": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 16, 17, 18, 19, 25, 27, 28, 31, 35, 36], "anaconda": 0, "analysi": 17, "ani": [0, 2, 3, 11, 18, 19, 22, 25, 31, 32], "annot": 10, "anonym": 0, "anoth": 28, "answer": 19, "anymor": 5, "aot": [8, 10, 31], "apach": [16, 26], "api": [1, 4, 8, 9, 10, 17, 22, 27, 28, 31, 32, 35], "app": 4, "appear": [25, 31], "append": 4, "appli": [2, 4, 5, 11, 16, 19, 23, 27, 28, 29, 31, 36], "applic": [1, 2, 4, 27, 28, 32, 33, 34, 35], "approach": [8, 25, 27], "appropri": 32, "ar": [0, 1, 2, 3, 4, 5, 7, 8, 10, 11, 14, 15, 16, 17, 18, 19, 22, 23, 25, 27, 28, 29, 31, 32, 33, 34, 36], "arc": [25, 27, 28, 31, 33], "architectur": [10, 27, 28], "arg": [6, 9, 16, 18, 36], "argc": 4, "argmax": 9, "argpars": 9, "argument": [6, 9, 13, 28], "argumentpars": 9, "argv": 4, "around": 8, "arrai": [19, 34], "arxiv": 28, "ask": 3, "assert": 10, "assign": 19, "associ": [8, 32], "assum": [11, 23], "asymmetr": [17, 31], "asynchron": 10, "at_dispatch_all_typ": 8, "at_dispatch_floating_typ": 8, "atan2": 11, "aten": [2, 4, 7, 8, 10, 31], "aten_cpu_cap": 0, "atendlmtensor": 7, "atenipextypexpu": 28, "ats": 33, "attempt": [18, 32], "attent": [1, 27, 28, 31], "attention_mask": 15, "attn": 31, "attribut": [2, 19], "auto": [0, 2, 4, 8, 19, 27, 31], "auto_kernel_select": 2, "autocast": [4, 22, 23], "autoclass": 3, "autofunct": 3, "autom": 11, "automat": [1, 2, 4, 5, 12, 17, 19, 25, 27, 28, 31, 32, 33], "automaticlli": 2, "automodelforcausallm": [28, 29, 31], "autoround": 28, "autotoken": 28, "avail": [0, 1, 2, 4, 5, 6, 8, 10, 13, 23, 28, 29, 31, 35], "aval": 8, "averag": [9, 16], "averagepool2d": 13, "avg_pool": 18, "avoid": [2, 3, 8, 25], "avx": [0, 1, 31], "avx2": 0, "avx256": 0, "avx2_vnni": 0, "avx512": [0, 19, 31], "avx512_4fmap": 0, "avx512_4vnniw": 0, "avx512_bf16": 0, "avx512_bitalg": 0, "avx512_bw": 0, "avx512_cd": 0, "avx512_dq": 0, "avx512_er": 0, "avx512_f": 0, "avx512_fp16": 0, "avx512_ifma": 0, "avx512_pf": 0, "avx512_vbmi": 0, "avx512_vbmi2": 0, "avx512_vl": 0, "avx512_vnni": 0, "avx512_vp2intersect": 0, "avx512_vpclmul": 0, "avx512_vpopcntdq": 0, "avx_vnni": 0, "awar": [19, 28], "awq": [2, 28], "b": [3, 11, 31], "b4": 30, "back": [0, 13, 19, 25, 31], "backend": [0, 1, 2, 5, 6, 7, 8, 9, 13, 16, 22, 25, 27, 28, 31, 33], "background": 8, "backpropag": 28, "backward": [4, 6, 8, 9, 11, 16, 22, 31, 34], "backwardprefetch": 9, "baddbmm": 11, "baeseong": 28, "bag": 31, "baichuan": 27, "baichuan2": [27, 31], "bandwidth": [27, 28], "barrier": 9, "base": [0, 1, 2, 3, 4, 8, 9, 10, 13, 16, 27, 28, 29, 30, 31], "basekit": [6, 16], "bash": [25, 28], "basic": [13, 31], "batch": [2, 4, 8, 9, 16, 19, 31, 32, 34], "batch_idx": [4, 9, 16], "batch_siz": [4, 6, 8, 9, 16, 19], "batchnorm": [0, 34], "batchnorm1d": 19, "beam": [27, 31], "becaus": [0, 4, 11, 19, 27, 33], "becom": 27, "been": [0, 1, 4, 5, 8, 19, 20, 22, 25, 31], "befor": [0, 1, 2, 3, 4, 5, 10, 17, 19, 22, 25, 28, 32, 33, 34], "beforehand": [28, 32, 33], "begin": [2, 3, 8], "behavior": [2, 10, 13], "being": [14, 31], "believ": [11, 19], "belong": 18, "below": [4, 6, 8, 11, 13, 14, 18, 19, 23, 25, 28, 31, 33, 36], "benchmark": [4, 30, 35], "benefici": 19, "benefit": [4, 5, 11, 17], "benifit": [32, 33], "bert": 15, "bertmodel": 4, "besid": [8, 27, 28, 31], "best": [0, 2, 11, 17, 27], "beta": 31, "better": [1, 2, 10, 13, 17, 19, 27, 28, 31, 34, 36], "between": [0, 7, 11, 27, 28], "bf16": [0, 2, 23, 27, 31, 36], "bf32": [2, 10], "bfloat16": [0, 2, 5, 13, 22, 23, 31, 34], "bia": [8, 11, 14, 19, 28, 31], "big": 19, "bigscienc": 27, "bilinear": 11, "bin": [0, 4, 25, 30, 31], "binari": [0, 3, 4, 11, 19, 32, 33], "binary_cross_entropi": 11, "binary_cross_entropy_with_logit": 11, "binaryop": 31, "bind": [5, 8, 9, 31], "bio": 30, "bit": 15, "bla": 10, "block": [2, 3, 10, 28, 31, 32, 34], "blocksiz": 28, "bloom": [27, 30], "bmm": 11, "bmp": 19, "bn": 2, "boast": 28, "bodi": 0, "bool": 2, "boost": [4, 12, 30, 31], "both": [1, 2, 4, 5, 7, 15, 17, 19, 27, 28, 29, 31, 33, 34, 36], "bottl": 36, "bottleneck": [27, 28], "bottom": 34, "bound": [27, 31, 36], "box": 4, "brain": 28, "branch": 1, "bridg": 8, "brief": 27, "bring": [12, 17, 27, 31, 32], "broad": [12, 31], "broadcast": 16, "broadcast_optimizer_st": 16, "broadcast_paramet": 16, "broader": [28, 31], "broken": 2, "buf": 36, "buffer": 27, "bug": [1, 3, 31, 32, 33], "build": [4, 5, 16, 25, 31, 33], "build_by_per_kernel": 10, "build_conv_contigu": 10, "build_ext": 8, "build_internal_debug": 10, "build_opt_level": 10, "build_separate_op": 10, "build_with_sanit": 10, "built": [0, 8, 25, 31, 33], "bwd": 31, "byeongwook": 28, "byte": 2, "c": [0, 1, 5, 6, 8, 11, 22, 25], "c10": [0, 4], "c10d": [6, 9], "cach": [2, 3, 8, 10, 14, 18, 22, 31, 32, 35, 36], "cache_en": 23, "cache_weight_for_large_batch": 2, "cai": 28, "calcul": [1, 2, 8, 11, 18, 28, 31], "calib_dataload": 28, "calib_dataset": [17, 29], "calib_func": 28, "calibr": [2, 4, 17, 29], "calibration_data_load": 4, "call": [0, 2, 4, 6, 8, 11, 18, 19, 28, 34, 35], "can": [0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 13, 16, 17, 18, 19, 23, 25, 27, 28, 29, 31, 32, 33, 34, 35, 36], "can_cast_train": 34, "candidate_cel": 8, "cannot": [11, 19, 25, 31], "canon": 19, "capabl": [0, 5, 17, 18, 22, 28, 31, 35], "capac": [5, 13, 30], "capsule2": 7, "captur": 35, "card": [6, 10, 19, 27, 31], "care": 8, "case": [0, 2, 4, 5, 8, 9, 10, 12, 19, 28, 31], "cast": [2, 8, 11], "cat": [8, 11, 13], "catch": [4, 17, 18], "categori": [11, 14], "caus": [2, 25, 27, 31, 33], "cc": [0, 4], "ccl": [6, 9, 10, 16, 30], "ccl_blocking_wait": 10, "ccl_same_stream": 10, "cd": [3, 4], "cdist": 11, "center": [5, 14, 25, 28, 31, 33], "cerr": 4, "certain": [1, 2, 25, 28, 29], "cgf": 8, "cgh": 8, "chain_matmul": 11, "challeng": [27, 28, 31], "chang": [0, 3, 4, 6, 8, 9, 11, 16, 19, 23, 25, 28, 29], "channel": [10, 17, 31, 32], "channels_last": [4, 5, 19], "char": 4, "charact": 3, "chat": [27, 28], "chatglm3": 31, "check": [4, 5, 6, 8, 10, 13, 19, 23, 27, 29, 31, 32, 34], "check_contigu": 8, "check_input": 8, "check_xpu": 8, "checkpoint": [2, 4, 16], "cheng": 28, "chines": 31, "choos": [4, 5, 11, 13, 31], "chosen": [0, 11, 13], "chw": 19, "chwn": 19, "cifar10": 4, "circumst": 11, "cl": 4, "cl_device_not_found": 25, "class": [6, 9, 11, 19, 22], "classif": 9, "claus": 36, "clean": [3, 25, 31], "cleanup": 9, "client": 31, "clone": [3, 36], "close": 19, "cmake": [0, 4, 5, 31], "cmake_cxx_flag": 4, "cmake_have_libc_pthread": 4, "cmake_minimum_requir": [4, 8], "cmake_prefix_path": 8, "cmakefil": 0, "cmakelist": [4, 8], "cmdclass": 8, "cn": 6, "cnn": 19, "co": 31, "code": [1, 2, 3, 5, 8, 9, 10, 13, 16, 19, 23, 25, 26, 29, 31, 32, 33, 35, 36], "codegen": 22, "collect": [2, 4, 5, 6, 9, 10, 17, 31], "column": [4, 8], "com": [3, 6, 9, 25], "combin": [2, 17], "come": 8, "comma": [10, 33], "command": [3, 4, 6, 8, 16, 22, 25, 31], "comment": [0, 3], "commit": [3, 30], "common": [0, 14], "commun": [5, 6, 7, 9, 10, 31], "compar": [1, 2, 5, 19, 28, 31, 36], "compat": [0, 28, 31], "compens": 16, "competit": 31, "compil": [1, 3, 4, 10, 25, 31], "compile_flag": 8, "compiled_model": 22, "complet": [3, 4, 18, 19, 29, 35], "complex": [0, 27, 28], "complexdoubl": 0, "complexfloat": 0, "complic": 8, "complier": 0, "compon": [8, 10, 26, 27], "compos": [4, 9, 15], "comprehens": [1, 31, 35], "compress": [15, 28], "compression_dim": 28, "compression_dtyp": 28, "compris": 19, "comput": [2, 4, 10, 15, 16, 19, 22, 27, 28, 31, 32, 33, 34], "compute_dtyp": 28, "compute_eng": 13, "concat": [13, 19, 27], "concat_linear": 2, "concaten": [13, 27], "concept": [16, 19], "conclus": 19, "concret": 31, "conda": [25, 31], "condit": [26, 31, 34], "conf": 28, "config": [2, 4, 17], "configur": [0, 2, 4, 8, 17, 18, 23, 25, 28, 31, 33], "conflict": [0, 22, 25], "connect": 34, "conserv": 25, "consid": 8, "consider": 17, "consist": [16, 27, 31], "consol": 18, "const": [0, 4, 8], "constrain": 28, "consum": 7, "contain": [0, 3, 6, 7, 10, 13, 28], "content": [28, 29, 31], "context": [2, 3, 7, 8, 11, 27], "contigu": [8, 10, 19, 27, 31], "contiguous_format": 19, "continu": [5, 13, 18, 25, 31], "contribut": 22, "control": [1, 10], "conv": [2, 10, 11, 18, 31, 34], "conv1": [9, 22], "conv1d": [11, 19], "conv2": [9, 22], "conv2d": [2, 9, 11, 19, 22, 31], "conv3d": [11, 31], "conv_binari": 17, "conv_bn": 2, "conv_bn_fold": 2, "conv_relu": 17, "conv_sum_relu": 17, "conv_tbc": 11, "conv_transpose1d": 11, "conv_transpose3d": 11, "conv_unari": 17, "conveni": [8, 11], "convent": 9, "convers": [2, 4, 11, 17, 28, 31], "convert": [0, 1, 2, 4, 5, 7, 11, 12, 15, 17, 19, 28, 29, 31, 34], "convert_dtype_str2torch": 28, "convert_jit": [4, 17, 29], "convolut": [2, 5, 11, 22, 32, 34], "convolutuon": 2, "convtranspos": 34, "convtranspose2d": 2, "coo": 19, "copi": [0, 3, 5, 7, 19], "copyright": [0, 26], "core": [0, 2, 16, 22, 25, 27, 28, 30, 31], "correct": [8, 9, 19, 25], "correctli": 25, "correspond": [2, 6, 25, 28, 31], "correspondingli": 28, "corrspond": 19, "corrupt": 16, "cosine_embedding_loss": 11, "cosine_similar": 11, "cost": [2, 5, 8, 21], "could": [6, 10, 17, 18, 19, 22, 28, 29, 34], "counter": 2, "counterpart": [2, 31], "cout": 4, "cover": [19, 27, 31], "cpp": [3, 4, 8], "cpp_extens": 8, "cppsdk": 4, "cpu": [1, 2, 4, 5, 6, 23, 25, 30, 31, 34], "cpu_capability_avx512": 0, "cpu_capability_avx512_bf16": 0, "cpu_featur": 0, "cpu_feature_main": 0, "cpuid": 0, "cpuinfo": 0, "cpuoffload": 9, "creat": [2, 3, 4, 7, 8, 14, 17, 22, 29, 31, 34], "credit": 0, "criterion": [4, 11], "cross": [10, 11, 31], "cross_entropy_loss": 11, "crossentropyloss": 4, "crucial": 25, "cuda": [8, 9, 31], "cudnn": 19, "curdevid": 28, "current": [0, 1, 2, 3, 5, 7, 9, 13, 17, 18, 22, 25, 27, 28, 29, 31, 33, 34, 36], "current_devic": 2, "current_stream": 8, "custom": [1, 5, 8, 10, 13, 14, 27, 31], "cvt_fp32_to_bf16": 0, "cvt_fp32_to_bf16_kernel_fn": 0, "cvt_fp32_to_bf16_kernel_impl": 0, "cvt_fp32_to_bf16_kernel_stub": 0, "cvtfp32tobf16": 0, "cvtfp32tobf16krnl": 0, "cwd": 6, "cxx": [0, 4, 25], "cxx11": 31, "cxx_standard": [4, 8], "d": [3, 4, 8, 11, 34], "d25": 30, "d2h": 25, "d__avx512f__": 0, "d__avx__": 0, "d_bia": 8, "d_candidate_cel": 8, "d_elu": 8, "d_gate": 8, "d_gate_weight": 8, "d_gates_": 8, "d_input": 8, "d_input_g": 8, "d_new_cel": 8, "d_old_cel": 8, "d_old_cell_": 8, "d_old_h": 8, "d_output_g": 8, "d_relu": 8, "d_sigmoid": 8, "d_tanh": 8, "d_tanh_new_cel": 8, "d_weight": 8, "d_x": 8, "dampen": 36, "data": [0, 2, 4, 6, 8, 11, 12, 14, 16, 17, 19, 23, 25, 28, 29, 31, 33, 36], "data_prepare_finish": 18, "data_ptr": 7, "data_typ": 19, "dataload": [4, 6, 9, 16], "dataset": [4, 6, 9, 16, 28], "dataset1": 9, "dataset2": 9, "datatyp": [28, 30, 31], "date": 31, "dcmake_c_compil": 8, "dcmake_cxx_compil": 8, "dcmake_prefix_path": [4, 8], "dcpmm": 30, "dcpu_cap": 0, "dcpu_capability_amx": 0, "dcpu_capability_avx2": 0, "dcpu_capability_avx512": 0, "dcpu_capability_avx512_bf16": 0, "dcpu_capability_avx512_fp16": 0, "dcpu_capability_avx512_vnni": 0, "dcpu_capability_default": 0, "ddp": [2, 5, 9, 31], "ddp_loss": 9, "ddr": 30, "deactiv": 25, "dealloc": 32, "debug": [10, 17, 18, 29], "decid": 27, "declar": [0, 8], "decltyp": 0, "decod": [27, 31], "decompress": 15, "decreas": [2, 25], "dedic": [2, 31], "deep": [6, 7, 9, 11, 13, 15, 16], "deepcopi": 2, "deepspe": [2, 10, 27, 30, 31], "def": [6, 8, 9, 11, 19, 22, 25, 31], "default": [0, 2, 4, 5, 6, 7, 9, 10, 18, 22, 25, 31, 34], "default_weight_observ": [4, 17, 29], "defaultvalu": 10, "defin": [0, 2, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 22, 29, 31, 34], "definit": [0, 2, 8], "deinit": 3, "delai": 15, "delayedsc": [2, 15], "deleg": [16, 32], "delet": 22, "delimit": 33, "deliv": 17, "deliveri": [32, 33], "demand": 5, "demonstr": [4, 7, 13, 19, 28], "demostr": 23, "dens": 19, "depend": [0, 4, 19, 31], "deploi": [27, 28, 31], "deploy": [2, 4, 28], "deployment_mod": 2, "deprec": [10, 12, 31], "dequant": [14, 31], "desc": 19, "descent": 28, "describ": [5, 6, 11, 19, 28], "descript": [2, 5, 9, 10, 18, 19, 24, 34], "descriptor": 31, "design": [3, 9, 11, 13, 14, 19, 28, 29, 31, 32, 34], "desir": [4, 17], "destroy_process_group": 9, "detach": 36, "detail": [0, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 17, 19, 23, 24, 27, 28, 30, 31, 34], "detect": [0, 1, 4], "determin": [0, 28], "develop": [1, 4, 8, 32, 33], "devic": [1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14, 16, 18, 19, 21, 22, 24, 25, 27, 28, 29, 31, 32, 33, 34], "device_count": 9, "device_id": [6, 7, 9], "device_map": 28, "devid": 16, "dgpu": 28, "diagram": 19, "dict": 2, "dictionari": 2, "differ": [0, 1, 2, 4, 6, 7, 10, 19, 27, 28], "difficult": 19, "digit": 9, "dim": [4, 7, 8, 9, 13, 19], "dimens": [2, 8, 13, 19, 34], "dimension": 8, "dir": [0, 10], "direct": 3, "directli": [2, 8, 22, 28], "directori": [1, 3, 8, 10, 25, 29, 31], "disabl": [2, 9, 18, 34], "disable_auto_channels_last": [12, 34], "disadvantag": [32, 33], "discret": [1, 24, 31], "discrete gpu": 1, "discuss": [3, 8, 19], "dispatch": [1, 8, 31], "displai": 2, "dist": [6, 9, 11], "distribut": [2, 6, 9, 10, 16, 25, 31, 32, 33], "distributeddataparallel": [9, 31], "distributedoptim": 16, "distributedsampl": [9, 16], "div": [19, 31], "divid": 34, "divis": 13, "dkm": 25, "dldevicetyp": 7, "dlmanagedtensor": 7, "dmesg": 25, "dmlc": 7, "dnn": 15, "do": [2, 3, 5, 8, 11, 19, 25, 27, 28], "doc": [1, 3, 17, 29, 31, 34], "dockerfil": 31, "docstr": 3, "document": [0, 4, 5, 8, 10, 29, 31], "doe": [7, 8, 18, 19, 25, 31, 34], "doesn": [2, 19], "domain": [7, 15], "domin": [1, 27], "don": [0, 3, 8, 11], "done": [0, 4, 9, 22], "dongsoo": 28, "dot": [11, 19, 27], "doubl": [0, 8], "down": [2, 25], "download": [4, 9, 22, 25], "downstream": 11, "dpc": [1, 7, 10, 22, 25, 31], "dpccp": 2, "dpcpp": [8, 22, 25, 31], "dpcppbuildextens": 8, "dpcppextens": 8, "dpcpproot": [22, 25], "drawback": 2, "drive": [1, 27], "driver": [5, 25, 30, 33], "dropout": [2, 9, 31, 32, 34], "dropout1": 9, "dropout2": 9, "dspevd": 31, "dst": 0, "dtype": [0, 2, 4, 11, 13, 17, 22, 23, 28, 29, 31, 34], "due": [0, 1, 11, 17, 25, 27, 28, 31], "dummi": 10, "dump": [17, 31], "duplic": 10, "durat": 25, "dure": [2, 3, 4, 27, 28, 33, 34], "dynam": [1, 4, 15, 25, 27, 31], "e": [0, 1, 2, 4, 8, 10, 11, 17, 19, 24, 25, 27, 31, 32, 33], "e4m3": 15, "e5m2": 15, "each": [0, 2, 5, 6, 8, 9, 10, 11, 13, 16, 18, 31, 34], "eager": [1, 17, 31], "earlier": 25, "eas": [8, 19], "easi": [1, 16, 24, 31], "easier": [19, 22], "easiest": 28, "easili": 28, "ec33277": 30, "ecc": 30, "ecolog": 14, "edit": 3, "educ": 31, "effect": [0, 16, 17, 28], "effici": [1, 6, 8, 9, 15, 22, 27, 28, 31, 36], "effort": 28, "either": [2, 5, 6, 25], "elabor": 8, "elaps": 9, "elapsed_tim": 9, "element": [10, 19, 36], "eleutherai": 27, "elia": 28, "elimin": 27, "els": [0, 6, 19, 36], "elu": [8, 31], "embed": [2, 27, 31], "emerg": [1, 27], "emit": 8, "empir": 13, "empow": [5, 22], "empti": [7, 18, 19], "empty_cach": [2, 35], "emul": 31, "enabl": [1, 2, 4, 5, 6, 10, 11, 15, 17, 19, 23, 27, 28, 31, 32, 33, 34], "enable_auto_channels_last": [12, 34], "enable_tim": 9, "enable_wrap": 9, "encount": [22, 25, 32, 33], "end": [2, 4, 18, 25, 28, 31, 32, 33, 34], "endif": 0, "endl": 4, "engin": [1, 4, 19, 24, 28, 31], "enhanc": [1, 22, 27, 28, 31], "enough": [2, 25], "ensur": [4, 16, 25], "entir": [8, 27], "enum": 2, "enumer": [2, 4, 7, 9, 16], "env": [6, 16, 22, 25], "env_key1": 3, "env_key2": 3, "env_val1": 3, "env_val2": 3, "environ": [0, 3, 5, 6, 9, 10, 16, 18, 22, 23, 25, 27], "epoch": [6, 9, 16], "eq": [9, 31], "equal": [10, 25], "equival": [8, 28, 31, 36], "err": 18, "error": [2, 3, 4, 8, 9, 18, 19, 25, 28, 31], "especi": [2, 3, 8], "essenti": 8, "estim": [2, 35], "et": 28, "etc": [0, 3, 5, 14, 18], "eval": [2, 4, 9, 11, 17, 23, 29, 31], "evalu": [2, 23, 31], "even": [2, 31], "event": [2, 9], "event_id": 18, "ever": 22, "everi": [6, 27], "exactli": [6, 8], "examin": 36, "exampl": [2, 3, 5, 10, 11, 13, 16, 18, 19, 23, 24, 27, 28, 29, 31, 36], "example_ddp": 6, "example_input": [17, 29], "except": [2, 6, 27], "exchang": 31, "exclus": [6, 9, 10], "execut": [0, 2, 4, 5, 7, 8, 10, 11, 13, 18, 21, 25, 31, 32, 33, 34, 36], "exist": [1, 3, 17, 22, 25, 28, 31], "exit": [25, 31], "exp": [8, 31], "expand": 27, "expect": [19, 25], "expens": 19, "experi": [3, 13, 19, 27, 28, 31], "experiment": 3, "explain": [0, 19, 28], "explicit": 6, "explicitli": [2, 4, 8, 10, 11], "explor": 28, "expon": 15, "export": [10, 18, 25, 31], "export_compressed_model": 28, "expos": [5, 11], "express": 19, "ext": 4, "ext_modul": 8, "extend": [1, 5, 7, 22, 24, 25, 27, 28, 31], "extens": [2, 4, 7, 9, 10, 12, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 36], "extern": 7, "extra": [2, 6, 31], "extract": 7, "f": [3, 4, 9, 16], "f16c": 0, "f32": [0, 19], "face": 31, "facebook": 27, "facilit": 22, "fact": [8, 19], "fail": [2, 25, 31, 34], "failur": [25, 31], "falcon": [2, 27], "fall": 13, "fallback": 31, "fals": [0, 2, 4, 9, 11, 17, 19, 23, 28, 29], "famili": [2, 25, 27, 31], "familiar": [2, 8], "far": 7, "fast": [2, 8, 16, 28, 32], "faster": 11, "fastest": 0, "fatal": [18, 25, 31], "fatal_error": [4, 8], "fault": 31, "fc1": [9, 22], "fc2": [9, 22], "feasibl": [5, 9], "featur": [1, 3, 4, 10, 11, 14, 19, 22, 24, 25, 31, 32, 33, 34], "feb": 31, "fed": 6, "feed": [2, 12, 19], "feedforward": 27, "few": [3, 12, 19], "fft": 31, "fft_fft": 11, "fft_fft2": 11, "fft_fftn": 11, "fft_hfft": 11, "fft_ifft": 11, "fft_ifft2": 11, "fft_ifftn": 11, "fft_ihfft": 11, "fft_irfft": 11, "fft_irfft2": 11, "fft_irfftn": 11, "fft_rfft": 11, "fft_rfft2": 11, "fft_rfftn": 11, "field": 5, "figur": [1, 7, 27], "file": [0, 2, 3, 4, 8, 10, 11, 18, 19, 22, 25, 31, 33], "filenam": 3, "fill": 8, "filter": 7, "final": [9, 32, 33], "find": [1, 4, 7, 8, 25, 30, 31, 32], "find_packag": [4, 8], "findavx": 0, "fine": [8, 28, 31], "finer": 1, "finetun": 31, "finish": [4, 13, 18], "first": [3, 4, 8, 12, 16, 17, 22, 28, 31], "firstli": [22, 27, 28], "fit": [3, 28, 31, 32], "five": 18, "fix": [2, 3, 31], "flag": [0, 34], "flagship": [5, 22], "flash": 31, "flatten": 9, "flavor": 8, "flex": [5, 25, 31, 33], "float": [0, 2, 4, 5, 8, 9, 11, 15, 34], "float16": [2, 5, 22, 23, 28, 29, 31], "float32": [2, 23], "float64": 11, "flow": 7, "flush": 2, "fly": 8, "fma": 0, "fmax": 8, "fmin": 8, "fmt": 18, "fn_type": 0, "focu": [2, 19, 27, 28, 29, 31], "focus": 31, "fold": 2, "folder": [3, 22], "follow": [0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 22, 23, 25, 26, 27, 29, 31, 34], "footprint": [5, 9, 15, 27, 31], "forc": 31, "fork": 0, "format": [2, 3, 5, 6, 7, 9, 12, 13, 15, 16, 18, 28, 31, 34], "format_tag": 19, "former": [4, 8], "formerli": [5, 6, 9], "forth": [16, 31], "forward": [2, 4, 6, 8, 9, 11, 13, 19, 22, 34], "found": [1, 4, 17, 19, 25, 29, 31], "foundat": 19, "four": 18, "fp16": [0, 13, 14, 23, 27, 28, 30, 31], "fp32": [0, 2, 10, 13, 14, 17, 23, 27, 31, 36], "fp32_math_mod": 2, "fp32_math_mode_max": 2, "fp32_math_mode_min": 2, "fp32mathmod": 2, "fp64": [10, 25, 31], "fp8": [2, 5, 31], "fp8_autocast": [2, 15], "fp8_group": 2, "fp8_model": 15, "fp8_recip": [2, 15], "fp8linear": 15, "fpmath": 2, "fpmath_mod": 2, "fragment": 2, "framework": [5, 7, 10, 16, 31], "frantar": 28, "free": [2, 17, 18, 35], "freed": [2, 35], "freez": [4, 11, 22, 23], "frequenc": 30, "frobenius_norm": 11, "from": [0, 1, 2, 3, 4, 5, 7, 8, 9, 11, 15, 16, 17, 18, 19, 20, 21, 25, 27, 28, 29, 31, 32, 33, 34, 35], "from_blob": 4, "from_dlpack": 7, "from_pretrain": [4, 28, 31], "front": 28, "frontend": [1, 2, 5, 27, 28, 31], "fsdp": 31, "fsdp_main": 9, "fsdp_mnist_xpu": 9, "fsycl": [4, 8, 33], "full": [3, 8, 22, 27, 31], "fulli": [0, 3, 17, 29, 31], "fully_sharded_data_parallel": 9, "fullyshardeddataparallel": 9, "function": [0, 2, 3, 4, 5, 8, 9, 11, 17, 18, 23, 25, 27, 28, 29, 31, 36], "functool": 9, "further": [1, 2, 4, 5, 18, 19, 27, 28], "fuse": [2, 27, 28, 31, 32, 34, 36], "fuse_update_step": 2, "fusion": [1, 2, 4, 17, 22, 29, 31, 32, 34], "futur": [3, 12, 31], "fw": 30, "fwd": 31, "g": [0, 2, 10, 11, 17, 19, 25, 27, 31, 32, 33], "gain": [1, 27], "gamma": 9, "gate": 8, "gate_weight": 8, "gates_row": 8, "gcc": [0, 31], "ge": 31, "geglu": 14, "gelu": 31, "gemm": [19, 27, 31], "genai": [1, 27], "gener": [0, 1, 3, 4, 6, 7, 8, 13, 17, 18, 19, 22, 27, 28, 29, 31, 33, 34], "get": [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 18, 27, 31], "get_cpp_typesize_and_vecs": 0, "get_cpp_typesize_and_vecsize_kernel_fn": 0, "get_cpp_typesize_and_vecsize_kernel_impl": 0, "get_cpp_typesize_and_vecsize_kernel_stub": 0, "get_devic": 7, "get_fp32_math_mod": 2, "get_group": 8, "get_group_rang": 8, "get_local_id": 8, "get_log_compon": 18, "get_log_level": 18, "get_log_output_file_path": 18, "get_log_rotate_file_s": 18, "get_log_split_file_s": 18, "get_rank": 6, "get_world_s": 6, "getcurrentxpustream": [4, 8], "getveclength": 0, "getveclengthkrnl": 0, "girl": 28, "git": 3, "github": [1, 3, 6, 7, 9, 11, 25], "given": [2, 27], "glibcxx": 31, "glibcxx_use_cxx11_abi": 25, "glm": [27, 28], "global": [2, 6], "gnu": 0, "go": [2, 3, 4, 8, 11, 32, 33], "goal": 16, "goe": 8, "good": [1, 2, 3, 19, 27, 36], "googl": 3, "gpt": [2, 27, 28, 30, 31], "gptq": [2, 28, 31], "gpu": [1, 2, 3, 4, 8, 10, 13, 14, 15, 16, 21, 23, 24, 25, 30, 33, 34, 35], "grad": [31, 36], "grad_cel": 8, "grad_h": 8, "grade": [5, 9, 31], "gradient": [5, 9, 15, 16, 28], "grain": 1, "graph": [1, 2, 5, 11, 17, 22, 31, 34], "graph_for": [17, 29], "graph_mod": 2, "graphic": [25, 27, 28, 31, 33], "greater": [13, 25], "grep": 25, "grid": 8, "grid_sampl": 11, "group": [2, 6, 8, 9], "group_siz": 28, "gru_cel": 11, "gt": 31, "guarante": 13, "guard": 16, "guess": 31, "guid": [0, 6], "guidanc": 5, "guidelin": 19, "gunho": 28, "h": [3, 4, 8, 18, 19, 28, 31], "h2d": 25, "ha": [0, 1, 2, 4, 5, 7, 8, 13, 19, 20, 22, 25, 28, 31, 33], "had": [22, 25], "half": [0, 2], "hand": 8, "handcraft": 28, "handl": [2, 4, 7, 19, 31], "handler": 8, "handwritten": 9, "har": [5, 22], "hard": [7, 19], "hardsigmoid": 31, "hardswish": 31, "hardtanh": 31, "hardwar": [0, 1, 5, 24, 27, 31], "has_2d_block_arrai": 28, "hav": 0, "have": [0, 1, 2, 3, 4, 6, 7, 8, 12, 13, 17, 19, 23, 25, 26, 27, 28, 33], "hbm": 31, "he": 28, "header": [0, 8], "heavier": 27, "height": 19, "held": 2, "help": [0, 2, 3, 4, 8, 9, 13, 27, 31, 32, 33, 35], "helper": 8, "henc": [17, 31], "here": [0, 3, 6, 7, 8, 9, 11, 18, 19], "heurist": 2, "hf": 27, "hgemm_bias_wint4_arc": 28, "hgemm_int4_common_dispatch": 28, "hgemmxetla_int4": 28, "hidden": [19, 27], "hierarchi": 5, "high": [8, 27, 28, 31, 36], "higher": [0, 19, 27], "highest": 13, "highli": [8, 13, 17, 23, 27, 28, 31], "hinge_embedding_loss": 11, "histor": 2, "histori": 27, "hold": [6, 9, 19], "holder": [6, 18], "home": [8, 16, 25], "hook": 28, "horovod": [5, 25, 31], "host": [7, 10, 25, 28, 30, 31], "how": [0, 1, 4, 5, 6, 7, 8, 18, 19, 23], "howev": [2, 3, 5, 8, 10, 11, 12, 19, 27, 28, 31, 35], "hpp": 4, "html": [3, 7, 8], "http": [3, 6, 7, 8, 9, 22, 25], "hub": [27, 28], "huber_loss": 11, "hug": 31, "huggingfac": [27, 28], "human": [2, 31], "hvd": [16, 25], "hw": [19, 33], "hwc": 19, "hwio": 19, "hwn": 19, "hybrid": 31, "hypeparamet": 28, "hyper": 30, "hyperparamet": 28, "i": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 22, 23, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36], "icpx": [4, 8], "icx": [4, 8], "id": [6, 7, 9], "ideal": 13, "ideep": [0, 19], "ident": [2, 4, 19, 32, 34], "identif": [0, 4], "identifi": 18, "idx": 27, "ifwi": 30, "ignor": 34, "igpu": 28, "illustr": [5, 6, 9, 17, 19], "imag": [11, 19, 31], "immintrin": 0, "impact": 2, "imper": 5, "impl": [0, 8, 31], "implement": [1, 3, 4, 5, 6, 7, 8, 9, 19, 27, 28, 31, 36], "implicit": [2, 31], "import": [0, 1, 2, 3, 4, 6, 8, 9, 15, 16, 17, 19, 22, 23, 25, 27, 28, 29, 31], "importerror": [6, 25, 31], "impos": 28, "impress": 28, "improp": 25, "improv": [1, 11, 15, 27, 28, 31, 34], "inact": 2, "inactive_split": 2, "inactive_split_byt": 2, "inc": [0, 27], "inc_model": 28, "includ": [0, 1, 2, 4, 8, 10, 14, 23, 25, 26, 27, 28, 30, 31, 32, 34], "include_dir": 8, "include_path": 8, "incorrectli": [25, 31], "increas": [1, 2, 16, 25, 27, 28, 31, 32, 33, 35], "increment": 8, "inde": [8, 25], "index": [2, 3, 6, 7, 8, 9, 19, 22, 25, 27, 31], "index_put": 11, "indic": 19, "indirect": 27, "individu": [2, 3], "inductor": [5, 22, 31], "industri": [5, 9, 31], "ineffici": 8, "infer": [2, 5, 14, 15, 17, 19, 22, 23, 31, 34], "inferenc": 2, "inference_data": [17, 29], "inference_dta": [17, 29], "influenc": 27, "info": [0, 4, 17, 18], "inform": [0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 17, 18, 19, 23, 25, 27, 28, 31, 32], "ingredi": 19, "init": [3, 16], "init_end_ev": 9, "init_process_group": [6, 9], "init_start_ev": 9, "initi": [6, 9, 16], "inlin": 0, "inp": 2, "inplac": [2, 17, 19, 28, 29], "input": [0, 2, 4, 6, 8, 9, 12, 13, 17, 18, 19, 22, 28, 30, 31], "input_g": 8, "input_id": [15, 28], "input_mask": 15, "input_ptr": 4, "input_xpu": 19, "insert": [4, 17, 29], "insid": [3, 8, 18, 32, 34], "instal": [3, 4, 9, 10, 22, 23, 24, 25, 27, 31, 33], "instanc": 34, "instanti": 8, "instead": [20, 21, 29, 31, 36], "instruct": [0, 1, 3, 4, 23, 24, 25, 27, 28, 31], "int": [0, 2, 4, 6, 8, 9], "int32": 31, "int4": [2, 28, 31], "int4_fullrang": 28, "int8": [0, 1, 2, 5, 17, 28, 29, 31], "integ": [2, 28], "integar": 18, "integr": [14, 25, 31, 33], "intel": [2, 5, 7, 8, 9, 10, 12, 13, 16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 29, 31, 32, 33, 34, 36], "intel discrete gpu": 1, "intel optim": 1, "intel64": [4, 25, 31], "intel64_lin": 4, "intel_extension_for_pytorch": [0, 1, 2, 3, 4, 6, 7, 8, 9, 15, 16, 17, 19, 22, 23, 25, 28, 29, 31], "intelllvm": 4, "intel\u00ae extension for pytorch*": 1, "intend": 3, "intens": [22, 31], "intent": 3, "interest": 3, "interfac": [3, 4, 8, 19, 27, 28], "intern": [0, 2, 10, 19], "interopar": 5, "interoper": 7, "interpret": [2, 8], "intervent": 11, "intrins": 0, "introduc": [1, 8, 19, 31], "introduct": 27, "introductori": 8, "intuit": 28, "invalid": [19, 25, 31], "invoc": [1, 25, 31], "invok": [2, 4, 8, 11, 23, 25, 29, 31], "io": 7, "iostream": 4, "ipex": [0, 1, 2, 4, 5, 8, 10, 12, 18, 23, 27, 28, 29, 31], "ipex_declare_dispatch": 0, "ipex_define_dispatch": 0, "ipex_event_log": 18, "ipex_fp32_math_mod": 10, "ipex_fused_optimizer_list_xpu": 34, "ipex_gpu_root_dir": 10, "ipex_info_event_end": 18, "ipex_info_log": 18, "ipex_log": 10, "ipex_log_compon": [10, 18], "ipex_log_level": [10, 18], "ipex_log_output": [10, 18], "ipex_log_rotate_s": [10, 18], "ipex_log_split_s": [10, 18], "ipex_op_regist": 28, "ipex_register_dispatch": 0, "ipex_weight_convert_module_xpu": 34, "ipex_xxx_event_end": 18, "ipex_xxx_log": 18, "ipextransformerattnoptimizedint4": 28, "ipextransformerlinear": 28, "ipextransformermlpoptimizedint4": 28, "irfft": 31, "is_contigu": [8, 19], "is_contiguous_channels_last_1d": 19, "is_xpu": 8, "isa": 1, "isa_codegen": 0, "isa_nam": 0, "isacodegen": 0, "issu": [1, 3, 11, 22, 27, 28], "item": [8, 9, 16], "iter": [2, 6, 27], "its": [0, 2, 4, 6, 7, 10, 11, 13, 14, 28], "itself": [2, 19], "itt": 10, "ivalu": 4, "j": [0, 2, 27, 28, 30, 31], "ji": 28, "jit": [1, 2, 4, 11, 17, 19, 23, 29, 31, 32, 33], "job": [3, 25], "join": 9, "json": 2, "jung": 28, "jupyt": 3, "just": [2, 8, 28, 29, 31, 32, 33], "k": [2, 28], "kcpu": 0, "kdloneapi": 7, "kdlsycl": 7, "keep": [3, 19], "keepdim": [8, 9], "kei": [2, 5, 27, 31, 32], "kept": 27, "kera": 16, "kernel": [1, 2, 5, 8, 10, 13, 18, 22, 25, 27, 28, 30, 31, 34], "kernel_s": [19, 22], "kfloat": 4, "kfn": 8, "kill": 25, "killer": 25, "kim": 28, "kineto": [21, 31], "kl_div": 11, "kmd": 25, "knob": 2, "know": [3, 7, 8, 32, 33], "knowledg": 28, "known": [5, 6, 9, 27], "known_issu": 22, "kv": 31, "kv_cach": 27, "kwon": 28, "kxpu": 4, "l": 6, "l1_loss": 11, "label": [6, 11, 15, 25], "lake": 27, "lamb": 36, "lambda": 8, "landscap": [1, 27], "languag": [1, 8, 28, 29, 31], "lapack": 31, "lar": 36, "larg": [1, 2, 5, 9, 18, 25, 28, 29, 31, 36], "large_pool": 2, "last": [8, 10, 31, 32], "latenc": [27, 31], "later": 28, "latest": [1, 6, 7, 16, 22, 24, 25, 27, 30], "latter": 8, "launch": [8, 10, 18, 27, 31], "launcher": 6, "layer": [27, 31, 32, 34], "layernorm": [13, 14, 31], "layout": [2, 5], "ld": [25, 31], "ld_preload": [25, 31], "ldd": 4, "le": 31, "lead": 27, "leak": [10, 31], "leaki": 31, "leaky_relu": 31, "learn": [6, 7, 9, 11, 15, 16], "lee": 28, "left": 27, "len": [0, 4, 9, 16], "length": [3, 30], "less": [2, 5, 10, 11, 25, 31], "let": [8, 19, 36], "level": [2, 5, 8, 10, 19, 27, 28, 31, 33], "leverag": [1, 22], "lib": [4, 10, 18, 25, 31], "libc10": 4, "libimf": 4, "libintel": 4, "libintlc": 4, "libirng": 4, "libmkl": [25, 31], "libmkl_cor": [4, 25, 31], "libmkl_gnu_thread": [4, 25], "libmkl_intel_ilp64": [25, 31], "libmkl_intel_lp64": [4, 25, 31], "libmkl_sequenti": 31, "libmkl_sycl": [4, 25, 31], "libmkl_vml_avx512": 25, "libopencl": 4, "libpytorch": 4, "libpytorch_path": 4, "librari": [0, 1, 4, 5, 6, 7, 8, 9, 10, 13, 14, 18, 22, 31], "libstdc": 25, "libsvml": 4, "libsycl": 4, "libtorch": [4, 31], "libtorch_cpu": 4, "libtriton": 25, "licens": 0, "lifecycl": [32, 33], "lighter": 11, "like": [1, 2, 3, 8, 17, 18, 25, 27, 28, 31], "limit": [3, 11, 19, 27, 28, 31], "lin": 28, "linalg_multi_dot": 11, "line": [3, 8, 19], "linear": [2, 6, 9, 11, 13, 15, 19, 22, 31, 32, 34], "linear_bn": 2, "linear_bn_fold": 2, "linear_unari": 17, "link": [0, 1, 4], "linker": [25, 31], "linux": [0, 4, 8, 25, 31], "list": [3, 4, 10, 11, 19, 24, 25, 29, 31, 33], "littl": 28, "live": 3, "ll": [3, 8], "llama": [2, 27, 31], "llama2": [27, 30], "llama3": [27, 28], "llm": [1, 2, 29, 31], "lltm": 8, "lltm_backward": 8, "lltm_backward_xpu": 8, "lltm_forward": 8, "lltm_forward_xpu": 8, "lltm_xpu": 8, "lltm_xpu_backward": 8, "lltm_xpu_backward_kernel": 8, "lltm_xpu_forward": 8, "lltm_xpu_forward_kernel": 8, "lltm_xpu_kernel": 8, "lmkl_core": [25, 31], "lmkl_intel_ilp64": [25, 31], "lmkl_sycl": [25, 31], "lmkl_tbb_thread": [25, 31], "load": [0, 1, 2, 4, 8, 25, 31, 34], "load_in_4bit": 28, "load_state_dict": 2, "loaded_model": 28, "loader": 6, "local": [6, 9, 16], "local_rank": [6, 9, 16], "localhost": 9, "locat": [0, 3, 22, 25], "log": [4, 5, 9, 10, 31], "log_compon": 18, "log_interv": 16, "log_level": 18, "log_path": 18, "log_sigmoid": 31, "log_softmax": [9, 11], "logic": [9, 19, 25], "logutil": 18, "long": [8, 19, 27, 31], "longer": 31, "look": [3, 4, 8, 19, 28], "loop": [2, 3, 10], "lora": [27, 31], "loss": [2, 4, 6, 9, 11, 16, 19, 22, 27, 28], "loss_fn": 6, "loss_funct": 22, "lot": [25, 27, 31, 32, 33], "low": [5, 6, 8, 23, 31, 36], "low_precision_checkpoint": 2, "lower": [0, 11, 17, 27, 28, 31], "loweringexcept": 25, "lowp": 2, "lp64": 31, "lr": [4, 6, 9, 11, 22, 36], "lr_schedul": 9, "lsb": 0, "lstm": [2, 13], "lt": [30, 31], "lunar": 27, "lv": 28, "m": [6, 8, 9, 16], "m150": 33, "machin": [0, 3, 6, 28], "macro": [0, 5, 18], "made": [3, 7, 31], "mai": [0, 1, 2, 3, 7, 8, 11, 12, 13, 17, 19, 25, 28, 31], "main": [1, 3, 4, 9, 22, 27, 28], "main_work": 6, "maintain": [5, 6, 8, 9, 11, 20], "major": 28, "make": [0, 3, 4, 5, 6, 8, 9, 16, 22, 23, 27, 28, 31, 33], "make_tupl": 0, "malloc_devic": 4, "mamx": 0, "manag": [6, 8, 11, 27, 31], "mani": [3, 5, 8, 31], "manipul": 19, "mantissa": 15, "manual": [2, 5, 13, 19, 34], "manual_se": [6, 9], "map": 19, "margin_ranking_loss": 11, "mark": [4, 18, 28], "mask": [0, 31], "mask_valu": 0, "masked_lm_label": 15, "master": [2, 6, 32, 34], "master_addr": [6, 9], "master_port": [6, 9], "match": [0, 2, 11, 34], "math": [2, 5, 8, 10, 13], "matmul": [11, 14, 28, 31], "matric": [8, 28], "matrix": [1, 22, 24, 31], "mavx2": 0, "mavx512bf16": 0, "mavx512bw": 0, "mavx512dq": 0, "mavx512f": 0, "mavx512fp16": 0, "mavx512vl": 0, "mavx512vnni": 0, "max": [0, 9, 14, 25, 28, 30, 31, 33], "max_job": 25, "max_memory_alloc": [2, 35], "max_memory_reserv": [2, 35], "max_pool2d": 9, "maximum": [0, 2], "maxpool1d": 19, "maxpool2d": [13, 22], "maxpool3d": 13, "mb": [10, 18], "md": [3, 10, 19], "me": 19, "mean": [0, 2, 10, 19, 25, 27, 28, 31], "measur": 2, "mechan": [0, 1, 8, 31], "meet": [5, 15, 34], "meltdown": 30, "mem_get_info": [2, 35], "memori": [4, 5, 7, 8, 9, 11, 12, 15, 18, 25, 27, 28, 30, 31, 34, 36], "memory_alloc": [2, 35], "memory_format": [4, 5, 19], "memory_reserv": [2, 35], "memory_snapshot": [2, 35], "memory_stat": [2, 35], "memory_stats_as_nested_dict": 2, "memory_summari": 2, "mention": [6, 8, 18], "merg": 31, "merit": 19, "mermori": 2, "messag": [8, 10, 18, 19, 25, 31], "met": 34, "meta": [19, 27], "metavar": 9, "method": [2, 7, 8, 11, 27, 28, 31], "methodologi": [2, 4, 5, 8, 36], "metric": 2, "mfma": 0, "microsoft": 27, "might": [2, 19, 25, 36], "min": [28, 31], "min_num_param": 9, "mind": 19, "mini": [27, 28, 31], "minim": [0, 10, 27, 28], "minimum": 19, "minmax": 28, "minmaxobserv": [4, 17, 29], "minor": [6, 31], "mish": 31, "miss": [3, 25], "mitig": [28, 30], "mix": [2, 4, 8, 27, 31], "mixtur": 11, "mkdir": 4, "mkl": [4, 25, 31], "mkl_dpcpp_root": [25, 31], "mkl_lapack_dspevd": 25, "mkldnn": 19, "mkldnn_util": 19, "mllama": 2, "mlp": [14, 28], "mm": [8, 11], "mm_bias_int4": 28, "mm_qkv_out_int4": 28, "mm_silu_mul_int4": 28, "mmx": 0, "mnist": 9, "mnist_cnn": 9, "mno": 0, "mode": [1, 2, 3, 5, 10, 19, 23, 25, 31], "model": [1, 2, 5, 6, 9, 10, 11, 12, 13, 15, 16, 17, 22, 23, 25, 29, 30, 31, 34], "model_nam": 28, "model_name_or_path": [29, 31], "model_state_dict": 4, "modelimp": [17, 29], "modeljit": [4, 17, 29], "modif": [6, 9, 16, 17], "modifi": [2, 3, 16], "modul": [0, 1, 2, 4, 5, 6, 8, 9, 11, 14, 17, 19, 22, 25, 28, 29, 31, 32, 34], "modular": [2, 4], "moe": 14, "momentum": [4, 22, 36], "momentum_buffer_list": 36, "monitor": [7, 35], "more": [0, 1, 2, 3, 5, 6, 8, 9, 10, 11, 17, 23, 25, 27, 28, 31, 32, 33, 35, 36], "moreov": [1, 27, 31], "most": [5, 10, 13, 17, 25, 27, 31], "motiv": [2, 4], "move": [4, 17, 19, 23], "mp": 9, "mpi": [16, 25], "mpi_rank": 6, "mpi_world_s": 6, "mpirun": 6, "mse_loss": 11, "mseloss": 6, "mtl": 31, "much": [8, 19, 31, 36], "mul": 31, "mul_": 36, "multi": [6, 31, 33], "multi_margin_loss": 11, "multi_process_spawn": 6, "multidimension": 19, "multiheadattent": 27, "multilabel_margin_loss": 11, "multipl": [0, 2, 3, 5, 6, 11, 19, 22, 25, 27, 31, 33], "multiprocess": [6, 9], "must": [0, 3, 8, 33, 34, 36], "mv": 11, "mxnet": 16, "my": 19, "my_auto_wrap_polici": 9, "mykernel": 0, "n": [4, 6, 8, 9, 19], "n1": 19, "n2": 19, "name": [0, 8, 13, 25, 28], "named_paramet": 16, "namespac": [0, 4, 11], "nan": 0, "nativ": [0, 1, 5, 11, 25, 31, 36], "natur": [8, 19, 27, 28, 31], "nb": 19, "nchw": 5, "nd": 19, "nd_item": 8, "nd_rang": 8, "ndim": 7, "ne": 31, "nearest": [19, 28], "necessari": [6, 9, 16, 19, 25], "necessarili": 14, "necessit": 28, "neck": 36, "need": [0, 2, 3, 4, 5, 6, 8, 9, 16, 17, 19, 23, 29, 31, 32, 33, 34, 36], "neg": 2, "neglig": 19, "neox": 2, "nest": 2, "nesterov": 36, "net": 9, "network": [1, 5, 11, 13, 15, 31], "neural": [1, 5, 13, 15, 31], "neural_compressor": 28, "new": [0, 3, 15, 19, 22, 28, 29, 31, 32], "new_cel": 8, "new_h": 8, "newer": 1, "newkernel": 0, "newkernelkrnl": 0, "next": [3, 8, 13], "next_sentence_label": 15, "nf4": 28, "nhwc": [5, 31], "nightli": [22, 25], "ninja": 8, "nll_loss": [9, 11, 16], "nll_loss2d": 11, "nll_loss_nd": 11, "nlp": 4, "nn": [2, 4, 6, 9, 11, 13, 19, 22, 32, 34], "no_grad": [4, 9, 22, 23], "node": [30, 31], "non": [2, 3, 10, 11, 19, 28], "noncontigu": 19, "none": [2, 6, 9, 36], "norm": [14, 31], "normal": [1, 4, 9, 16, 27, 28, 32, 34], "note": [0, 2, 3, 6, 7, 8, 9, 12, 14, 16, 19, 25, 27, 28, 31, 33, 35], "noth": 2, "notic": 26, "nov": 31, "now": [2, 5, 8, 19, 22], "nproc": 9, "nuclear_norm": 11, "null": [10, 18], "num_alloc_retri": 2, "num_oom": 2, "num_replica": [9, 16], "num_work": [6, 9], "number": [1, 2, 3, 4, 6, 8, 9, 16, 18, 25, 30, 31, 36], "numer": [2, 11], "numpi": 7, "nuqmm": 28, "o": [0, 4, 6, 9, 25, 30, 31], "o0": 2, "o1": 2, "o3": 0, "oam": 30, "object": [0, 2, 4, 25, 31], "observ": [4, 12, 17, 29], "obtain": [17, 27, 28, 29], "occasion": 28, "occupi": [2, 35], "occur": [25, 28, 31], "oct": 31, "octob": 2, "oem": 30, "off": [10, 11, 25, 27, 28, 31], "offer": [1, 3, 28, 35], "offici": [3, 17, 20, 31], "offset": [19, 27], "old": 25, "old_cel": 8, "old_h": 8, "onc": [0, 2, 3, 13, 18, 19, 28, 33, 34], "one": [2, 3, 6, 7, 8, 10, 13, 16, 17, 18, 19, 25, 29, 31, 36], "oneapi": [4, 5, 6, 7, 9, 13, 16, 22, 25, 30, 31, 33], "oneapi_root": 6, "oneccl": [5, 9, 10, 31], "oneccl_bind_pt": 6, "oneccl_bindings_for_pytorch": [6, 9], "onednn": [0, 2, 5, 10, 13, 18, 22, 27, 31], "onednn_layout": 13, "onemkl": [10, 13, 18, 25, 31], "ones": [0, 8, 17, 28], "onli": [0, 1, 2, 3, 4, 7, 8, 10, 11, 13, 14, 16, 17, 18, 19, 23, 31, 34], "onlin": [15, 31], "onnx": 28, "oob": 31, "oom": 25, "op": [3, 9, 10, 13, 18, 22, 27, 28], "open": [1, 25, 31], "openai": 25, "opencl": 33, "oper": [1, 2, 4, 5, 7, 8, 10, 11, 17, 18, 21, 22, 28, 31, 32], "opportun": 34, "opportunit": 2, "opt": [0, 2, 6, 22, 25, 27, 30, 31], "optim": [1, 2, 5, 6, 9, 10, 11, 12, 13, 14, 16, 19, 22, 23, 24, 25, 28, 31], "optimize_lstm": 2, "optimize_transform": 31, "optimized_model": 2, "optimized_optim": 2, "optimizer_state_dict": 4, "option": [1, 2, 4, 10, 22, 25, 32, 33, 34], "optioncpu": 10, "optionexperiment": 10, "optiongpu": 10, "order": [0, 2, 7, 13, 15, 19, 25, 31], "ordered_gemm_wint4_config_set_arc": 28, "ordered_gemm_wint4_config_set_pvc": 28, "org": [8, 22, 25], "organ": 19, "origin": [0, 2, 7, 15, 16, 28, 29, 34, 36], "oserror": [25, 31], "other": [0, 2, 5, 7, 9, 11, 13, 14, 15, 16, 17, 18, 19, 23, 25, 27, 28, 31, 35, 36], "otherwis": [2, 9, 10, 28], "our": [3, 4, 8, 13, 17, 27, 28], "out": [2, 4, 7, 8, 11, 25, 28, 31, 36], "outplac": 19, "output": [2, 4, 9, 10, 11, 13, 15, 16, 18, 19, 22, 28], "output_g": 8, "output_tensor": 4, "outstand": 3, "over": [2, 3, 5, 8, 10, 11, 12, 19, 31], "overal": 17, "overcom": [27, 28], "overestim": 35, "overhead": [1, 2, 8, 10, 27, 31, 32, 36], "overrid": [10, 31], "overridden": [0, 2], "overview": [29, 31], "overwrit": 2, "own": [4, 8], "ox": 10, "p": 30, "pack": [2, 7], "packag": [1, 4, 6, 8, 9, 23, 25, 31], "packed_accessor32": 8, "packedtensoraccessor32": 8, "packet": 2, "pad": [11, 19, 22, 31], "page": [4, 6, 9, 20, 29, 30, 31, 34], "parallel": [2, 6, 25, 27, 31], "parallel_for": 8, "param": [2, 36], "paramet": [0, 2, 4, 5, 6, 9, 11, 16, 18, 22, 27, 28, 31, 34, 36], "parameterwrapp": 34, "parent": 7, "park": 28, "pars": [7, 9, 31], "parse_arg": 9, "parser": 9, "part": [4, 8, 11, 18, 19, 32, 34], "partial": [9, 27], "particular": [3, 11, 25, 28, 29, 31], "particularli": [5, 7], "partit": 16, "pass": [0, 1, 3, 4, 8, 25, 28], "patch": 31, "path": [2, 4, 8, 9, 10, 18, 19, 22, 25, 34], "path_to_your_onemkl": 31, "pattern": [8, 17, 19, 29, 31, 35], "pdist": [11, 31], "peak": 2, "peer": 28, "peft": 31, "per": [2, 7, 8, 16, 17, 30], "per_kernel": 10, "per_tensor_symmetr": [4, 17, 29], "perchannel": [17, 29], "perf": 19, "perform": [1, 2, 4, 5, 8, 10, 11, 12, 13, 14, 17, 19, 22, 24, 27, 28, 29, 31, 34, 36], "period": 2, "permut": 31, "permutecontigu": 13, "persist": 10, "perspect": [2, 19], "pertain": 0, "phase": [2, 27], "phi": [27, 31], "phi3": 28, "pick": 3, "pin": 16, "pin_memori": [6, 9], "pip": [3, 6, 16, 22, 25, 28], "pixelshuffl": 31, "pl": 10, "place": [2, 6, 11, 18, 27], "plan": 3, "platform": [4, 7, 10, 19, 22, 25, 28, 31, 33, 34], "platinum": 30, "pleas": [2, 3, 5, 6, 9, 18, 20, 21, 22, 23, 25, 28, 31, 33, 34, 35], "plug": 8, "pmi_rank": 6, "pmi_siz": 6, "point": [2, 5, 7, 8, 11, 15, 28], "pointer": [0, 8, 25], "poisson_nll_loss": 11, "polici": [28, 31], "polymorph": 0, "pool": [2, 22, 32], "popular": [1, 7, 27, 28, 30, 31], "popup": 3, "port": [3, 6], "posit": [27, 28], "possibl": [2, 4, 7, 13, 31], "post": [3, 5, 17, 22, 27, 28, 31], "potenti": [22, 34], "pow": [11, 31], "power": [8, 15], "pr": 19, "practic": [5, 8, 27], "pragma": 0, "pre": [14, 22, 25, 27, 28, 33], "prebuilt": [25, 31, 33], "precis": [4, 6, 15, 23, 28, 30, 31], "pred": 9, "predefin": 2, "prefer": [1, 13], "prefetchw": 0, "prefetchwt1": 0, "prefil": [2, 27], "prefix": 8, "preload": [25, 31], "prelu": 11, "prepack": [2, 19, 27], "prepar": [17, 29], "prepare_data": 18, "prepare_fp8": 15, "prepare_jit": [4, 17, 29], "preprint": 28, "prerequisit": [3, 4], "present": 28, "preserv": [27, 28], "prevent": [16, 36], "previou": [18, 19, 22, 31], "previous": 8, "primari": 31, "primit": [2, 10, 31], "principl": [16, 19], "print": [0, 4, 6, 9, 16, 17, 19, 22, 29], "printout": 2, "prior": [2, 23], "prioriti": 13, "probabl": [7, 9, 25], "problem": [25, 36], "procedur": [25, 28], "process": [4, 5, 6, 8, 9, 13, 15, 16, 25, 27, 28, 31, 34], "processgroup": [2, 6, 9], "processor": [31, 36], "produc": [3, 7, 8, 11, 35], "product": [1, 27, 31, 32, 33], "profil": 31, "program": [1, 2], "progress": [25, 27], "project": [1, 4, 8], "prompt": [27, 28], "properti": [4, 8], "propos": [3, 19, 27, 28], "prototyp": [2, 10, 31, 33], "provid": [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 13, 14, 18, 24, 25, 27, 28, 29, 31, 33, 34, 35, 36], "pseudo": 36, "pt": [4, 9, 28], "pth": 4, "pthread": 4, "pti": 31, "ptr": 31, "public": [8, 31], "publicli": 31, "pull": 3, "purpos": [0, 8], "push_back": 4, "put": [6, 7, 9], "pvc": [30, 33], "py": [3, 6, 8, 9, 10], "pybind11": 8, "pybind11_modul": 8, "pyi": 3, "python": [0, 1, 2, 3, 6, 7, 8, 9, 10, 16, 27, 28, 29, 31, 32, 34], "python_include_dir": 8, "pytorch": [2, 4, 5, 7, 8, 9, 10, 11, 12, 18, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 35, 36], "qconfig": [4, 17, 29], "qconfig_summary_fil": 2, "qkv": 28, "qmodel": 28, "qscheme": [4, 17, 29], "qualiti": [27, 28, 31, 32, 33], "quant": 2, "quant_method": 2, "quantiz": [1, 4, 14, 29, 31], "quantizaiton": 15, "quantizat": 2, "quantization_config": [2, 28], "quantize_jit": [4, 17, 29], "quantwrapp": [17, 29], "queri": [0, 19], "question": 19, "queue": [5, 10, 18], "quick": [1, 24], "quint8": 4, "quit": [0, 28, 31], "qwen": [27, 28, 31], "qwen2": [27, 28], "r": [3, 30], "rais": [2, 6], "ram": 25, "rand": [4, 11, 19, 22], "randint": 4, "randn": [6, 13, 19], "random": [9, 16], "rang": [1, 6, 8, 9, 15, 16, 17, 29], "rank": [5, 6, 9, 16, 31], "ransform": 28, "rate": [9, 16, 31], "rather": [2, 18, 19], "re": [3, 6, 8, 11], "reach": 31, "read": [3, 8, 28, 36], "readabl": [2, 8], "readi": 7, "readm": 3, "real": [2, 4, 6], "realli": [3, 8], "realloc": 18, "reason": 19, "rebas": 3, "reboot": 25, "receiv": [2, 25, 31], "recent": [19, 28], "recip": [2, 31], "reciproc": 11, "recogn": 7, "recommend": [1, 4, 12, 13, 17, 23, 25, 28, 31], "record": [9, 18], "record_avg_pool": 18, "recurs": 3, "reduc": [1, 2, 5, 9, 15, 27, 28, 31, 36], "reduce_rang": [4, 17, 29], "reduceop": 9, "reducescatt": [9, 31], "reduct": 9, "redund": 31, "refer": [0, 1, 2, 6, 8, 9, 10, 12, 13, 19, 20, 22, 23, 24, 27, 31, 33, 34], "referenc": 34, "regard": [6, 19], "regardless": 11, "region": [0, 11], "regist": [0, 1, 28, 31], "registri": 25, "regress": [3, 12], "regular": 4, "reinstal": [3, 25], "reinterpret": 19, "reinterpret_cast": 0, "relat": [0, 7, 9, 17, 18, 22, 25], "releas": [0, 1, 2, 6, 12, 14, 19, 20, 25, 33, 35], "reli": [7, 19], "relianc": 28, "reload": 8, "relu": [9, 19, 22, 31], "remain": [28, 36], "remark": [27, 28, 31], "rememb": 22, "remov": [2, 3, 25, 31], "renorm": 11, "reorder": [2, 19, 27], "reorder_cach": 27, "repeat": 19, "repeatedli": 3, "replac": [2, 3, 6, 9, 17, 28, 31, 32, 34], "replace_dropout_with_ident": 2, "replic": 6, "replica": [5, 6, 9], "repo": [3, 6], "repo_url": 6, "report": [0, 1, 18, 25], "repositori": 6, "repres": [3, 6, 18, 31], "represent": 19, "request": [1, 2, 3, 32], "requir": [2, 3, 4, 6, 7, 8, 10, 11, 13, 15, 17, 19, 23, 25, 27, 28, 29, 31, 34], "reserv": [2, 18, 32], "reserved_byt": 2, "reset": [2, 14], "reset_accumulated_memory_stat": 2, "reset_peak_memory_stat": 2, "reset_peak_stat": 2, "reshap": 8, "resid": 6, "residu": 14, "resiz": 4, "resnet50": 10, "resnet50_weight": 4, "resolv": [25, 31], "resourc": [27, 28, 31], "respons": [7, 17, 27, 31], "restor": 16, "result": [1, 2, 8, 19], "retak": 14, "retri": 2, "retriev": 8, "return": [0, 2, 4, 6, 8, 9, 11, 19, 22], "return_tensor": 28, "reus": 2, "review": 28, "rf": [3, 22], "rfc": 19, "rh": 0, "right": [8, 23, 27], "rm": [3, 14, 22, 31], "rmsnorm": 27, "root": [0, 4, 6, 22, 25, 27, 31], "root_rank": 16, "rope": 27, "rotari": 27, "rotat": [10, 18], "roughli": 19, "round": [28, 31], "rounding_bia": 0, "rst": 3, "rtn": 28, "run": [2, 3, 4, 5, 6, 8, 9, 10, 11, 22, 23, 25, 31, 32, 33, 34], "run_benchmark_woq": 28, "rune": 6, "runtim": [0, 1, 2, 5, 7, 8, 9, 11, 18, 23, 25, 31, 33], "runtimeerror": 25, "sacrif": 11, "safe": 7, "same": [0, 6, 8, 9, 18, 19, 27, 31], "sampl": [0, 2, 6, 12], "sample_input": [2, 12], "sampler": [6, 9, 16], "sampler1": 9, "sampler2": 9, "sanit": 10, "save": [2, 3, 4, 9, 15, 16, 19, 31, 34], "save_model": 9, "save_pretrain": 28, "saved_dir": 28, "scalar": 31, "scalar_t": 8, "scalartyp": 0, "scalartypetocpptyp": 0, "scale": [2, 5, 9, 10, 15, 16, 27, 28, 31], "scale_dtyp": 28, "scaled_dot_product_attent": 11, "scatter_add": 11, "scenario": [2, 7, 14, 17, 28, 31], "schedul": [1, 9], "scope": 11, "scratchpad": 10, "script": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 16, 17, 22, 27, 29, 34], "scriptmodul": [17, 29], "sdp": 31, "sdpa": [27, 31], "se": 28, "se5c7411": 30, "seamlessli": [5, 22], "search": [1, 3, 27, 31], "sec": 9, "second": [8, 16, 18, 25], "secret": 19, "section": [1, 4, 5, 11, 17, 22, 24, 27, 28, 29, 34], "see": [1, 2, 3, 8, 11, 15, 19, 25, 31, 33], "seed": [6, 9], "seed_numb": 6, "segment": [2, 5, 31], "segment_id": 15, "select": [2, 4, 5, 28, 31], "selector": 7, "self": [6, 9, 11, 19, 22], "semant": 19, "semidefinit": 28, "sep": [0, 10], "separ": [4, 8, 10, 18, 26], "seper": 33, "seq_length": 4, "sequenc": [19, 27], "sequenti": 19, "seri": [5, 14, 25, 27, 28, 31, 33], "serv": 31, "server": [16, 25], "servic": 4, "set": [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 13, 16, 23, 25, 28, 30, 31, 33, 34], "set_devic": [6, 9, 16], "set_epoch": [6, 9], "set_fp32_math_mod": 2, "set_log_compon": 18, "set_log_level": 18, "set_log_output_file_path": 18, "set_log_rotate_file_s": 18, "set_log_split_file_s": 18, "set_properti": [4, 8], "set_source_files_properti": 8, "setup": [3, 8, 9, 16, 27], "setuptool": 5, "setvar": 6, "sever": [2, 18, 30, 36], "sgd": [2, 4, 6, 11, 16, 22, 31, 34, 36], "sgd_fused_step": 36, "sh": [6, 16, 22, 25, 28], "sha": 0, "shall": [3, 19], "shape": [2, 7, 13, 22, 27], "shard": [27, 31], "share": [1, 3, 5, 6, 7, 8, 25, 31], "shen": 28, "ship": 25, "shorten": 3, "should": [2, 3, 8, 9, 11, 18, 25, 27, 28], "show": [0, 4, 6, 7, 8, 11, 27, 28, 29, 30, 31], "showcas": [15, 28], "shown": [1, 2, 4, 6, 19, 27, 28], "shuffl": [6, 9], "si": 30, "side": [5, 7], "sigmoid": [8, 31], "sign": [15, 28], "significantli": [27, 28, 31], "silu": 31, "similar": [0, 9, 25, 31], "simpl": [2, 5, 8, 11, 19, 22, 28, 31], "simplenet": [11, 22], "simpli": [4, 8], "simplic": 28, "simultan": 34, "sinc": [2, 7, 8, 19, 20, 28, 31, 32, 33], "singl": [9, 16, 18, 27, 30, 31, 36], "single_card": 6, "single_card_dist": 6, "situat": 7, "size": [0, 2, 4, 6, 7, 8, 9, 10, 16, 18, 19, 25, 27, 28, 31, 32, 33], "size_based_auto_wrap_polici": 9, "size_t": 8, "sizeof": 0, "skip": [0, 3, 4, 19, 28, 32, 33], "sleef": 0, "slice": [4, 8, 19], "slide": 31, "slot": 30, "slow": 25, "slower": 11, "small": [2, 17], "small_pool": 2, "smaller": [0, 11, 31], "smallest": 32, "smooth_l1_loss": 11, "smoothquant": 27, "snapshot": [2, 35], "snippet": [9, 13, 29], "so": [0, 2, 4, 5, 7, 11, 16, 19, 22, 25, 31, 35, 36], "socket": 30, "soft_margin_loss": 11, "softmax": [13, 31], "softplu": 31, "softwar": 26, "solut": [2, 9, 25, 27, 31], "solv": 36, "some": [0, 2, 3, 5, 8, 9, 11, 13, 19, 23, 25], "someth": 19, "soon": 21, "sourc": [0, 1, 3, 8, 10, 16, 22, 25, 26, 33], "space": [19, 28], "span": 31, "spars": 19, "spawn": [6, 9], "spec": 7, "special": [0, 13, 27], "specif": [1, 2, 4, 5, 7, 10, 13, 14, 16, 18, 19, 27, 31, 32, 34], "specifi": [2, 8, 9, 10, 18, 25, 33, 34], "specifii": 0, "spectr": 30, "specul": 31, "speed": [2, 5, 8, 27, 31, 32, 36], "speedup": [11, 13, 27, 31], "sphinx": 3, "spir64_gen": 33, "split": [0, 2, 8, 10, 18, 32, 34], "split_master_weight_for_bf16": 2, "spontan": 19, "sqrt": 31, "squar": [27, 31], "src": 0, "src_data_ptr": 19, "src_md": 19, "src_mem": 19, "sse": 0, "sse2": 0, "sse3": 0, "sse4_1": 0, "sse4_2": 0, "ssse3": 0, "stabil": 11, "stabl": [5, 6, 7, 11], "stack": [11, 18], "stage": [17, 36], "stai": 28, "standard": [1, 8, 27], "start": [1, 2, 3, 4, 6, 16, 18, 31], "stat": 2, "state": [2, 5, 8, 9, 16, 27, 35], "state_dict": [2, 4, 9, 16], "state_s": 8, "statement": [0, 5], "static": [2, 5, 17, 27, 28, 31, 34], "statist": [2, 4, 17, 29], "statu": 0, "std": [0, 4, 8], "stead": 0, "step": [2, 3, 4, 6, 8, 9, 11, 13, 16, 18, 22, 25, 28, 32, 34], "step2": 13, "step3": 13, "step4": 13, "step_id": 18, "step_siz": 9, "steplr": 9, "still": [3, 5, 8, 11, 19, 31, 33], "stock": [3, 7, 10, 19, 31], "stop": 25, "storag": 36, "store": [0, 14, 19, 27, 36], "store_tru": 9, "str": [2, 6], "strategi": 8, "stream": [4, 8, 10], "strict": 4, "stride": [7, 11, 22], "stride_c": 19, "stride_h": 19, "stride_n": 19, "stride_w": 19, "string": [2, 9, 18], "structur": [1, 5, 7], "style": [2, 3, 4, 8], "sub": [18, 31], "sub_fold": 3, "subcompon": 10, "subfold": 0, "subgraph": 2, "subject": [0, 26], "submit": [1, 3, 8], "submit_barri": 10, "submodul": 3, "subsequ": [8, 19, 25], "substitut": 28, "success": 4, "successfulli": 6, "suffici": [5, 10, 28], "suffix": [0, 25, 31], "suggest": [1, 19], "suit": 3, "suitabl": 28, "sum": [8, 9, 19, 31], "summari": 2, "summit": 28, "super": [6, 9, 11, 19, 22], "suppli": [11, 19], "support": [0, 2, 3, 4, 6, 7, 8, 10, 13, 17, 18, 21, 22, 24, 25, 27, 29, 31, 33, 34, 36], "sure": [3, 6, 9, 16], "surgeon": 28, "sw": 30, "swap": [17, 25], "switch": [0, 9], "sycl": [1, 5, 7, 10, 13, 18, 31, 32, 34], "sycl_devic": 19, "sycl_queu": 8, "symbol": [25, 31], "symlink": 3, "symmetr": 17, "sync": 3, "synchron": [7, 8, 10, 16], "syngraph": 18, "sysman": 2, "system": [0, 8, 25, 31], "t": [0, 2, 3, 7, 8, 11, 19, 31], "t2": 7, "t_valu": 0, "tabl": [0, 33], "take": [1, 2, 8, 11, 19, 24, 28, 31], "tanh": [8, 31], "target": [0, 4, 8, 9, 16, 31, 32, 33], "target_include_directori": 8, "target_link_librari": [4, 8], "task": [2, 25, 27, 28, 31], "tdr": 25, "tdrdelai": 25, "team": [1, 3], "techniqu": [1, 2, 8], "technolog": [1, 27], "tell": 19, "templat": [8, 13, 18, 27, 31], "temporari": 8, "tenor": 19, "tensor": [0, 2, 4, 5, 7, 8, 11, 13, 17, 27, 31, 35], "tensordot": 11, "tensorflow": [16, 19], "tensoropt": 4, "teq": 28, "term": [8, 26], "termin": 25, "test": [0, 4, 9, 30, 31, 32, 33], "test_": 3, "test_batch_s": 9, "test_input": 19, "test_input_xpu": 19, "test_kwarg": 9, "test_load": 9, "test_loss": 9, "text": 33, "tf32": [2, 10], "tgi": 31, "than": [0, 2, 10, 13, 14, 17, 18, 19, 22, 25, 28, 31], "thank": 3, "thei": [8, 11, 19, 22, 27, 28, 31], "them": [1, 3, 10, 16, 19, 25, 27, 28, 31, 34, 36], "therefor": [14, 17], "thi": [0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 16, 17, 18, 19, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 36], "thing": 25, "those": [2, 8, 16, 25, 27, 35], "thread": [1, 4, 8, 10, 30], "three": [0, 7, 27], "through": [1, 2, 4, 5, 8, 11, 22, 24, 28, 31], "throughput": [27, 31], "throw": 9, "thrown": [2, 25], "thu": [2, 8, 11, 13, 19, 31], "thudm": 27, "tiiuae": 27, "tile": [0, 5, 6, 27, 30], "time": [0, 2, 3, 5, 8, 9, 18, 19, 21, 25, 27, 28, 31, 36], "timeout": 3, "timestamp": 27, "tip": 0, "tloss": [9, 16], "tmp": 8, "to_channels_last_1d": 19, "to_dens": 19, "to_dlpack": 7, "to_mkldnn": 19, "togeth": 33, "toi": 9, "token": [28, 30, 31], "token_type_id": 15, "too": 25, "tool": 0, "toolkit": [2, 30, 31], "toolset": 0, "top": [31, 34], "toplevel": 3, "topologi": [30, 36], "torch": [1, 2, 6, 7, 8, 9, 10, 11, 13, 16, 17, 18, 19, 20, 23, 25, 28, 29, 30, 31, 34, 36], "torch_ccl": [5, 6], "torch_check": [0, 5, 8, 18], "torch_error": [5, 18], "torch_extens": 8, "torch_extension_nam": 8, "torch_ipex": [0, 2, 28], "torch_ipex_include_dir": 8, "torch_ipex_librari": [4, 8], "torch_librari": 8, "torch_llm_allreduc": [10, 31], "torchdynamo": 1, "torchinductor": [5, 22], "torchscirpt": 2, "torchscript": [1, 2, 5, 23, 36], "torchvis": [4, 9], "total": [2, 8, 30, 31, 35], "totensor": [4, 9], "tpp": 27, "trace": [1, 4, 5, 10, 11, 17, 18, 23, 29], "track": [1, 2], "trade": [11, 27, 28, 31], "train": [2, 6, 9, 15, 16, 17, 19, 23, 28, 29, 31, 32, 34], "train_dataset": [4, 6, 16], "train_kwarg": 9, "train_load": [4, 6, 9, 11, 16], "train_sampl": [6, 16], "trainabl": 28, "transfer": 25, "transform": [2, 4, 9, 14, 19, 28, 30, 31], "transpar": [29, 31], "transpos": [8, 31], "tree": 3, "tri": 34, "trigger": [9, 17, 25, 29, 31, 33], "triplet_margin_loss": 11, "triton": [22, 25, 31], "true": [0, 2, 4, 6, 8, 9, 13, 15, 17, 22, 23, 28, 29, 34], "trust_remote_cod": 28, "try": [2, 3, 4, 6, 18, 25], "tune": [11, 28, 31], "tupl": [0, 2, 6], "turboboost": 30, "turn": 31, "tutori": [3, 4, 6, 8, 9, 20], "two": [2, 5, 7, 8, 15, 18, 27, 32], "txt": [3, 4, 8], "type": [0, 2, 3, 4, 5, 7, 8, 9, 17, 19, 23, 25, 28, 31, 32, 33, 34], "typenam": 8, "types": 0, "typic": [5, 7, 16, 28, 31], "u": 8, "ubuntu": [25, 30, 31], "ucod": 30, "uint32_t": 0, "uint8": 17, "ultim": 8, "ultra": [27, 28, 31], "unabl": 32, "unalign": 0, "unaryop": 31, "uncas": 4, "undefin": [25, 31], "under": [2, 7, 11, 14, 19, 25, 26, 31], "undergo": 29, "underlai": 8, "underli": [0, 1, 5, 27, 35], "underneath": 31, "understand": 35, "unifi": [2, 4], "uniform": 28, "uninstal": [3, 25], "uniqu": 18, "unit": [1, 8, 31], "unlik": [4, 5, 9, 27, 28], "unlist": 11, "unload": 25, "unlock": 22, "unoccupi": 2, "unpredict": 2, "unquant": 28, "unstabl": 11, "unsupport": [25, 31], "until": 3, "untrack": 3, "unus": [2, 10, 35], "up": [2, 5, 7, 8, 9, 10, 27, 31, 32], "updat": [2, 3, 6, 9, 31, 32, 34, 36], "upgrad": 31, "uplift": 31, "upon": 28, "upper": 19, "upsampl": [13, 19], "upsampleblinear2d": 13, "upsamplenearest": 13, "upstream": 19, "url": [6, 22, 25], "us": [0, 1, 2, 3, 6, 9, 10, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 26, 27, 28, 31, 32, 34, 35], "usag": [2, 5, 7, 11, 13, 17, 19, 20, 23, 24, 31], "use_aot_devlist": [10, 33], "use_channels_last_1d": 10, "use_ds_kernel": 10, "use_itt_annot": 10, "use_llm_runtim": 28, "use_onednn_dir": 10, "use_onemkl": [10, 25, 31], "use_optimum_format": 28, "use_override_op": 10, "use_persist_stream": 10, "use_primitive_cach": 10, "use_queue_barri": 10, "use_scratchpad_mod": 10, "use_split_fp64_loop": 10, "use_sycl_assert": 10, "use_xetla": 10, "use_xetla_src": 10, "user": [1, 2, 4, 5, 10, 12, 13, 19, 22, 31, 32, 33, 34, 35], "user_nam": 8, "usm": [4, 7], "usr": [0, 25, 31], "usual": [19, 22, 25, 27, 28], "util": [1, 4, 5, 6, 7, 8, 9, 13, 16, 18, 19, 25, 28, 33, 34], "v": [25, 31], "v0": [7, 31], "v1": 31, "v2": [22, 31], "v3": [22, 31], "v4": 30, "valid": [2, 7, 9, 10, 14, 28], "valu": [0, 2, 4, 10, 15, 25, 27, 28], "vanilla": 8, "var": [6, 16, 22, 25], "variabl": [0, 3, 5, 10, 16, 18, 23, 25], "variant": 11, "variou": [7, 22, 27, 28, 31], "vec256": 0, "vec512": 0, "vec_bia": 0, "vector": [0, 1, 2, 4, 8, 14, 19, 31], "vectors": 0, "vehicl": 31, "ver": 8, "verbos": [5, 8, 18], "veri": [3, 8, 19, 21, 25, 27, 31], "verif": 31, "verifi": [4, 22, 25, 27, 31], "version": [0, 4, 7, 8, 22, 25, 26, 31, 34, 36], "via": [3, 5, 7, 8, 10, 17, 22, 25, 28, 33, 35], "view": [17, 19, 22, 31], "view_a": 9, "virtual": 0, "virtualguardimpl": 8, "visibl": [2, 31], "vision": 4, "vl": 27, "vllm": 31, "vml": [25, 31], "vnni": [0, 1, 31], "vocab_s": 4, "void": [0, 8], "w": [19, 28], "w4g32": 28, "w8": 28, "w8a8": [27, 28], "wa": [4, 7, 8, 25, 31], "wai": [3, 8, 19, 27, 28, 36], "wait": [7, 28], "walk": 8, "want": [0, 3, 8, 10, 19, 23], "warmup": [17, 29], "warmup_data": [17, 29], "warn": [3, 18], "warp": 6, "wast": 27, "wc": 19, "we": [0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 17, 19, 25, 27, 28, 30, 31, 35, 36], "webpag": 31, "weight": [1, 2, 4, 6, 8, 15, 16, 17, 19, 29, 31, 32, 34], "weight_decai": [22, 36], "weight_dtyp": 28, "weightonlyquantconfig": 28, "weightonlyquantizedlinear": 28, "weights_prepack": 2, "well": [1, 2, 3, 4, 27, 28, 31], "were": 8, "what": [11, 31, 32, 33], "wheel": [25, 31, 33], "when": [2, 3, 5, 6, 7, 8, 10, 11, 12, 13, 14, 16, 18, 19, 25, 27, 28, 31, 32, 33, 34, 36], "where": [2, 3, 5, 6, 7, 9, 22, 25, 31], "whether": [2, 10, 11, 19, 34], "which": [0, 1, 2, 4, 5, 6, 7, 8, 10, 11, 15, 18, 19, 25, 27, 28, 31, 33, 35], "while": [10, 11, 17, 19, 27, 28, 31], "whl": [6, 22, 25], "who": [10, 31, 33], "whole": [17, 18, 31], "wide": [7, 28], "wider": [1, 4], "widespread": [1, 27], "width": [0, 19, 27], "window": [25, 31], "wise": [10, 28, 29, 31, 36], "wish": [3, 8, 19], "with_arg": [4, 17, 29], "within": [3, 18, 28, 29, 31, 32, 34], "without": [2, 5, 7, 11, 25, 28, 31, 32], "wn": 19, "won": [0, 2, 11], "woq": 27, "woq_quantization_config": 28, "work": [0, 2, 3, 4, 6, 8, 9, 19, 23, 25, 27, 29, 31], "work_group": 8, "workabl": 2, "workaround": 31, "worker": [5, 6, 9, 16], "workflow": [5, 17], "workgroup": 31, "workload": [1, 4, 5, 11, 17, 25, 27, 29, 31, 32], "workspac": [4, 14], "world": 6, "world_siz": [6, 9], "wors": 28, "worth": 14, "would": [0, 3, 4, 8, 13, 17, 18, 19, 22, 25, 31], "wrap": [6, 9, 16, 34], "wrap_cpp_modul": 4, "wrapper": [6, 8], "write": [0, 5], "written": [0, 4, 31, 34], "wrong": [25, 31], "wsl2": [25, 31], "ww42": 30, "x": [0, 1, 4, 8, 9, 11, 19, 22, 24, 28, 33], "x1": 13, "x2": 13, "xcr0": 0, "xdf": 3, "xe": [13, 27, 31], "xe_hpg": 10, "xe_lpg": 10, "xelink": [10, 31], "xelta": 27, "xeon": 30, "xetla": [10, 13, 31], "xmx": [1, 24, 31], "xpu": [1, 2, 5, 6, 7, 9, 10, 11, 13, 14, 16, 17, 18, 20, 22, 23, 24, 25, 28, 29, 30, 34], "xpu_kwarg": 9, "xpu_r": 19, "xpucomputeeng": 13, "xpumalloc": 2, "xpustream": 4, "xsave": 0, "xxx": [18, 31], "y": [4, 11, 28], "ye": 3, "yield": 1, "you": [0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 16, 18, 19, 22, 23, 25, 27, 28, 29, 31, 32, 33, 35], "youngjoo": 28, "your": [1, 3, 4, 6, 8, 9, 11, 16, 22, 23, 25, 26, 28, 32, 33, 35], "your_generation_param": 31, "yourself": 8, "z": [4, 8], "ze_flat_device_hierarchi": 5, "zero": [5, 9, 25, 28, 33], "zero_grad": [4, 9, 16, 22], "zero_point": 17, "zeros_lik": 8, "zhang": 28, "zoo": 4}, "titles": ["Intel\u00ae Extension for PyTorch* CPU ISA Dynamic Dispatch Design Doc", "Intel\u00ae Extension for PyTorch*", "API Documentation", "Contribution", "Examples", "Features", "DistributedDataParallel (DDP)", "DLPack Solution", "DPC++ Extension", "Fully Sharded Data Parallel (FSDP)", "Advanced Configuration", "Auto Mixed Precision (AMP) on GPU", "Auto Channels Last", "Compute Engine (Experimental feature for debug)", "Intel\u00ae Extension for PyTorch* - DeepSpeed* Kernels", "Float8 Data Type Support (Prototype)", "Horovod with PyTorch (Prototype)", "Intel\u00ae Extension for PyTorch* Optimizations for Quantization [GPU]", "IPEX_LOG (Prototype)", "Channels Last", "Kineto Profiler", "Legacy Profiler Tool (Deprecated)", "torch.compile for GPU (Beta)", "Quick Start", "Introduction", "Troubleshooting", "License", "Large Language Models (LLM) Optimizations Overview", "Weight-Only Quantization (Prototype)", "Transformers Optimization Frontend API", "Performance", "Releases", "Technical Details", "Ahead of Time (AOT) Compilation", "ipex.optimize Frontend API", "Memory Management", "Optimizer Fusion on GPU"], "titleterms": {"0": 31, "1": [30, 31], "10": [30, 31], "110": 31, "120": 31, "13": 31, "1550": 27, "1d": 19, "2": 31, "20": 31, "200": 31, "3": 31, "30": 31, "40": 31, "5": 31, "For": 28, "That": 19, "accessor": 8, "add": 0, "advanc": [5, 10], "ahead": [32, 33], "ai": [4, 30], "all": 19, "amp": [5, 11], "aot": [32, 33], "api": [0, 2, 5, 6, 12, 19, 24, 29, 34], "architectur": 1, "asynchron": 7, "aten": [0, 19], "auto": [5, 11, 12], "autocast": 11, "automat": 34, "b": 19, "basic": 4, "behavior": 11, "benchmark": 28, "bert": 4, "beta": [5, 22], "better": 3, "bfloat16": [4, 11], "bind": 6, "block": 19, "break": 31, "build": [0, 3, 8, 10], "c": [2, 4, 18, 19], "c10": 8, "cach": 27, "can": 11, "capsul": 7, "case": [7, 11, 13, 33], "center": [27, 30], "chang": 31, "channel": [5, 12, 19, 34], "check": 0, "cmake": 8, "code": [0, 4], "codegen": 0, "common": 29, "compil": [0, 5, 8, 22, 32, 33], "compon": 18, "compressor": 28, "comput": [5, 13], "configur": [5, 10, 30], "contribut": 3, "conv_bn_fold": 34, "convers": 19, "convolut": 19, "correspond": 8, "coverag": 19, "cpp": 0, "cpu": [0, 19], "creat": 19, "creation": 19, "csrc": 0, "current": 8, "custom": [0, 4], "d": 19, "data": [5, 7, 9, 15, 27, 30], "ddp": 6, "debug": [0, 3, 5, 13], "deep": 27, "deepspe": [14, 29], "default": [11, 12, 19], "definit": 18, "depend": [22, 25], "deprec": 21, "design": [0, 7], "detail": 32, "determin": 19, "develop": 3, "disabl": 12, "dispatch": [0, 28], "dispatchstub": 0, "distribut": [5, 27, 29], "distributeddataparallel": 6, "dldevic": 7, "dlpack": [5, 7], "doc": 0, "document": [2, 3, 24], "dpc": [4, 5, 8], "dynam": [0, 6], "dyndisp": 0, "eas": 12, "easi": 5, "elig": 11, "enabl": 12, "engin": [5, 13], "enviorn": 18, "environ": 28, "event": 18, "exampl": [0, 4, 6, 7, 8, 9, 15, 22], "execut": [23, 28], "experiment": 13, "export": 7, "extens": [0, 1, 3, 5, 6, 8, 14, 17, 28], "featur": [0, 5, 13, 28], "fetch": 8, "fine": 27, "float16": [4, 11], "float32": [4, 11], "float8": 15, "folder": 0, "format": 19, "fp16": 29, "fp8": 15, "framework": 28, "from": 6, "frontend": [29, 34], "fsdp": [5, 9], "fulli": [5, 9], "fuse_update_step": 34, "fusion": [27, 36], "gener": [2, 25], "get": 24, "gpu": [5, 6, 9, 11, 17, 22, 27, 28, 31, 32, 36], "h": 0, "hardwar": 30, "highlight": 31, "horovod": 16, "i": 19, "imper": [4, 11, 17, 29], "implement": [0, 13], "import": 7, "infer": [4, 11, 27, 28, 29], "inferenec": 22, "initi": 28, "input": 11, "instal": [6, 16, 28], "instanc": 4, "int4": 27, "int8": 4, "intel": [0, 1, 3, 4, 6, 14, 17, 27, 28, 30], "intrin": 0, "introduct": [6, 7, 8, 9, 11, 13, 14, 18, 20, 21, 22, 24, 28, 33, 36], "ipex": [32, 34], "ipex_log": [5, 18], "ipex_simple_trac": 18, "ipex_verbos": 18, "isa": 0, "issu": [12, 25, 31], "jit": 8, "kernel": [0, 4, 14, 19], "kineto": [5, 20], "known": [12, 31], "kv": 27, "languag": 27, "larg": 27, "last": [5, 12, 19, 34], "launch": 6, "layout": 19, "legaci": 21, "level": [0, 18], "librari": 25, "licens": 26, "linear": [27, 28], "linear_bn_fold": 34, "link": 6, "list": 27, "llm": [27, 28, 30], "load": 28, "local": 3, "log": 18, "low": 27, "manag": [2, 32, 35], "manner": 19, "manual": 0, "matrix": 28, "matter": 19, "max": 27, "memori": [2, 19, 32, 35], "methodologi": 27, "mix": [5, 11], "mode": [4, 15, 17, 29], "model": [4, 19, 27, 28], "motiv": 8, "mpi": 6, "multipl": 13, "nativ": 19, "nchw": 19, "nchw16c": 19, "neural": 28, "nhwc": 19, "node": 6, "oneccl": 6, "onednn": 19, "onli": [6, 9, 27, 28], "op": [8, 11], "oper": [13, 15, 19, 27, 36], "optim": [4, 17, 27, 29, 32, 34, 36], "option": 28, "overview": [0, 27, 30], "parallel": [5, 9], "path": 11, "perform": [25, 30], "platform": [14, 27], "pointer": 7, "polici": [13, 27], "prebuilt": 6, "precis": [5, 11, 27], "primit": 19, "privat": 0, "process": 0, "product": 30, "profil": [5, 20, 21], "program": 7, "promot": 11, "prototyp": [5, 15, 16, 18, 28], "pseudocod": 29, "pytest": 3, "python": [4, 5, 18], "pytorch": [0, 1, 3, 6, 14, 16, 17, 19, 28], "quantiz": [2, 5, 15, 17, 27, 28], "queue": 8, "quick": 23, "recommend": 6, "refer": [4, 11, 28], "regist": 19, "releas": 31, "replac": 18, "replace_dropout_with_ident": 34, "request": 8, "requir": [0, 22, 33], "resnet50": 4, "run": [15, 28], "runtim": [6, 10, 28], "save": 28, "scale": 6, "scenario": 29, "script": 28, "segment": 27, "select": [0, 13], "set": 18, "setup": 28, "setuptool": 8, "shard": [5, 9], "simpl": 18, "singl": [4, 6], "smoothquant": 29, "softwar": 30, "solut": [5, 7], "sourc": 6, "specif": [0, 11], "split_master_weight_for_bf16": 34, "start": [23, 24], "stride": 19, "struct": 0, "stub": 0, "support": [1, 5, 11, 14, 15, 19, 28], "sycl": [4, 8], "technic": 32, "tensor": 19, "test": 3, "time": [10, 32, 33], "tip": 3, "tool": [5, 21], "torch": [4, 5, 22], "torchscript": [4, 11, 17, 29], "train": [4, 5, 11, 22], "transform": 29, "troubleshoot": [22, 25], "tune": 27, "type": [11, 15, 27], "unit": 3, "us": [4, 5, 7, 8, 11, 12, 13, 33], "usag": [4, 6, 9, 15, 16, 18, 22, 25, 28, 29], "v2": 30, "valid": 27, "vec": 0, "version": 30, "weight": [27, 28], "what": 19, "wheel": 6, "widest": 11, "woq": 28, "write": [3, 8, 19], "xpu": [3, 4, 8, 19, 31], "xpustream": 8, "xyz": 0, "xyzkrnl": 0}}) \ No newline at end of file diff --git a/xpu/2.5.10+xpu/tutorials/api_doc.html b/xpu/2.5.10+xpu/tutorials/api_doc.html index fe9aa95f6..832d01191 100644 --- a/xpu/2.5.10+xpu/tutorials/api_doc.html +++ b/xpu/2.5.10+xpu/tutorials/api_doc.html @@ -7,7 +7,7 @@ API Documentation — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -371,7 +371,7 @@

General

Examples

-
>>> import intel_extension_for_pytorch as ipex
+
>>> import intel_extension_for_pytorch as ipex
 >>> # to get the current fpmath mode
 >>> ipex.get_fp32_math_mode(device="xpu")
 
@@ -380,7 +380,7 @@

General to provide identical usage for XPU device only. The motivation of adding this alias is to unify the coding style in user scripts base on torch.xpu modular.

Examples

-
>>> import intel_extension_for_pytorch as ipex
+
>>> import intel_extension_for_pytorch as ipex
 >>> # to get the current fpmath mode
 >>> torch.xpu.get_fp32_math_mode(device="xpu")
 
@@ -404,7 +404,7 @@

General

Examples

-
>>> import intel_extension_for_pytorch as ipex
+
>>> import intel_extension_for_pytorch as ipex
 >>> # to enable the implicit data type conversion
 >>> ipex.set_fp32_math_mode(device="xpu", mode=ipex.FP32MathMode.BF32)
 >>> # to disable the implicit data type conversion
@@ -415,7 +415,7 @@ 

General to provide identical usage for XPU device only. The motivation of adding this alias is to unify the coding style in user scripts base on torch.xpu modular.

Examples

-
>>> import intel_extension_for_pytorch as ipex
+
>>> import intel_extension_for_pytorch as ipex
 >>> # to enable the implicit data type conversion
 >>> torch.xpu.set_fp32_math_mode(device="xpu", mode=ipex.FP32MathMode.BF32)
 >>> # to disable the implicit data type conversion
@@ -791,7 +791,7 @@ 

C++ API< Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/tutorials/contribution.html b/xpu/2.5.10+xpu/tutorials/contribution.html index 29df82d3a..9b3a2540a 100644 --- a/xpu/2.5.10+xpu/tutorials/contribution.html +++ b/xpu/2.5.10+xpu/tutorials/contribution.html @@ -7,7 +7,7 @@ Contribution — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -272,7 +272,7 @@

Tips< Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/tutorials/examples.html b/xpu/2.5.10+xpu/tutorials/examples.html index cfca9d8a4..b731eac43 100644 --- a/xpu/2.5.10+xpu/tutorials/examples.html +++ b/xpu/2.5.10+xpu/tutorials/examples.html @@ -7,7 +7,7 @@ Examples — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -200,8 +200,8 @@

Single-Instance Training
...
-import torch
-import intel_extension_for_pytorch as ipex
+import torch
+import intel_extension_for_pytorch as ipex
 ...
 model = Model()
 criterion = ...
@@ -237,11 +237,11 @@ 

Single-Instance Training
Float32
-
import torch
-import torchvision
+
import torch
+import torchvision
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -299,11 +299,11 @@ 
Float32
BFloat16
-
import torch
-import torchvision
+
import torch
+import torchvision
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -373,11 +373,11 @@ 

Float32Imperative Mode

Resnet50
-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -400,11 +400,11 @@ 
Resnet50
BERT
-
import torch
-from transformers import BertModel
+
import torch
+from transformers import BertModel
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -435,11 +435,11 @@ 
TorchScript ModeTorchScript for further optimizations.

Resnet50
-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -469,11 +469,11 @@ 
Resnet50
BERT
-
import torch
-from transformers import BertModel
+
import torch
+from transformers import BertModel
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -515,11 +515,11 @@ 

BFloat16Imperative Mode

Resnet50
-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -545,11 +545,11 @@ 
Resnet50
BERT
-
import torch
-from transformers import BertModel
+
import torch
+from transformers import BertModel
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -583,11 +583,11 @@ 
TorchScript ModeTorchScript for further optimizations.

Resnet50
-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -617,11 +617,11 @@ 
Resnet50
BERT
-
import torch
-from transformers import BertModel
+
import torch
+from transformers import BertModel
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -664,11 +664,11 @@ 

Float16
Imperative Mode
Resnet50
-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -694,11 +694,11 @@ 
Resnet50<
BERT
-
import torch
-from transformers import BertModel
+
import torch
+from transformers import BertModel
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -732,11 +732,11 @@ 
TorchScript ModeTorchScript for further optimizations.

Resnet50
-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -766,11 +766,11 @@ 
Resnet50<
BERT
-
import torch
-from transformers import BertModel
+
import torch
+from transformers import BertModel
 
 ############# code changes ###############
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ############# code changes ###############
 
@@ -808,21 +808,21 @@ 
BERT<

INT8

We recommend using TorchScript for INT8 model because it has wider support for models. TorchScript mode also auto-enables our optimizations. For TorchScript INT8 model, inserting observer and model quantization is achieved through prepare_jit and convert_jit separately. Calibration process is required for collecting statistics from real data. After conversion, optimizations such as operator fusion would be auto-enabled.

-
import os
-import torch
-from torch.jit._recursive import wrap_cpp_module
-from torch.quantization.quantize_jit import (
+
import os
+import torch
+from torch.jit._recursive import wrap_cpp_module
+from torch.quantization.quantize_jit import (
     convert_jit,
     prepare_jit,
 )
 
 #################### code changes ####################
-import intel_extension_for_pytorch as ipex
+import intel_extension_for_pytorch as ipex
 
 ######################################################
 
 ##### Example Model #####
-import torchvision.models as models
+import torchvision.models as models
 
 model = models.resnet50(weights="ResNet50_Weights.DEFAULT")
 model.eval()
@@ -843,7 +843,7 @@ 

INT8< modelJit = prepare_jit(modelJit, {"": qconfig}, True) ##### Example Dataloader ##### -import torchvision +import torchvision DOWNLOAD = True DATA = "datasets/cifar10/" @@ -884,11 +884,11 @@

INT8<

torch.xpu.optimize

The torch.xpu.optimize function is an alternative to ipex.optimize in Intel® Extension for PyTorch*, and provides identical usage for XPU devices only. The motivation for adding this alias is to unify the coding style in user scripts base on torch.xpu modular. Refer to the example below for usage.

-
import torch
-import torchvision.models as models
+
import torch
+import torchvision.models as models
 
 ############# code changes #########
-import intel_extension_for_pytorch
+import intel_extension_for_pytorch
 
 ############# code changes #########
 
@@ -1116,7 +1116,7 @@ 

Intel® AI Reference ModelsSphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/tutorials/features.html b/xpu/2.5.10+xpu/tutorials/features.html index f2753dd31..c382229cb 100644 --- a/xpu/2.5.10+xpu/tutorials/features.html +++ b/xpu/2.5.10+xpu/tutorials/features.html @@ -7,7 +7,7 @@ Features — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -276,7 +276,7 @@

IPEX_LOG Built with Sphinx using a theme provided by Read the Docs. - +

© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
diff --git a/xpu/2.5.10+xpu/tutorials/features/DDP.html b/xpu/2.5.10+xpu/tutorials/features/DDP.html index 0c8bdd8f4..33dee9dcc 100644 --- a/xpu/2.5.10+xpu/tutorials/features/DDP.html +++ b/xpu/2.5.10+xpu/tutorials/features/DDP.html @@ -7,7 +7,7 @@ DistributedDataParallel (DDP) — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -168,9 +168,9 @@

[Recommended] Install from prebuilt wheelsoneccl_bindings_for_pytorch

# Generic Python* for CPU
-REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/cpu/cn/
 # Generic Python* for GPU
-REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
+REPO_URL: https://pytorch-extension.intel.com/release-whl/stable/xpu/cn/
 

Installation from either repository shares the command below. Replace the place holder <REPO_URL> with a real URL mentioned above.

@@ -201,9 +201,9 @@

DDP Usage
  • Import the necessary packages.

  • -
    import torch
    -import intel_extension_for_pytorch 
    -import oneccl_bindings_for_pytorch
    +
    import torch
    +import intel_extension_for_pytorch 
    +import oneccl_bindings_for_pytorch
     

    JIT Compiling Extensions

    Previously, we mentioned that there were two ways of building DPC++ extensions: use setuptools as AOT or compile with JIT. Having the former one introduced, let’s elaborate on the latter one. The JIT compilation mechanism provides a methodology to compile and load your extensions on the fly by invoking a simple torch API function torch.xpu.cpp_extension.load(). For the LLTM, this would look as simple as this:

    -
    import torch
    -import intel_extension_for_pytorch
    -from torch.xpu.cpp_extension import load
    +
    import torch
    +import intel_extension_for_pytorch
    +from torch.xpu.cpp_extension import load
     
     lltm_xpu = load(name="lltm_xpu", sources=['lltm_xpu.cpp', 'lltm_xpu_kernel.cpp',])
     
    @@ -282,8 +282,8 @@

    Fetching the corresponding sycl::queuevoid*, which can cast to a sycl::queue pointer.

    -
    import torch
    -import intel_extension_for_pytorch
    +
    import torch
    +import intel_extension_for_pytorch
     stream = torch.xpu.current_stream()
     queue = stream.sycl_queue # queue is a ``void*`` which can cast to a sycl::queue pointer
     
    @@ -636,7 +636,7 @@

    Using accessorsSphinx using a theme provided by Read the Docs. - +

    © Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others. No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (OBSD), http://opensource.org/licenses/0BSD.
    diff --git a/xpu/2.5.10+xpu/tutorials/features/FSDP.html b/xpu/2.5.10+xpu/tutorials/features/FSDP.html index 861d38bfc..d74544766 100644 --- a/xpu/2.5.10+xpu/tutorials/features/FSDP.html +++ b/xpu/2.5.10+xpu/tutorials/features/FSDP.html @@ -7,7 +7,7 @@ Fully Sharded Data Parallel (FSDP) — Intel&#174 Extension for PyTorch* 2.5.10+xpu documentation - + @@ -144,10 +144,10 @@

    FSDP Usage (GPU only)
  • Import the necessary packages.

  • -
    import torch
    -import intel_extension_for_pytorch 
    -import oneccl_bindings_for_pytorch
    -from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    +
    import torch
    +import intel_extension_for_pytorch 
    +import oneccl_bindings_for_pytorch
    +from torch.distributed.fsdp import FullyShardedDataParallel as FSDP