diff --git a/.github/workflows/cli-scripts-deploy-custom-container-torchserve-huggingface-textgen.yml b/.github/workflows/cli-scripts-deploy-custom-container-torchserve-huggingface-textgen.yml
deleted file mode 100644
index b83a3fc723..0000000000
--- a/.github/workflows/cli-scripts-deploy-custom-container-torchserve-huggingface-textgen.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-# This code is autogenerated.
-# Code is generated by running custom script: python3 readme.py
-# Any manual changes to this file may cause incorrect behavior.
-# Any manual changes will be overwritten if the code is regenerated.
-
-name: cli-scripts-deploy-custom-container-torchserve-huggingface-textgen
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "54 0/12 * * *"
-  pull_request:
-    branches:
-      - main
-    paths:
-      - cli/deploy-custom-container-torchserve-huggingface-textgen.sh
-      - infra/bootstrapping/**
-      - .github/workflows/cli-scripts-deploy-custom-container-torchserve-huggingface-textgen.yml
-      - cli/setup.sh
-permissions:
-  id-token: write
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
-  cancel-in-progress: true
-jobs:
-  build:
-    runs-on: ubuntu-latest
-    steps:
-    - name: check out repo
-      uses: actions/checkout@v2
-    - name: azure login
-      uses: azure/login@v1
-      with:
-        client-id: ${{ secrets.OIDC_AZURE_CLIENT_ID }}
-        tenant-id: ${{ secrets.OIDC_AZURE_TENANT_ID }}
-        subscription-id: ${{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}
-    - name: bootstrap resources
-      run: |
-          bash bootstrap.sh
-      working-directory: infra/bootstrapping
-      continue-on-error: false
-    - name: setup-cli
-      run: |
-          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
-          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
-          bash setup.sh
-      working-directory: cli
-      continue-on-error: true
-    - name: Eagerly cache access tokens for required scopes
-      run: |
-          # Workaround for azure-cli's lack of support for ID token refresh
-          # Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617
-
-          # Management
-          az account get-access-token --scope https://management.azure.com/.default --output none
-          # ML
-          az account get-access-token --scope https://ml.azure.com/.default --output none
-    - name: validate readme
-      run: |
-          python check-readme.py "${{ github.workspace }}/cli/"
-      working-directory: infra/bootstrapping
-      continue-on-error: false
-    - name: test script script
-      run: |
-          source "${{ github.workspace }}/infra/bootstrapping/sdk_helpers.sh";
-          source "${{ github.workspace }}/infra/bootstrapping/init_environment.sh";
-          set -e; bash -x deploy-custom-container-torchserve-huggingface-textgen.sh
-      working-directory: cli
diff --git a/cli/README.md b/cli/README.md
index e8072f3bb1..71e8242048 100644
--- a/cli/README.md
+++ b/cli/README.md
@@ -45,7 +45,6 @@ path|status|
 [deploy-custom-container-tfserving-half-plus-two-integrated.sh](deploy-custom-container-tfserving-half-plus-two-integrated.sh)|[![deploy-custom-container-tfserving-half-plus-two-integrated](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-tfserving-half-plus-two-integrated/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-tfserving-half-plus-two-integrated.yml)
 [deploy-custom-container-tfserving-half-plus-two.sh](deploy-custom-container-tfserving-half-plus-two.sh)|[![deploy-custom-container-tfserving-half-plus-two](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-tfserving-half-plus-two/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-tfserving-half-plus-two.yml)
 [deploy-custom-container-torchserve-densenet.sh](deploy-custom-container-torchserve-densenet.sh)|[![deploy-custom-container-torchserve-densenet](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-torchserve-densenet/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-torchserve-densenet.yml)
-[deploy-custom-container-torchserve-huggingface-textgen.sh](deploy-custom-container-torchserve-huggingface-textgen.sh)|[![deploy-custom-container-torchserve-huggingface-textgen](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-torchserve-huggingface-textgen/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-torchserve-huggingface-textgen.yml)
 [deploy-custom-container-triton-single-model.sh](deploy-custom-container-triton-single-model.sh)|[![deploy-custom-container-triton-single-model](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-custom-container-triton-single-model/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-custom-container-triton-single-model.yml)
 [deploy-local-endpoint.sh](deploy-local-endpoint.sh)|[![deploy-local-endpoint](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-local-endpoint/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-local-endpoint.yml)
 [deploy-managed-online-endpoint-access-resource-sai.sh](deploy-managed-online-endpoint-access-resource-sai.sh)|[![deploy-managed-online-endpoint-access-resource-sai](https://github.com/Azure/azureml-examples/workflows/cli-scripts-deploy-managed-online-endpoint-access-resource-sai/badge.svg?branch=main)](https://github.com/Azure/azureml-examples/actions/workflows/cli-scripts-deploy-managed-online-endpoint-access-resource-sai.yml)
diff --git a/cli/endpoints/online/custom-container/README.md b/cli/endpoints/online/custom-container/README.md
index ca8d67f857..7efcb62ef4 100644
--- a/cli/endpoints/online/custom-container/README.md
+++ b/cli/endpoints/online/custom-container/README.md
@@ -15,6 +15,5 @@ Each example consists of a script located in the [CLI](../../..) directory as we
 |[r/multimodel-plumber](r/multimodel-plumber)|[deploy-custom-container-r-multimodel-plumber](../../../deploy-custom-container-r-multimodel-plumber.sh)|Deploy three regression models to one endpoint using the Plumber R package|
 |[tfserving/half-plus-two](tfserving/half-plus-two)|[deploy-custom-container-tfserving-half-plus-two](../../../deploy-custom-container-tfserving-half-plus-two.sh)|Deploy a simple Half Plus Two model using a TFServing custom container using the standard model registration process.|
 |[tfserving/half-plus-two-integrated](tfserving/half-plus-two-integrated)|[deploy-custom-container-tfserving-half-plus-two-integrated](../../../deploy-custom-container-tfserving-half-plus-two-integrated.sh)|Deploy a simple Half Plus Two model using a TFServing custom container with the model integrated into the image.|
-|[torchserve/densenet](torchserve/densenet)|[deploy-custom-container-torchserve-densenet](../../../deploy-custom-container-torchserve-densenet.sh)|Deploy a single model using a Torchserve custom container.|
-|[torchserve/huggingface-textgen](torchserve/huggingface-textgen)|[deploy-custom-container-torchserve-huggingface-textgen](../../../deploy-custom-container-torchserve-huggingface-textgen.sh)|Deploy Huggingface models to an online endpoint and follow along with the Huggingface Transformers Torchserve example.| 
+|[torchserve/densenet](torchserve/densenet)|[deploy-custom-container-torchserve-densenet](../../../deploy-custom-container-torchserve-densenet.sh)|Deploy a single model using a Torchserve custom container.| 
 |[triton/single-model](triton/single-model)|[deploy-custom-container-triton-single-model](../../../deploy-custom-container-triton-single-model.sh)|Deploy a Triton model using a custom container|
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb b/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb
index 066d752fd9..10df13243c 100644
--- a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb
+++ b/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_chat_completion.ipynb
@@ -39,6 +39,8 @@
    "source": [
     "%pip install azure-ai-ml\n",
     "%pip install azure-identity\n",
+    "%pip install azure-core\n",
+    "%pip install azure-ai-inference\n",
     "\n",
     "%pip install mlflow\n",
     "%pip install azureml-mlflow\n",
@@ -62,13 +64,18 @@
     "\n",
     "import base64\n",
     "import json\n",
+    "import os\n",
+    "import uuid\n",
     "\n",
-    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
-    "\n",
-    "from azure.ai.ml import MLClient, Input\n",
+    "from azure.ai.inference import ChatCompletionsClient\n",
+    "from azure.ai.inference.models import SystemMessage, UserMessage\n",
+    "from azure.ai.ml import Input, MLClient\n",
     "from azure.ai.ml.constants import AssetTypes\n",
     "from azure.ai.ml.dsl import pipeline\n",
-    "from azure.ai.ml.entities import Data"
+    "from azure.ai.ml.entities import Data, ServerlessEndpoint\n",
+    "from azure.core.credentials import AzureKeyCredential\n",
+    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
+    "from azure.core.exceptions import ResourceNotFoundError"
    ]
   },
   {
@@ -110,6 +117,19 @@
     "`DefaultAzureCredential` should be capable of handling most Azure SDK authentication scenarios. "
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "try:\n",
+    "    credential = DefaultAzureCredential()\n",
+    "    # Check if given credential can get token successfully.\n",
+    "    credential.get_token(\"https://management.azure.com/.default\")\n",
+    "except Exception as ex:\n",
+    "    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work\n",
+    "    credential = InteractiveBrowserCredential()"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -150,14 +170,7 @@
    "source": [
     "## Pick a teacher model\n",
     "\n",
-    "We support **Meta-Llama-3.1-405B-Instruct** as the teacher model. \n",
-    "### First deploy the teacher model in Azure AI Studio\n",
-    "* Go to Azure AI Studio (ai.azure.com)\n",
-    "* Select Meta-Llama-3.1-405B-Instruct model from Model catalog.\n",
-    "* Deploy with \"Pay-as-you-go\"\n",
-    "* Once deployed successfully, you should be assigned for an API endpoint and a security key for inference.\n",
-    "\n",
-    "Update the following cell with the information of the deployment you just created."
+    "We support **Meta-Llama-3.1-405B-Instruct** as the teacher model. "
    ]
   },
   {
@@ -166,13 +179,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Llama-3-405B Teacher model endpoint name\n",
-    "# The serverless model name is the name found in ML Studio > Endpoints > Serverless endpoints > Model column\n",
+    "# We will reuse or create a serverless endpoint\n",
     "TEACHER_MODEL_NAME = \"Meta-Llama-3.1-405B-Instruct\"\n",
+    "TEACHER_MODEL_ENDPOINT_NAME = \"Meta-Llama-3-1-405B-Instruct-vum\"\n",
     "\n",
-    "# The serverless model endpoint name is the name found in ML Studio > Endpoints > Serverless endpoints > Name column\n",
-    "# The endpoint URL will be resolved from this name by the MLFlow component\n",
-    "TEACHER_MODEL_ENDPOINT_NAME = \"Meta-Llama-3-1-405B-Instruct-vum\""
+    "mlclient_azureml_meta = MLClient(credential, registry_name=\"azureml-meta\")\n",
+    "try:\n",
+    "    ml_client.serverless_endpoints.get(TEACHER_MODEL_ENDPOINT_NAME)\n",
+    "except ResourceNotFoundError:\n",
+    "    # create the endpoint\n",
+    "    teacher_model_id = (\n",
+    "        \"azureml://registries/azureml-meta/models/Meta-Llama-3.1-405B-Instruct\"\n",
+    "    )\n",
+    "    teacher_endpoint = ServerlessEndpoint(\n",
+    "        name=TEACHER_MODEL_ENDPOINT_NAME,\n",
+    "        model_id=teacher_model_id,\n",
+    "    )\n",
+    "    ml_client.begin_create_or_update(teacher_endpoint).result()"
    ]
   },
   {
@@ -194,7 +217,6 @@
     "STUDENT_MODEL_VERSION = 1\n",
     "\n",
     "# retrieve student model from model registry\n",
-    "mlclient_azureml_meta = MLClient(credential, registry_name=\"azureml-meta\")\n",
     "student_model = mlclient_azureml_meta.models.get(\n",
     "    STUDENT_MODEL_NAME, version=STUDENT_MODEL_VERSION\n",
     ")\n",
@@ -307,7 +329,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "! mkdir -p data"
+    "!mkdir data"
    ]
   },
   {
@@ -319,44 +341,44 @@
     "train_data_path = \"data/train_conjnli_512.jsonl\"\n",
     "valid_data_path = \"data/valid_conjnli_256.jsonl\"\n",
     "\n",
-    "for row in train:\n",
-    "    data = {\"messages\": []}\n",
-    "    data[\"messages\"].append(\n",
-    "        {\n",
-    "            \"role\": \"system\",\n",
-    "            \"content\": \"You are a helpful assistant. Your output should only be one of the three labels: 'entailment', 'contradiction', or 'neutral'.\",\n",
-    "        }\n",
-    "    )\n",
-    "    data[\"messages\"].append(\n",
-    "        {\n",
-    "            \"role\": \"user\",\n",
-    "            \"content\": \"Given the following two texts, your task is to determine the logical relationship between them. The first text is the 'premise' and the second text is the 'hypothesis'. The relationship should be labeled as one of the following: 'entailment' if the premise entails the hypothesis, 'contradiction' if the premise contradicts the hypothesis, or 'neutral' if the premise neither entails nor contradicts the hypothesis.\\n\\nPremise: \"\n",
-    "            + row[\"premise\"]\n",
-    "            + \"\\nHypothesis: \"\n",
-    "            + row[\"hypothesis\"],\n",
-    "        }\n",
-    "    )\n",
-    "    with open(train_data_path, \"a\") as f:\n",
+    "with open(train_data_path, \"w+\") as f:\n",
+    "    for row in train:\n",
+    "        data = {\"messages\": []}\n",
+    "        data[\"messages\"].append(\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": \"You are a helpful assistant. Your output should only be one of the three labels: 'entailment', 'contradiction', or 'neutral'.\",\n",
+    "            }\n",
+    "        )\n",
+    "        data[\"messages\"].append(\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": \"Given the following two texts, your task is to determine the logical relationship between them. The first text is the 'premise' and the second text is the 'hypothesis'. The relationship should be labeled as one of the following: 'entailment' if the premise entails the hypothesis, 'contradiction' if the premise contradicts the hypothesis, or 'neutral' if the premise neither entails nor contradicts the hypothesis.\\n\\nPremise: \"\n",
+    "                + row[\"premise\"]\n",
+    "                + \"\\nHypothesis: \"\n",
+    "                + row[\"hypothesis\"],\n",
+    "            }\n",
+    "        )\n",
     "        f.write(json.dumps(data) + \"\\n\")\n",
     "\n",
-    "for row in val:\n",
-    "    data = {\"messages\": []}\n",
-    "    data[\"messages\"].append(\n",
-    "        {\n",
-    "            \"role\": \"system\",\n",
-    "            \"content\": \"You are a helpful assistant. Your output should only be one of the three labels: 'entailment', 'contradiction', or 'neutral'.\",\n",
-    "        }\n",
-    "    )\n",
-    "    data[\"messages\"].append(\n",
-    "        {\n",
-    "            \"role\": \"user\",\n",
-    "            \"content\": \"Given the following two texts, your task is to determine the logical relationship between them. The first text is the 'premise' and the second text is the 'hypothesis'. The relationship should be labeled as one of the following: 'entailment' if the premise entails the hypothesis, 'contradiction' if the premise contradicts the hypothesis, or 'neutral' if the premise neither entails nor contradicts the hypothesis.\\n\\nPremise: \"\n",
-    "            + row[\"premise\"]\n",
-    "            + \"\\nHypothesis: \"\n",
-    "            + row[\"hypothesis\"],\n",
-    "        }\n",
-    "    )\n",
-    "    with open(valid_data_path, \"a\") as f:\n",
+    "with open(valid_data_path, \"w+\") as f:\n",
+    "    for row in val:\n",
+    "        data = {\"messages\": []}\n",
+    "        data[\"messages\"].append(\n",
+    "            {\n",
+    "                \"role\": \"system\",\n",
+    "                \"content\": \"You are a helpful assistant. Your output should only be one of the three labels: 'entailment', 'contradiction', or 'neutral'.\",\n",
+    "            }\n",
+    "        )\n",
+    "        data[\"messages\"].append(\n",
+    "            {\n",
+    "                \"role\": \"user\",\n",
+    "                \"content\": \"Given the following two texts, your task is to determine the logical relationship between them. The first text is the 'premise' and the second text is the 'hypothesis'. The relationship should be labeled as one of the following: 'entailment' if the premise entails the hypothesis, 'contradiction' if the premise contradicts the hypothesis, or 'neutral' if the premise neither entails nor contradicts the hypothesis.\\n\\nPremise: \"\n",
+    "                + row[\"premise\"]\n",
+    "                + \"\\nHypothesis: \"\n",
+    "                + row[\"hypothesis\"],\n",
+    "            }\n",
+    "        )\n",
     "        f.write(json.dumps(data) + \"\\n\")"
    ]
   },
@@ -375,7 +397,7 @@
    "outputs": [],
    "source": [
     "train_data = None\n",
-    "train_data_name = \"nli_train_70-70\"\n",
+    "train_data_name = \"nli_train_70\"\n",
     "\n",
     "train_data = ml_client.data.create_or_update(\n",
     "    Data(\n",
@@ -427,7 +449,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ENABLE_CHAIN_OF_THOUGHT = \"true\""
+    "ENABLE_CHAIN_OF_THOUGHT = \"True\""
    ]
   },
   {
@@ -569,16 +591,114 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Consuming the distilled model\n",
+    "## Create a serverless endpoint to consume the model (optional)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Wait for the job to complete\n",
+    "ml_client.jobs.stream(ft_job.name)\n",
+    "registered_model_name = ml_client.jobs.get(ft_job.name).properties[\n",
+    "    \"registered_ft_model_name\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the model url for registered endpoint\n",
+    "rg_model_vs = ml_client.models.get(registered_model_name, label=\"latest\")._version\n",
+    "\n",
+    "rg_model_asset_id = (\n",
+    "    \"azureml://locations/\"\n",
+    "    f\"{ai_project.location}\"\n",
+    "    \"/workspaces/\"\n",
+    "    f\"{ai_project._workspace_id}\"\n",
+    "    \"/models/\"\n",
+    "    f\"{registered_model_name}\"\n",
+    "    \"/versions/\"\n",
+    "    f\"{rg_model_vs}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create serverless endpoint - names must be unique, we will use suffix of the model\n",
+    "short_id = registered_model_name[-9:]\n",
+    "serverless_endpoint_name = \"my-endpoint-\" + short_id\n",
+    "\n",
+    "serverless_endpoint = ServerlessEndpoint(\n",
+    "    name=serverless_endpoint_name,\n",
+    "    model_id=rg_model_asset_id,\n",
+    ")\n",
+    "\n",
+    "created_endpoint = ml_client.serverless_endpoints.begin_create_or_update(\n",
+    "    serverless_endpoint\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sample inference against the deployed endpoint (optional)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = created_endpoint.scoring_uri\n",
+    "key = ml_client.serverless_endpoints.get_keys(created_endpoint.name).primary_key\n",
+    "model = ChatCompletionsClient(\n",
+    "    endpoint=url,\n",
+    "    credential=AzureKeyCredential(key),\n",
+    ")\n",
     "\n",
-    "Once the above job completes, you should be able to deploy the model and use it for inferencing. To deploy this model, do the following:\n",
+    "response = model.complete(\n",
+    "    messages=[\n",
+    "        SystemMessage(\n",
+    "            content=\"You are a helpful assistant. Your output should only be one of the five choices: 'A', 'B', 'C', 'D', or 'E'.\"\n",
+    "        ),\n",
+    "        UserMessage(\n",
+    "            content=\"Answer the following multiple-choice question by selecting the correct option.\\n\\nQuestion: Can you name a good reason for attending school?\\nAnswer Choices:\\n(A) get smart\\n(B) boredom\\n(C) colds and flu\\n(D) taking tests\\n(E) spend time\"\n",
+    "        ),\n",
+    "    ],\n",
+    ")\n",
     "\n",
-    "* Go to AI Studio\n",
-    "* Navigate to the Fine-tuning tab on the left menu\n",
-    "* In the list of models you see, click on the model which got created from the distillation\n",
-    "* This should take you to the details page where you can see the model attributes and other details\n",
-    "* Click on the Deploy button on top of the page\n",
-    "* Follow the steps to deploy the model"
+    "print(response.choices[0].message.content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Cleanup endpoints created (optional)\n",
+    "\n",
+    "Endpoint deployments are chargeable and incurr costs on the subscription. Optionally clean up the endpoints after finishing experiments"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "_ = ml_client.serverless_endpoints.begin_delete(TEACHER_MODEL_ENDPOINT_NAME)\n",
+    "_ = ml_client.serverless_endpoints.begin_delete(serverless_endpoint_name)"
    ]
   }
  ],
diff --git a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_nlu_qa_task.ipynb b/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_nlu_qa_task.ipynb
index f2258e1736..858501fe6a 100644
--- a/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_nlu_qa_task.ipynb
+++ b/sdk/python/foundation-models/system/finetune/Llama-notebooks/distillation/distillation_nlu_qa_task.ipynb
@@ -38,6 +38,8 @@
    "source": [
     "%pip install azure-ai-ml\n",
     "%pip install azure-identity\n",
+    "%pip install azure-core\n",
+    "%pip install azure-ai-inference\n",
     "\n",
     "%pip install mlflow\n",
     "%pip install azureml-mlflow\n",
@@ -62,11 +64,16 @@
     "import base64\n",
     "import json\n",
     "import os\n",
-    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential\n",
-    "from azure.ai.ml import MLClient, Input\n",
+    "import uuid\n",
+    "\n",
+    "from azure.ai.inference import ChatCompletionsClient\n",
+    "from azure.ai.inference.models import SystemMessage, UserMessage\n",
+    "from azure.ai.ml import Input, MLClient\n",
     "from azure.ai.ml.constants import AssetTypes\n",
     "from azure.ai.ml.dsl import pipeline\n",
-    "from azure.ai.ml.entities import Data"
+    "from azure.ai.ml.entities import Data, ServerlessEndpoint\n",
+    "from azure.core.credentials import AzureKeyCredential\n",
+    "from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential"
    ]
   },
   {
@@ -471,6 +478,7 @@
     "    system_properties: str,\n",
     "    input_finetune_model: Input,\n",
     "    train_file_path: Input,\n",
+    "    registered_model_name: str,\n",
     "    validation_file_path: Input = None,\n",
     "):\n",
     "    oss_distillation = distillation_pipeline_component(\n",
@@ -487,6 +495,7 @@
     "        per_device_train_batch_size=1,\n",
     "        num_train_epochs=3,\n",
     "        data_generation_task_type=\"NLU_QA\",\n",
+    "        registered_model_name=registered_model_name,\n",
     "    )\n",
     "\n",
     "    return {\"output_model\": oss_distillation.outputs.output_model}"
@@ -522,10 +531,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "short_id = str(uuid.uuid4())[:8]\n",
     "train_file_path_input = Input(type=\"uri_file\", path=train_data.path)\n",
     "validation_file_path_input = Input(type=\"uri_file\", path=valid_data.path)\n",
     "input_finetune_model = Input(type=\"mlflow_model\", path=student_model.id)\n",
     "experiment_name = f\"distillation-{TEACHER_MODEL_NAME}\".replace(\".\", \"-\")\n",
+    "# do not use underscores in the name, that's unsupported\n",
+    "registered_model_name = \"my-model-\" + short_id\n",
     "\n",
     "finetuning_job = distillation_pipeline(\n",
     "    teacher_model_endpoint_name=TEACHER_MODEL_ENDPOINT_NAME,\n",
@@ -533,6 +545,7 @@
     "    system_properties=system_properties_b64_encoded,\n",
     "    input_finetune_model=input_finetune_model,\n",
     "    train_file_path=train_file_path_input,\n",
+    "    registered_model_name=registered_model_name,\n",
     "    validation_file_path=validation_file_path_input,\n",
     ")\n",
     "\n",
@@ -573,16 +586,75 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Consuming the distilled model\n",
+    "## Create a serverless endpoint to consume the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create the model url for registered endpoint\n",
+    "rg_model_vs = ml_client.models.get(registered_model_name, label=\"latest\")._version\n",
+    "\n",
+    "rg_model_asset_id = (\n",
+    "    \"azureml://locations/\"\n",
+    "    f\"{ai_project.location}\"\n",
+    "    \"/workspaces/\"\n",
+    "    f\"{ai_project._workspace_id}\"\n",
+    "    \"/models/\"\n",
+    "    f\"{registered_model_name}\"\n",
+    "    \"/versions/\"\n",
+    "    f\"{rg_model_vs}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create serverless endpoint - names must be unique\n",
+    "serverless_endpoint_name = \"my-endpoint-\" + short_id\n",
+    "\n",
+    "# create an serverless endpoint\n",
+    "serverless_endpoint = ServerlessEndpoint(\n",
+    "    name=serverless_endpoint_name,\n",
+    "    model_id=rg_model_asset_id,\n",
+    ")\n",
     "\n",
-    "Once the above job completes, you should be able to deploy the model and use it for inferencing. To deploy this model, do the following:\n",
+    "created_endpoint = ml_client.serverless_endpoints.begin_create_or_update(\n",
+    "    serverless_endpoint\n",
+    ").result()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = created_endpoint.scoring_uri\n",
+    "key = ml_client.serverless_endpoints.get_keys(created_endpoint.name).primary_key\n",
+    "model = ChatCompletionsClient(\n",
+    "    endpoint=url,\n",
+    "    credential=AzureKeyCredential(key),\n",
+    ")\n",
+    "\n",
+    "response = model.complete(\n",
+    "    messages=[\n",
+    "        SystemMessage(\n",
+    "            content=\"You are a helpful assistant. Your output should only be one of the five choices: 'A', 'B', 'C', 'D', or 'E'.\"\n",
+    "        ),\n",
+    "        UserMessage(\n",
+    "            content=\"Answer the following multiple-choice question by selecting the correct option.\\n\\nQuestion: Can you name a good reason for attending school?\\nAnswer Choices:\\n(A) get smart\\n(B) boredom\\n(C) colds and flu\\n(D) taking tests\\n(E) spend time\"\n",
+    "        ),\n",
+    "    ],\n",
+    ")\n",
     "\n",
-    "* Go to AI Studio\n",
-    "* Navigate to the Fine-tuning tab on the left menu\n",
-    "* In the list of models you see, click on the model which got created from the distillation\n",
-    "* This should take you to the details page where you can see the model attributes and other details\n",
-    "* Click on the Deploy button on top of the page\n",
-    "* Follow the steps to deploy the model"
+    "print(response.choices[0].message.content)"
    ]
   }
  ],
diff --git a/setup/setup-ci/install-pip-package.sh b/setup/setup-ci/install-pip-package.sh
index d417ad7e2c..ced2f8ceb3 100644
--- a/setup/setup-ci/install-pip-package.sh
+++ b/setup/setup-ci/install-pip-package.sh
@@ -8,6 +8,7 @@ sudo -u azureuser -i <<'EOF'
 
 PACKAGE=numpy
 ENVIRONMENT=azureml_py38 
+source /anaconda/etc/profile.d/conda.sh
 conda activate "$ENVIRONMENT"
 pip install "$PACKAGE"
 conda deactivate
diff --git a/setup/setup-ci/setup-custom-conda-env.sh b/setup/setup-ci/setup-custom-conda-env.sh
index e7a0059e33..204d0799c7 100644
--- a/setup/setup-ci/setup-custom-conda-env.sh
+++ b/setup/setup-ci/setup-custom-conda-env.sh
@@ -4,12 +4,14 @@ set -e
 
 # This script creates a custom conda environment and kernel based on a sample yml file.
 
+source /anaconda/etc/profile.d/conda.sh
 conda env create -f env.yml
 echo "Activating new conda environment"
 conda activate envname
 conda install -y ipykernel
-echo "Installing kernel"
 sudo -u azureuser -i <<'EOF'
+echo "Installing kernel"
+source /anaconda/etc/profile.d/conda.sh
 conda activate envname
 python -m ipykernel install --user --name envname --display-name "mykernel"
 echo "Conda environment setup successfully."