diff --git a/python/samples/demos/README.md b/python/samples/demos/README.md index 1387b06b0d5a..5db161e72ca0 100644 --- a/python/samples/demos/README.md +++ b/python/samples/demos/README.md @@ -6,4 +6,5 @@ Demonstration applications that leverage the usage of one or many SK features | ----------------- | ----------------------------------------------- | | assistants_group_chat | A sample Agent demo that shows a chat functionality with an OpenAI Assistant agent. | | booking_restaurant | A sample chat bot that leverages the Microsoft Graph and Bookings API as a Semantic Kernel plugin to make a fake booking at a restaurant. | -| telemetry_with_application_insights | A sample project that shows how a Python application can be configured to send Semantic Kernel telemetry to Application Insights. | \ No newline at end of file +| telemetry_with_application_insights | A sample project that shows how a Python application can be configured to send Semantic Kernel telemetry to Application Insights. | +| nvidia-nim | A sample Semantic Kernel plugin demo that shows how to use NVidia NIM in Semantic Kernel plugin. | \ No newline at end of file diff --git a/python/samples/demos/nvidia-nim/README.md b/python/samples/demos/nvidia-nim/README.md new file mode 100644 index 000000000000..46ec9c72a285 --- /dev/null +++ b/python/samples/demos/nvidia-nim/README.md @@ -0,0 +1,35 @@ +# NVidia NIM Plugin + +NVidia Inference Microservice is fully optimized and has wide variety. Perfect tools to enpower Copilot's semantic kernel. + +This sample show how to encorperate NIM into semantic kernel. +This sample is based on llama-3.1-8b-instruct:latest which is version 1.1.2 at this time. Please check the the documentation of the NIM you plan to sue to see whethere there is any additional change you need to make. + +## Deploy NIM to Azure + +NIM can deploy to anyplace include but not limited to Azure ML. AKS and Azure VM. Just do one of the following to prepare NIM endpoint for next step. + +1. **Azure ML Deployment:** + + - Detail instruction can be found [here](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/azureml) + +2. **Azure Kubernetes Service Deployment** + + - Detail instruction can be found [here](https://github.com/NVIDIA/nim-deploy/tree/main/cloud-service-providers/azure/aks) + +3. **Azure VM Deployment** + + - Create an Azure VM with 1x A100 GPU and NVidia AI Enterprise imGE + - Follow the link [here](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) to continue + - export the endpoint to public accessable. + +## NVidia NIM Plugin + +We use llama-3.1-8b-instruct as example. We assume there is an expert called nllama3. We create a plugin called nllama3 and a magic word called nllama3. Any question asked nllama3 will redirect to this plugin and other questions will use default llm + + - Update the nim_url with the endpoint created in previous step. + - run nvidia_nim_plugin.py and see how it work. + +## Additional Resources: + +- Refer to the [NVidia NIM documentation](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html) for guidance on deploying the service. \ No newline at end of file diff --git a/python/samples/demos/nvidia-nim/nvidia_nim_plugin.py b/python/samples/demos/nvidia-nim/nvidia_nim_plugin.py new file mode 100644 index 000000000000..acda0a0ac7e0 --- /dev/null +++ b/python/samples/demos/nvidia-nim/nvidia_nim_plugin.py @@ -0,0 +1,140 @@ +# Copyright (c) Microsoft. All rights reserved. + +import asyncio +from typing import Annotated + +from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior +from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion, OpenAIChatCompletion +from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import ( + OpenAIChatPromptExecutionSettings, +) +from semantic_kernel.contents.chat_history import ChatHistory +from semantic_kernel.contents.function_call_content import FunctionCallContent +from semantic_kernel.core_plugins.time_plugin import TimePlugin +from semantic_kernel.functions.kernel_arguments import KernelArguments +from semantic_kernel.functions.kernel_function_decorator import kernel_function +from semantic_kernel.kernel import Kernel +from openai import OpenAI + +# +# Please Replace the url with the real NIM endpoint. +# +nim_url = "http://0.0.0.0:8000/v1" + +class NLlama3Plugin: + """A sample plugin that provides response from NIM.""" + + @kernel_function(name="get_nllama3_opinion", description="Get the opinion of nllama3") + def get_nllama3_opinion(self, question: Annotated[str, "The input question"]) -> Annotated[str, "The output is a string"]: + + prompt = question.replace("nllama3", "you") + # Make sure model name match the model of NIM you deploy + client = OpenAI(base_url=nim_url, api_key="not-used") + response = client.completions.create( + model="meta/llama-3.1-8b-instruct", + prompt=prompt, + max_tokens=64, + stream=False + ) + completion = response.choices[0].text + return completion + + +async def main(): + kernel = Kernel() + + use_azure_openai = True + service_id = "function_calling" + if use_azure_openai: + # Please make sure your AzureOpenAI Deployment allows for function calling + ai_service = AzureChatCompletion( + service_id=service_id, + ) + else: + ai_service = OpenAIChatCompletion( + service_id=service_id, + ai_model_id="gpt-3.5-turbo-1106", + ) + kernel.add_service(ai_service) + + kernel.add_plugin(NLlama3Plugin(), plugin_name="nllama3") + + # Example 1: Use automated function calling with a non-streaming prompt + print("========== Example 1: Use automated function calling with a non-streaming prompt ==========") + settings: OpenAIChatPromptExecutionSettings = kernel.get_prompt_execution_settings_from_service_id( + service_id=service_id + ) + settings.function_call_behavior = FunctionCallBehavior.EnableFunctions( + auto_invoke=True, filters={"included_plugins": ["nllama3"]} + ) + + print( + await kernel.invoke_prompt( + function_name="prompt_test", + plugin_name="weather_test", + prompt="What does nllama3 think about global warming?", + settings=settings, + ) + ) + + # Example 2: Use automated function calling with a streaming prompt + print("========== Example 2: Use automated function calling with a streaming prompt ==========") + settings: OpenAIChatPromptExecutionSettings = kernel.get_prompt_execution_settings_from_service_id( + service_id=service_id + ) + settings.function_call_behavior = FunctionCallBehavior.EnableFunctions( + auto_invoke=True, filters={"included_plugins": ["nllama3"]} + ) + + result = kernel.invoke_prompt_stream( + function_name="prompt_test", + plugin_name="weather_test", + prompt="What does nllama3 think about global warming?", + settings=settings, + ) + + async for message in result: + print(str(message[0]), end="") + print("") + + # Example 3: Use manual function calling with a non-streaming prompt + print("========== Example 3: Use manual function calling with a non-streaming prompt ==========") + + chat: OpenAIChatCompletion | AzureChatCompletion = kernel.get_service(service_id) + chat_history = ChatHistory() + settings: OpenAIChatPromptExecutionSettings = kernel.get_prompt_execution_settings_from_service_id( + service_id=service_id + ) + settings.function_call_behavior = FunctionCallBehavior.EnableFunctions( + auto_invoke=False, filters={"included_plugins": ["nllama3"]} + ) + chat_history.add_user_message( + "What does nllama3 think about global warming?" + ) + + while True: + # The result is a list of ChatMessageContent objects, grab the first one + result = await chat.get_chat_message_contents(chat_history=chat_history, settings=settings, kernel=kernel) + result = result[0] + + if result.content: + print(result.content) + + if not result.items or not any(isinstance(item, FunctionCallContent) for item in result.items): + break + + chat_history.add_message(result) + for item in result.items: + await chat._process_function_call( + function_call=item, + kernel=kernel, + chat_history=chat_history, + arguments=KernelArguments(), + function_call_count=1, + request_index=0, + function_call_behavior=settings.function_call_behavior, + ) + + +if __name__ == "__main__": + asyncio.run(main())