diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 62f532dd7473..1aa84ac81c6e 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -209,6 +209,10 @@
 # ServiceLabel: %Image Analysis %Service Attention
 /sdk/vision/azure-ai-vision-imageanalysis/                           @dargilco @rhurey
 
+# PRLabel: %AI Model Inference
+# ServiceLabel: %AI Model Inference %Service Attention
+/sdk/ai/azure-ai-inference/                                          @dargilco
+
 # PRLabel: %HDInsight
 /sdk/hdinsight/                                                      @idear1203
 
diff --git a/.vscode/cspell.json b/.vscode/cspell.json
index 669fabcf0879..f324f4cc38f1 100644
--- a/.vscode/cspell.json
+++ b/.vscode/cspell.json
@@ -1280,6 +1280,16 @@
         "smirnov"
       ]
     },
+    {
+      "filename": "sdk/ai/azure-ai-inference/**",
+      "words": [
+        "ubinary",
+        "mros",
+        "Nify",
+        "ctxt",
+        "wday"
+      ]
+    },
     {
       "filename": "sdk/ai/azure-ai-generative/**",
       "words": [
diff --git a/eng/.docsettings.yml b/eng/.docsettings.yml
index fdca190818e5..202d26e7640a 100644
--- a/eng/.docsettings.yml
+++ b/eng/.docsettings.yml
@@ -14,6 +14,7 @@ omitted_paths:
   - sdk/**/swagger/*
   - sdk/ml/azure-ai-ml/tests/*
   - sdk/vision/azure-ai-vision-imageanalysis/tests/*
+  - sdk/ai/azure-ai-inference/tests/*
   - sdk/storage/azure-storage-extensions/*
 
 language: python
diff --git a/eng/pipelines/templates/stages/platform-matrix-ai.json b/eng/pipelines/templates/stages/platform-matrix-ai.json
deleted file mode 100644
index 6a056ce2280b..000000000000
--- a/eng/pipelines/templates/stages/platform-matrix-ai.json
+++ /dev/null
@@ -1,31 +0,0 @@
-{
-  "displayNames": {
-    "--disablecov": "",
-    "false": "",
-    "true": ""
-  },
-  "matrix": {
-    "Agent": {
-      "windows-2022": { "OSVmImage": "env:WINDOWSVMIMAGE", "Pool": "env:WINDOWSPOOL" },
-      "ubuntu-20.04": { "OSVmImage": "env:LINUXVMIMAGE", "Pool": "env:LINUXPOOL" },
-      "macos-11": { "OSVmImage": "env:MACVMIMAGE", "Pool": "env:MACPOOL" }
-    },
-    "PythonVersion": ["3.8", "3.10", "3.11" ],
-    "CoverageArg": "--disablecov",
-    "TestSamples": "false"
-  },
-  "include": [
-    {
-      "CoverageConfig": {
-        "ubuntu2004_39_coverage": {
-          "OSVmImage": "env:LINUXVMIMAGE",
-          "Pool": "env:LINUXPOOL",
-          "PythonVersion": "3.9",
-          "CoverageArg": "",
-          "TestSamples": "false"
-        }
-      }
-    }
-  ]
-}
-
diff --git a/sdk/ai/azure-ai-generative/setup.py b/sdk/ai/azure-ai-generative/setup.py
index b767590d29c9..696e84a33431 100644
--- a/sdk/ai/azure-ai-generative/setup.py
+++ b/sdk/ai/azure-ai-generative/setup.py
@@ -42,7 +42,7 @@
     url="https://github.com/Azure/azure-sdk-for-python",
     keywords="azure, azuresdk, azure sdk",
     classifiers=[
-        "Development Status :: 4 - Beta",
+        "Development Status :: 7 - Inactive",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3 :: Only",
diff --git a/sdk/ai/azure-ai-inference/CHANGELOG.md b/sdk/ai/azure-ai-inference/CHANGELOG.md
new file mode 100644
index 000000000000..402c31bca81e
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/CHANGELOG.md
@@ -0,0 +1,5 @@
+# Release History
+
+## 1.0.0b1 (2024-06-11)
+
+- Initial beta version
diff --git a/sdk/ai/azure-ai-inference/LICENSE b/sdk/ai/azure-ai-inference/LICENSE
new file mode 100644
index 000000000000..63447fd8bbbf
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/LICENSE
@@ -0,0 +1,21 @@
+Copyright (c) Microsoft Corporation.
+
+MIT License
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/MANIFEST.in b/sdk/ai/azure-ai-inference/MANIFEST.in
new file mode 100644
index 000000000000..6af49607c8e6
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/MANIFEST.in
@@ -0,0 +1,7 @@
+include *.md
+include LICENSE
+include azure/ai/inference/py.typed
+recursive-include tests *.py
+recursive-include samples *.py *.md
+include azure/__init__.py
+include azure/ai/__init__.py
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/README.md b/sdk/ai/azure-ai-inference/README.md
new file mode 100644
index 000000000000..108c3cc23438
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/README.md
@@ -0,0 +1,482 @@
+# Azure AI Inference client library for Python
+
+The client Library (in preview) does inference, including chat completions, for AI models deployed by [Azure AI Studio](https://ai.azure.com) and [Azure Machine Learning Studio](https://ml.azure.com/). It supports Serverless API endpoints and Managed Compute Endpoints (formerly known as Managed Online Endpoints). The client library makes services calls using REST API version `2024-05-01-preview`, as documented in [Azure AI Model Inference API](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-api). For more information see [Overview: Deploy models, flows, and web apps with Azure AI Studio](https://learn.microsoft.com/azure/ai-studio/concepts/deployments-overview).
+
+Use the model inference client library to:
+
+* Authenticate against the service
+* Get information about the model
+* Do chat completions
+* Get text embeddings
+<!-- * Get image embeddings -->
+
+With some minor adjustments, this client library can also be configured to do inference for Azure OpenAI endpoints. See samples with `azure_openai` in their name, in the [samples folder](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples).
+
+[Product documentation](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-api)
+| [Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples)
+| [API reference documentation](https://aka.ms/azsdk/azure-ai-inference/python/reference)
+| [Package (Pypi)](https://aka.ms/azsdk/azure-ai-inference/python/package)
+| [SDK source code](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/azure/ai/inference)
+
+## Getting started
+
+### Prerequisites
+
+* [Python 3.8](https://www.python.org/) or later installed, including [pip](https://pip.pypa.io/en/stable/).
+* An [Azure subscription](https://azure.microsoft.com/free).
+* An [AI Model from the catalog](https://ai.azure.com/explore/models) deployed through Azure AI Studio.
+* To construct the client library, you will need to pass in the endpoint URL. The endpoint URL has the form `https://your-host-name.your-azure-region.inference.ai.azure.com`, where `your-host-name` is your unique model deployment host name and `your-azure-region` is the Azure region where the model is deployed (e.g. `eastus2`).
+* Depending on your model deployment and authentication preference, you either need a key to authenticate against the service, or Entra ID credentials. The key is a 32-character string.
+
+### Install the package
+
+To install the Azure AI Inferencing package use the following command:
+
+```bash
+pip install azure-ai-inference
+```
+
+To update an existing installation of the package, use:
+
+```bash
+pip install --upgrade azure-ai-inference
+```
+
+## Key concepts
+
+### Create and authenticate a client directly, using key
+
+The package includes two clients `ChatCompletionsClient` and `EmbeddingsClient`<!-- and `ImageGenerationClients`-->. Both can be created in the similar manner. For example, assuming `endpoint` and `key` are strings holding your endpoint URL and key, this Python code will create and authenticate a synchronous `ChatCompletionsClient`:
+
+```python
+from azure.ai.inference import ChatCompletionsClient
+from azure.core.credentials import AzureKeyCredential
+
+client = ChatCompletionsClient(
+    endpoint=endpoint,
+    credential=AzureKeyCredential(key)
+)
+```
+
+A synchronous client supports synchronous inference methods, meaning they will block until the service responds with inference results. For simplicity the code snippets below all use synchronous methods. The client offers equivalent asynchronous methods which are more commonly used in production.
+
+To create an asynchronous client, Install the additional package [aiohttp](https://pypi.org/project/aiohttp/):
+
+```bash
+    pip install aiohttp
+```
+
+and update the code above to import `asyncio`, and import `ChatCompletionsClient` from the `azure.ai.inference.aio` namespace instead of `azure.ai.inference`:
+
+```python
+import asyncio
+from azure.ai.inference.aio import ChatCompletionsClient
+from azure.core.credentials import AzureKeyCredential
+
+client = ChatCompletionsClient(
+    endpoint=endpoint,
+    credential=AzureKeyCredential(key)
+)
+```
+
+### Create and authenticate a client directly, using Entra ID
+
+_Note: At the time of this package release, not all deployments support Entra ID authentication. For those who do, follow the instructions below._
+
+To use an Entra ID token credential, first install the [azure-identity](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity) package:
+
+```python
+pip install azure.identity
+```
+
+You will need to provide the desired credential type obtained from that package. A common selection is [DefaultAzureCredential](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/identity/azure-identity#defaultazurecredential) and it can be used as follows:
+
+```python
+from azure.ai.inference import ChatCompletionsClient
+from azure.identity import DefaultAzureCredential
+
+client = ChatCompletionsClient(
+    endpoint=endpoint,
+    credential=DefaultAzureCredential(exclude_interactive_browser_credential=False)
+)
+```
+
+During application development, you would typically set up the environment for authentication using Entra ID by first [Installing the Azure CLI](https://learn.microsoft.com/cli/azure/install-azure-cli), running `az login` in your console window, then entering your credentials in the browser window that was opened. The call to `DefaultAzureCredential()` will then succeed. Setting `exclude_interactive_browser_credential=False` in that call will enable launching a browser window if the user isn't already logged in.
+
+### Create and authentice clients using `load_client`
+
+As an alternative to creating a specific client directly, you can use the function `load_client` to return the relevant client (of types `ChatCompletionsClient` or `EmbeddingsClient`) based on the provided endpoint:
+
+```python
+from azure.ai.inference import load_client
+from azure.core.credentials import AzureKeyCredential
+
+client = load_client(
+    endpoint=endpoint,
+    credential=AzureKeyCredential(key)
+)
+
+print(f"Created client of type `{type(client).__name__}`.")
+```
+
+To load an asynchronous client, import the `load_client` function from `azure.ai.inference.aio` instead.
+
+Entra ID authentication is also supported by the `load_client` function. Replace the key authentication above with `credential=DefaultAzureCredential()` for example.
+
+### Getting AI model information
+
+All clients provide a `get_model_info` method to retrive AI model information. This makes a REST call to the `/info` route on the provided endpoint, as documented in [the REST API reference](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-info).
+
+<!-- SNIPPET:sample_get_model_info.get_model_info -->
+
+```python
+model_info = client.get_model_info()
+
+print(f"Model name: {model_info.model_name}")
+print(f"Model provider name: {model_info.model_provider_name}")
+print(f"Model type: {model_info.model_type}")
+```
+
+<!-- END SNIPPET -->
+
+AI model information is cached in the client, and futher calls to `get_model_info` will access the cached value and wil not result in a REST API call. Note that if you created the client using `load_client` function, model information will already be cached in the client.
+
+AI model information is displayed (if available) when you `print(client)`.
+
+### Chat Completions
+
+The `ChatCompletionsClient` has a method named `complete`. The method makes a REST API call to the `/chat/completions` route on the provided endpoint, as documented in [the REST API reference](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-chat-completions).
+
+See simple chat completion examples below. More can be found in the [samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples) folder.
+
+### Text Embeddings
+
+The `EmbeddingsClient` has a method named `embedding`. The method makes a REST API call to the `/embeddings` route on the provided endpoint, as documented in [the REST API reference](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-embeddings).
+
+See simple text embedding example below. More can be found in the [samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples) folder.
+
+<!-- 
+### Image Embeddings
+
+TODO: Add overview and link to explain image embeddings.
+
+Embeddings operations target the URL route `images/embeddings` on the provided endpoint.
+-->
+
+### Sending proprietary model parameters
+
+The REST API defines common model parameters for chat completions, text embeddings, etc. If the model you are targeting has additional parameters you would like to set, the client library allows you easily do so. See [Chat completions with additional model-specific parameters](#chat-completions-with-additional-model-specific-parameters). It similarly applies to other clients.
+
+### Inference using Azure OpenAI endpoints
+
+The request and response payloads of the [Azure AI Model Inference API](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-api) is mostly compatible with OpenAI REST APIs for chat completions and text embeddings. Therefore, with some minor adjustments, this client library can be configured to do inference using Azure OpenAI endpoints. See samples with `azure_openai` in their name, in the [samples folder](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples), and the comments there.
+
+## Examples
+
+In the following sections you will find simple examples of:
+
+* [Chat completions](#chat-completions-example)
+* [Streaming chat completions](#streaming-chat-completions-example)
+* [Chat completions with additional model-specific parameters](#chat-completions-with-additional-model-specific-parameters)
+* [Text Embeddings](#text-embeddings-example)
+<!-- * [Image Embeddings](#image-embeddings-example) -->
+
+The examples create a synchronous client as mentioned in [Create and authenticate clients](#create-and-authenticate-clients). Only mandatory input settings are shown for simplicity.
+
+See the [Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples) folder for full working samples for synchronous and asynchronous clients.
+
+### Chat completions example
+
+This example demonstrates how to generate a single chat completions, with key authentication, assuming `endpoint` and `key` are already defined.
+
+<!-- SNIPPET:sample_chat_completions.chat_completions -->
+
+```python
+from azure.ai.inference import ChatCompletionsClient
+from azure.ai.inference.models import SystemMessage, UserMessage
+from azure.core.credentials import AzureKeyCredential
+
+client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+response = client.complete(
+    messages=[
+        SystemMessage(content="You are a helpful assistant."),
+        UserMessage(content="How many feet are in a mile?"),
+    ]
+)
+
+print(response.choices[0].message.content)
+```
+
+<!-- END SNIPPET -->
+
+The following types or messages are supported: `SystemMessage`,`UserMessage`, `AssistantMessage`, `ToolMessage` (See sample [sample_chat_completions_with_tools.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py) for usage of `ToolMessage`).
+
+Alternatively, you can provide the messages as dictionary instead of using the strongly typed classes like `SystemMessage` and `UserMessage`:
+
+<!-- SNIPPET:sample_chat_completions_from_input_json.chat_completions -->
+
+```python
+response = client.complete(
+    {
+        "messages": [
+            {
+                "role": "system",
+                "content": "You are an AI assistant that helps people find information. Your replies are short, no more than two sentences.",
+            },
+            {
+                "role": "user",
+                "content": "What year was construction of the International Space Station mostly done?",
+            },
+            {
+                "role": "assistant",
+                "content": "The main construction of the International Space Station (ISS) was completed between 1998 and 2011. During this period, more than 30 flights by US space shuttles and 40 by Russian rockets were conducted to transport components and modules to the station.",
+            },
+            {"role": "user", "content": "And what was the estimated cost to build it?"},
+        ]
+    }
+)
+```
+
+<!-- END SNIPPET -->
+
+To generate completions for additional messages, simply call `client.complete` multiple times using the same `client`.
+
+### Streaming chat completions example
+
+This example demonstrates how to generate a single chat completions with streaming response, with key authentication, assuming `endpoint` and `key` are already defined. You need to add `stream=True` to the `complete` call to enable streaming.
+
+<!-- SNIPPET:sample_chat_completions_streaming.chat_completions_streaming -->
+
+```python
+from azure.ai.inference import ChatCompletionsClient
+from azure.ai.inference.models import SystemMessage, UserMessage
+from azure.core.credentials import AzureKeyCredential
+
+client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+response = client.complete(
+    stream=True,
+    messages=[
+        SystemMessage(content="You are a helpful assistant."),
+        UserMessage(content="Give me 5 good reasons why I should exercise every day."),
+    ],
+)
+
+for update in response:
+    print(update.choices[0].delta.content or "", end="")
+
+client.close()
+```
+
+<!-- END SNIPPET -->
+
+In the above `for` loop that prints the results you should see the answer progressively get longer as updates get streamed to the client.
+
+To generate completions for additional messages, simply call `client.complete` multiple times using the same `client`.
+
+### Chat completions with additional model-specific parameters
+
+In this example, extra JSON elements are inserted at the root of the request body by setting `model_extras` when calling the `complete` method. These are intended for AI models that require extra parameters beyond what is defined in the REST API.
+
+Note that by default, the service will reject any request payload that includes unknown parameters (ones that are not defined in the REST API [Request Body table](https://learn.microsoft.com/azure/ai-studio/reference/reference-model-inference-chat-completions#request-body)). In order to change the default service behaviour, when the `complete` method includes `model_extras`, the client library will automatically add the HTTP request header `"unknown_params": "pass_through"`.
+
+The input argument `model_extras` is not restricted to chat completions. It is suppored on other client methods as well.
+
+<!-- SNIPPET:sample_chat_completions_with_model_extras.model_extras -->
+
+```python
+response = client.complete(
+    messages=[
+        SystemMessage(content="You are a helpful assistant."),
+        UserMessage(content="How many feet are in a mile?"),
+    ],
+    model_extras={"key1": "value1", "key2": "value2"},  # Optional. Additional parameters to pass to the model.
+)
+```
+
+<!-- END SNIPPET -->
+In the above example, this will be the JSON payload in the HTTP request:
+
+```json
+{
+    "messages":
+    [
+        {"role":"system","content":"You are a helpful assistant."},
+        {"role":"user","content":"How many feet are in a mile?"}
+    ],
+    "key1": "value1",
+    "key2": "value2"
+}
+```
+
+### Text Embeddings example
+
+This example demonstrates how to get text embeddings, with key authentication, assuming `endpoint` and `key` are already defined.
+
+<!-- SNIPPET:sample_embeddings.embeddings -->
+
+```python
+from azure.ai.inference import EmbeddingsClient
+from azure.core.credentials import AzureKeyCredential
+
+client = EmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+response = client.embed(input=["first phrase", "second phrase", "third phrase"])
+
+for item in response.data:
+    length = len(item.embedding)
+    print(
+        f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, "
+        f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+    )
+```
+
+<!-- END SNIPPET -->
+
+The length of the embedding vector depends on the model, but you should see something like this:
+
+```txt
+data[0]: length=1024, [0.0013399124, -0.01576233, ..., 0.007843018, 0.000238657]
+data[1]: length=1024, [0.036590576, -0.0059547424, ..., 0.011405945, 0.004863739]
+data[2]: length=1024, [0.04196167, 0.029083252, ..., -0.0027484894, 0.0073127747]
+```
+
+To generate embeddings for additional phrases, simply call `client.embed` multiple times using the same `client`.
+
+<!--
+### Image Embeddings example
+
+This example demonstrates how to get image embeddings.
+
+ <! -- SNIPPET:sample_image_embeddings.image_embeddings -- >
+
+```python
+from azure.ai.inference import ImageEmbeddingsClient
+from azure.ai.inference.models import EmbeddingInput
+from azure.core.credentials import AzureKeyCredential
+
+with open("sample1.png", "rb") as f:
+    image1: str = base64.b64encode(f.read()).decode("utf-8")
+with open("sample2.png", "rb") as f:
+    image2: str = base64.b64encode(f.read()).decode("utf-8")
+
+client = ImageEmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+response = client.embed(input=[EmbeddingInput(image=image1), EmbeddingInput(image=image2)])
+
+for item in response.data:
+    length = len(item.embedding)
+    print(
+        f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, "
+        f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+    )
+```
+
+-- END SNIPPET --
+
+The printed result of course depends on the model, but you should see something like this:
+
+```txt
+TBD
+```
+
+To generate embeddings for additional phrases, simply call `client.embed` multiple times using the same `client`.
+-->
+
+## Troubleshooting
+
+### Exceptions
+
+The `complete`, `embed` and `get_model_info` methods on the clients raise an [HttpResponseError](https://learn.microsoft.com/python/api/azure-core/azure.core.exceptions.httpresponseerror) exception for a non-success HTTP status code response from the service. The exception's `status_code` will be the HTTP response status code. The exception's `error.message` contains a detailed message that will allow you to diagnose the issue:
+
+```python
+from azure.core.exceptions import HttpResponseError
+
+...
+
+try:
+    result = client.complete( ... )
+except HttpResponseError as e:
+    print(f"Status code: {e.status_code} ({e.reason})")
+```
+
+For example, when you provide a wrong authentication key:
+
+```text
+Status code: 401 (Unauthorized)
+Operation returned an invalid status 'Unauthorized'
+```
+
+Or for example when you created an `EmbeddingsClient` and called `embed` on the client, but the endpoint does not
+support the `/embeddings` route:
+
+```text
+Status code: 405 (Method Not Allowed)
+Operation returned an invalid status 'Method Not Allowed'
+```
+
+### Logging
+
+The client uses the standard [Python logging library](https://docs.python.org/3/library/logging.html). The SDK logs HTTP request and response details, which may be useful in troubleshooting. To log to stdout, add the following:
+
+```python
+import sys
+import logging
+
+# Acquire the logger for this client library. Use 'azure' to affect both
+# 'azure.core` and `azure.ai.inference' libraries.
+logger = logging.getLogger("azure")
+
+# Set the desired logging level. logging.INFO or logging.DEBUG are good options.
+logger.setLevel(logging.DEBUG)
+
+# Direct logging output to stdout:
+handler = logging.StreamHandler(stream=sys.stdout)
+# Or direct logging output to a file:
+# handler = logging.FileHandler(filename="sample.log")
+logger.addHandler(handler)
+
+# Optional: change the default logging format. Here we add a timestamp.
+formatter = logging.Formatter("%(asctime)s:%(levelname)s:%(name)s:%(message)s")
+handler.setFormatter(formatter)
+```
+
+By default logs redact the values of URL query strings, the values of some HTTP request and response headers (including `Authorization` which holds the key or token), and the request and response payloads. To create logs without redaction, set the method argument `logging_enable = True` when you construct the client library, or when you call any of the client's `create` methods.
+
+```python
+# Create a chat completions client with non redacted logs
+client = ChatCompletionsClient(
+    endpoint=endpoint,
+    credential=AzureKeyCredential(key),
+    logging_enable=True
+)
+```
+
+None redacted logs are generated for log level `logging.DEBUG` only. Be sure to protect non redacted logs to avoid compromising security. For more information see [Configure logging in the Azure libraries for Python](https://aka.ms/azsdk/python/logging)
+
+## Next steps
+
+* Have a look at the [Samples](https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference/samples) folder, containing fully runnable Python code for doing inference using synchronous and asynchronous clients.
+
+## Contributing
+
+This project welcomes contributions and suggestions. Most contributions require
+you to agree to a Contributor License Agreement (CLA) declaring that you have
+the right to, and actually do, grant us the rights to use your contribution.
+For details, visit [https://cla.microsoft.com](https://cla.microsoft.com).
+
+When you submit a pull request, a CLA-bot will automatically determine whether
+you need to provide a CLA and decorate the PR appropriately (e.g., label,
+comment). Simply follow the instructions provided by the bot. You will only
+need to do this once across all repos using our CLA.
+
+This project has adopted the
+[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct). For more information,
+see the Code of Conduct FAQ or contact opencode@microsoft.com with any
+additional questions or comments.
+
+
+<!-- Note: I did not use LINKS section here with a list of `[link-label](link-url)` because these
+links don't work in the Sphinx generated documentation. The index.html page of these docs
+include this README, but with broken links.-->
diff --git a/sdk/ai/azure-ai-inference/assets.json b/sdk/ai/azure-ai-inference/assets.json
new file mode 100644
index 000000000000..ce5bacc60905
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/assets.json
@@ -0,0 +1,6 @@
+{
+  "AssetsRepo": "Azure/azure-sdk-assets",
+  "AssetsRepoPrefixPath": "python",
+  "TagPrefix": "python/ai/azure-ai-inference",
+  "Tag": "python/ai/azure-ai-inference_a50981dab0"
+}
diff --git a/sdk/ai/azure-ai-inference/azure/__init__.py b/sdk/ai/azure-ai-inference/azure/__init__.py
new file mode 100644
index 000000000000..d55ccad1f573
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/__init__.py
@@ -0,0 +1 @@
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
diff --git a/sdk/ai/azure-ai-inference/azure/ai/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/__init__.py
new file mode 100644
index 000000000000..d55ccad1f573
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/__init__.py
@@ -0,0 +1 @@
+__path__ = __import__("pkgutil").extend_path(__path__, __name__)  # type: ignore
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py
new file mode 100644
index 000000000000..ff62b276a309
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py
@@ -0,0 +1,28 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from ._patch import ChatCompletionsClient
+from ._patch import EmbeddingsClient
+from ._patch import ImageEmbeddingsClient
+from ._version import VERSION
+
+__version__ = VERSION
+
+
+from ._patch import load_client
+from ._patch import patch_sdk as _patch_sdk
+
+__all__ = [
+    "load_client",
+    "ChatCompletionsClient",
+    "EmbeddingsClient",
+    "ImageEmbeddingsClient",
+]
+
+
+_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
new file mode 100644
index 000000000000..f717136114ce
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
@@ -0,0 +1,265 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from copy import deepcopy
+from typing import Any, TYPE_CHECKING, Union
+
+from azure.core import PipelineClient
+from azure.core.credentials import AzureKeyCredential
+from azure.core.pipeline import policies
+from azure.core.rest import HttpRequest, HttpResponse
+
+from ._configuration import (
+    ChatCompletionsClientConfiguration,
+    EmbeddingsClientConfiguration,
+    ImageEmbeddingsClientConfiguration,
+)
+from ._operations import (
+    ChatCompletionsClientOperationsMixin,
+    EmbeddingsClientOperationsMixin,
+    ImageEmbeddingsClientOperationsMixin,
+)
+from ._serialization import Deserializer, Serializer
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core.credentials import TokenCredential
+
+
+class ChatCompletionsClient(ChatCompletionsClientOperationsMixin):  # pylint: disable=client-accepts-api-version-keyword
+    """ChatCompletionsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        _endpoint = "{endpoint}"
+        self._config = ChatCompletionsClientConfiguration(endpoint=endpoint, credential=credential, **kwargs)
+        _policies = kwargs.pop("policies", None)
+        if _policies is None:
+            _policies = [
+                policies.RequestIdPolicy(**kwargs),
+                self._config.headers_policy,
+                self._config.user_agent_policy,
+                self._config.proxy_policy,
+                policies.ContentDecodePolicy(**kwargs),
+                self._config.redirect_policy,
+                self._config.retry_policy,
+                self._config.authentication_policy,
+                self._config.custom_hook_policy,
+                self._config.logging_policy,
+                policies.DistributedTracingPolicy(**kwargs),
+                policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
+                self._config.http_logging_policy,
+            ]
+        self._client: PipelineClient = PipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
+
+        self._serialize = Serializer()
+        self._deserialize = Deserializer()
+        self._serialize.client_side_validation = False
+
+    def send_request(self, request: HttpRequest, *, stream: bool = False, **kwargs: Any) -> HttpResponse:
+        """Runs the network request through the client's chained policies.
+
+        >>> from azure.core.rest import HttpRequest
+        >>> request = HttpRequest("GET", "https://www.example.org/")
+        <HttpRequest [GET], url: 'https://www.example.org/'>
+        >>> response = client.send_request(request)
+        <HttpResponse: 200 OK>
+
+        For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
+
+        :param request: The network request you want to make. Required.
+        :type request: ~azure.core.rest.HttpRequest
+        :keyword bool stream: Whether the response payload will be streamed. Defaults to False.
+        :return: The response of your network call. Does not do error handling on your response.
+        :rtype: ~azure.core.rest.HttpResponse
+        """
+
+        request_copy = deepcopy(request)
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+
+        request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
+        return self._client.send_request(request_copy, stream=stream, **kwargs)  # type: ignore
+
+    def close(self) -> None:
+        self._client.close()
+
+    def __enter__(self) -> "ChatCompletionsClient":
+        self._client.__enter__()
+        return self
+
+    def __exit__(self, *exc_details: Any) -> None:
+        self._client.__exit__(*exc_details)
+
+
+class EmbeddingsClient(EmbeddingsClientOperationsMixin):  # pylint: disable=client-accepts-api-version-keyword
+    """EmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        _endpoint = "{endpoint}"
+        self._config = EmbeddingsClientConfiguration(endpoint=endpoint, credential=credential, **kwargs)
+        _policies = kwargs.pop("policies", None)
+        if _policies is None:
+            _policies = [
+                policies.RequestIdPolicy(**kwargs),
+                self._config.headers_policy,
+                self._config.user_agent_policy,
+                self._config.proxy_policy,
+                policies.ContentDecodePolicy(**kwargs),
+                self._config.redirect_policy,
+                self._config.retry_policy,
+                self._config.authentication_policy,
+                self._config.custom_hook_policy,
+                self._config.logging_policy,
+                policies.DistributedTracingPolicy(**kwargs),
+                policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
+                self._config.http_logging_policy,
+            ]
+        self._client: PipelineClient = PipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
+
+        self._serialize = Serializer()
+        self._deserialize = Deserializer()
+        self._serialize.client_side_validation = False
+
+    def send_request(self, request: HttpRequest, *, stream: bool = False, **kwargs: Any) -> HttpResponse:
+        """Runs the network request through the client's chained policies.
+
+        >>> from azure.core.rest import HttpRequest
+        >>> request = HttpRequest("GET", "https://www.example.org/")
+        <HttpRequest [GET], url: 'https://www.example.org/'>
+        >>> response = client.send_request(request)
+        <HttpResponse: 200 OK>
+
+        For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
+
+        :param request: The network request you want to make. Required.
+        :type request: ~azure.core.rest.HttpRequest
+        :keyword bool stream: Whether the response payload will be streamed. Defaults to False.
+        :return: The response of your network call. Does not do error handling on your response.
+        :rtype: ~azure.core.rest.HttpResponse
+        """
+
+        request_copy = deepcopy(request)
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+
+        request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
+        return self._client.send_request(request_copy, stream=stream, **kwargs)  # type: ignore
+
+    def close(self) -> None:
+        self._client.close()
+
+    def __enter__(self) -> "EmbeddingsClient":
+        self._client.__enter__()
+        return self
+
+    def __exit__(self, *exc_details: Any) -> None:
+        self._client.__exit__(*exc_details)
+
+
+class ImageEmbeddingsClient(ImageEmbeddingsClientOperationsMixin):  # pylint: disable=client-accepts-api-version-keyword
+    """ImageEmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        _endpoint = "{endpoint}"
+        self._config = ImageEmbeddingsClientConfiguration(endpoint=endpoint, credential=credential, **kwargs)
+        _policies = kwargs.pop("policies", None)
+        if _policies is None:
+            _policies = [
+                policies.RequestIdPolicy(**kwargs),
+                self._config.headers_policy,
+                self._config.user_agent_policy,
+                self._config.proxy_policy,
+                policies.ContentDecodePolicy(**kwargs),
+                self._config.redirect_policy,
+                self._config.retry_policy,
+                self._config.authentication_policy,
+                self._config.custom_hook_policy,
+                self._config.logging_policy,
+                policies.DistributedTracingPolicy(**kwargs),
+                policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
+                self._config.http_logging_policy,
+            ]
+        self._client: PipelineClient = PipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
+
+        self._serialize = Serializer()
+        self._deserialize = Deserializer()
+        self._serialize.client_side_validation = False
+
+    def send_request(self, request: HttpRequest, *, stream: bool = False, **kwargs: Any) -> HttpResponse:
+        """Runs the network request through the client's chained policies.
+
+        >>> from azure.core.rest import HttpRequest
+        >>> request = HttpRequest("GET", "https://www.example.org/")
+        <HttpRequest [GET], url: 'https://www.example.org/'>
+        >>> response = client.send_request(request)
+        <HttpResponse: 200 OK>
+
+        For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
+
+        :param request: The network request you want to make. Required.
+        :type request: ~azure.core.rest.HttpRequest
+        :keyword bool stream: Whether the response payload will be streamed. Defaults to False.
+        :return: The response of your network call. Does not do error handling on your response.
+        :rtype: ~azure.core.rest.HttpResponse
+        """
+
+        request_copy = deepcopy(request)
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+
+        request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
+        return self._client.send_request(request_copy, stream=stream, **kwargs)  # type: ignore
+
+    def close(self) -> None:
+        self._client.close()
+
+    def __enter__(self) -> "ImageEmbeddingsClient":
+        self._client.__enter__()
+        return self
+
+    def __exit__(self, *exc_details: Any) -> None:
+        self._client.__exit__(*exc_details)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
new file mode 100644
index 000000000000..65f27adc9ec9
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from typing import Any, TYPE_CHECKING, Union
+
+from azure.core.credentials import AzureKeyCredential
+from azure.core.pipeline import policies
+
+from ._version import VERSION
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core.credentials import TokenCredential
+
+
+class ChatCompletionsClientConfiguration:  # pylint: disable=too-many-instance-attributes,name-too-long
+    """Configuration for ChatCompletionsClient.
+
+    Note that all parameters used to create this instance are saved as instance
+    attributes.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        api_version: str = kwargs.pop("api_version", "2024-05-01-preview")
+
+        if endpoint is None:
+            raise ValueError("Parameter 'endpoint' must not be None.")
+        if credential is None:
+            raise ValueError("Parameter 'credential' must not be None.")
+
+        self.endpoint = endpoint
+        self.credential = credential
+        self.api_version = api_version
+        self.credential_scopes = kwargs.pop("credential_scopes", ["https://ml.azure.com/.default"])
+        kwargs.setdefault("sdk_moniker", "ai-inference/{}".format(VERSION))
+        self.polling_interval = kwargs.get("polling_interval", 30)
+        self._configure(**kwargs)
+
+    def _infer_policy(self, **kwargs):
+        if isinstance(self.credential, AzureKeyCredential):
+            return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
+        if hasattr(self.credential, "get_token"):
+            return policies.BearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
+        raise TypeError(f"Unsupported credential: {self.credential}")
+
+    def _configure(self, **kwargs: Any) -> None:
+        self.user_agent_policy = kwargs.get("user_agent_policy") or policies.UserAgentPolicy(**kwargs)
+        self.headers_policy = kwargs.get("headers_policy") or policies.HeadersPolicy(**kwargs)
+        self.proxy_policy = kwargs.get("proxy_policy") or policies.ProxyPolicy(**kwargs)
+        self.logging_policy = kwargs.get("logging_policy") or policies.NetworkTraceLoggingPolicy(**kwargs)
+        self.http_logging_policy = kwargs.get("http_logging_policy") or policies.HttpLoggingPolicy(**kwargs)
+        self.custom_hook_policy = kwargs.get("custom_hook_policy") or policies.CustomHookPolicy(**kwargs)
+        self.redirect_policy = kwargs.get("redirect_policy") or policies.RedirectPolicy(**kwargs)
+        self.retry_policy = kwargs.get("retry_policy") or policies.RetryPolicy(**kwargs)
+        self.authentication_policy = kwargs.get("authentication_policy")
+        if self.credential and not self.authentication_policy:
+            self.authentication_policy = self._infer_policy(**kwargs)
+
+
+class EmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attributes,name-too-long
+    """Configuration for EmbeddingsClient.
+
+    Note that all parameters used to create this instance are saved as instance
+    attributes.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        api_version: str = kwargs.pop("api_version", "2024-05-01-preview")
+
+        if endpoint is None:
+            raise ValueError("Parameter 'endpoint' must not be None.")
+        if credential is None:
+            raise ValueError("Parameter 'credential' must not be None.")
+
+        self.endpoint = endpoint
+        self.credential = credential
+        self.api_version = api_version
+        self.credential_scopes = kwargs.pop("credential_scopes", ["https://ml.azure.com/.default"])
+        kwargs.setdefault("sdk_moniker", "ai-inference/{}".format(VERSION))
+        self.polling_interval = kwargs.get("polling_interval", 30)
+        self._configure(**kwargs)
+
+    def _infer_policy(self, **kwargs):
+        if isinstance(self.credential, AzureKeyCredential):
+            return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
+        if hasattr(self.credential, "get_token"):
+            return policies.BearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
+        raise TypeError(f"Unsupported credential: {self.credential}")
+
+    def _configure(self, **kwargs: Any) -> None:
+        self.user_agent_policy = kwargs.get("user_agent_policy") or policies.UserAgentPolicy(**kwargs)
+        self.headers_policy = kwargs.get("headers_policy") or policies.HeadersPolicy(**kwargs)
+        self.proxy_policy = kwargs.get("proxy_policy") or policies.ProxyPolicy(**kwargs)
+        self.logging_policy = kwargs.get("logging_policy") or policies.NetworkTraceLoggingPolicy(**kwargs)
+        self.http_logging_policy = kwargs.get("http_logging_policy") or policies.HttpLoggingPolicy(**kwargs)
+        self.custom_hook_policy = kwargs.get("custom_hook_policy") or policies.CustomHookPolicy(**kwargs)
+        self.redirect_policy = kwargs.get("redirect_policy") or policies.RedirectPolicy(**kwargs)
+        self.retry_policy = kwargs.get("retry_policy") or policies.RetryPolicy(**kwargs)
+        self.authentication_policy = kwargs.get("authentication_policy")
+        if self.credential and not self.authentication_policy:
+            self.authentication_policy = self._infer_policy(**kwargs)
+
+
+class ImageEmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attributes,name-too-long
+    """Configuration for ImageEmbeddingsClient.
+
+    Note that all parameters used to create this instance are saved as instance
+    attributes.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        api_version: str = kwargs.pop("api_version", "2024-05-01-preview")
+
+        if endpoint is None:
+            raise ValueError("Parameter 'endpoint' must not be None.")
+        if credential is None:
+            raise ValueError("Parameter 'credential' must not be None.")
+
+        self.endpoint = endpoint
+        self.credential = credential
+        self.api_version = api_version
+        self.credential_scopes = kwargs.pop("credential_scopes", ["https://ml.azure.com/.default"])
+        kwargs.setdefault("sdk_moniker", "ai-inference/{}".format(VERSION))
+        self.polling_interval = kwargs.get("polling_interval", 30)
+        self._configure(**kwargs)
+
+    def _infer_policy(self, **kwargs):
+        if isinstance(self.credential, AzureKeyCredential):
+            return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
+        if hasattr(self.credential, "get_token"):
+            return policies.BearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
+        raise TypeError(f"Unsupported credential: {self.credential}")
+
+    def _configure(self, **kwargs: Any) -> None:
+        self.user_agent_policy = kwargs.get("user_agent_policy") or policies.UserAgentPolicy(**kwargs)
+        self.headers_policy = kwargs.get("headers_policy") or policies.HeadersPolicy(**kwargs)
+        self.proxy_policy = kwargs.get("proxy_policy") or policies.ProxyPolicy(**kwargs)
+        self.logging_policy = kwargs.get("logging_policy") or policies.NetworkTraceLoggingPolicy(**kwargs)
+        self.http_logging_policy = kwargs.get("http_logging_policy") or policies.HttpLoggingPolicy(**kwargs)
+        self.custom_hook_policy = kwargs.get("custom_hook_policy") or policies.CustomHookPolicy(**kwargs)
+        self.redirect_policy = kwargs.get("redirect_policy") or policies.RedirectPolicy(**kwargs)
+        self.retry_policy = kwargs.get("retry_policy") or policies.RetryPolicy(**kwargs)
+        self.authentication_policy = kwargs.get("authentication_policy")
+        if self.credential and not self.authentication_policy:
+            self.authentication_policy = self._infer_policy(**kwargs)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
new file mode 100644
index 000000000000..5cf70733404d
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
@@ -0,0 +1,887 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+# pylint: disable=protected-access, arguments-differ, signature-differs, broad-except
+
+import copy
+import calendar
+import decimal
+import functools
+import sys
+import logging
+import base64
+import re
+import typing
+import enum
+import email.utils
+from datetime import datetime, date, time, timedelta, timezone
+from json import JSONEncoder
+from typing_extensions import Self
+import isodate
+from azure.core.exceptions import DeserializationError
+from azure.core import CaseInsensitiveEnumMeta
+from azure.core.pipeline import PipelineResponse
+from azure.core.serialization import _Null
+
+if sys.version_info >= (3, 9):
+    from collections.abc import MutableMapping
+else:
+    from typing import MutableMapping
+
+_LOGGER = logging.getLogger(__name__)
+
+__all__ = ["SdkJSONEncoder", "Model", "rest_field", "rest_discriminator"]
+
+TZ_UTC = timezone.utc
+_T = typing.TypeVar("_T")
+
+
+def _timedelta_as_isostr(td: timedelta) -> str:
+    """Converts a datetime.timedelta object into an ISO 8601 formatted string, e.g. 'P4DT12H30M05S'
+
+    Function adapted from the Tin Can Python project: https://github.com/RusticiSoftware/TinCanPython
+
+    :param timedelta td: The timedelta to convert
+    :rtype: str
+    :return: ISO8601 version of this timedelta
+    """
+
+    # Split seconds to larger units
+    seconds = td.total_seconds()
+    minutes, seconds = divmod(seconds, 60)
+    hours, minutes = divmod(minutes, 60)
+    days, hours = divmod(hours, 24)
+
+    days, hours, minutes = list(map(int, (days, hours, minutes)))
+    seconds = round(seconds, 6)
+
+    # Build date
+    date_str = ""
+    if days:
+        date_str = "%sD" % days
+
+    if hours or minutes or seconds:
+        # Build time
+        time_str = "T"
+
+        # Hours
+        bigger_exists = date_str or hours
+        if bigger_exists:
+            time_str += "{:02}H".format(hours)
+
+        # Minutes
+        bigger_exists = bigger_exists or minutes
+        if bigger_exists:
+            time_str += "{:02}M".format(minutes)
+
+        # Seconds
+        try:
+            if seconds.is_integer():
+                seconds_string = "{:02}".format(int(seconds))
+            else:
+                # 9 chars long w/ leading 0, 6 digits after decimal
+                seconds_string = "%09.6f" % seconds
+                # Remove trailing zeros
+                seconds_string = seconds_string.rstrip("0")
+        except AttributeError:  # int.is_integer() raises
+            seconds_string = "{:02}".format(seconds)
+
+        time_str += "{}S".format(seconds_string)
+    else:
+        time_str = ""
+
+    return "P" + date_str + time_str
+
+
+def _serialize_bytes(o, format: typing.Optional[str] = None) -> str:
+    encoded = base64.b64encode(o).decode()
+    if format == "base64url":
+        return encoded.strip("=").replace("+", "-").replace("/", "_")
+    return encoded
+
+
+def _serialize_datetime(o, format: typing.Optional[str] = None):
+    if hasattr(o, "year") and hasattr(o, "hour"):
+        if format == "rfc7231":
+            return email.utils.format_datetime(o, usegmt=True)
+        if format == "unix-timestamp":
+            return int(calendar.timegm(o.utctimetuple()))
+
+        # astimezone() fails for naive times in Python 2.7, so make make sure o is aware (tzinfo is set)
+        if not o.tzinfo:
+            iso_formatted = o.replace(tzinfo=TZ_UTC).isoformat()
+        else:
+            iso_formatted = o.astimezone(TZ_UTC).isoformat()
+        # Replace the trailing "+00:00" UTC offset with "Z" (RFC 3339: https://www.ietf.org/rfc/rfc3339.txt)
+        return iso_formatted.replace("+00:00", "Z")
+    # Next try datetime.date or datetime.time
+    return o.isoformat()
+
+
+def _is_readonly(p):
+    try:
+        return p._visibility == ["read"]  # pylint: disable=protected-access
+    except AttributeError:
+        return False
+
+
+class SdkJSONEncoder(JSONEncoder):
+    """A JSON encoder that's capable of serializing datetime objects and bytes."""
+
+    def __init__(self, *args, exclude_readonly: bool = False, format: typing.Optional[str] = None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.exclude_readonly = exclude_readonly
+        self.format = format
+
+    def default(self, o):  # pylint: disable=too-many-return-statements
+        if _is_model(o):
+            if self.exclude_readonly:
+                readonly_props = [p._rest_name for p in o._attr_to_rest_field.values() if _is_readonly(p)]
+                return {k: v for k, v in o.items() if k not in readonly_props}
+            return dict(o.items())
+        try:
+            return super(SdkJSONEncoder, self).default(o)
+        except TypeError:
+            if isinstance(o, _Null):
+                return None
+            if isinstance(o, decimal.Decimal):
+                return float(o)
+            if isinstance(o, (bytes, bytearray)):
+                return _serialize_bytes(o, self.format)
+            try:
+                # First try datetime.datetime
+                return _serialize_datetime(o, self.format)
+            except AttributeError:
+                pass
+            # Last, try datetime.timedelta
+            try:
+                return _timedelta_as_isostr(o)
+            except AttributeError:
+                # This will be raised when it hits value.total_seconds in the method above
+                pass
+            return super(SdkJSONEncoder, self).default(o)
+
+
+_VALID_DATE = re.compile(r"\d{4}[-]\d{2}[-]\d{2}T\d{2}:\d{2}:\d{2}" + r"\.?\d*Z?[-+]?[\d{2}]?:?[\d{2}]?")
+_VALID_RFC7231 = re.compile(
+    r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun),\s\d{2}\s"
+    r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{4}\s\d{2}:\d{2}:\d{2}\sGMT"
+)
+
+
+def _deserialize_datetime(attr: typing.Union[str, datetime]) -> datetime:
+    """Deserialize ISO-8601 formatted string into Datetime object.
+
+    :param str attr: response string to be deserialized.
+    :rtype: ~datetime.datetime
+    :returns: The datetime object from that input
+    """
+    if isinstance(attr, datetime):
+        # i'm already deserialized
+        return attr
+    attr = attr.upper()
+    match = _VALID_DATE.match(attr)
+    if not match:
+        raise ValueError("Invalid datetime string: " + attr)
+
+    check_decimal = attr.split(".")
+    if len(check_decimal) > 1:
+        decimal_str = ""
+        for digit in check_decimal[1]:
+            if digit.isdigit():
+                decimal_str += digit
+            else:
+                break
+        if len(decimal_str) > 6:
+            attr = attr.replace(decimal_str, decimal_str[0:6])
+
+    date_obj = isodate.parse_datetime(attr)
+    test_utc = date_obj.utctimetuple()
+    if test_utc.tm_year > 9999 or test_utc.tm_year < 1:
+        raise OverflowError("Hit max or min date")
+    return date_obj
+
+
+def _deserialize_datetime_rfc7231(attr: typing.Union[str, datetime]) -> datetime:
+    """Deserialize RFC7231 formatted string into Datetime object.
+
+    :param str attr: response string to be deserialized.
+    :rtype: ~datetime.datetime
+    :returns: The datetime object from that input
+    """
+    if isinstance(attr, datetime):
+        # i'm already deserialized
+        return attr
+    match = _VALID_RFC7231.match(attr)
+    if not match:
+        raise ValueError("Invalid datetime string: " + attr)
+
+    return email.utils.parsedate_to_datetime(attr)
+
+
+def _deserialize_datetime_unix_timestamp(attr: typing.Union[float, datetime]) -> datetime:
+    """Deserialize unix timestamp into Datetime object.
+
+    :param str attr: response string to be deserialized.
+    :rtype: ~datetime.datetime
+    :returns: The datetime object from that input
+    """
+    if isinstance(attr, datetime):
+        # i'm already deserialized
+        return attr
+    return datetime.fromtimestamp(attr, TZ_UTC)
+
+
+def _deserialize_date(attr: typing.Union[str, date]) -> date:
+    """Deserialize ISO-8601 formatted string into Date object.
+    :param str attr: response string to be deserialized.
+    :rtype: date
+    :returns: The date object from that input
+    """
+    # This must NOT use defaultmonth/defaultday. Using None ensure this raises an exception.
+    if isinstance(attr, date):
+        return attr
+    return isodate.parse_date(attr, defaultmonth=None, defaultday=None)  # type: ignore
+
+
+def _deserialize_time(attr: typing.Union[str, time]) -> time:
+    """Deserialize ISO-8601 formatted string into time object.
+
+    :param str attr: response string to be deserialized.
+    :rtype: datetime.time
+    :returns: The time object from that input
+    """
+    if isinstance(attr, time):
+        return attr
+    return isodate.parse_time(attr)
+
+
+def _deserialize_bytes(attr):
+    if isinstance(attr, (bytes, bytearray)):
+        return attr
+    return bytes(base64.b64decode(attr))
+
+
+def _deserialize_bytes_base64(attr):
+    if isinstance(attr, (bytes, bytearray)):
+        return attr
+    padding = "=" * (3 - (len(attr) + 3) % 4)  # type: ignore
+    attr = attr + padding  # type: ignore
+    encoded = attr.replace("-", "+").replace("_", "/")
+    return bytes(base64.b64decode(encoded))
+
+
+def _deserialize_duration(attr):
+    if isinstance(attr, timedelta):
+        return attr
+    return isodate.parse_duration(attr)
+
+
+def _deserialize_decimal(attr):
+    if isinstance(attr, decimal.Decimal):
+        return attr
+    return decimal.Decimal(str(attr))
+
+
+_DESERIALIZE_MAPPING = {
+    datetime: _deserialize_datetime,
+    date: _deserialize_date,
+    time: _deserialize_time,
+    bytes: _deserialize_bytes,
+    bytearray: _deserialize_bytes,
+    timedelta: _deserialize_duration,
+    typing.Any: lambda x: x,
+    decimal.Decimal: _deserialize_decimal,
+}
+
+_DESERIALIZE_MAPPING_WITHFORMAT = {
+    "rfc3339": _deserialize_datetime,
+    "rfc7231": _deserialize_datetime_rfc7231,
+    "unix-timestamp": _deserialize_datetime_unix_timestamp,
+    "base64": _deserialize_bytes,
+    "base64url": _deserialize_bytes_base64,
+}
+
+
+def get_deserializer(annotation: typing.Any, rf: typing.Optional["_RestField"] = None):
+    if rf and rf._format:
+        return _DESERIALIZE_MAPPING_WITHFORMAT.get(rf._format)
+    return _DESERIALIZE_MAPPING.get(annotation)
+
+
+def _get_type_alias_type(module_name: str, alias_name: str):
+    types = {
+        k: v
+        for k, v in sys.modules[module_name].__dict__.items()
+        if isinstance(v, typing._GenericAlias)  # type: ignore
+    }
+    if alias_name not in types:
+        return alias_name
+    return types[alias_name]
+
+
+def _get_model(module_name: str, model_name: str):
+    models = {k: v for k, v in sys.modules[module_name].__dict__.items() if isinstance(v, type)}
+    module_end = module_name.rsplit(".", 1)[0]
+    models.update({k: v for k, v in sys.modules[module_end].__dict__.items() if isinstance(v, type)})
+    if isinstance(model_name, str):
+        model_name = model_name.split(".")[-1]
+    if model_name not in models:
+        return model_name
+    return models[model_name]
+
+
+_UNSET = object()
+
+
+class _MyMutableMapping(MutableMapping[str, typing.Any]):  # pylint: disable=unsubscriptable-object
+    def __init__(self, data: typing.Dict[str, typing.Any]) -> None:
+        self._data = data
+
+    def __contains__(self, key: typing.Any) -> bool:
+        return key in self._data
+
+    def __getitem__(self, key: str) -> typing.Any:
+        return self._data.__getitem__(key)
+
+    def __setitem__(self, key: str, value: typing.Any) -> None:
+        self._data.__setitem__(key, value)
+
+    def __delitem__(self, key: str) -> None:
+        self._data.__delitem__(key)
+
+    def __iter__(self) -> typing.Iterator[typing.Any]:
+        return self._data.__iter__()
+
+    def __len__(self) -> int:
+        return self._data.__len__()
+
+    def __ne__(self, other: typing.Any) -> bool:
+        return not self.__eq__(other)
+
+    def keys(self) -> typing.KeysView[str]:
+        return self._data.keys()
+
+    def values(self) -> typing.ValuesView[typing.Any]:
+        return self._data.values()
+
+    def items(self) -> typing.ItemsView[str, typing.Any]:
+        return self._data.items()
+
+    def get(self, key: str, default: typing.Any = None) -> typing.Any:
+        try:
+            return self[key]
+        except KeyError:
+            return default
+
+    @typing.overload
+    def pop(self, key: str) -> typing.Any: ...
+
+    @typing.overload
+    def pop(self, key: str, default: _T) -> _T: ...
+
+    @typing.overload
+    def pop(self, key: str, default: typing.Any) -> typing.Any: ...
+
+    def pop(self, key: str, default: typing.Any = _UNSET) -> typing.Any:
+        if default is _UNSET:
+            return self._data.pop(key)
+        return self._data.pop(key, default)
+
+    def popitem(self) -> typing.Tuple[str, typing.Any]:
+        return self._data.popitem()
+
+    def clear(self) -> None:
+        self._data.clear()
+
+    def update(self, *args: typing.Any, **kwargs: typing.Any) -> None:
+        self._data.update(*args, **kwargs)
+
+    @typing.overload
+    def setdefault(self, key: str, default: None = None) -> None: ...
+
+    @typing.overload
+    def setdefault(self, key: str, default: typing.Any) -> typing.Any: ...
+
+    def setdefault(self, key: str, default: typing.Any = _UNSET) -> typing.Any:
+        if default is _UNSET:
+            return self._data.setdefault(key)
+        return self._data.setdefault(key, default)
+
+    def __eq__(self, other: typing.Any) -> bool:
+        try:
+            other_model = self.__class__(other)
+        except Exception:
+            return False
+        return self._data == other_model._data
+
+    def __repr__(self) -> str:
+        return str(self._data)
+
+
+def _is_model(obj: typing.Any) -> bool:
+    return getattr(obj, "_is_model", False)
+
+
+def _serialize(o, format: typing.Optional[str] = None):  # pylint: disable=too-many-return-statements
+    if isinstance(o, list):
+        return [_serialize(x, format) for x in o]
+    if isinstance(o, dict):
+        return {k: _serialize(v, format) for k, v in o.items()}
+    if isinstance(o, set):
+        return {_serialize(x, format) for x in o}
+    if isinstance(o, tuple):
+        return tuple(_serialize(x, format) for x in o)
+    if isinstance(o, (bytes, bytearray)):
+        return _serialize_bytes(o, format)
+    if isinstance(o, decimal.Decimal):
+        return float(o)
+    if isinstance(o, enum.Enum):
+        return o.value
+    try:
+        # First try datetime.datetime
+        return _serialize_datetime(o, format)
+    except AttributeError:
+        pass
+    # Last, try datetime.timedelta
+    try:
+        return _timedelta_as_isostr(o)
+    except AttributeError:
+        # This will be raised when it hits value.total_seconds in the method above
+        pass
+    return o
+
+
+def _get_rest_field(
+    attr_to_rest_field: typing.Dict[str, "_RestField"], rest_name: str
+) -> typing.Optional["_RestField"]:
+    try:
+        return next(rf for rf in attr_to_rest_field.values() if rf._rest_name == rest_name)
+    except StopIteration:
+        return None
+
+
+def _create_value(rf: typing.Optional["_RestField"], value: typing.Any) -> typing.Any:
+    if not rf:
+        return _serialize(value, None)
+    if rf._is_multipart_file_input:
+        return value
+    if rf._is_model:
+        return _deserialize(rf._type, value)
+    return _serialize(value, rf._format)
+
+
+class Model(_MyMutableMapping):
+    _is_model = True
+
+    def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None:
+        class_name = self.__class__.__name__
+        if len(args) > 1:
+            raise TypeError(f"{class_name}.__init__() takes 2 positional arguments but {len(args) + 1} were given")
+        dict_to_pass = {
+            rest_field._rest_name: rest_field._default
+            for rest_field in self._attr_to_rest_field.values()
+            if rest_field._default is not _UNSET
+        }
+        if args:
+            dict_to_pass.update(
+                {k: _create_value(_get_rest_field(self._attr_to_rest_field, k), v) for k, v in args[0].items()}
+            )
+        else:
+            non_attr_kwargs = [k for k in kwargs if k not in self._attr_to_rest_field]
+            if non_attr_kwargs:
+                # actual type errors only throw the first wrong keyword arg they see, so following that.
+                raise TypeError(f"{class_name}.__init__() got an unexpected keyword argument '{non_attr_kwargs[0]}'")
+            dict_to_pass.update(
+                {
+                    self._attr_to_rest_field[k]._rest_name: _create_value(self._attr_to_rest_field[k], v)
+                    for k, v in kwargs.items()
+                    if v is not None
+                }
+            )
+        super().__init__(dict_to_pass)
+
+    def copy(self) -> "Model":
+        return Model(self.__dict__)
+
+    def __new__(cls, *args: typing.Any, **kwargs: typing.Any) -> Self:  # pylint: disable=unused-argument
+        # we know the last three classes in mro are going to be 'Model', 'dict', and 'object'
+        mros = cls.__mro__[:-3][::-1]  # ignore model, dict, and object parents, and reverse the mro order
+        attr_to_rest_field: typing.Dict[str, _RestField] = {  # map attribute name to rest_field property
+            k: v for mro_class in mros for k, v in mro_class.__dict__.items() if k[0] != "_" and hasattr(v, "_type")
+        }
+        annotations = {
+            k: v
+            for mro_class in mros
+            if hasattr(mro_class, "__annotations__")  # pylint: disable=no-member
+            for k, v in mro_class.__annotations__.items()  # pylint: disable=no-member
+        }
+        for attr, rf in attr_to_rest_field.items():
+            rf._module = cls.__module__
+            if not rf._type:
+                rf._type = rf._get_deserialize_callable_from_annotation(annotations.get(attr, None))
+            if not rf._rest_name_input:
+                rf._rest_name_input = attr
+        cls._attr_to_rest_field: typing.Dict[str, _RestField] = dict(attr_to_rest_field.items())
+
+        return super().__new__(cls)  # pylint: disable=no-value-for-parameter
+
+    def __init_subclass__(cls, discriminator: typing.Optional[str] = None) -> None:
+        for base in cls.__bases__:
+            if hasattr(base, "__mapping__"):  # pylint: disable=no-member
+                base.__mapping__[discriminator or cls.__name__] = cls  # type: ignore  # pylint: disable=no-member
+
+    @classmethod
+    def _get_discriminator(cls, exist_discriminators) -> typing.Optional[str]:
+        for v in cls.__dict__.values():
+            if (
+                isinstance(v, _RestField) and v._is_discriminator and v._rest_name not in exist_discriminators
+            ):  # pylint: disable=protected-access
+                return v._rest_name  # pylint: disable=protected-access
+        return None
+
+    @classmethod
+    def _deserialize(cls, data, exist_discriminators):
+        if not hasattr(cls, "__mapping__"):  # pylint: disable=no-member
+            return cls(data)
+        discriminator = cls._get_discriminator(exist_discriminators)
+        exist_discriminators.append(discriminator)
+        mapped_cls = cls.__mapping__.get(data.get(discriminator), cls)  # pyright: ignore # pylint: disable=no-member
+        if mapped_cls == cls:
+            return cls(data)
+        return mapped_cls._deserialize(data, exist_discriminators)  # pylint: disable=protected-access
+
+    def as_dict(self, *, exclude_readonly: bool = False) -> typing.Dict[str, typing.Any]:
+        """Return a dict that can be JSONify using json.dump.
+
+        :keyword bool exclude_readonly: Whether to remove the readonly properties.
+        :returns: A dict JSON compatible object
+        :rtype: dict
+        """
+
+        result = {}
+        if exclude_readonly:
+            readonly_props = [p._rest_name for p in self._attr_to_rest_field.values() if _is_readonly(p)]
+        for k, v in self.items():
+            if exclude_readonly and k in readonly_props:  # pyright: ignore
+                continue
+            is_multipart_file_input = False
+            try:
+                is_multipart_file_input = next(
+                    rf for rf in self._attr_to_rest_field.values() if rf._rest_name == k
+                )._is_multipart_file_input
+            except StopIteration:
+                pass
+            result[k] = v if is_multipart_file_input else Model._as_dict_value(v, exclude_readonly=exclude_readonly)
+        return result
+
+    @staticmethod
+    def _as_dict_value(v: typing.Any, exclude_readonly: bool = False) -> typing.Any:
+        if v is None or isinstance(v, _Null):
+            return None
+        if isinstance(v, (list, tuple, set)):
+            return type(v)(Model._as_dict_value(x, exclude_readonly=exclude_readonly) for x in v)
+        if isinstance(v, dict):
+            return {dk: Model._as_dict_value(dv, exclude_readonly=exclude_readonly) for dk, dv in v.items()}
+        return v.as_dict(exclude_readonly=exclude_readonly) if hasattr(v, "as_dict") else v
+
+
+def _deserialize_model(model_deserializer: typing.Optional[typing.Callable], obj):
+    if _is_model(obj):
+        return obj
+    return _deserialize(model_deserializer, obj)
+
+
+def _deserialize_with_optional(if_obj_deserializer: typing.Optional[typing.Callable], obj):
+    if obj is None:
+        return obj
+    return _deserialize_with_callable(if_obj_deserializer, obj)
+
+
+def _deserialize_with_union(deserializers, obj):
+    for deserializer in deserializers:
+        try:
+            return _deserialize(deserializer, obj)
+        except DeserializationError:
+            pass
+    raise DeserializationError()
+
+
+def _deserialize_dict(
+    value_deserializer: typing.Optional[typing.Callable],
+    module: typing.Optional[str],
+    obj: typing.Dict[typing.Any, typing.Any],
+):
+    if obj is None:
+        return obj
+    return {k: _deserialize(value_deserializer, v, module) for k, v in obj.items()}
+
+
+def _deserialize_multiple_sequence(
+    entry_deserializers: typing.List[typing.Optional[typing.Callable]],
+    module: typing.Optional[str],
+    obj,
+):
+    if obj is None:
+        return obj
+    return type(obj)(_deserialize(deserializer, entry, module) for entry, deserializer in zip(obj, entry_deserializers))
+
+
+def _deserialize_sequence(
+    deserializer: typing.Optional[typing.Callable],
+    module: typing.Optional[str],
+    obj,
+):
+    if obj is None:
+        return obj
+    return type(obj)(_deserialize(deserializer, entry, module) for entry in obj)
+
+
+def _sorted_annotations(types: typing.List[typing.Any]) -> typing.List[typing.Any]:
+    return sorted(
+        types,
+        key=lambda x: hasattr(x, "__name__") and x.__name__.lower() in ("str", "float", "int", "bool"),
+    )
+
+
+def _get_deserialize_callable_from_annotation(  # pylint: disable=R0911, R0915, R0912
+    annotation: typing.Any,
+    module: typing.Optional[str],
+    rf: typing.Optional["_RestField"] = None,
+) -> typing.Optional[typing.Callable[[typing.Any], typing.Any]]:
+    if not annotation or annotation in [int, float]:
+        return None
+
+    # is it a type alias?
+    if isinstance(annotation, str):
+        if module is not None:
+            annotation = _get_type_alias_type(module, annotation)
+
+    # is it a forward ref / in quotes?
+    if isinstance(annotation, (str, typing.ForwardRef)):
+        try:
+            model_name = annotation.__forward_arg__  # type: ignore
+        except AttributeError:
+            model_name = annotation
+        if module is not None:
+            annotation = _get_model(module, model_name)
+
+    try:
+        if module and _is_model(annotation):
+            if rf:
+                rf._is_model = True
+
+            return functools.partial(_deserialize_model, annotation)  # pyright: ignore
+    except Exception:
+        pass
+
+    # is it a literal?
+    try:
+        if annotation.__origin__ is typing.Literal:  # pyright: ignore
+            return None
+    except AttributeError:
+        pass
+
+    # is it optional?
+    try:
+        if any(a for a in annotation.__args__ if a == type(None)):  # pyright: ignore
+            if len(annotation.__args__) <= 2:  # pyright: ignore
+                if_obj_deserializer = _get_deserialize_callable_from_annotation(
+                    next(a for a in annotation.__args__ if a != type(None)), module, rf  # pyright: ignore
+                )
+
+                return functools.partial(_deserialize_with_optional, if_obj_deserializer)
+            # the type is Optional[Union[...]], we need to remove the None type from the Union
+            annotation_copy = copy.copy(annotation)
+            annotation_copy.__args__ = [a for a in annotation_copy.__args__ if a != type(None)]  # pyright: ignore
+            return _get_deserialize_callable_from_annotation(annotation_copy, module, rf)
+    except AttributeError:
+        pass
+
+    # is it union?
+    if getattr(annotation, "__origin__", None) is typing.Union:
+        # initial ordering is we make `string` the last deserialization option, because it is often them most generic
+        deserializers = [
+            _get_deserialize_callable_from_annotation(arg, module, rf)
+            for arg in _sorted_annotations(annotation.__args__)  # pyright: ignore
+        ]
+
+        return functools.partial(_deserialize_with_union, deserializers)
+
+    try:
+        if annotation._name == "Dict":  # pyright: ignore
+            value_deserializer = _get_deserialize_callable_from_annotation(
+                annotation.__args__[1], module, rf  # pyright: ignore
+            )
+
+            return functools.partial(
+                _deserialize_dict,
+                value_deserializer,
+                module,
+            )
+    except (AttributeError, IndexError):
+        pass
+    try:
+        if annotation._name in ["List", "Set", "Tuple", "Sequence"]:  # pyright: ignore
+            if len(annotation.__args__) > 1:  # pyright: ignore
+
+                entry_deserializers = [
+                    _get_deserialize_callable_from_annotation(dt, module, rf)
+                    for dt in annotation.__args__  # pyright: ignore
+                ]
+                return functools.partial(_deserialize_multiple_sequence, entry_deserializers, module)
+            deserializer = _get_deserialize_callable_from_annotation(
+                annotation.__args__[0], module, rf  # pyright: ignore
+            )
+
+            return functools.partial(_deserialize_sequence, deserializer, module)
+    except (TypeError, IndexError, AttributeError, SyntaxError):
+        pass
+
+    def _deserialize_default(
+        deserializer,
+        obj,
+    ):
+        if obj is None:
+            return obj
+        try:
+            return _deserialize_with_callable(deserializer, obj)
+        except Exception:
+            pass
+        return obj
+
+    if get_deserializer(annotation, rf):
+        return functools.partial(_deserialize_default, get_deserializer(annotation, rf))
+
+    return functools.partial(_deserialize_default, annotation)
+
+
+def _deserialize_with_callable(
+    deserializer: typing.Optional[typing.Callable[[typing.Any], typing.Any]],
+    value: typing.Any,
+):
+    try:
+        if value is None or isinstance(value, _Null):
+            return None
+        if deserializer is None:
+            return value
+        if isinstance(deserializer, CaseInsensitiveEnumMeta):
+            try:
+                return deserializer(value)
+            except ValueError:
+                # for unknown value, return raw value
+                return value
+        if isinstance(deserializer, type) and issubclass(deserializer, Model):
+            return deserializer._deserialize(value, [])
+        return typing.cast(typing.Callable[[typing.Any], typing.Any], deserializer)(value)
+    except Exception as e:
+        raise DeserializationError() from e
+
+
+def _deserialize(
+    deserializer: typing.Any,
+    value: typing.Any,
+    module: typing.Optional[str] = None,
+    rf: typing.Optional["_RestField"] = None,
+    format: typing.Optional[str] = None,
+) -> typing.Any:
+    if isinstance(value, PipelineResponse):
+        value = value.http_response.json()
+    if rf is None and format:
+        rf = _RestField(format=format)
+    if not isinstance(deserializer, functools.partial):
+        deserializer = _get_deserialize_callable_from_annotation(deserializer, module, rf)
+    return _deserialize_with_callable(deserializer, value)
+
+
+class _RestField:
+    def __init__(
+        self,
+        *,
+        name: typing.Optional[str] = None,
+        type: typing.Optional[typing.Callable] = None,  # pylint: disable=redefined-builtin
+        is_discriminator: bool = False,
+        visibility: typing.Optional[typing.List[str]] = None,
+        default: typing.Any = _UNSET,
+        format: typing.Optional[str] = None,
+        is_multipart_file_input: bool = False,
+    ):
+        self._type = type
+        self._rest_name_input = name
+        self._module: typing.Optional[str] = None
+        self._is_discriminator = is_discriminator
+        self._visibility = visibility
+        self._is_model = False
+        self._default = default
+        self._format = format
+        self._is_multipart_file_input = is_multipart_file_input
+
+    @property
+    def _class_type(self) -> typing.Any:
+        return getattr(self._type, "args", [None])[0]
+
+    @property
+    def _rest_name(self) -> str:
+        if self._rest_name_input is None:
+            raise ValueError("Rest name was never set")
+        return self._rest_name_input
+
+    def __get__(self, obj: Model, type=None):  # pylint: disable=redefined-builtin
+        # by this point, type and rest_name will have a value bc we default
+        # them in __new__ of the Model class
+        item = obj.get(self._rest_name)
+        if item is None:
+            return item
+        if self._is_model:
+            return item
+        return _deserialize(self._type, _serialize(item, self._format), rf=self)
+
+    def __set__(self, obj: Model, value) -> None:
+        if value is None:
+            # we want to wipe out entries if users set attr to None
+            try:
+                obj.__delitem__(self._rest_name)
+            except KeyError:
+                pass
+            return
+        if self._is_model:
+            if not _is_model(value):
+                value = _deserialize(self._type, value)
+            obj.__setitem__(self._rest_name, value)
+            return
+        obj.__setitem__(self._rest_name, _serialize(value, self._format))
+
+    def _get_deserialize_callable_from_annotation(
+        self, annotation: typing.Any
+    ) -> typing.Optional[typing.Callable[[typing.Any], typing.Any]]:
+        return _get_deserialize_callable_from_annotation(annotation, self._module, self)
+
+
+def rest_field(
+    *,
+    name: typing.Optional[str] = None,
+    type: typing.Optional[typing.Callable] = None,  # pylint: disable=redefined-builtin
+    visibility: typing.Optional[typing.List[str]] = None,
+    default: typing.Any = _UNSET,
+    format: typing.Optional[str] = None,
+    is_multipart_file_input: bool = False,
+) -> typing.Any:
+    return _RestField(
+        name=name,
+        type=type,
+        visibility=visibility,
+        default=default,
+        format=format,
+        is_multipart_file_input=is_multipart_file_input,
+    )
+
+
+def rest_discriminator(
+    *,
+    name: typing.Optional[str] = None,
+    type: typing.Optional[typing.Callable] = None,  # pylint: disable=redefined-builtin
+) -> typing.Any:
+    return _RestField(name=name, type=type, is_discriminator=True)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/__init__.py
new file mode 100644
index 000000000000..d3ebd561f739
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/__init__.py
@@ -0,0 +1,23 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from ._operations import ChatCompletionsClientOperationsMixin
+from ._operations import EmbeddingsClientOperationsMixin
+from ._operations import ImageEmbeddingsClientOperationsMixin
+
+from ._patch import __all__ as _patch_all
+from ._patch import *  # pylint: disable=unused-wildcard-import
+from ._patch import patch_sdk as _patch_sdk
+
+__all__ = [
+    "ChatCompletionsClientOperationsMixin",
+    "EmbeddingsClientOperationsMixin",
+    "ImageEmbeddingsClientOperationsMixin",
+]
+__all__.extend([p for p in _patch_all if p not in __all__])
+_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
new file mode 100644
index 000000000000..48a52c3b763f
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
@@ -0,0 +1,1105 @@
+# pylint: disable=too-many-lines,too-many-statements
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+from io import IOBase
+import json
+import sys
+from typing import Any, Callable, Dict, IO, List, Optional, Type, TypeVar, Union, overload
+
+from azure.core.exceptions import (
+    ClientAuthenticationError,
+    HttpResponseError,
+    ResourceExistsError,
+    ResourceNotFoundError,
+    ResourceNotModifiedError,
+    map_error,
+)
+from azure.core.pipeline import PipelineResponse
+from azure.core.rest import HttpRequest, HttpResponse
+from azure.core.tracing.decorator import distributed_trace
+from azure.core.utils import case_insensitive_dict
+
+from .. import models as _models
+from .._model_base import SdkJSONEncoder, _deserialize
+from .._serialization import Serializer
+from .._vendor import ChatCompletionsClientMixinABC, EmbeddingsClientMixinABC, ImageEmbeddingsClientMixinABC
+
+if sys.version_info >= (3, 9):
+    from collections.abc import MutableMapping
+else:
+    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
+JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
+_Unset: Any = object()
+T = TypeVar("T")
+ClsType = Optional[Callable[[PipelineResponse[HttpRequest, HttpResponse], T, Dict[str, Any]], Any]]
+
+_SERIALIZER = Serializer()
+_SERIALIZER.client_side_validation = False
+
+
+def build_chat_completions_complete_request(
+    *, unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None, **kwargs: Any
+) -> HttpRequest:
+    _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+    _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
+
+    content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+    api_version: str = kwargs.pop("api_version", _params.pop("api-version", "2024-05-01-preview"))
+    accept = _headers.pop("Accept", "application/json")
+
+    # Construct URL
+    _url = "/chat/completions"
+
+    # Construct parameters
+    _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
+
+    # Construct headers
+    if unknown_params is not None:
+        _headers["unknown-parameters"] = _SERIALIZER.header("unknown_params", unknown_params, "str")
+    if content_type is not None:
+        _headers["Content-Type"] = _SERIALIZER.header("content_type", content_type, "str")
+    _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
+
+    return HttpRequest(method="POST", url=_url, params=_params, headers=_headers, **kwargs)
+
+
+def build_chat_completions_get_model_info_request(**kwargs: Any) -> HttpRequest:  # pylint: disable=name-too-long
+    _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+    _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
+
+    api_version: str = kwargs.pop("api_version", _params.pop("api-version", "2024-05-01-preview"))
+    accept = _headers.pop("Accept", "application/json")
+
+    # Construct URL
+    _url = "/info"
+
+    # Construct parameters
+    _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
+
+    # Construct headers
+    _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
+
+    return HttpRequest(method="GET", url=_url, params=_params, headers=_headers, **kwargs)
+
+
+def build_embeddings_embed_request(
+    *, unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None, **kwargs: Any
+) -> HttpRequest:
+    _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+    _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
+
+    content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+    api_version: str = kwargs.pop("api_version", _params.pop("api-version", "2024-05-01-preview"))
+    accept = _headers.pop("Accept", "application/json")
+
+    # Construct URL
+    _url = "/embeddings"
+
+    # Construct parameters
+    _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
+
+    # Construct headers
+    if unknown_params is not None:
+        _headers["unknown-parameters"] = _SERIALIZER.header("unknown_params", unknown_params, "str")
+    if content_type is not None:
+        _headers["Content-Type"] = _SERIALIZER.header("content_type", content_type, "str")
+    _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
+
+    return HttpRequest(method="POST", url=_url, params=_params, headers=_headers, **kwargs)
+
+
+def build_embeddings_get_model_info_request(**kwargs: Any) -> HttpRequest:
+    _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+    _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
+
+    api_version: str = kwargs.pop("api_version", _params.pop("api-version", "2024-05-01-preview"))
+    accept = _headers.pop("Accept", "application/json")
+
+    # Construct URL
+    _url = "/info"
+
+    # Construct parameters
+    _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
+
+    # Construct headers
+    _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
+
+    return HttpRequest(method="GET", url=_url, params=_params, headers=_headers, **kwargs)
+
+
+def build_image_embeddings_embed_request(
+    *, unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None, **kwargs: Any
+) -> HttpRequest:
+    _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+    _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
+
+    content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+    api_version: str = kwargs.pop("api_version", _params.pop("api-version", "2024-05-01-preview"))
+    accept = _headers.pop("Accept", "application/json")
+
+    # Construct URL
+    _url = "/images/embeddings"
+
+    # Construct parameters
+    _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
+
+    # Construct headers
+    if unknown_params is not None:
+        _headers["unknown-parameters"] = _SERIALIZER.header("unknown_params", unknown_params, "str")
+    if content_type is not None:
+        _headers["Content-Type"] = _SERIALIZER.header("content_type", content_type, "str")
+    _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
+
+    return HttpRequest(method="POST", url=_url, params=_params, headers=_headers, **kwargs)
+
+
+def build_image_embeddings_get_model_info_request(**kwargs: Any) -> HttpRequest:  # pylint: disable=name-too-long
+    _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+    _params = case_insensitive_dict(kwargs.pop("params", {}) or {})
+
+    api_version: str = kwargs.pop("api_version", _params.pop("api-version", "2024-05-01-preview"))
+    accept = _headers.pop("Accept", "application/json")
+
+    # Construct URL
+    _url = "/info"
+
+    # Construct parameters
+    _params["api-version"] = _SERIALIZER.query("api_version", api_version, "str")
+
+    # Construct headers
+    _headers["Accept"] = _SERIALIZER.header("accept", accept, "str")
+
+    return HttpRequest(method="GET", url=_url, params=_params, headers=_headers, **kwargs)
+
+
+class ChatCompletionsClientOperationsMixin(ChatCompletionsClientMixinABC):
+
+    @overload
+    def _complete(
+        self,
+        body: JSON,
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.ChatCompletions: ...
+    @overload
+    def _complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        frequency_penalty: Optional[float] = None,
+        stream_parameter: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any
+    ) -> _models.ChatCompletions: ...
+    @overload
+    def _complete(
+        self,
+        body: IO[bytes],
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.ChatCompletions: ...
+
+    @distributed_trace
+    def _complete(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        messages: List[_models.ChatRequestMessage] = _Unset,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        frequency_penalty: Optional[float] = None,
+        stream_parameter: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any
+    ) -> _models.ChatCompletions:
+        # pylint: disable=line-too-long
+        # pylint: disable=too-many-locals
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes"
+        provided prompt data. The method makes a REST API call to the ``/chat/completions`` route
+        on the given endpoint.
+
+        :param body: Is either a JSON type or a IO[bytes] type. Required.
+        :type body: JSON or IO[bytes]
+        :keyword messages: The collection of context messages associated with this chat completions
+         request.
+         Typical usage begins with a chat message for the System role that provides instructions for
+         the behavior of the assistant, followed by alternating messages between the User and
+         Assistant roles. Required.
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
+        :keyword unknown_params: Controls what happens if unknown parameters are passed in the JSON
+         request payload.
+         This sets the HTTP request header ``unknown-parameters``. Known values are: "error", "drop",
+         and "pass_through". Default value is None.
+        :paramtype unknown_params: str or ~azure.ai.inference.models.UnknownParams
+        :keyword frequency_penalty: A value that influences the probability of generated tokens
+         appearing based on their cumulative
+         frequency in generated text.
+         Positive values will make tokens less likely to appear as their frequency increases and
+         decrease the likelihood of the model repeating the same statements verbatim.
+         Supported range is [-2, 2]. Default value is None.
+        :paramtype frequency_penalty: float
+        :keyword stream_parameter: A value indicating whether chat completions should be streamed for
+         this request. Default value is None.
+        :paramtype stream_parameter: bool
+        :keyword presence_penalty: A value that influences the probability of generated tokens
+         appearing based on their existing
+         presence in generated text.
+         Positive values will make tokens less likely to appear when they already exist and increase
+         the
+         model's likelihood to output new topics.
+         Supported range is [-2, 2]. Default value is None.
+        :paramtype presence_penalty: float
+        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
+         generated completions.
+         Higher values will make output more random while lower values will make results more focused
+         and deterministic.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1]. Default value is None.
+        :paramtype temperature: float
+        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
+         causes the
+         model to consider the results of tokens with the provided probability mass. As an example, a
+         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+         considered.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1]. Default value is None.
+        :paramtype top_p: float
+        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
+        :paramtype max_tokens: int
+        :keyword response_format: An object specifying the format that the model must output. Used to
+         enable JSON mode. Known values are: "text" and "json_object". Default value is None.
+        :paramtype response_format: str or ~azure.ai.inference.models.ChatCompletionsResponseFormat
+        :keyword stop: A collection of textual sequences that will end completions generation. Default
+         value is None.
+        :paramtype stop: list[str]
+        :keyword tools: The available tool definitions that the chat completions request can use,
+         including caller-defined functions. Default value is None.
+        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
+         use for the chat completions response. Is either a Union[str,
+         "_models.ChatCompletionsToolSelectionPreset"] type or a ChatCompletionsNamedToolSelection type.
+         Default value is None.
+        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolSelectionPreset or
+         ~azure.ai.inference.models.ChatCompletionsNamedToolSelection
+        :keyword seed: If specified, the system will make a best effort to sample deterministically
+         such that repeated requests with the
+         same seed and parameters should return the same result. Determinism is not guaranteed.".
+         Default value is None.
+        :paramtype seed: int
+        :return: ChatCompletions. The ChatCompletions is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # JSON input template you can fill out and use as your body input.
+                body = {
+                    "messages": [
+                        chat_request_message
+                    ],
+                    "frequency_penalty": 0.0,  # Optional. A value that influences the
+                      probability of generated tokens appearing based on their cumulative frequency in
+                      generated text. Positive values will make tokens less likely to appear as their
+                      frequency increases and decrease the likelihood of the model repeating the same
+                      statements verbatim. Supported range is [-2, 2].
+                    "max_tokens": 0,  # Optional. The maximum number of tokens to generate.
+                    "presence_penalty": 0.0,  # Optional. A value that influences the probability
+                      of generated tokens appearing based on their existing presence in generated text.
+                      Positive values will make tokens less likely to appear when they already exist
+                      and increase the model's likelihood to output new topics. Supported range is [-2,
+                      2].
+                    "response_format": "str",  # Optional. An object specifying the format that
+                      the model must output. Used to enable JSON mode. Known values are: "text" and
+                      "json_object".
+                    "seed": 0,  # Optional. If specified, the system will make a best effort to
+                      sample deterministically such that repeated requests with the same seed and
+                      parameters should return the same result. Determinism is not guaranteed.".
+                    "stop": [
+                        "str"  # Optional. A collection of textual sequences that will end
+                          completions generation.
+                    ],
+                    "stream": bool,  # Optional. A value indicating whether chat completions
+                      should be streamed for this request.
+                    "temperature": 0.0,  # Optional. The sampling temperature to use that
+                      controls the apparent creativity of generated completions. Higher values will
+                      make output more random while lower values will make results more focused and
+                      deterministic. It is not recommended to modify temperature and top_p for the same
+                      completions request as the interaction of these two settings is difficult to
+                      predict. Supported range is [0, 1].
+                    "tool_choice": "str",  # Optional. If specified, the model will configure
+                      which of the provided tools it can use for the chat completions response. Is
+                      either a Union[str, "_models.ChatCompletionsToolSelectionPreset"] type or a
+                      ChatCompletionsNamedToolSelection type.
+                    "tools": [
+                        chat_completions_tool_definition
+                    ],
+                    "top_p": 0.0  # Optional. An alternative to sampling with temperature called
+                      nucleus sampling. This value causes the model to consider the results of tokens
+                      with the provided probability mass. As an example, a value of 0.15 will cause
+                      only the tokens comprising the top 15% of probability mass to be considered. It
+                      is not recommended to modify temperature and top_p for the same completions
+                      request as the interaction of these two settings is difficult to predict.
+                      Supported range is [0, 1].
+                }
+
+                # response body for status code(s): 200
+                response == {
+                    "choices": [
+                        {
+                            "finish_reason": "str",  # The reason that this chat
+                              completions choice completed its generated. Required. Known values are:
+                              "stop", "length", "content_filter", and "tool_calls".
+                            "index": 0,  # The ordered index associated with this chat
+                              completions choice. Required.
+                            "message": {
+                                "content": "str",  # The content of the message.
+                                  Required.
+                                "role": "str",  # The chat role associated with the
+                                  message. Required. Known values are: "system", "user", "assistant",
+                                  and "tool".
+                                "tool_calls": [
+                                    chat_completions_tool_call
+                                ]
+                            }
+                        }
+                    ],
+                    "created": "2020-02-20 00:00:00",  # The first timestamp associated with
+                      generation activity for this completions response, represented as seconds since
+                      the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
+                    "id": "str",  # A unique identifier associated with this chat completions
+                      response. Required.
+                    "model": "str",  # The model used for the chat completion. Required.
+                    "usage": {
+                        "capacity_type": "str",  # Indicates whether your capacity has been
+                          affected by the usage amount (token count) reported here. Required. Known
+                          values are: "usage" and "fixed".
+                        "completion_tokens": 0,  # The number of tokens generated across all
+                          completions emissions. Required.
+                        "prompt_tokens": 0,  # The number of tokens in the provided prompts
+                          for the completions request. Required.
+                        "total_tokens": 0  # The total number of tokens processed for the
+                          completions request and response. Required.
+                    }
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+        cls: ClsType[_models.ChatCompletions] = kwargs.pop("cls", None)
+
+        if body is _Unset:
+            if messages is _Unset:
+                raise TypeError("missing required argument: messages")
+            body = {
+                "frequency_penalty": frequency_penalty,
+                "max_tokens": max_tokens,
+                "messages": messages,
+                "presence_penalty": presence_penalty,
+                "response_format": response_format,
+                "seed": seed,
+                "stop": stop,
+                "stream": stream_parameter,
+                "temperature": temperature,
+                "tool_choice": tool_choice,
+                "tools": tools,
+                "top_p": top_p,
+            }
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_chat_completions_complete_request(
+            unknown_params=unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ChatCompletions, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+    @distributed_trace
+    def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+        The method makes a REST API call to the ``/info`` route on the given endpoint.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # response body for status code(s): 200
+                response == {
+                    "model_name": "str",  # The name of the AI model. For example: ``Phi21``.
+                      Required.
+                    "model_provider_name": "str",  # The model provider name. For example:
+                      ``Microsoft Research``. Required.
+                    "model_type": "str"  # The type of the AI model. A Unique identifier for the
+                      profile. Required. Known values are: "embeddings", "image_generation",
+                      "text_generation", "image_embeddings", "audio_generation", and "chat".
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = kwargs.pop("headers", {}) or {}
+        _params = kwargs.pop("params", {}) or {}
+
+        cls: ClsType[_models.ModelInfo] = kwargs.pop("cls", None)
+
+        _request = build_chat_completions_get_model_info_request(
+            api_version=self._config.api_version,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ModelInfo, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+
+class EmbeddingsClientOperationsMixin(EmbeddingsClientMixinABC):
+
+    @overload
+    def _embed(
+        self,
+        body: JSON,
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    def _embed(
+        self,
+        *,
+        input: List[str],
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    def _embed(
+        self,
+        body: IO[bytes],
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+
+    @distributed_trace
+    def _embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        input: List[str] = _Unset,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the ``/embeddings`` route on the given endpoint.
+
+        :param body: Is either a JSON type or a IO[bytes] type. Required.
+        :type body: JSON or IO[bytes]
+        :keyword input: Input text to embed, encoded as a string or array of tokens.
+         To embed multiple inputs in a single request, pass an array
+         of strings or array of token arrays. Required.
+        :paramtype input: list[str]
+        :keyword unknown_params: Controls what happens if unknown parameters are passed in the JSON
+         request payload.
+         This sets the HTTP request header ``unknown-parameters``. Known values are: "error", "drop",
+         and "pass_through". Default value is None.
+        :paramtype unknown_params: str or ~azure.ai.inference.models.UnknownParams
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings. Known
+         values are: "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # JSON input template you can fill out and use as your body input.
+                body = {
+                    "input": [
+                        "str"  # Input text to embed, encoded as a string or array of tokens.
+                          To embed multiple inputs in a single request, pass an array of strings or
+                          array of token arrays. Required.
+                    ],
+                    "dimensions": 0,  # Optional. Optional. The number of dimensions the
+                      resulting output embeddings should have. Passing null causes the model to use its
+                      default value. Returns a 422 error if the model doesn't support the value or
+                      parameter.
+                    "encoding_format": "str",  # Optional. Optional. The desired format for the
+                      returned embeddings. Known values are: "base64", "binary", "float", "int8",
+                      "ubinary", and "uint8".
+                    "input_type": "str"  # Optional. Optional. The type of the input. Returns a
+                      422 error if the model doesn't support the value or parameter. Known values are:
+                      "text", "query", and "document".
+                }
+
+                # response body for status code(s): 200
+                response == {
+                    "data": [
+                        {
+                            "embedding": [
+                                0.0  # List of embeddings value for the input prompt.
+                                  These represent a measurement of the vector-based relatedness of the
+                                  provided input. Required.
+                            ],
+                            "index": 0  # Index of the prompt to which the EmbeddingItem
+                              corresponds. Required.
+                        }
+                    ],
+                    "id": "str",  # Unique identifier for the embeddings result. Required.
+                    "model": "str",  # The model ID used to generate this result. Required.
+                    "usage": {
+                        "capacity_type": "str",  # Indicates whether your capacity has been
+                          affected by the usage amount (token count) reported here. Required. Known
+                          values are: "usage" and "fixed".
+                        "input_tokens": 0,  # Number of tokens in the request prompt.
+                          Required.
+                        "prompt_tokens": 0,  # Number of tokens used for the prompt sent to
+                          the AI model. Typically identical to ``input_tokens``. However, certain AI
+                          models may add extra tokens to the input hence the number can be higher. (for
+                          example when input_type="query"). Required.
+                        "total_tokens": 0  # Total number of tokens transacted in this
+                          request/response. Required.
+                    }
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+        cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_embeddings_embed_request(
+            unknown_params=unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+    @distributed_trace
+    def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+        The method makes a REST API call to the ``/info`` route on the given endpoint.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # response body for status code(s): 200
+                response == {
+                    "model_name": "str",  # The name of the AI model. For example: ``Phi21``.
+                      Required.
+                    "model_provider_name": "str",  # The model provider name. For example:
+                      ``Microsoft Research``. Required.
+                    "model_type": "str"  # The type of the AI model. A Unique identifier for the
+                      profile. Required. Known values are: "embeddings", "image_generation",
+                      "text_generation", "image_embeddings", "audio_generation", and "chat".
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = kwargs.pop("headers", {}) or {}
+        _params = kwargs.pop("params", {}) or {}
+
+        cls: ClsType[_models.ModelInfo] = kwargs.pop("cls", None)
+
+        _request = build_embeddings_get_model_info_request(
+            api_version=self._config.api_version,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ModelInfo, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+
+class ImageEmbeddingsClientOperationsMixin(ImageEmbeddingsClientMixinABC):
+
+    @overload
+    def _embed(
+        self,
+        body: JSON,
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    def _embed(
+        self,
+        *,
+        input: List[_models.EmbeddingInput],
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    def _embed(
+        self,
+        body: IO[bytes],
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+
+    @distributed_trace
+    def _embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        input: List[_models.EmbeddingInput] = _Unset,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the ``/images/embeddings`` route on the given endpoint.
+
+        :param body: Is either a JSON type or a IO[bytes] type. Required.
+        :type body: JSON or IO[bytes]
+        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
+         array.
+         The input must not exceed the max input tokens for the model. Required.
+        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
+        :keyword unknown_params: Controls what happens if unknown parameters are passed in the JSON
+         request payload.
+         This sets the HTTP request header ``unknown-parameters``. Known values are: "error", "drop",
+         and "pass_through". Default value is None.
+        :paramtype unknown_params: str or ~azure.ai.inference.models.UnknownParams
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The number of dimensions the resulting output embeddings
+         should have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # JSON input template you can fill out and use as your body input.
+                body = {
+                    "input": [
+                        {
+                            "image": "str",  # The input image, in PNG format. Required.
+                            "text": "str"  # Optional. Optional. The text input to feed
+                              into the model (like DINO, CLIP). Returns a 422 error if the model
+                              doesn't support the value or parameter.
+                        }
+                    ],
+                    "dimensions": 0,  # Optional. Optional. The number of dimensions the
+                      resulting output embeddings should have. Passing null causes the model to use its
+                      default value. Returns a 422 error if the model doesn't support the value or
+                      parameter.
+                    "encoding_format": "str",  # Optional. Optional. The number of dimensions the
+                      resulting output embeddings should have. Passing null causes the model to use its
+                      default value. Returns a 422 error if the model doesn't support the value or
+                      parameter. Known values are: "base64", "binary", "float", "int8", "ubinary", and
+                      "uint8".
+                    "input_type": "str"  # Optional. Optional. The type of the input. Returns a
+                      422 error if the model doesn't support the value or parameter. Known values are:
+                      "text", "query", and "document".
+                }
+
+                # response body for status code(s): 200
+                response == {
+                    "data": [
+                        {
+                            "embedding": [
+                                0.0  # List of embeddings value for the input prompt.
+                                  These represent a measurement of the vector-based relatedness of the
+                                  provided input. Required.
+                            ],
+                            "index": 0  # Index of the prompt to which the EmbeddingItem
+                              corresponds. Required.
+                        }
+                    ],
+                    "id": "str",  # Unique identifier for the embeddings result. Required.
+                    "model": "str",  # The model ID used to generate this result. Required.
+                    "usage": {
+                        "capacity_type": "str",  # Indicates whether your capacity has been
+                          affected by the usage amount (token count) reported here. Required. Known
+                          values are: "usage" and "fixed".
+                        "input_tokens": 0,  # Number of tokens in the request prompt.
+                          Required.
+                        "prompt_tokens": 0,  # Number of tokens used for the prompt sent to
+                          the AI model. Typically identical to ``input_tokens``. However, certain AI
+                          models may add extra tokens to the input hence the number can be higher. (for
+                          example when input_type="query"). Required.
+                        "total_tokens": 0  # Total number of tokens transacted in this
+                          request/response. Required.
+                    }
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+        cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_image_embeddings_embed_request(
+            unknown_params=unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+    @distributed_trace
+    def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+        The method makes a REST API call to the ``/info`` route on the given endpoint.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # response body for status code(s): 200
+                response == {
+                    "model_name": "str",  # The name of the AI model. For example: ``Phi21``.
+                      Required.
+                    "model_provider_name": "str",  # The model provider name. For example:
+                      ``Microsoft Research``. Required.
+                    "model_type": "str"  # The type of the AI model. A Unique identifier for the
+                      profile. Required. Known values are: "embeddings", "image_generation",
+                      "text_generation", "image_embeddings", "audio_generation", and "chat".
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = kwargs.pop("headers", {}) or {}
+        _params = kwargs.pop("params", {}) or {}
+
+        cls: ClsType[_models.ModelInfo] = kwargs.pop("cls", None)
+
+        _request = build_image_embeddings_get_model_info_request(
+            api_version=self._config.api_version,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ModelInfo, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_patch.py
new file mode 100644
index 000000000000..f7dd32510333
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_patch.py
@@ -0,0 +1,20 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""Customize generated code here.
+
+Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
+"""
+from typing import List
+
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
+
+
+def patch_sdk():
+    """Do not remove from this file.
+
+    `patch_sdk` is a last resort escape hatch that allows you to do customizations
+    you can't accomplish using the techniques described in
+    https://aka.ms/azsdk/python/dpcodegen/python/customize
+    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
new file mode 100644
index 000000000000..9dbeb1ffee6d
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
@@ -0,0 +1,1090 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+# pylint: disable=too-many-lines)
+"""Customize generated code here.
+
+Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
+
+Why do we patch auto-generated code?
+1. Add support for input argument `model_extras` (all clients)
+2. Add support for function load_client
+3. Add support for get_model_info, while caching the result (all clients)
+4. Add support for chat completion streaming (ChatCompletionsClient client only)
+5. __enter__ (and __aenter__) method had to be overridden due to
+   https://github.com/Azure/autorest.python/issues/2619 (all clients).
+   Otherwise intellisense did not show the patched public methods on the client object,
+   when the client is defined using context manager ("with" statement).
+
+"""
+import json
+import logging
+import sys
+
+from io import IOBase
+from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING
+
+from azure.core.pipeline import PipelineResponse
+from azure.core.credentials import AzureKeyCredential
+from azure.core.tracing.decorator import distributed_trace
+from azure.core.utils import case_insensitive_dict
+from azure.core.exceptions import (
+    ClientAuthenticationError,
+    HttpResponseError,
+    map_error,
+    ResourceExistsError,
+    ResourceNotFoundError,
+    ResourceNotModifiedError,
+)
+from . import models as _models
+from ._model_base import SdkJSONEncoder, _deserialize
+from ._serialization import Serializer
+from ._operations._operations import (
+    build_chat_completions_complete_request,
+    build_embeddings_embed_request,
+    build_image_embeddings_embed_request,
+)
+from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
+from ._client import EmbeddingsClient as EmbeddingsClientGenerated
+from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
+
+if sys.version_info >= (3, 9):
+    from collections.abc import MutableMapping
+else:
+    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
+
+if sys.version_info >= (3, 11):
+    from typing import Self
+else:
+    from typing_extensions import Self
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core.credentials import TokenCredential
+
+JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
+_Unset: Any = object()
+
+_SERIALIZER = Serializer()
+_SERIALIZER.client_side_validation = False
+
+_LOGGER = logging.getLogger(__name__)
+
+
+def load_client(
+    endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any
+) -> Union[ChatCompletionsClientGenerated, EmbeddingsClientGenerated, ImageEmbeddingsClientGenerated]:
+    """
+    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
+    on the given endpoint, to determine the model type and therefore which client to instantiate.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    :return: The appropriate client associated with the given endpoint
+    :rtype: ~azure.ai.inference.ChatCompletionsClient or ~azure.ai.inference.EmbeddingsClient
+     or ~azure.ai.inference.ImageEmbeddingsClient
+    :raises ~azure.core.exceptions.HttpResponseError
+    """
+
+    with ChatCompletionsClient(endpoint, credential, **kwargs) as client: # Pick any of the clients, it does not matter.
+        model_info = client.get_model_info() # type: ignore
+
+    _LOGGER.info("model_info=%s", model_info)
+    if not model_info.model_type:
+        raise ValueError(
+            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
+        )
+
+    # TODO: Remove "completions" and "embedding" once Mistral Large and Cohere fixes their model type
+    if model_info.model_type in (_models.ModelType.CHAT, "completion"):
+        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
+        chat_completion_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
+        return chat_completion_client
+
+    if model_info.model_type in (_models.ModelType.EMBEDDINGS, "embedding"):
+        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
+        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
+        return embedding_client
+
+    if model_info.model_type == _models.ModelType.IMAGE_EMBEDDINGS:
+        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
+        image_embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
+        return image_embedding_client
+
+    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
+
+
+class ChatCompletionsClient(ChatCompletionsClientGenerated):
+    """ChatCompletionsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        self._model_info: Optional[_models.ModelInfo] = None
+        super().__init__(endpoint, credential, **kwargs)
+
+
+    @overload
+    def complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        content_type: str = "application/json",
+        model_extras: Optional[Dict[str, Any]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Literal[False] = False,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> _models.ChatCompletions:
+        ...
+
+
+    @overload
+    def complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        content_type: str = "application/json",
+        model_extras: Optional[Dict[str, Any]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Literal[True],
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> _models.StreamingChatCompletions:
+        ...
+
+
+    @overload
+    def complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        content_type: str = "application/json",
+        model_extras: Optional[Dict[str, Any]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Optional[bool] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Union[_models.StreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
+        on the given endpoint.
+        When using this method with `stream=True`, the response is streamed
+        back to the client. Iterate over the resulting StreamingChatCompletions
+        object to get content updates as they arrive. By default, the response is a ChatCompletions object
+        (non-streaming).
+
+        :keyword messages: The collection of context messages associated with this chat completions
+         request.
+         Typical usage begins with a chat message for the System role that provides instructions for
+         the behavior of the assistant, followed by alternating messages between the User and
+         Assistant roles. Required.
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword frequency_penalty: A value that influences the probability of generated tokens
+         appearing based on their cumulative frequency in generated text.
+         Positive values will make tokens less likely to appear as their frequency increases and
+         decrease the likelihood of the model repeating the same statements verbatim.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype frequency_penalty: float
+        :keyword presence_penalty: A value that influences the probability of generated tokens
+         appearing based on their existing
+         presence in generated text.
+         Positive values will make tokens less likely to appear when they already exist and increase
+         the model's likelihood to output new topics.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype presence_penalty: float
+        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
+         generated completions.
+         Higher values will make output more random while lower values will make results more focused
+         and deterministic.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype temperature: float
+        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
+         causes the
+         model to consider the results of tokens with the provided probability mass. As an example, a
+         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+         considered.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype top_p: float
+        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
+        :paramtype max_tokens: int
+        :keyword response_format: An object specifying the format that the model must output. Used to
+         enable JSON mode. Known values are: "text" and "json_object". Default value is None.
+        :paramtype response_format: str or ~azure.ai.inference.models.ChatCompletionsResponseFormat
+        :keyword stop: A collection of textual sequences that will end completions generation. Default
+         value is None.
+        :paramtype stop: list[str]
+        :keyword stream: A value indicating whether chat completions should be streamed for this request.
+         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
+         Otherwise the response will be a ChatCompletions.
+        :paramtype stream: bool
+        :keyword tools: The available tool definitions that the chat completions request can use,
+         including caller-defined functions. Default value is None.
+        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
+         use for the chat completions response. Is either a Union[str,
+         "_models.ChatCompletionsToolSelectionPreset"] type or a ChatCompletionsNamedToolSelection type.
+         Default value is None.
+        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolSelectionPreset or
+         ~azure.ai.inference.models.ChatCompletionsNamedToolSelection
+        :keyword seed: If specified, the system will make a best effort to sample deterministically
+         such that repeated requests with the
+         same seed and parameters should return the same result. Determinism is not guaranteed.".
+         Default value is None.
+        :paramtype seed: int
+        :return: ChatCompletions for non-streaming, or StreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    def complete(
+        self,
+        body: JSON,
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> Union[_models.StreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data.
+
+        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
+         specifies the full request payload. Required.
+        :type body: JSON
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: ChatCompletions for non-streaming, or StreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    def complete(
+        self,
+        body: IO[bytes],
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> Union[_models.StreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        # pylint: disable=too-many-locals
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data.
+
+        :param body: Specifies the full request payload. Required.
+        :type body: IO[bytes]
+        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: ChatCompletions for non-streaming, or StreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @distributed_trace
+    def complete(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        messages: List[_models.ChatRequestMessage] = _Unset,
+        model_extras: Optional[Dict[str, Any]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Optional[bool] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Union[_models.StreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        # pylint: disable=too-many-locals
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
+        back to the client. Iterate over the resulting ~azure.ai.inference.models.StreamingChatCompletions
+        object to get content updates as they arrive.
+
+        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
+         that specifies the full request payload. Required.
+        :type body: JSON or IO[bytes]
+        :keyword messages: The collection of context messages associated with this chat completions
+         request.
+         Typical usage begins with a chat message for the System role that provides instructions for
+         the behavior of the assistant, followed by alternating messages between the User and
+         Assistant roles. Required.
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword frequency_penalty: A value that influences the probability of generated tokens
+         appearing based on their cumulative frequency in generated text.
+         Positive values will make tokens less likely to appear as their frequency increases and
+         decrease the likelihood of the model repeating the same statements verbatim.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype frequency_penalty: float
+        :keyword presence_penalty: A value that influences the probability of generated tokens
+         appearing based on their existing
+         presence in generated text.
+         Positive values will make tokens less likely to appear when they already exist and increase
+         the model's likelihood to output new topics.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype presence_penalty: float
+        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
+         generated completions.
+         Higher values will make output more random while lower values will make results more focused
+         and deterministic.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype temperature: float
+        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
+         causes the
+         model to consider the results of tokens with the provided probability mass. As an example, a
+         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+         considered.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype top_p: float
+        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
+        :paramtype max_tokens: int
+        :keyword response_format: An object specifying the format that the model must output. Used to
+         enable JSON mode. Known values are: "text" and "json_object". Default value is None.
+        :paramtype response_format: str or ~azure.ai.inference.models.ChatCompletionsResponseFormat
+        :keyword stop: A collection of textual sequences that will end completions generation. Default
+         value is None.
+        :paramtype stop: list[str]
+        :keyword stream: A value indicating whether chat completions should be streamed for this request.
+         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
+         Otherwise the response will be a ChatCompletions.
+        :paramtype stream: bool
+        :keyword tools: The available tool definitions that the chat completions request can use,
+         including caller-defined functions. Default value is None.
+        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
+         use for the chat completions response. Is either a Union[str,
+         "_models.ChatCompletionsToolSelectionPreset"] type or a ChatCompletionsNamedToolSelection type.
+         Default value is None.
+        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolSelectionPreset or
+         ~azure.ai.inference.models.ChatCompletionsNamedToolSelection
+        :keyword seed: If specified, the system will make a best effort to sample deterministically
+         such that repeated requests with the
+         same seed and parameters should return the same result. Determinism is not guaranteed.".
+         Default value is None.
+        :paramtype seed: int
+        :return: ChatCompletions for non-streaming, or StreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        error_map = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+        _unknown_params: Union[_models._enums.UnknownParams, None] = None
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+
+        if body is _Unset:
+            if messages is _Unset:
+                raise TypeError("missing required argument: messages")
+            body = {
+                "frequency_penalty": frequency_penalty,
+                "max_tokens": max_tokens,
+                "messages": messages,
+                "presence_penalty": presence_penalty,
+                "response_format": response_format,
+                "seed": seed,
+                "stop": stop,
+                "stream": stream,
+                "temperature": temperature,
+                "tool_choice": tool_choice,
+                "tools": tools,
+                "top_p": top_p,
+            }
+            if model_extras is not None and bool(model_extras):
+                body.update(model_extras)
+                _unknown_params = _models._enums.UnknownParams.PASS_THROUGH  # pylint: disable=protected-access
+            body = {k: v for k, v in body.items() if v is not None}
+        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
+            stream = body["stream"]
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_chat_completions_complete_request(
+            unknown_params=_unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = stream or False
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            return _models.StreamingChatCompletions(response)
+
+        return _deserialize(_models._models.ChatCompletions, response.json())  # pylint: disable=protected-access
+
+
+    @distributed_trace
+    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        if not self._model_info:
+            self._model_info = self._get_model_info(**kwargs) # pylint: disable=attribute-defined-outside-init
+        return self._model_info
+
+
+    def __str__(self) -> str:
+        # pylint: disable=client-method-name-no-double-underscore
+        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
+
+
+    # Remove this once https://github.com/Azure/autorest.python/issues/2619 is fixed,
+    # and you see the equivalent auto-generated method in _client.py return "Self"
+    def __enter__(self) -> Self:
+        self._client.__enter__()
+        return self
+
+
+class EmbeddingsClient(EmbeddingsClientGenerated):
+    """EmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        self._model_info: Optional[_models.ModelInfo] = None
+        super().__init__(endpoint, credential, **kwargs)
+
+
+    @overload
+    def embed(
+        self,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[str],
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input text to embed, encoded as a string or array of tokens.
+         To embed multiple inputs in a single request, pass an array
+         of strings or array of token arrays. Required.
+        :paramtype input: list[str]
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :keyword extras: Extra parameters (in the form of string key-value pairs) that are not in the
+         standard request payload.
+         They will be passed to the service as-is in the root of the JSON request payload.
+         How the service handles these extra parameters depends on the value of the
+         ``extra-parameters``
+         HTTP request header. Default value is None.
+        :paramtype extras: dict[str, str]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    def embed(
+        self,
+        body: JSON,
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
+         specifies the full request payload. Required.
+        :type body: JSON
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    def embed(
+        self,
+        body: IO[bytes],
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :param body: Specifies the full request payload. Required.
+        :type body: IO[bytes]
+        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @distributed_trace
+    def embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[str] = _Unset,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
+         that specifies the full request payload. Required.
+        :type body: JSON or IO[bytes]
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input text to embed, encoded as a string or array of tokens.
+         To embed multiple inputs in a single request, pass an array
+         of strings or array of token arrays. Required.
+        :paramtype input: list[str]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+        _unknown_params: Union[_models._enums.UnknownParams, None] = None
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            if model_extras is not None and bool(model_extras):
+                body.update(model_extras)
+                _unknown_params = _models._enums.UnknownParams.PASS_THROUGH  # pylint: disable=protected-access
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_embeddings_embed_request(
+            unknown_params=_unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        return deserialized  # type: ignore
+
+
+    @distributed_trace
+    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        if not self._model_info:
+            self._model_info = self._get_model_info(**kwargs) # pylint: disable=attribute-defined-outside-init
+        return self._model_info
+
+
+    def __str__(self) -> str:
+        # pylint: disable=client-method-name-no-double-underscore
+        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
+
+
+    # Remove this once https://github.com/Azure/autorest.python/issues/2619 is fixed,
+    # and you see the equivalent auto-generated method in _client.py return "Self"
+    def __enter__(self) -> Self:
+        self._client.__enter__()
+        return self
+
+
+class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
+    """ImageEmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any) -> None:
+        self._model_info: Optional[_models.ModelInfo] = None
+        super().__init__(endpoint, credential, **kwargs)
+
+    @overload
+    def embed(
+        self,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[_models.EmbeddingInput],
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
+         array.
+         The input must not exceed the max input tokens for the model. Required.
+        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :keyword extras: Extra parameters (in the form of string key-value pairs) that are not in the
+         standard request payload.
+         They will be passed to the service as-is in the root of the JSON request payload.
+         How the service handles these extra parameters depends on the value of the
+         ``extra-parameters``
+         HTTP request header. Default value is None.
+        :paramtype extras: dict[str, str]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    def embed(
+        self,
+        body: JSON,
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
+         specifies the full request payload. Required.
+        :type body: JSON
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    def embed(
+        self,
+        body: IO[bytes],
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :param body: Specifies the full request payload. Required.
+        :type body: IO[bytes]
+        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @distributed_trace
+    def embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[_models.EmbeddingInput] = _Unset,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
+         that specifies the full request payload. Required.
+        :type body: JSON or IO[bytes]
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
+         array.
+         The input must not exceed the max input tokens for the model. Required.
+        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+        _unknown_params: Union[_models._enums.UnknownParams, None] = None
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            if model_extras is not None and bool(model_extras):
+                body.update(model_extras)
+                _unknown_params = _models._enums.UnknownParams.PASS_THROUGH  # pylint: disable=protected-access
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_image_embeddings_embed_request(
+            unknown_params=_unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        return deserialized  # type: ignore
+
+
+    @distributed_trace
+    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        if not self._model_info:
+            self._model_info = self._get_model_info(**kwargs) # pylint: disable=attribute-defined-outside-init
+        return self._model_info
+
+
+    def __str__(self) -> str:
+        # pylint: disable=client-method-name-no-double-underscore
+        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
+
+
+    # Remove this once https://github.com/Azure/autorest.python/issues/2619 is fixed,
+    # and you see the equivalent auto-generated method in _client.py return "Self"
+    def __enter__(self) -> Self:
+        self._client.__enter__()
+        return self
+
+
+__all__: List[str] = [
+    "load_client",
+    "ChatCompletionsClient",
+    "EmbeddingsClient",
+    "ImageEmbeddingsClient",
+]  # Add all objects you want publicly available to users at this package level
+
+
+def patch_sdk():
+    """Do not remove from this file.
+
+    `patch_sdk` is a last resort escape hatch that allows you to do customizations
+    you can't accomplish using the techniques described in
+    https://aka.ms/azsdk/python/dpcodegen/python/customize
+    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
new file mode 100644
index 000000000000..2f781d740827
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
@@ -0,0 +1,1998 @@
+# --------------------------------------------------------------------------
+#
+# Copyright (c) Microsoft Corporation. All rights reserved.
+#
+# The MIT License (MIT)
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the ""Software""), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+#
+# --------------------------------------------------------------------------
+
+# pylint: skip-file
+# pyright: reportUnnecessaryTypeIgnoreComment=false
+
+from base64 import b64decode, b64encode
+import calendar
+import datetime
+import decimal
+import email
+from enum import Enum
+import json
+import logging
+import re
+import sys
+import codecs
+from typing import (
+    Dict,
+    Any,
+    cast,
+    Optional,
+    Union,
+    AnyStr,
+    IO,
+    Mapping,
+    Callable,
+    TypeVar,
+    MutableMapping,
+    Type,
+    List,
+    Mapping,
+)
+
+try:
+    from urllib import quote  # type: ignore
+except ImportError:
+    from urllib.parse import quote
+import xml.etree.ElementTree as ET
+
+import isodate  # type: ignore
+
+from azure.core.exceptions import DeserializationError, SerializationError
+from azure.core.serialization import NULL as CoreNull
+
+_BOM = codecs.BOM_UTF8.decode(encoding="utf-8")
+
+ModelType = TypeVar("ModelType", bound="Model")
+JSON = MutableMapping[str, Any]
+
+
+class RawDeserializer:
+
+    # Accept "text" because we're open minded people...
+    JSON_REGEXP = re.compile(r"^(application|text)/([a-z+.]+\+)?json$")
+
+    # Name used in context
+    CONTEXT_NAME = "deserialized_data"
+
+    @classmethod
+    def deserialize_from_text(cls, data: Optional[Union[AnyStr, IO]], content_type: Optional[str] = None) -> Any:
+        """Decode data according to content-type.
+
+        Accept a stream of data as well, but will be load at once in memory for now.
+
+        If no content-type, will return the string version (not bytes, not stream)
+
+        :param data: Input, could be bytes or stream (will be decoded with UTF8) or text
+        :type data: str or bytes or IO
+        :param str content_type: The content type.
+        """
+        if hasattr(data, "read"):
+            # Assume a stream
+            data = cast(IO, data).read()
+
+        if isinstance(data, bytes):
+            data_as_str = data.decode(encoding="utf-8-sig")
+        else:
+            # Explain to mypy the correct type.
+            data_as_str = cast(str, data)
+
+            # Remove Byte Order Mark if present in string
+            data_as_str = data_as_str.lstrip(_BOM)
+
+        if content_type is None:
+            return data
+
+        if cls.JSON_REGEXP.match(content_type):
+            try:
+                return json.loads(data_as_str)
+            except ValueError as err:
+                raise DeserializationError("JSON is invalid: {}".format(err), err)
+        elif "xml" in (content_type or []):
+            try:
+
+                try:
+                    if isinstance(data, unicode):  # type: ignore
+                        # If I'm Python 2.7 and unicode XML will scream if I try a "fromstring" on unicode string
+                        data_as_str = data_as_str.encode(encoding="utf-8")  # type: ignore
+                except NameError:
+                    pass
+
+                return ET.fromstring(data_as_str)  # nosec
+            except ET.ParseError as err:
+                # It might be because the server has an issue, and returned JSON with
+                # content-type XML....
+                # So let's try a JSON load, and if it's still broken
+                # let's flow the initial exception
+                def _json_attemp(data):
+                    try:
+                        return True, json.loads(data)
+                    except ValueError:
+                        return False, None  # Don't care about this one
+
+                success, json_result = _json_attemp(data)
+                if success:
+                    return json_result
+                # If i'm here, it's not JSON, it's not XML, let's scream
+                # and raise the last context in this block (the XML exception)
+                # The function hack is because Py2.7 messes up with exception
+                # context otherwise.
+                _LOGGER.critical("Wasn't XML not JSON, failing")
+                raise DeserializationError("XML is invalid") from err
+        raise DeserializationError("Cannot deserialize content-type: {}".format(content_type))
+
+    @classmethod
+    def deserialize_from_http_generics(cls, body_bytes: Optional[Union[AnyStr, IO]], headers: Mapping) -> Any:
+        """Deserialize from HTTP response.
+
+        Use bytes and headers to NOT use any requests/aiohttp or whatever
+        specific implementation.
+        Headers will tested for "content-type"
+        """
+        # Try to use content-type from headers if available
+        content_type = None
+        if "content-type" in headers:
+            content_type = headers["content-type"].split(";")[0].strip().lower()
+        # Ouch, this server did not declare what it sent...
+        # Let's guess it's JSON...
+        # Also, since Autorest was considering that an empty body was a valid JSON,
+        # need that test as well....
+        else:
+            content_type = "application/json"
+
+        if body_bytes:
+            return cls.deserialize_from_text(body_bytes, content_type)
+        return None
+
+
+_LOGGER = logging.getLogger(__name__)
+
+try:
+    _long_type = long  # type: ignore
+except NameError:
+    _long_type = int
+
+
+class UTC(datetime.tzinfo):
+    """Time Zone info for handling UTC"""
+
+    def utcoffset(self, dt):
+        """UTF offset for UTC is 0."""
+        return datetime.timedelta(0)
+
+    def tzname(self, dt):
+        """Timestamp representation."""
+        return "Z"
+
+    def dst(self, dt):
+        """No daylight saving for UTC."""
+        return datetime.timedelta(hours=1)
+
+
+try:
+    from datetime import timezone as _FixedOffset  # type: ignore
+except ImportError:  # Python 2.7
+
+    class _FixedOffset(datetime.tzinfo):  # type: ignore
+        """Fixed offset in minutes east from UTC.
+        Copy/pasted from Python doc
+        :param datetime.timedelta offset: offset in timedelta format
+        """
+
+        def __init__(self, offset):
+            self.__offset = offset
+
+        def utcoffset(self, dt):
+            return self.__offset
+
+        def tzname(self, dt):
+            return str(self.__offset.total_seconds() / 3600)
+
+        def __repr__(self):
+            return "<FixedOffset {}>".format(self.tzname(None))
+
+        def dst(self, dt):
+            return datetime.timedelta(0)
+
+        def __getinitargs__(self):
+            return (self.__offset,)
+
+
+try:
+    from datetime import timezone
+
+    TZ_UTC = timezone.utc
+except ImportError:
+    TZ_UTC = UTC()  # type: ignore
+
+_FLATTEN = re.compile(r"(?<!\\)\.")
+
+
+def attribute_transformer(key, attr_desc, value):
+    """A key transformer that returns the Python attribute.
+
+    :param str key: The attribute name
+    :param dict attr_desc: The attribute metadata
+    :param object value: The value
+    :returns: A key using attribute name
+    """
+    return (key, value)
+
+
+def full_restapi_key_transformer(key, attr_desc, value):
+    """A key transformer that returns the full RestAPI key path.
+
+    :param str _: The attribute name
+    :param dict attr_desc: The attribute metadata
+    :param object value: The value
+    :returns: A list of keys using RestAPI syntax.
+    """
+    keys = _FLATTEN.split(attr_desc["key"])
+    return ([_decode_attribute_map_key(k) for k in keys], value)
+
+
+def last_restapi_key_transformer(key, attr_desc, value):
+    """A key transformer that returns the last RestAPI key.
+
+    :param str key: The attribute name
+    :param dict attr_desc: The attribute metadata
+    :param object value: The value
+    :returns: The last RestAPI key.
+    """
+    key, value = full_restapi_key_transformer(key, attr_desc, value)
+    return (key[-1], value)
+
+
+def _create_xml_node(tag, prefix=None, ns=None):
+    """Create a XML node."""
+    if prefix and ns:
+        ET.register_namespace(prefix, ns)
+    if ns:
+        return ET.Element("{" + ns + "}" + tag)
+    else:
+        return ET.Element(tag)
+
+
+class Model(object):
+    """Mixin for all client request body/response body models to support
+    serialization and deserialization.
+    """
+
+    _subtype_map: Dict[str, Dict[str, Any]] = {}
+    _attribute_map: Dict[str, Dict[str, Any]] = {}
+    _validation: Dict[str, Dict[str, Any]] = {}
+
+    def __init__(self, **kwargs: Any) -> None:
+        self.additional_properties: Optional[Dict[str, Any]] = {}
+        for k in kwargs:
+            if k not in self._attribute_map:
+                _LOGGER.warning("%s is not a known attribute of class %s and will be ignored", k, self.__class__)
+            elif k in self._validation and self._validation[k].get("readonly", False):
+                _LOGGER.warning("Readonly attribute %s will be ignored in class %s", k, self.__class__)
+            else:
+                setattr(self, k, kwargs[k])
+
+    def __eq__(self, other: Any) -> bool:
+        """Compare objects by comparing all attributes."""
+        if isinstance(other, self.__class__):
+            return self.__dict__ == other.__dict__
+        return False
+
+    def __ne__(self, other: Any) -> bool:
+        """Compare objects by comparing all attributes."""
+        return not self.__eq__(other)
+
+    def __str__(self) -> str:
+        return str(self.__dict__)
+
+    @classmethod
+    def enable_additional_properties_sending(cls) -> None:
+        cls._attribute_map["additional_properties"] = {"key": "", "type": "{object}"}
+
+    @classmethod
+    def is_xml_model(cls) -> bool:
+        try:
+            cls._xml_map  # type: ignore
+        except AttributeError:
+            return False
+        return True
+
+    @classmethod
+    def _create_xml_node(cls):
+        """Create XML node."""
+        try:
+            xml_map = cls._xml_map  # type: ignore
+        except AttributeError:
+            xml_map = {}
+
+        return _create_xml_node(xml_map.get("name", cls.__name__), xml_map.get("prefix", None), xml_map.get("ns", None))
+
+    def serialize(self, keep_readonly: bool = False, **kwargs: Any) -> JSON:
+        """Return the JSON that would be sent to server from this model.
+
+        This is an alias to `as_dict(full_restapi_key_transformer, keep_readonly=False)`.
+
+        If you want XML serialization, you can pass the kwargs is_xml=True.
+
+        :param bool keep_readonly: If you want to serialize the readonly attributes
+        :returns: A dict JSON compatible object
+        :rtype: dict
+        """
+        serializer = Serializer(self._infer_class_models())
+        return serializer._serialize(self, keep_readonly=keep_readonly, **kwargs)  # type: ignore
+
+    def as_dict(
+        self,
+        keep_readonly: bool = True,
+        key_transformer: Callable[[str, Dict[str, Any], Any], Any] = attribute_transformer,
+        **kwargs: Any
+    ) -> JSON:
+        """Return a dict that can be serialized using json.dump.
+
+        Advanced usage might optionally use a callback as parameter:
+
+        .. code::python
+
+            def my_key_transformer(key, attr_desc, value):
+                return key
+
+        Key is the attribute name used in Python. Attr_desc
+        is a dict of metadata. Currently contains 'type' with the
+        msrest type and 'key' with the RestAPI encoded key.
+        Value is the current value in this object.
+
+        The string returned will be used to serialize the key.
+        If the return type is a list, this is considered hierarchical
+        result dict.
+
+        See the three examples in this file:
+
+        - attribute_transformer
+        - full_restapi_key_transformer
+        - last_restapi_key_transformer
+
+        If you want XML serialization, you can pass the kwargs is_xml=True.
+
+        :param function key_transformer: A key transformer function.
+        :returns: A dict JSON compatible object
+        :rtype: dict
+        """
+        serializer = Serializer(self._infer_class_models())
+        return serializer._serialize(self, key_transformer=key_transformer, keep_readonly=keep_readonly, **kwargs)  # type: ignore
+
+    @classmethod
+    def _infer_class_models(cls):
+        try:
+            str_models = cls.__module__.rsplit(".", 1)[0]
+            models = sys.modules[str_models]
+            client_models = {k: v for k, v in models.__dict__.items() if isinstance(v, type)}
+            if cls.__name__ not in client_models:
+                raise ValueError("Not Autorest generated code")
+        except Exception:
+            # Assume it's not Autorest generated (tests?). Add ourselves as dependencies.
+            client_models = {cls.__name__: cls}
+        return client_models
+
+    @classmethod
+    def deserialize(cls: Type[ModelType], data: Any, content_type: Optional[str] = None) -> ModelType:
+        """Parse a str using the RestAPI syntax and return a model.
+
+        :param str data: A str using RestAPI structure. JSON by default.
+        :param str content_type: JSON by default, set application/xml if XML.
+        :returns: An instance of this model
+        :raises: DeserializationError if something went wrong
+        """
+        deserializer = Deserializer(cls._infer_class_models())
+        return deserializer(cls.__name__, data, content_type=content_type)  # type: ignore
+
+    @classmethod
+    def from_dict(
+        cls: Type[ModelType],
+        data: Any,
+        key_extractors: Optional[Callable[[str, Dict[str, Any], Any], Any]] = None,
+        content_type: Optional[str] = None,
+    ) -> ModelType:
+        """Parse a dict using given key extractor return a model.
+
+        By default consider key
+        extractors (rest_key_case_insensitive_extractor, attribute_key_case_insensitive_extractor
+        and last_rest_key_case_insensitive_extractor)
+
+        :param dict data: A dict using RestAPI structure
+        :param str content_type: JSON by default, set application/xml if XML.
+        :returns: An instance of this model
+        :raises: DeserializationError if something went wrong
+        """
+        deserializer = Deserializer(cls._infer_class_models())
+        deserializer.key_extractors = (  # type: ignore
+            [  # type: ignore
+                attribute_key_case_insensitive_extractor,
+                rest_key_case_insensitive_extractor,
+                last_rest_key_case_insensitive_extractor,
+            ]
+            if key_extractors is None
+            else key_extractors
+        )
+        return deserializer(cls.__name__, data, content_type=content_type)  # type: ignore
+
+    @classmethod
+    def _flatten_subtype(cls, key, objects):
+        if "_subtype_map" not in cls.__dict__:
+            return {}
+        result = dict(cls._subtype_map[key])
+        for valuetype in cls._subtype_map[key].values():
+            result.update(objects[valuetype]._flatten_subtype(key, objects))
+        return result
+
+    @classmethod
+    def _classify(cls, response, objects):
+        """Check the class _subtype_map for any child classes.
+        We want to ignore any inherited _subtype_maps.
+        Remove the polymorphic key from the initial data.
+        """
+        for subtype_key in cls.__dict__.get("_subtype_map", {}).keys():
+            subtype_value = None
+
+            if not isinstance(response, ET.Element):
+                rest_api_response_key = cls._get_rest_key_parts(subtype_key)[-1]
+                subtype_value = response.pop(rest_api_response_key, None) or response.pop(subtype_key, None)
+            else:
+                subtype_value = xml_key_extractor(subtype_key, cls._attribute_map[subtype_key], response)
+            if subtype_value:
+                # Try to match base class. Can be class name only
+                # (bug to fix in Autorest to support x-ms-discriminator-name)
+                if cls.__name__ == subtype_value:
+                    return cls
+                flatten_mapping_type = cls._flatten_subtype(subtype_key, objects)
+                try:
+                    return objects[flatten_mapping_type[subtype_value]]  # type: ignore
+                except KeyError:
+                    _LOGGER.warning(
+                        "Subtype value %s has no mapping, use base class %s.",
+                        subtype_value,
+                        cls.__name__,
+                    )
+                    break
+            else:
+                _LOGGER.warning("Discriminator %s is absent or null, use base class %s.", subtype_key, cls.__name__)
+                break
+        return cls
+
+    @classmethod
+    def _get_rest_key_parts(cls, attr_key):
+        """Get the RestAPI key of this attr, split it and decode part
+        :param str attr_key: Attribute key must be in attribute_map.
+        :returns: A list of RestAPI part
+        :rtype: list
+        """
+        rest_split_key = _FLATTEN.split(cls._attribute_map[attr_key]["key"])
+        return [_decode_attribute_map_key(key_part) for key_part in rest_split_key]
+
+
+def _decode_attribute_map_key(key):
+    """This decode a key in an _attribute_map to the actual key we want to look at
+    inside the received data.
+
+    :param str key: A key string from the generated code
+    """
+    return key.replace("\\.", ".")
+
+
+class Serializer(object):
+    """Request object model serializer."""
+
+    basic_types = {str: "str", int: "int", bool: "bool", float: "float"}
+
+    _xml_basic_types_serializers = {"bool": lambda x: str(x).lower()}
+    days = {0: "Mon", 1: "Tue", 2: "Wed", 3: "Thu", 4: "Fri", 5: "Sat", 6: "Sun"}
+    months = {
+        1: "Jan",
+        2: "Feb",
+        3: "Mar",
+        4: "Apr",
+        5: "May",
+        6: "Jun",
+        7: "Jul",
+        8: "Aug",
+        9: "Sep",
+        10: "Oct",
+        11: "Nov",
+        12: "Dec",
+    }
+    validation = {
+        "min_length": lambda x, y: len(x) < y,
+        "max_length": lambda x, y: len(x) > y,
+        "minimum": lambda x, y: x < y,
+        "maximum": lambda x, y: x > y,
+        "minimum_ex": lambda x, y: x <= y,
+        "maximum_ex": lambda x, y: x >= y,
+        "min_items": lambda x, y: len(x) < y,
+        "max_items": lambda x, y: len(x) > y,
+        "pattern": lambda x, y: not re.match(y, x, re.UNICODE),
+        "unique": lambda x, y: len(x) != len(set(x)),
+        "multiple": lambda x, y: x % y != 0,
+    }
+
+    def __init__(self, classes: Optional[Mapping[str, type]] = None):
+        self.serialize_type = {
+            "iso-8601": Serializer.serialize_iso,
+            "rfc-1123": Serializer.serialize_rfc,
+            "unix-time": Serializer.serialize_unix,
+            "duration": Serializer.serialize_duration,
+            "date": Serializer.serialize_date,
+            "time": Serializer.serialize_time,
+            "decimal": Serializer.serialize_decimal,
+            "long": Serializer.serialize_long,
+            "bytearray": Serializer.serialize_bytearray,
+            "base64": Serializer.serialize_base64,
+            "object": self.serialize_object,
+            "[]": self.serialize_iter,
+            "{}": self.serialize_dict,
+        }
+        self.dependencies: Dict[str, type] = dict(classes) if classes else {}
+        self.key_transformer = full_restapi_key_transformer
+        self.client_side_validation = True
+
+    def _serialize(self, target_obj, data_type=None, **kwargs):
+        """Serialize data into a string according to type.
+
+        :param target_obj: The data to be serialized.
+        :param str data_type: The type to be serialized from.
+        :rtype: str, dict
+        :raises: SerializationError if serialization fails.
+        """
+        key_transformer = kwargs.get("key_transformer", self.key_transformer)
+        keep_readonly = kwargs.get("keep_readonly", False)
+        if target_obj is None:
+            return None
+
+        attr_name = None
+        class_name = target_obj.__class__.__name__
+
+        if data_type:
+            return self.serialize_data(target_obj, data_type, **kwargs)
+
+        if not hasattr(target_obj, "_attribute_map"):
+            data_type = type(target_obj).__name__
+            if data_type in self.basic_types.values():
+                return self.serialize_data(target_obj, data_type, **kwargs)
+
+        # Force "is_xml" kwargs if we detect a XML model
+        try:
+            is_xml_model_serialization = kwargs["is_xml"]
+        except KeyError:
+            is_xml_model_serialization = kwargs.setdefault("is_xml", target_obj.is_xml_model())
+
+        serialized = {}
+        if is_xml_model_serialization:
+            serialized = target_obj._create_xml_node()
+        try:
+            attributes = target_obj._attribute_map
+            for attr, attr_desc in attributes.items():
+                attr_name = attr
+                if not keep_readonly and target_obj._validation.get(attr_name, {}).get("readonly", False):
+                    continue
+
+                if attr_name == "additional_properties" and attr_desc["key"] == "":
+                    if target_obj.additional_properties is not None:
+                        serialized.update(target_obj.additional_properties)
+                    continue
+                try:
+
+                    orig_attr = getattr(target_obj, attr)
+                    if is_xml_model_serialization:
+                        pass  # Don't provide "transformer" for XML for now. Keep "orig_attr"
+                    else:  # JSON
+                        keys, orig_attr = key_transformer(attr, attr_desc.copy(), orig_attr)
+                        keys = keys if isinstance(keys, list) else [keys]
+
+                    kwargs["serialization_ctxt"] = attr_desc
+                    new_attr = self.serialize_data(orig_attr, attr_desc["type"], **kwargs)
+
+                    if is_xml_model_serialization:
+                        xml_desc = attr_desc.get("xml", {})
+                        xml_name = xml_desc.get("name", attr_desc["key"])
+                        xml_prefix = xml_desc.get("prefix", None)
+                        xml_ns = xml_desc.get("ns", None)
+                        if xml_desc.get("attr", False):
+                            if xml_ns:
+                                ET.register_namespace(xml_prefix, xml_ns)
+                                xml_name = "{{{}}}{}".format(xml_ns, xml_name)
+                            serialized.set(xml_name, new_attr)  # type: ignore
+                            continue
+                        if xml_desc.get("text", False):
+                            serialized.text = new_attr  # type: ignore
+                            continue
+                        if isinstance(new_attr, list):
+                            serialized.extend(new_attr)  # type: ignore
+                        elif isinstance(new_attr, ET.Element):
+                            # If the down XML has no XML/Name, we MUST replace the tag with the local tag. But keeping the namespaces.
+                            if "name" not in getattr(orig_attr, "_xml_map", {}):
+                                splitted_tag = new_attr.tag.split("}")
+                                if len(splitted_tag) == 2:  # Namespace
+                                    new_attr.tag = "}".join([splitted_tag[0], xml_name])
+                                else:
+                                    new_attr.tag = xml_name
+                            serialized.append(new_attr)  # type: ignore
+                        else:  # That's a basic type
+                            # Integrate namespace if necessary
+                            local_node = _create_xml_node(xml_name, xml_prefix, xml_ns)
+                            local_node.text = str(new_attr)
+                            serialized.append(local_node)  # type: ignore
+                    else:  # JSON
+                        for k in reversed(keys):  # type: ignore
+                            new_attr = {k: new_attr}
+
+                        _new_attr = new_attr
+                        _serialized = serialized
+                        for k in keys:  # type: ignore
+                            if k not in _serialized:
+                                _serialized.update(_new_attr)  # type: ignore
+                            _new_attr = _new_attr[k]  # type: ignore
+                            _serialized = _serialized[k]
+                except ValueError as err:
+                    if isinstance(err, SerializationError):
+                        raise
+
+        except (AttributeError, KeyError, TypeError) as err:
+            msg = "Attribute {} in object {} cannot be serialized.\n{}".format(attr_name, class_name, str(target_obj))
+            raise SerializationError(msg) from err
+        else:
+            return serialized
+
+    def body(self, data, data_type, **kwargs):
+        """Serialize data intended for a request body.
+
+        :param data: The data to be serialized.
+        :param str data_type: The type to be serialized from.
+        :rtype: dict
+        :raises: SerializationError if serialization fails.
+        :raises: ValueError if data is None
+        """
+
+        # Just in case this is a dict
+        internal_data_type_str = data_type.strip("[]{}")
+        internal_data_type = self.dependencies.get(internal_data_type_str, None)
+        try:
+            is_xml_model_serialization = kwargs["is_xml"]
+        except KeyError:
+            if internal_data_type and issubclass(internal_data_type, Model):
+                is_xml_model_serialization = kwargs.setdefault("is_xml", internal_data_type.is_xml_model())
+            else:
+                is_xml_model_serialization = False
+        if internal_data_type and not isinstance(internal_data_type, Enum):
+            try:
+                deserializer = Deserializer(self.dependencies)
+                # Since it's on serialization, it's almost sure that format is not JSON REST
+                # We're not able to deal with additional properties for now.
+                deserializer.additional_properties_detection = False
+                if is_xml_model_serialization:
+                    deserializer.key_extractors = [  # type: ignore
+                        attribute_key_case_insensitive_extractor,
+                    ]
+                else:
+                    deserializer.key_extractors = [
+                        rest_key_case_insensitive_extractor,
+                        attribute_key_case_insensitive_extractor,
+                        last_rest_key_case_insensitive_extractor,
+                    ]
+                data = deserializer._deserialize(data_type, data)
+            except DeserializationError as err:
+                raise SerializationError("Unable to build a model: " + str(err)) from err
+
+        return self._serialize(data, data_type, **kwargs)
+
+    def url(self, name, data, data_type, **kwargs):
+        """Serialize data intended for a URL path.
+
+        :param data: The data to be serialized.
+        :param str data_type: The type to be serialized from.
+        :rtype: str
+        :raises: TypeError if serialization fails.
+        :raises: ValueError if data is None
+        """
+        try:
+            output = self.serialize_data(data, data_type, **kwargs)
+            if data_type == "bool":
+                output = json.dumps(output)
+
+            if kwargs.get("skip_quote") is True:
+                output = str(output)
+                output = output.replace("{", quote("{")).replace("}", quote("}"))
+            else:
+                output = quote(str(output), safe="")
+        except SerializationError:
+            raise TypeError("{} must be type {}.".format(name, data_type))
+        else:
+            return output
+
+    def query(self, name, data, data_type, **kwargs):
+        """Serialize data intended for a URL query.
+
+        :param data: The data to be serialized.
+        :param str data_type: The type to be serialized from.
+        :keyword bool skip_quote: Whether to skip quote the serialized result.
+        Defaults to False.
+        :rtype: str, list
+        :raises: TypeError if serialization fails.
+        :raises: ValueError if data is None
+        """
+        try:
+            # Treat the list aside, since we don't want to encode the div separator
+            if data_type.startswith("["):
+                internal_data_type = data_type[1:-1]
+                do_quote = not kwargs.get("skip_quote", False)
+                return self.serialize_iter(data, internal_data_type, do_quote=do_quote, **kwargs)
+
+            # Not a list, regular serialization
+            output = self.serialize_data(data, data_type, **kwargs)
+            if data_type == "bool":
+                output = json.dumps(output)
+            if kwargs.get("skip_quote") is True:
+                output = str(output)
+            else:
+                output = quote(str(output), safe="")
+        except SerializationError:
+            raise TypeError("{} must be type {}.".format(name, data_type))
+        else:
+            return str(output)
+
+    def header(self, name, data, data_type, **kwargs):
+        """Serialize data intended for a request header.
+
+        :param data: The data to be serialized.
+        :param str data_type: The type to be serialized from.
+        :rtype: str
+        :raises: TypeError if serialization fails.
+        :raises: ValueError if data is None
+        """
+        try:
+            if data_type in ["[str]"]:
+                data = ["" if d is None else d for d in data]
+
+            output = self.serialize_data(data, data_type, **kwargs)
+            if data_type == "bool":
+                output = json.dumps(output)
+        except SerializationError:
+            raise TypeError("{} must be type {}.".format(name, data_type))
+        else:
+            return str(output)
+
+    def serialize_data(self, data, data_type, **kwargs):
+        """Serialize generic data according to supplied data type.
+
+        :param data: The data to be serialized.
+        :param str data_type: The type to be serialized from.
+        :param bool required: Whether it's essential that the data not be
+         empty or None
+        :raises: AttributeError if required data is None.
+        :raises: ValueError if data is None
+        :raises: SerializationError if serialization fails.
+        """
+        if data is None:
+            raise ValueError("No value for given attribute")
+
+        try:
+            if data is CoreNull:
+                return None
+            if data_type in self.basic_types.values():
+                return self.serialize_basic(data, data_type, **kwargs)
+
+            elif data_type in self.serialize_type:
+                return self.serialize_type[data_type](data, **kwargs)
+
+            # If dependencies is empty, try with current data class
+            # It has to be a subclass of Enum anyway
+            enum_type = self.dependencies.get(data_type, data.__class__)
+            if issubclass(enum_type, Enum):
+                return Serializer.serialize_enum(data, enum_obj=enum_type)
+
+            iter_type = data_type[0] + data_type[-1]
+            if iter_type in self.serialize_type:
+                return self.serialize_type[iter_type](data, data_type[1:-1], **kwargs)
+
+        except (ValueError, TypeError) as err:
+            msg = "Unable to serialize value: {!r} as type: {!r}."
+            raise SerializationError(msg.format(data, data_type)) from err
+        else:
+            return self._serialize(data, **kwargs)
+
+    @classmethod
+    def _get_custom_serializers(cls, data_type, **kwargs):
+        custom_serializer = kwargs.get("basic_types_serializers", {}).get(data_type)
+        if custom_serializer:
+            return custom_serializer
+        if kwargs.get("is_xml", False):
+            return cls._xml_basic_types_serializers.get(data_type)
+
+    @classmethod
+    def serialize_basic(cls, data, data_type, **kwargs):
+        """Serialize basic builting data type.
+        Serializes objects to str, int, float or bool.
+
+        Possible kwargs:
+        - basic_types_serializers dict[str, callable] : If set, use the callable as serializer
+        - is_xml bool : If set, use xml_basic_types_serializers
+
+        :param data: Object to be serialized.
+        :param str data_type: Type of object in the iterable.
+        """
+        custom_serializer = cls._get_custom_serializers(data_type, **kwargs)
+        if custom_serializer:
+            return custom_serializer(data)
+        if data_type == "str":
+            return cls.serialize_unicode(data)
+        return eval(data_type)(data)  # nosec
+
+    @classmethod
+    def serialize_unicode(cls, data):
+        """Special handling for serializing unicode strings in Py2.
+        Encode to UTF-8 if unicode, otherwise handle as a str.
+
+        :param data: Object to be serialized.
+        :rtype: str
+        """
+        try:  # If I received an enum, return its value
+            return data.value
+        except AttributeError:
+            pass
+
+        try:
+            if isinstance(data, unicode):  # type: ignore
+                # Don't change it, JSON and XML ElementTree are totally able
+                # to serialize correctly u'' strings
+                return data
+        except NameError:
+            return str(data)
+        else:
+            return str(data)
+
+    def serialize_iter(self, data, iter_type, div=None, **kwargs):
+        """Serialize iterable.
+
+        Supported kwargs:
+        - serialization_ctxt dict : The current entry of _attribute_map, or same format.
+          serialization_ctxt['type'] should be same as data_type.
+        - is_xml bool : If set, serialize as XML
+
+        :param list attr: Object to be serialized.
+        :param str iter_type: Type of object in the iterable.
+        :param bool required: Whether the objects in the iterable must
+         not be None or empty.
+        :param str div: If set, this str will be used to combine the elements
+         in the iterable into a combined string. Default is 'None'.
+        :keyword bool do_quote: Whether to quote the serialized result of each iterable element.
+        Defaults to False.
+        :rtype: list, str
+        """
+        if isinstance(data, str):
+            raise SerializationError("Refuse str type as a valid iter type.")
+
+        serialization_ctxt = kwargs.get("serialization_ctxt", {})
+        is_xml = kwargs.get("is_xml", False)
+
+        serialized = []
+        for d in data:
+            try:
+                serialized.append(self.serialize_data(d, iter_type, **kwargs))
+            except ValueError as err:
+                if isinstance(err, SerializationError):
+                    raise
+                serialized.append(None)
+
+        if kwargs.get("do_quote", False):
+            serialized = ["" if s is None else quote(str(s), safe="") for s in serialized]
+
+        if div:
+            serialized = ["" if s is None else str(s) for s in serialized]
+            serialized = div.join(serialized)
+
+        if "xml" in serialization_ctxt or is_xml:
+            # XML serialization is more complicated
+            xml_desc = serialization_ctxt.get("xml", {})
+            xml_name = xml_desc.get("name")
+            if not xml_name:
+                xml_name = serialization_ctxt["key"]
+
+            # Create a wrap node if necessary (use the fact that Element and list have "append")
+            is_wrapped = xml_desc.get("wrapped", False)
+            node_name = xml_desc.get("itemsName", xml_name)
+            if is_wrapped:
+                final_result = _create_xml_node(xml_name, xml_desc.get("prefix", None), xml_desc.get("ns", None))
+            else:
+                final_result = []
+            # All list elements to "local_node"
+            for el in serialized:
+                if isinstance(el, ET.Element):
+                    el_node = el
+                else:
+                    el_node = _create_xml_node(node_name, xml_desc.get("prefix", None), xml_desc.get("ns", None))
+                    if el is not None:  # Otherwise it writes "None" :-p
+                        el_node.text = str(el)
+                final_result.append(el_node)
+            return final_result
+        return serialized
+
+    def serialize_dict(self, attr, dict_type, **kwargs):
+        """Serialize a dictionary of objects.
+
+        :param dict attr: Object to be serialized.
+        :param str dict_type: Type of object in the dictionary.
+        :param bool required: Whether the objects in the dictionary must
+         not be None or empty.
+        :rtype: dict
+        """
+        serialization_ctxt = kwargs.get("serialization_ctxt", {})
+        serialized = {}
+        for key, value in attr.items():
+            try:
+                serialized[self.serialize_unicode(key)] = self.serialize_data(value, dict_type, **kwargs)
+            except ValueError as err:
+                if isinstance(err, SerializationError):
+                    raise
+                serialized[self.serialize_unicode(key)] = None
+
+        if "xml" in serialization_ctxt:
+            # XML serialization is more complicated
+            xml_desc = serialization_ctxt["xml"]
+            xml_name = xml_desc["name"]
+
+            final_result = _create_xml_node(xml_name, xml_desc.get("prefix", None), xml_desc.get("ns", None))
+            for key, value in serialized.items():
+                ET.SubElement(final_result, key).text = value
+            return final_result
+
+        return serialized
+
+    def serialize_object(self, attr, **kwargs):
+        """Serialize a generic object.
+        This will be handled as a dictionary. If object passed in is not
+        a basic type (str, int, float, dict, list) it will simply be
+        cast to str.
+
+        :param dict attr: Object to be serialized.
+        :rtype: dict or str
+        """
+        if attr is None:
+            return None
+        if isinstance(attr, ET.Element):
+            return attr
+        obj_type = type(attr)
+        if obj_type in self.basic_types:
+            return self.serialize_basic(attr, self.basic_types[obj_type], **kwargs)
+        if obj_type is _long_type:
+            return self.serialize_long(attr)
+        if obj_type is str:
+            return self.serialize_unicode(attr)
+        if obj_type is datetime.datetime:
+            return self.serialize_iso(attr)
+        if obj_type is datetime.date:
+            return self.serialize_date(attr)
+        if obj_type is datetime.time:
+            return self.serialize_time(attr)
+        if obj_type is datetime.timedelta:
+            return self.serialize_duration(attr)
+        if obj_type is decimal.Decimal:
+            return self.serialize_decimal(attr)
+
+        # If it's a model or I know this dependency, serialize as a Model
+        elif obj_type in self.dependencies.values() or isinstance(attr, Model):
+            return self._serialize(attr)
+
+        if obj_type == dict:
+            serialized = {}
+            for key, value in attr.items():
+                try:
+                    serialized[self.serialize_unicode(key)] = self.serialize_object(value, **kwargs)
+                except ValueError:
+                    serialized[self.serialize_unicode(key)] = None
+            return serialized
+
+        if obj_type == list:
+            serialized = []
+            for obj in attr:
+                try:
+                    serialized.append(self.serialize_object(obj, **kwargs))
+                except ValueError:
+                    pass
+            return serialized
+        return str(attr)
+
+    @staticmethod
+    def serialize_enum(attr, enum_obj=None):
+        try:
+            result = attr.value
+        except AttributeError:
+            result = attr
+        try:
+            enum_obj(result)  # type: ignore
+            return result
+        except ValueError:
+            for enum_value in enum_obj:  # type: ignore
+                if enum_value.value.lower() == str(attr).lower():
+                    return enum_value.value
+            error = "{!r} is not valid value for enum {!r}"
+            raise SerializationError(error.format(attr, enum_obj))
+
+    @staticmethod
+    def serialize_bytearray(attr, **kwargs):
+        """Serialize bytearray into base-64 string.
+
+        :param attr: Object to be serialized.
+        :rtype: str
+        """
+        return b64encode(attr).decode()
+
+    @staticmethod
+    def serialize_base64(attr, **kwargs):
+        """Serialize str into base-64 string.
+
+        :param attr: Object to be serialized.
+        :rtype: str
+        """
+        encoded = b64encode(attr).decode("ascii")
+        return encoded.strip("=").replace("+", "-").replace("/", "_")
+
+    @staticmethod
+    def serialize_decimal(attr, **kwargs):
+        """Serialize Decimal object to float.
+
+        :param attr: Object to be serialized.
+        :rtype: float
+        """
+        return float(attr)
+
+    @staticmethod
+    def serialize_long(attr, **kwargs):
+        """Serialize long (Py2) or int (Py3).
+
+        :param attr: Object to be serialized.
+        :rtype: int/long
+        """
+        return _long_type(attr)
+
+    @staticmethod
+    def serialize_date(attr, **kwargs):
+        """Serialize Date object into ISO-8601 formatted string.
+
+        :param Date attr: Object to be serialized.
+        :rtype: str
+        """
+        if isinstance(attr, str):
+            attr = isodate.parse_date(attr)
+        t = "{:04}-{:02}-{:02}".format(attr.year, attr.month, attr.day)
+        return t
+
+    @staticmethod
+    def serialize_time(attr, **kwargs):
+        """Serialize Time object into ISO-8601 formatted string.
+
+        :param datetime.time attr: Object to be serialized.
+        :rtype: str
+        """
+        if isinstance(attr, str):
+            attr = isodate.parse_time(attr)
+        t = "{:02}:{:02}:{:02}".format(attr.hour, attr.minute, attr.second)
+        if attr.microsecond:
+            t += ".{:02}".format(attr.microsecond)
+        return t
+
+    @staticmethod
+    def serialize_duration(attr, **kwargs):
+        """Serialize TimeDelta object into ISO-8601 formatted string.
+
+        :param TimeDelta attr: Object to be serialized.
+        :rtype: str
+        """
+        if isinstance(attr, str):
+            attr = isodate.parse_duration(attr)
+        return isodate.duration_isoformat(attr)
+
+    @staticmethod
+    def serialize_rfc(attr, **kwargs):
+        """Serialize Datetime object into RFC-1123 formatted string.
+
+        :param Datetime attr: Object to be serialized.
+        :rtype: str
+        :raises: TypeError if format invalid.
+        """
+        try:
+            if not attr.tzinfo:
+                _LOGGER.warning("Datetime with no tzinfo will be considered UTC.")
+            utc = attr.utctimetuple()
+        except AttributeError:
+            raise TypeError("RFC1123 object must be valid Datetime object.")
+
+        return "{}, {:02} {} {:04} {:02}:{:02}:{:02} GMT".format(
+            Serializer.days[utc.tm_wday],
+            utc.tm_mday,
+            Serializer.months[utc.tm_mon],
+            utc.tm_year,
+            utc.tm_hour,
+            utc.tm_min,
+            utc.tm_sec,
+        )
+
+    @staticmethod
+    def serialize_iso(attr, **kwargs):
+        """Serialize Datetime object into ISO-8601 formatted string.
+
+        :param Datetime attr: Object to be serialized.
+        :rtype: str
+        :raises: SerializationError if format invalid.
+        """
+        if isinstance(attr, str):
+            attr = isodate.parse_datetime(attr)
+        try:
+            if not attr.tzinfo:
+                _LOGGER.warning("Datetime with no tzinfo will be considered UTC.")
+            utc = attr.utctimetuple()
+            if utc.tm_year > 9999 or utc.tm_year < 1:
+                raise OverflowError("Hit max or min date")
+
+            microseconds = str(attr.microsecond).rjust(6, "0").rstrip("0").ljust(3, "0")
+            if microseconds:
+                microseconds = "." + microseconds
+            date = "{:04}-{:02}-{:02}T{:02}:{:02}:{:02}".format(
+                utc.tm_year, utc.tm_mon, utc.tm_mday, utc.tm_hour, utc.tm_min, utc.tm_sec
+            )
+            return date + microseconds + "Z"
+        except (ValueError, OverflowError) as err:
+            msg = "Unable to serialize datetime object."
+            raise SerializationError(msg) from err
+        except AttributeError as err:
+            msg = "ISO-8601 object must be valid Datetime object."
+            raise TypeError(msg) from err
+
+    @staticmethod
+    def serialize_unix(attr, **kwargs):
+        """Serialize Datetime object into IntTime format.
+        This is represented as seconds.
+
+        :param Datetime attr: Object to be serialized.
+        :rtype: int
+        :raises: SerializationError if format invalid
+        """
+        if isinstance(attr, int):
+            return attr
+        try:
+            if not attr.tzinfo:
+                _LOGGER.warning("Datetime with no tzinfo will be considered UTC.")
+            return int(calendar.timegm(attr.utctimetuple()))
+        except AttributeError:
+            raise TypeError("Unix time object must be valid Datetime object.")
+
+
+def rest_key_extractor(attr, attr_desc, data):
+    key = attr_desc["key"]
+    working_data = data
+
+    while "." in key:
+        # Need the cast, as for some reasons "split" is typed as list[str | Any]
+        dict_keys = cast(List[str], _FLATTEN.split(key))
+        if len(dict_keys) == 1:
+            key = _decode_attribute_map_key(dict_keys[0])
+            break
+        working_key = _decode_attribute_map_key(dict_keys[0])
+        working_data = working_data.get(working_key, data)
+        if working_data is None:
+            # If at any point while following flatten JSON path see None, it means
+            # that all properties under are None as well
+            return None
+        key = ".".join(dict_keys[1:])
+
+    return working_data.get(key)
+
+
+def rest_key_case_insensitive_extractor(attr, attr_desc, data):
+    key = attr_desc["key"]
+    working_data = data
+
+    while "." in key:
+        dict_keys = _FLATTEN.split(key)
+        if len(dict_keys) == 1:
+            key = _decode_attribute_map_key(dict_keys[0])
+            break
+        working_key = _decode_attribute_map_key(dict_keys[0])
+        working_data = attribute_key_case_insensitive_extractor(working_key, None, working_data)
+        if working_data is None:
+            # If at any point while following flatten JSON path see None, it means
+            # that all properties under are None as well
+            return None
+        key = ".".join(dict_keys[1:])
+
+    if working_data:
+        return attribute_key_case_insensitive_extractor(key, None, working_data)
+
+
+def last_rest_key_extractor(attr, attr_desc, data):
+    """Extract the attribute in "data" based on the last part of the JSON path key."""
+    key = attr_desc["key"]
+    dict_keys = _FLATTEN.split(key)
+    return attribute_key_extractor(dict_keys[-1], None, data)
+
+
+def last_rest_key_case_insensitive_extractor(attr, attr_desc, data):
+    """Extract the attribute in "data" based on the last part of the JSON path key.
+
+    This is the case insensitive version of "last_rest_key_extractor"
+    """
+    key = attr_desc["key"]
+    dict_keys = _FLATTEN.split(key)
+    return attribute_key_case_insensitive_extractor(dict_keys[-1], None, data)
+
+
+def attribute_key_extractor(attr, _, data):
+    return data.get(attr)
+
+
+def attribute_key_case_insensitive_extractor(attr, _, data):
+    found_key = None
+    lower_attr = attr.lower()
+    for key in data:
+        if lower_attr == key.lower():
+            found_key = key
+            break
+
+    return data.get(found_key)
+
+
+def _extract_name_from_internal_type(internal_type):
+    """Given an internal type XML description, extract correct XML name with namespace.
+
+    :param dict internal_type: An model type
+    :rtype: tuple
+    :returns: A tuple XML name + namespace dict
+    """
+    internal_type_xml_map = getattr(internal_type, "_xml_map", {})
+    xml_name = internal_type_xml_map.get("name", internal_type.__name__)
+    xml_ns = internal_type_xml_map.get("ns", None)
+    if xml_ns:
+        xml_name = "{{{}}}{}".format(xml_ns, xml_name)
+    return xml_name
+
+
+def xml_key_extractor(attr, attr_desc, data):
+    if isinstance(data, dict):
+        return None
+
+    # Test if this model is XML ready first
+    if not isinstance(data, ET.Element):
+        return None
+
+    xml_desc = attr_desc.get("xml", {})
+    xml_name = xml_desc.get("name", attr_desc["key"])
+
+    # Look for a children
+    is_iter_type = attr_desc["type"].startswith("[")
+    is_wrapped = xml_desc.get("wrapped", False)
+    internal_type = attr_desc.get("internalType", None)
+    internal_type_xml_map = getattr(internal_type, "_xml_map", {})
+
+    # Integrate namespace if necessary
+    xml_ns = xml_desc.get("ns", internal_type_xml_map.get("ns", None))
+    if xml_ns:
+        xml_name = "{{{}}}{}".format(xml_ns, xml_name)
+
+    # If it's an attribute, that's simple
+    if xml_desc.get("attr", False):
+        return data.get(xml_name)
+
+    # If it's x-ms-text, that's simple too
+    if xml_desc.get("text", False):
+        return data.text
+
+    # Scenario where I take the local name:
+    # - Wrapped node
+    # - Internal type is an enum (considered basic types)
+    # - Internal type has no XML/Name node
+    if is_wrapped or (internal_type and (issubclass(internal_type, Enum) or "name" not in internal_type_xml_map)):
+        children = data.findall(xml_name)
+    # If internal type has a local name and it's not a list, I use that name
+    elif not is_iter_type and internal_type and "name" in internal_type_xml_map:
+        xml_name = _extract_name_from_internal_type(internal_type)
+        children = data.findall(xml_name)
+    # That's an array
+    else:
+        if internal_type:  # Complex type, ignore itemsName and use the complex type name
+            items_name = _extract_name_from_internal_type(internal_type)
+        else:
+            items_name = xml_desc.get("itemsName", xml_name)
+        children = data.findall(items_name)
+
+    if len(children) == 0:
+        if is_iter_type:
+            if is_wrapped:
+                return None  # is_wrapped no node, we want None
+            else:
+                return []  # not wrapped, assume empty list
+        return None  # Assume it's not there, maybe an optional node.
+
+    # If is_iter_type and not wrapped, return all found children
+    if is_iter_type:
+        if not is_wrapped:
+            return children
+        else:  # Iter and wrapped, should have found one node only (the wrap one)
+            if len(children) != 1:
+                raise DeserializationError(
+                    "Tried to deserialize an array not wrapped, and found several nodes '{}'. Maybe you should declare this array as wrapped?".format(
+                        xml_name
+                    )
+                )
+            return list(children[0])  # Might be empty list and that's ok.
+
+    # Here it's not a itertype, we should have found one element only or empty
+    if len(children) > 1:
+        raise DeserializationError("Find several XML '{}' where it was not expected".format(xml_name))
+    return children[0]
+
+
+class Deserializer(object):
+    """Response object model deserializer.
+
+    :param dict classes: Class type dictionary for deserializing complex types.
+    :ivar list key_extractors: Ordered list of extractors to be used by this deserializer.
+    """
+
+    basic_types = {str: "str", int: "int", bool: "bool", float: "float"}
+
+    valid_date = re.compile(r"\d{4}[-]\d{2}[-]\d{2}T\d{2}:\d{2}:\d{2}" r"\.?\d*Z?[-+]?[\d{2}]?:?[\d{2}]?")
+
+    def __init__(self, classes: Optional[Mapping[str, type]] = None):
+        self.deserialize_type = {
+            "iso-8601": Deserializer.deserialize_iso,
+            "rfc-1123": Deserializer.deserialize_rfc,
+            "unix-time": Deserializer.deserialize_unix,
+            "duration": Deserializer.deserialize_duration,
+            "date": Deserializer.deserialize_date,
+            "time": Deserializer.deserialize_time,
+            "decimal": Deserializer.deserialize_decimal,
+            "long": Deserializer.deserialize_long,
+            "bytearray": Deserializer.deserialize_bytearray,
+            "base64": Deserializer.deserialize_base64,
+            "object": self.deserialize_object,
+            "[]": self.deserialize_iter,
+            "{}": self.deserialize_dict,
+        }
+        self.deserialize_expected_types = {
+            "duration": (isodate.Duration, datetime.timedelta),
+            "iso-8601": (datetime.datetime),
+        }
+        self.dependencies: Dict[str, type] = dict(classes) if classes else {}
+        self.key_extractors = [rest_key_extractor, xml_key_extractor]
+        # Additional properties only works if the "rest_key_extractor" is used to
+        # extract the keys. Making it to work whatever the key extractor is too much
+        # complicated, with no real scenario for now.
+        # So adding a flag to disable additional properties detection. This flag should be
+        # used if your expect the deserialization to NOT come from a JSON REST syntax.
+        # Otherwise, result are unexpected
+        self.additional_properties_detection = True
+
+    def __call__(self, target_obj, response_data, content_type=None):
+        """Call the deserializer to process a REST response.
+
+        :param str target_obj: Target data type to deserialize to.
+        :param requests.Response response_data: REST response object.
+        :param str content_type: Swagger "produces" if available.
+        :raises: DeserializationError if deserialization fails.
+        :return: Deserialized object.
+        """
+        data = self._unpack_content(response_data, content_type)
+        return self._deserialize(target_obj, data)
+
+    def _deserialize(self, target_obj, data):
+        """Call the deserializer on a model.
+
+        Data needs to be already deserialized as JSON or XML ElementTree
+
+        :param str target_obj: Target data type to deserialize to.
+        :param object data: Object to deserialize.
+        :raises: DeserializationError if deserialization fails.
+        :return: Deserialized object.
+        """
+        # This is already a model, go recursive just in case
+        if hasattr(data, "_attribute_map"):
+            constants = [name for name, config in getattr(data, "_validation", {}).items() if config.get("constant")]
+            try:
+                for attr, mapconfig in data._attribute_map.items():
+                    if attr in constants:
+                        continue
+                    value = getattr(data, attr)
+                    if value is None:
+                        continue
+                    local_type = mapconfig["type"]
+                    internal_data_type = local_type.strip("[]{}")
+                    if internal_data_type not in self.dependencies or isinstance(internal_data_type, Enum):
+                        continue
+                    setattr(data, attr, self._deserialize(local_type, value))
+                return data
+            except AttributeError:
+                return
+
+        response, class_name = self._classify_target(target_obj, data)
+
+        if isinstance(response, str):
+            return self.deserialize_data(data, response)
+        elif isinstance(response, type) and issubclass(response, Enum):
+            return self.deserialize_enum(data, response)
+
+        if data is None:
+            return data
+        try:
+            attributes = response._attribute_map  # type: ignore
+            d_attrs = {}
+            for attr, attr_desc in attributes.items():
+                # Check empty string. If it's not empty, someone has a real "additionalProperties"...
+                if attr == "additional_properties" and attr_desc["key"] == "":
+                    continue
+                raw_value = None
+                # Enhance attr_desc with some dynamic data
+                attr_desc = attr_desc.copy()  # Do a copy, do not change the real one
+                internal_data_type = attr_desc["type"].strip("[]{}")
+                if internal_data_type in self.dependencies:
+                    attr_desc["internalType"] = self.dependencies[internal_data_type]
+
+                for key_extractor in self.key_extractors:
+                    found_value = key_extractor(attr, attr_desc, data)
+                    if found_value is not None:
+                        if raw_value is not None and raw_value != found_value:
+                            msg = (
+                                "Ignoring extracted value '%s' from %s for key '%s'"
+                                " (duplicate extraction, follow extractors order)"
+                            )
+                            _LOGGER.warning(msg, found_value, key_extractor, attr)
+                            continue
+                        raw_value = found_value
+
+                value = self.deserialize_data(raw_value, attr_desc["type"])
+                d_attrs[attr] = value
+        except (AttributeError, TypeError, KeyError) as err:
+            msg = "Unable to deserialize to object: " + class_name  # type: ignore
+            raise DeserializationError(msg) from err
+        else:
+            additional_properties = self._build_additional_properties(attributes, data)
+            return self._instantiate_model(response, d_attrs, additional_properties)
+
+    def _build_additional_properties(self, attribute_map, data):
+        if not self.additional_properties_detection:
+            return None
+        if "additional_properties" in attribute_map and attribute_map.get("additional_properties", {}).get("key") != "":
+            # Check empty string. If it's not empty, someone has a real "additionalProperties"
+            return None
+        if isinstance(data, ET.Element):
+            data = {el.tag: el.text for el in data}
+
+        known_keys = {
+            _decode_attribute_map_key(_FLATTEN.split(desc["key"])[0])
+            for desc in attribute_map.values()
+            if desc["key"] != ""
+        }
+        present_keys = set(data.keys())
+        missing_keys = present_keys - known_keys
+        return {key: data[key] for key in missing_keys}
+
+    def _classify_target(self, target, data):
+        """Check to see whether the deserialization target object can
+        be classified into a subclass.
+        Once classification has been determined, initialize object.
+
+        :param str target: The target object type to deserialize to.
+        :param str/dict data: The response data to deserialize.
+        """
+        if target is None:
+            return None, None
+
+        if isinstance(target, str):
+            try:
+                target = self.dependencies[target]
+            except KeyError:
+                return target, target
+
+        try:
+            target = target._classify(data, self.dependencies)  # type: ignore
+        except AttributeError:
+            pass  # Target is not a Model, no classify
+        return target, target.__class__.__name__  # type: ignore
+
+    def failsafe_deserialize(self, target_obj, data, content_type=None):
+        """Ignores any errors encountered in deserialization,
+        and falls back to not deserializing the object. Recommended
+        for use in error deserialization, as we want to return the
+        HttpResponseError to users, and not have them deal with
+        a deserialization error.
+
+        :param str target_obj: The target object type to deserialize to.
+        :param str/dict data: The response data to deserialize.
+        :param str content_type: Swagger "produces" if available.
+        """
+        try:
+            return self(target_obj, data, content_type=content_type)
+        except:
+            _LOGGER.debug(
+                "Ran into a deserialization error. Ignoring since this is failsafe deserialization", exc_info=True
+            )
+            return None
+
+    @staticmethod
+    def _unpack_content(raw_data, content_type=None):
+        """Extract the correct structure for deserialization.
+
+        If raw_data is a PipelineResponse, try to extract the result of RawDeserializer.
+        if we can't, raise. Your Pipeline should have a RawDeserializer.
+
+        If not a pipeline response and raw_data is bytes or string, use content-type
+        to decode it. If no content-type, try JSON.
+
+        If raw_data is something else, bypass all logic and return it directly.
+
+        :param raw_data: Data to be processed.
+        :param content_type: How to parse if raw_data is a string/bytes.
+        :raises JSONDecodeError: If JSON is requested and parsing is impossible.
+        :raises UnicodeDecodeError: If bytes is not UTF8
+        """
+        # Assume this is enough to detect a Pipeline Response without importing it
+        context = getattr(raw_data, "context", {})
+        if context:
+            if RawDeserializer.CONTEXT_NAME in context:
+                return context[RawDeserializer.CONTEXT_NAME]
+            raise ValueError("This pipeline didn't have the RawDeserializer policy; can't deserialize")
+
+        # Assume this is enough to recognize universal_http.ClientResponse without importing it
+        if hasattr(raw_data, "body"):
+            return RawDeserializer.deserialize_from_http_generics(raw_data.text(), raw_data.headers)
+
+        # Assume this enough to recognize requests.Response without importing it.
+        if hasattr(raw_data, "_content_consumed"):
+            return RawDeserializer.deserialize_from_http_generics(raw_data.text, raw_data.headers)
+
+        if isinstance(raw_data, (str, bytes)) or hasattr(raw_data, "read"):
+            return RawDeserializer.deserialize_from_text(raw_data, content_type)  # type: ignore
+        return raw_data
+
+    def _instantiate_model(self, response, attrs, additional_properties=None):
+        """Instantiate a response model passing in deserialized args.
+
+        :param response: The response model class.
+        :param d_attrs: The deserialized response attributes.
+        """
+        if callable(response):
+            subtype = getattr(response, "_subtype_map", {})
+            try:
+                readonly = [k for k, v in response._validation.items() if v.get("readonly")]
+                const = [k for k, v in response._validation.items() if v.get("constant")]
+                kwargs = {k: v for k, v in attrs.items() if k not in subtype and k not in readonly + const}
+                response_obj = response(**kwargs)
+                for attr in readonly:
+                    setattr(response_obj, attr, attrs.get(attr))
+                if additional_properties:
+                    response_obj.additional_properties = additional_properties
+                return response_obj
+            except TypeError as err:
+                msg = "Unable to deserialize {} into model {}. ".format(kwargs, response)  # type: ignore
+                raise DeserializationError(msg + str(err))
+        else:
+            try:
+                for attr, value in attrs.items():
+                    setattr(response, attr, value)
+                return response
+            except Exception as exp:
+                msg = "Unable to populate response model. "
+                msg += "Type: {}, Error: {}".format(type(response), exp)
+                raise DeserializationError(msg)
+
+    def deserialize_data(self, data, data_type):
+        """Process data for deserialization according to data type.
+
+        :param str data: The response string to be deserialized.
+        :param str data_type: The type to deserialize to.
+        :raises: DeserializationError if deserialization fails.
+        :return: Deserialized object.
+        """
+        if data is None:
+            return data
+
+        try:
+            if not data_type:
+                return data
+            if data_type in self.basic_types.values():
+                return self.deserialize_basic(data, data_type)
+            if data_type in self.deserialize_type:
+                if isinstance(data, self.deserialize_expected_types.get(data_type, tuple())):
+                    return data
+
+                is_a_text_parsing_type = lambda x: x not in ["object", "[]", r"{}"]
+                if isinstance(data, ET.Element) and is_a_text_parsing_type(data_type) and not data.text:
+                    return None
+                data_val = self.deserialize_type[data_type](data)
+                return data_val
+
+            iter_type = data_type[0] + data_type[-1]
+            if iter_type in self.deserialize_type:
+                return self.deserialize_type[iter_type](data, data_type[1:-1])
+
+            obj_type = self.dependencies[data_type]
+            if issubclass(obj_type, Enum):
+                if isinstance(data, ET.Element):
+                    data = data.text
+                return self.deserialize_enum(data, obj_type)
+
+        except (ValueError, TypeError, AttributeError) as err:
+            msg = "Unable to deserialize response data."
+            msg += " Data: {}, {}".format(data, data_type)
+            raise DeserializationError(msg) from err
+        else:
+            return self._deserialize(obj_type, data)
+
+    def deserialize_iter(self, attr, iter_type):
+        """Deserialize an iterable.
+
+        :param list attr: Iterable to be deserialized.
+        :param str iter_type: The type of object in the iterable.
+        :rtype: list
+        """
+        if attr is None:
+            return None
+        if isinstance(attr, ET.Element):  # If I receive an element here, get the children
+            attr = list(attr)
+        if not isinstance(attr, (list, set)):
+            raise DeserializationError("Cannot deserialize as [{}] an object of type {}".format(iter_type, type(attr)))
+        return [self.deserialize_data(a, iter_type) for a in attr]
+
+    def deserialize_dict(self, attr, dict_type):
+        """Deserialize a dictionary.
+
+        :param dict/list attr: Dictionary to be deserialized. Also accepts
+         a list of key, value pairs.
+        :param str dict_type: The object type of the items in the dictionary.
+        :rtype: dict
+        """
+        if isinstance(attr, list):
+            return {x["key"]: self.deserialize_data(x["value"], dict_type) for x in attr}
+
+        if isinstance(attr, ET.Element):
+            # Transform <Key>value</Key> into {"Key": "value"}
+            attr = {el.tag: el.text for el in attr}
+        return {k: self.deserialize_data(v, dict_type) for k, v in attr.items()}
+
+    def deserialize_object(self, attr, **kwargs):
+        """Deserialize a generic object.
+        This will be handled as a dictionary.
+
+        :param dict attr: Dictionary to be deserialized.
+        :rtype: dict
+        :raises: TypeError if non-builtin datatype encountered.
+        """
+        if attr is None:
+            return None
+        if isinstance(attr, ET.Element):
+            # Do no recurse on XML, just return the tree as-is
+            return attr
+        if isinstance(attr, str):
+            return self.deserialize_basic(attr, "str")
+        obj_type = type(attr)
+        if obj_type in self.basic_types:
+            return self.deserialize_basic(attr, self.basic_types[obj_type])
+        if obj_type is _long_type:
+            return self.deserialize_long(attr)
+
+        if obj_type == dict:
+            deserialized = {}
+            for key, value in attr.items():
+                try:
+                    deserialized[key] = self.deserialize_object(value, **kwargs)
+                except ValueError:
+                    deserialized[key] = None
+            return deserialized
+
+        if obj_type == list:
+            deserialized = []
+            for obj in attr:
+                try:
+                    deserialized.append(self.deserialize_object(obj, **kwargs))
+                except ValueError:
+                    pass
+            return deserialized
+
+        else:
+            error = "Cannot deserialize generic object with type: "
+            raise TypeError(error + str(obj_type))
+
+    def deserialize_basic(self, attr, data_type):
+        """Deserialize basic builtin data type from string.
+        Will attempt to convert to str, int, float and bool.
+        This function will also accept '1', '0', 'true' and 'false' as
+        valid bool values.
+
+        :param str attr: response string to be deserialized.
+        :param str data_type: deserialization data type.
+        :rtype: str, int, float or bool
+        :raises: TypeError if string format is not valid.
+        """
+        # If we're here, data is supposed to be a basic type.
+        # If it's still an XML node, take the text
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+            if not attr:
+                if data_type == "str":
+                    # None or '', node <a/> is empty string.
+                    return ""
+                else:
+                    # None or '', node <a/> with a strong type is None.
+                    # Don't try to model "empty bool" or "empty int"
+                    return None
+
+        if data_type == "bool":
+            if attr in [True, False, 1, 0]:
+                return bool(attr)
+            elif isinstance(attr, str):
+                if attr.lower() in ["true", "1"]:
+                    return True
+                elif attr.lower() in ["false", "0"]:
+                    return False
+            raise TypeError("Invalid boolean value: {}".format(attr))
+
+        if data_type == "str":
+            return self.deserialize_unicode(attr)
+        return eval(data_type)(attr)  # nosec
+
+    @staticmethod
+    def deserialize_unicode(data):
+        """Preserve unicode objects in Python 2, otherwise return data
+        as a string.
+
+        :param str data: response string to be deserialized.
+        :rtype: str or unicode
+        """
+        # We might be here because we have an enum modeled as string,
+        # and we try to deserialize a partial dict with enum inside
+        if isinstance(data, Enum):
+            return data
+
+        # Consider this is real string
+        try:
+            if isinstance(data, unicode):  # type: ignore
+                return data
+        except NameError:
+            return str(data)
+        else:
+            return str(data)
+
+    @staticmethod
+    def deserialize_enum(data, enum_obj):
+        """Deserialize string into enum object.
+
+        If the string is not a valid enum value it will be returned as-is
+        and a warning will be logged.
+
+        :param str data: Response string to be deserialized. If this value is
+         None or invalid it will be returned as-is.
+        :param Enum enum_obj: Enum object to deserialize to.
+        :rtype: Enum
+        """
+        if isinstance(data, enum_obj) or data is None:
+            return data
+        if isinstance(data, Enum):
+            data = data.value
+        if isinstance(data, int):
+            # Workaround. We might consider remove it in the future.
+            try:
+                return list(enum_obj.__members__.values())[data]
+            except IndexError:
+                error = "{!r} is not a valid index for enum {!r}"
+                raise DeserializationError(error.format(data, enum_obj))
+        try:
+            return enum_obj(str(data))
+        except ValueError:
+            for enum_value in enum_obj:
+                if enum_value.value.lower() == str(data).lower():
+                    return enum_value
+            # We don't fail anymore for unknown value, we deserialize as a string
+            _LOGGER.warning("Deserializer is not able to find %s as valid enum in %s", data, enum_obj)
+            return Deserializer.deserialize_unicode(data)
+
+    @staticmethod
+    def deserialize_bytearray(attr):
+        """Deserialize string into bytearray.
+
+        :param str attr: response string to be deserialized.
+        :rtype: bytearray
+        :raises: TypeError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        return bytearray(b64decode(attr))  # type: ignore
+
+    @staticmethod
+    def deserialize_base64(attr):
+        """Deserialize base64 encoded string into string.
+
+        :param str attr: response string to be deserialized.
+        :rtype: bytearray
+        :raises: TypeError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        padding = "=" * (3 - (len(attr) + 3) % 4)  # type: ignore
+        attr = attr + padding  # type: ignore
+        encoded = attr.replace("-", "+").replace("_", "/")
+        return b64decode(encoded)
+
+    @staticmethod
+    def deserialize_decimal(attr):
+        """Deserialize string into Decimal object.
+
+        :param str attr: response string to be deserialized.
+        :rtype: Decimal
+        :raises: DeserializationError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        try:
+            return decimal.Decimal(str(attr))  # type: ignore
+        except decimal.DecimalException as err:
+            msg = "Invalid decimal {}".format(attr)
+            raise DeserializationError(msg) from err
+
+    @staticmethod
+    def deserialize_long(attr):
+        """Deserialize string into long (Py2) or int (Py3).
+
+        :param str attr: response string to be deserialized.
+        :rtype: long or int
+        :raises: ValueError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        return _long_type(attr)  # type: ignore
+
+    @staticmethod
+    def deserialize_duration(attr):
+        """Deserialize ISO-8601 formatted string into TimeDelta object.
+
+        :param str attr: response string to be deserialized.
+        :rtype: TimeDelta
+        :raises: DeserializationError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        try:
+            duration = isodate.parse_duration(attr)
+        except (ValueError, OverflowError, AttributeError) as err:
+            msg = "Cannot deserialize duration object."
+            raise DeserializationError(msg) from err
+        else:
+            return duration
+
+    @staticmethod
+    def deserialize_date(attr):
+        """Deserialize ISO-8601 formatted string into Date object.
+
+        :param str attr: response string to be deserialized.
+        :rtype: Date
+        :raises: DeserializationError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        if re.search(r"[^\W\d_]", attr, re.I + re.U):  # type: ignore
+            raise DeserializationError("Date must have only digits and -. Received: %s" % attr)
+        # This must NOT use defaultmonth/defaultday. Using None ensure this raises an exception.
+        return isodate.parse_date(attr, defaultmonth=0, defaultday=0)
+
+    @staticmethod
+    def deserialize_time(attr):
+        """Deserialize ISO-8601 formatted string into time object.
+
+        :param str attr: response string to be deserialized.
+        :rtype: datetime.time
+        :raises: DeserializationError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        if re.search(r"[^\W\d_]", attr, re.I + re.U):  # type: ignore
+            raise DeserializationError("Date must have only digits and -. Received: %s" % attr)
+        return isodate.parse_time(attr)
+
+    @staticmethod
+    def deserialize_rfc(attr):
+        """Deserialize RFC-1123 formatted string into Datetime object.
+
+        :param str attr: response string to be deserialized.
+        :rtype: Datetime
+        :raises: DeserializationError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        try:
+            parsed_date = email.utils.parsedate_tz(attr)  # type: ignore
+            date_obj = datetime.datetime(
+                *parsed_date[:6], tzinfo=_FixedOffset(datetime.timedelta(minutes=(parsed_date[9] or 0) / 60))
+            )
+            if not date_obj.tzinfo:
+                date_obj = date_obj.astimezone(tz=TZ_UTC)
+        except ValueError as err:
+            msg = "Cannot deserialize to rfc datetime object."
+            raise DeserializationError(msg) from err
+        else:
+            return date_obj
+
+    @staticmethod
+    def deserialize_iso(attr):
+        """Deserialize ISO-8601 formatted string into Datetime object.
+
+        :param str attr: response string to be deserialized.
+        :rtype: Datetime
+        :raises: DeserializationError if string format invalid.
+        """
+        if isinstance(attr, ET.Element):
+            attr = attr.text
+        try:
+            attr = attr.upper()  # type: ignore
+            match = Deserializer.valid_date.match(attr)
+            if not match:
+                raise ValueError("Invalid datetime string: " + attr)
+
+            check_decimal = attr.split(".")
+            if len(check_decimal) > 1:
+                decimal_str = ""
+                for digit in check_decimal[1]:
+                    if digit.isdigit():
+                        decimal_str += digit
+                    else:
+                        break
+                if len(decimal_str) > 6:
+                    attr = attr.replace(decimal_str, decimal_str[0:6])
+
+            date_obj = isodate.parse_datetime(attr)
+            test_utc = date_obj.utctimetuple()
+            if test_utc.tm_year > 9999 or test_utc.tm_year < 1:
+                raise OverflowError("Hit max or min date")
+        except (ValueError, OverflowError, AttributeError) as err:
+            msg = "Cannot deserialize datetime object."
+            raise DeserializationError(msg) from err
+        else:
+            return date_obj
+
+    @staticmethod
+    def deserialize_unix(attr):
+        """Serialize Datetime object into IntTime format.
+        This is represented as seconds.
+
+        :param int attr: Object to be serialized.
+        :rtype: Datetime
+        :raises: DeserializationError if format invalid
+        """
+        if isinstance(attr, ET.Element):
+            attr = int(attr.text)  # type: ignore
+        try:
+            attr = int(attr)
+            date_obj = datetime.datetime.fromtimestamp(attr, TZ_UTC)
+        except ValueError as err:
+            msg = "Cannot deserialize to unix datetime object."
+            raise DeserializationError(msg) from err
+        else:
+            return date_obj
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_vendor.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_vendor.py
new file mode 100644
index 000000000000..8ea240fb008b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_vendor.py
@@ -0,0 +1,48 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from abc import ABC
+from typing import TYPE_CHECKING
+
+from ._configuration import (
+    ChatCompletionsClientConfiguration,
+    EmbeddingsClientConfiguration,
+    ImageEmbeddingsClientConfiguration,
+)
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core import PipelineClient
+
+    from ._serialization import Deserializer, Serializer
+
+
+class ChatCompletionsClientMixinABC(ABC):
+    """DO NOT use this class. It is for internal typing use only."""
+
+    _client: "PipelineClient"
+    _config: ChatCompletionsClientConfiguration
+    _serialize: "Serializer"
+    _deserialize: "Deserializer"
+
+
+class EmbeddingsClientMixinABC(ABC):
+    """DO NOT use this class. It is for internal typing use only."""
+
+    _client: "PipelineClient"
+    _config: EmbeddingsClientConfiguration
+    _serialize: "Serializer"
+    _deserialize: "Deserializer"
+
+
+class ImageEmbeddingsClientMixinABC(ABC):
+    """DO NOT use this class. It is for internal typing use only."""
+
+    _client: "PipelineClient"
+    _config: ImageEmbeddingsClientConfiguration
+    _serialize: "Serializer"
+    _deserialize: "Deserializer"
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
new file mode 100644
index 000000000000..be71c81bd282
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
@@ -0,0 +1,9 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+VERSION = "1.0.0b1"
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py
new file mode 100644
index 000000000000..c31764c00803
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py
@@ -0,0 +1,25 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from ._patch import ChatCompletionsClient
+from ._patch import EmbeddingsClient
+from ._patch import ImageEmbeddingsClient
+
+
+from ._patch import load_client
+from ._patch import patch_sdk as _patch_sdk
+
+__all__ = [
+    "load_client",
+    "ChatCompletionsClient",
+    "EmbeddingsClient",
+    "ImageEmbeddingsClient",
+]
+
+
+_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
new file mode 100644
index 000000000000..fad042e5fcee
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
@@ -0,0 +1,277 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from copy import deepcopy
+from typing import Any, Awaitable, TYPE_CHECKING, Union
+
+from azure.core import AsyncPipelineClient
+from azure.core.credentials import AzureKeyCredential
+from azure.core.pipeline import policies
+from azure.core.rest import AsyncHttpResponse, HttpRequest
+
+from .._serialization import Deserializer, Serializer
+from ._configuration import (
+    ChatCompletionsClientConfiguration,
+    EmbeddingsClientConfiguration,
+    ImageEmbeddingsClientConfiguration,
+)
+from ._operations import (
+    ChatCompletionsClientOperationsMixin,
+    EmbeddingsClientOperationsMixin,
+    ImageEmbeddingsClientOperationsMixin,
+)
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core.credentials_async import AsyncTokenCredential
+
+
+class ChatCompletionsClient(ChatCompletionsClientOperationsMixin):  # pylint: disable=client-accepts-api-version-keyword
+    """ChatCompletionsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        _endpoint = "{endpoint}"
+        self._config = ChatCompletionsClientConfiguration(endpoint=endpoint, credential=credential, **kwargs)
+        _policies = kwargs.pop("policies", None)
+        if _policies is None:
+            _policies = [
+                policies.RequestIdPolicy(**kwargs),
+                self._config.headers_policy,
+                self._config.user_agent_policy,
+                self._config.proxy_policy,
+                policies.ContentDecodePolicy(**kwargs),
+                self._config.redirect_policy,
+                self._config.retry_policy,
+                self._config.authentication_policy,
+                self._config.custom_hook_policy,
+                self._config.logging_policy,
+                policies.DistributedTracingPolicy(**kwargs),
+                policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
+                self._config.http_logging_policy,
+            ]
+        self._client: AsyncPipelineClient = AsyncPipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
+
+        self._serialize = Serializer()
+        self._deserialize = Deserializer()
+        self._serialize.client_side_validation = False
+
+    def send_request(
+        self, request: HttpRequest, *, stream: bool = False, **kwargs: Any
+    ) -> Awaitable[AsyncHttpResponse]:
+        """Runs the network request through the client's chained policies.
+
+        >>> from azure.core.rest import HttpRequest
+        >>> request = HttpRequest("GET", "https://www.example.org/")
+        <HttpRequest [GET], url: 'https://www.example.org/'>
+        >>> response = await client.send_request(request)
+        <AsyncHttpResponse: 200 OK>
+
+        For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
+
+        :param request: The network request you want to make. Required.
+        :type request: ~azure.core.rest.HttpRequest
+        :keyword bool stream: Whether the response payload will be streamed. Defaults to False.
+        :return: The response of your network call. Does not do error handling on your response.
+        :rtype: ~azure.core.rest.AsyncHttpResponse
+        """
+
+        request_copy = deepcopy(request)
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+
+        request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
+        return self._client.send_request(request_copy, stream=stream, **kwargs)  # type: ignore
+
+    async def close(self) -> None:
+        await self._client.close()
+
+    async def __aenter__(self) -> "ChatCompletionsClient":
+        await self._client.__aenter__()
+        return self
+
+    async def __aexit__(self, *exc_details: Any) -> None:
+        await self._client.__aexit__(*exc_details)
+
+
+class EmbeddingsClient(EmbeddingsClientOperationsMixin):  # pylint: disable=client-accepts-api-version-keyword
+    """EmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        _endpoint = "{endpoint}"
+        self._config = EmbeddingsClientConfiguration(endpoint=endpoint, credential=credential, **kwargs)
+        _policies = kwargs.pop("policies", None)
+        if _policies is None:
+            _policies = [
+                policies.RequestIdPolicy(**kwargs),
+                self._config.headers_policy,
+                self._config.user_agent_policy,
+                self._config.proxy_policy,
+                policies.ContentDecodePolicy(**kwargs),
+                self._config.redirect_policy,
+                self._config.retry_policy,
+                self._config.authentication_policy,
+                self._config.custom_hook_policy,
+                self._config.logging_policy,
+                policies.DistributedTracingPolicy(**kwargs),
+                policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
+                self._config.http_logging_policy,
+            ]
+        self._client: AsyncPipelineClient = AsyncPipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
+
+        self._serialize = Serializer()
+        self._deserialize = Deserializer()
+        self._serialize.client_side_validation = False
+
+    def send_request(
+        self, request: HttpRequest, *, stream: bool = False, **kwargs: Any
+    ) -> Awaitable[AsyncHttpResponse]:
+        """Runs the network request through the client's chained policies.
+
+        >>> from azure.core.rest import HttpRequest
+        >>> request = HttpRequest("GET", "https://www.example.org/")
+        <HttpRequest [GET], url: 'https://www.example.org/'>
+        >>> response = await client.send_request(request)
+        <AsyncHttpResponse: 200 OK>
+
+        For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
+
+        :param request: The network request you want to make. Required.
+        :type request: ~azure.core.rest.HttpRequest
+        :keyword bool stream: Whether the response payload will be streamed. Defaults to False.
+        :return: The response of your network call. Does not do error handling on your response.
+        :rtype: ~azure.core.rest.AsyncHttpResponse
+        """
+
+        request_copy = deepcopy(request)
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+
+        request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
+        return self._client.send_request(request_copy, stream=stream, **kwargs)  # type: ignore
+
+    async def close(self) -> None:
+        await self._client.close()
+
+    async def __aenter__(self) -> "EmbeddingsClient":
+        await self._client.__aenter__()
+        return self
+
+    async def __aexit__(self, *exc_details: Any) -> None:
+        await self._client.__aexit__(*exc_details)
+
+
+class ImageEmbeddingsClient(ImageEmbeddingsClientOperationsMixin):  # pylint: disable=client-accepts-api-version-keyword
+    """ImageEmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        _endpoint = "{endpoint}"
+        self._config = ImageEmbeddingsClientConfiguration(endpoint=endpoint, credential=credential, **kwargs)
+        _policies = kwargs.pop("policies", None)
+        if _policies is None:
+            _policies = [
+                policies.RequestIdPolicy(**kwargs),
+                self._config.headers_policy,
+                self._config.user_agent_policy,
+                self._config.proxy_policy,
+                policies.ContentDecodePolicy(**kwargs),
+                self._config.redirect_policy,
+                self._config.retry_policy,
+                self._config.authentication_policy,
+                self._config.custom_hook_policy,
+                self._config.logging_policy,
+                policies.DistributedTracingPolicy(**kwargs),
+                policies.SensitiveHeaderCleanupPolicy(**kwargs) if self._config.redirect_policy else None,
+                self._config.http_logging_policy,
+            ]
+        self._client: AsyncPipelineClient = AsyncPipelineClient(base_url=_endpoint, policies=_policies, **kwargs)
+
+        self._serialize = Serializer()
+        self._deserialize = Deserializer()
+        self._serialize.client_side_validation = False
+
+    def send_request(
+        self, request: HttpRequest, *, stream: bool = False, **kwargs: Any
+    ) -> Awaitable[AsyncHttpResponse]:
+        """Runs the network request through the client's chained policies.
+
+        >>> from azure.core.rest import HttpRequest
+        >>> request = HttpRequest("GET", "https://www.example.org/")
+        <HttpRequest [GET], url: 'https://www.example.org/'>
+        >>> response = await client.send_request(request)
+        <AsyncHttpResponse: 200 OK>
+
+        For more information on this code flow, see https://aka.ms/azsdk/dpcodegen/python/send_request
+
+        :param request: The network request you want to make. Required.
+        :type request: ~azure.core.rest.HttpRequest
+        :keyword bool stream: Whether the response payload will be streamed. Defaults to False.
+        :return: The response of your network call. Does not do error handling on your response.
+        :rtype: ~azure.core.rest.AsyncHttpResponse
+        """
+
+        request_copy = deepcopy(request)
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+
+        request_copy.url = self._client.format_url(request_copy.url, **path_format_arguments)
+        return self._client.send_request(request_copy, stream=stream, **kwargs)  # type: ignore
+
+    async def close(self) -> None:
+        await self._client.close()
+
+    async def __aenter__(self) -> "ImageEmbeddingsClient":
+        await self._client.__aenter__()
+        return self
+
+    async def __aexit__(self, *exc_details: Any) -> None:
+        await self._client.__aexit__(*exc_details)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
new file mode 100644
index 000000000000..15f6105a1624
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from typing import Any, TYPE_CHECKING, Union
+
+from azure.core.credentials import AzureKeyCredential
+from azure.core.pipeline import policies
+
+from .._version import VERSION
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core.credentials_async import AsyncTokenCredential
+
+
+class ChatCompletionsClientConfiguration:  # pylint: disable=too-many-instance-attributes,name-too-long
+    """Configuration for ChatCompletionsClient.
+
+    Note that all parameters used to create this instance are saved as instance
+    attributes.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        api_version: str = kwargs.pop("api_version", "2024-05-01-preview")
+
+        if endpoint is None:
+            raise ValueError("Parameter 'endpoint' must not be None.")
+        if credential is None:
+            raise ValueError("Parameter 'credential' must not be None.")
+
+        self.endpoint = endpoint
+        self.credential = credential
+        self.api_version = api_version
+        self.credential_scopes = kwargs.pop("credential_scopes", ["https://ml.azure.com/.default"])
+        kwargs.setdefault("sdk_moniker", "ai-inference/{}".format(VERSION))
+        self.polling_interval = kwargs.get("polling_interval", 30)
+        self._configure(**kwargs)
+
+    def _infer_policy(self, **kwargs):
+        if isinstance(self.credential, AzureKeyCredential):
+            return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
+        if hasattr(self.credential, "get_token"):
+            return policies.AsyncBearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
+        raise TypeError(f"Unsupported credential: {self.credential}")
+
+    def _configure(self, **kwargs: Any) -> None:
+        self.user_agent_policy = kwargs.get("user_agent_policy") or policies.UserAgentPolicy(**kwargs)
+        self.headers_policy = kwargs.get("headers_policy") or policies.HeadersPolicy(**kwargs)
+        self.proxy_policy = kwargs.get("proxy_policy") or policies.ProxyPolicy(**kwargs)
+        self.logging_policy = kwargs.get("logging_policy") or policies.NetworkTraceLoggingPolicy(**kwargs)
+        self.http_logging_policy = kwargs.get("http_logging_policy") or policies.HttpLoggingPolicy(**kwargs)
+        self.custom_hook_policy = kwargs.get("custom_hook_policy") or policies.CustomHookPolicy(**kwargs)
+        self.redirect_policy = kwargs.get("redirect_policy") or policies.AsyncRedirectPolicy(**kwargs)
+        self.retry_policy = kwargs.get("retry_policy") or policies.AsyncRetryPolicy(**kwargs)
+        self.authentication_policy = kwargs.get("authentication_policy")
+        if self.credential and not self.authentication_policy:
+            self.authentication_policy = self._infer_policy(**kwargs)
+
+
+class EmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attributes,name-too-long
+    """Configuration for EmbeddingsClient.
+
+    Note that all parameters used to create this instance are saved as instance
+    attributes.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        api_version: str = kwargs.pop("api_version", "2024-05-01-preview")
+
+        if endpoint is None:
+            raise ValueError("Parameter 'endpoint' must not be None.")
+        if credential is None:
+            raise ValueError("Parameter 'credential' must not be None.")
+
+        self.endpoint = endpoint
+        self.credential = credential
+        self.api_version = api_version
+        self.credential_scopes = kwargs.pop("credential_scopes", ["https://ml.azure.com/.default"])
+        kwargs.setdefault("sdk_moniker", "ai-inference/{}".format(VERSION))
+        self.polling_interval = kwargs.get("polling_interval", 30)
+        self._configure(**kwargs)
+
+    def _infer_policy(self, **kwargs):
+        if isinstance(self.credential, AzureKeyCredential):
+            return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
+        if hasattr(self.credential, "get_token"):
+            return policies.AsyncBearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
+        raise TypeError(f"Unsupported credential: {self.credential}")
+
+    def _configure(self, **kwargs: Any) -> None:
+        self.user_agent_policy = kwargs.get("user_agent_policy") or policies.UserAgentPolicy(**kwargs)
+        self.headers_policy = kwargs.get("headers_policy") or policies.HeadersPolicy(**kwargs)
+        self.proxy_policy = kwargs.get("proxy_policy") or policies.ProxyPolicy(**kwargs)
+        self.logging_policy = kwargs.get("logging_policy") or policies.NetworkTraceLoggingPolicy(**kwargs)
+        self.http_logging_policy = kwargs.get("http_logging_policy") or policies.HttpLoggingPolicy(**kwargs)
+        self.custom_hook_policy = kwargs.get("custom_hook_policy") or policies.CustomHookPolicy(**kwargs)
+        self.redirect_policy = kwargs.get("redirect_policy") or policies.AsyncRedirectPolicy(**kwargs)
+        self.retry_policy = kwargs.get("retry_policy") or policies.AsyncRetryPolicy(**kwargs)
+        self.authentication_policy = kwargs.get("authentication_policy")
+        if self.credential and not self.authentication_policy:
+            self.authentication_policy = self._infer_policy(**kwargs)
+
+
+class ImageEmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attributes,name-too-long
+    """Configuration for ImageEmbeddingsClient.
+
+    Note that all parameters used to create this instance are saved as instance
+    attributes.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        api_version: str = kwargs.pop("api_version", "2024-05-01-preview")
+
+        if endpoint is None:
+            raise ValueError("Parameter 'endpoint' must not be None.")
+        if credential is None:
+            raise ValueError("Parameter 'credential' must not be None.")
+
+        self.endpoint = endpoint
+        self.credential = credential
+        self.api_version = api_version
+        self.credential_scopes = kwargs.pop("credential_scopes", ["https://ml.azure.com/.default"])
+        kwargs.setdefault("sdk_moniker", "ai-inference/{}".format(VERSION))
+        self.polling_interval = kwargs.get("polling_interval", 30)
+        self._configure(**kwargs)
+
+    def _infer_policy(self, **kwargs):
+        if isinstance(self.credential, AzureKeyCredential):
+            return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
+        if hasattr(self.credential, "get_token"):
+            return policies.AsyncBearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
+        raise TypeError(f"Unsupported credential: {self.credential}")
+
+    def _configure(self, **kwargs: Any) -> None:
+        self.user_agent_policy = kwargs.get("user_agent_policy") or policies.UserAgentPolicy(**kwargs)
+        self.headers_policy = kwargs.get("headers_policy") or policies.HeadersPolicy(**kwargs)
+        self.proxy_policy = kwargs.get("proxy_policy") or policies.ProxyPolicy(**kwargs)
+        self.logging_policy = kwargs.get("logging_policy") or policies.NetworkTraceLoggingPolicy(**kwargs)
+        self.http_logging_policy = kwargs.get("http_logging_policy") or policies.HttpLoggingPolicy(**kwargs)
+        self.custom_hook_policy = kwargs.get("custom_hook_policy") or policies.CustomHookPolicy(**kwargs)
+        self.redirect_policy = kwargs.get("redirect_policy") or policies.AsyncRedirectPolicy(**kwargs)
+        self.retry_policy = kwargs.get("retry_policy") or policies.AsyncRetryPolicy(**kwargs)
+        self.authentication_policy = kwargs.get("authentication_policy")
+        if self.credential and not self.authentication_policy:
+            self.authentication_policy = self._infer_policy(**kwargs)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/__init__.py
new file mode 100644
index 000000000000..d3ebd561f739
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/__init__.py
@@ -0,0 +1,23 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from ._operations import ChatCompletionsClientOperationsMixin
+from ._operations import EmbeddingsClientOperationsMixin
+from ._operations import ImageEmbeddingsClientOperationsMixin
+
+from ._patch import __all__ as _patch_all
+from ._patch import *  # pylint: disable=unused-wildcard-import
+from ._patch import patch_sdk as _patch_sdk
+
+__all__ = [
+    "ChatCompletionsClientOperationsMixin",
+    "EmbeddingsClientOperationsMixin",
+    "ImageEmbeddingsClientOperationsMixin",
+]
+__all__.extend([p for p in _patch_all if p not in __all__])
+_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
new file mode 100644
index 000000000000..33344e718128
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
@@ -0,0 +1,974 @@
+# pylint: disable=too-many-lines,too-many-statements
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+from io import IOBase
+import json
+import sys
+from typing import Any, Callable, Dict, IO, List, Optional, Type, TypeVar, Union, overload
+
+from azure.core.exceptions import (
+    ClientAuthenticationError,
+    HttpResponseError,
+    ResourceExistsError,
+    ResourceNotFoundError,
+    ResourceNotModifiedError,
+    map_error,
+)
+from azure.core.pipeline import PipelineResponse
+from azure.core.rest import AsyncHttpResponse, HttpRequest
+from azure.core.tracing.decorator_async import distributed_trace_async
+from azure.core.utils import case_insensitive_dict
+
+from ... import models as _models
+from ..._model_base import SdkJSONEncoder, _deserialize
+from ..._operations._operations import (
+    build_chat_completions_complete_request,
+    build_chat_completions_get_model_info_request,
+    build_embeddings_embed_request,
+    build_embeddings_get_model_info_request,
+    build_image_embeddings_embed_request,
+    build_image_embeddings_get_model_info_request,
+)
+from .._vendor import ChatCompletionsClientMixinABC, EmbeddingsClientMixinABC, ImageEmbeddingsClientMixinABC
+
+if sys.version_info >= (3, 9):
+    from collections.abc import MutableMapping
+else:
+    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
+JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
+_Unset: Any = object()
+T = TypeVar("T")
+ClsType = Optional[Callable[[PipelineResponse[HttpRequest, AsyncHttpResponse], T, Dict[str, Any]], Any]]
+
+
+class ChatCompletionsClientOperationsMixin(ChatCompletionsClientMixinABC):
+
+    @overload
+    async def _complete(
+        self,
+        body: JSON,
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.ChatCompletions: ...
+    @overload
+    async def _complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        frequency_penalty: Optional[float] = None,
+        stream_parameter: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any
+    ) -> _models.ChatCompletions: ...
+    @overload
+    async def _complete(
+        self,
+        body: IO[bytes],
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.ChatCompletions: ...
+
+    @distributed_trace_async
+    async def _complete(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        messages: List[_models.ChatRequestMessage] = _Unset,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        frequency_penalty: Optional[float] = None,
+        stream_parameter: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any
+    ) -> _models.ChatCompletions:
+        # pylint: disable=line-too-long
+        # pylint: disable=too-many-locals
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes"
+        provided prompt data. The method makes a REST API call to the ``/chat/completions`` route
+        on the given endpoint.
+
+        :param body: Is either a JSON type or a IO[bytes] type. Required.
+        :type body: JSON or IO[bytes]
+        :keyword messages: The collection of context messages associated with this chat completions
+         request.
+         Typical usage begins with a chat message for the System role that provides instructions for
+         the behavior of the assistant, followed by alternating messages between the User and
+         Assistant roles. Required.
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
+        :keyword unknown_params: Controls what happens if unknown parameters are passed in the JSON
+         request payload.
+         This sets the HTTP request header ``unknown-parameters``. Known values are: "error", "drop",
+         and "pass_through". Default value is None.
+        :paramtype unknown_params: str or ~azure.ai.inference.models.UnknownParams
+        :keyword frequency_penalty: A value that influences the probability of generated tokens
+         appearing based on their cumulative
+         frequency in generated text.
+         Positive values will make tokens less likely to appear as their frequency increases and
+         decrease the likelihood of the model repeating the same statements verbatim.
+         Supported range is [-2, 2]. Default value is None.
+        :paramtype frequency_penalty: float
+        :keyword stream_parameter: A value indicating whether chat completions should be streamed for
+         this request. Default value is None.
+        :paramtype stream_parameter: bool
+        :keyword presence_penalty: A value that influences the probability of generated tokens
+         appearing based on their existing
+         presence in generated text.
+         Positive values will make tokens less likely to appear when they already exist and increase
+         the
+         model's likelihood to output new topics.
+         Supported range is [-2, 2]. Default value is None.
+        :paramtype presence_penalty: float
+        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
+         generated completions.
+         Higher values will make output more random while lower values will make results more focused
+         and deterministic.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1]. Default value is None.
+        :paramtype temperature: float
+        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
+         causes the
+         model to consider the results of tokens with the provided probability mass. As an example, a
+         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+         considered.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1]. Default value is None.
+        :paramtype top_p: float
+        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
+        :paramtype max_tokens: int
+        :keyword response_format: An object specifying the format that the model must output. Used to
+         enable JSON mode. Known values are: "text" and "json_object". Default value is None.
+        :paramtype response_format: str or ~azure.ai.inference.models.ChatCompletionsResponseFormat
+        :keyword stop: A collection of textual sequences that will end completions generation. Default
+         value is None.
+        :paramtype stop: list[str]
+        :keyword tools: The available tool definitions that the chat completions request can use,
+         including caller-defined functions. Default value is None.
+        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
+         use for the chat completions response. Is either a Union[str,
+         "_models.ChatCompletionsToolSelectionPreset"] type or a ChatCompletionsNamedToolSelection type.
+         Default value is None.
+        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolSelectionPreset or
+         ~azure.ai.inference.models.ChatCompletionsNamedToolSelection
+        :keyword seed: If specified, the system will make a best effort to sample deterministically
+         such that repeated requests with the
+         same seed and parameters should return the same result. Determinism is not guaranteed.".
+         Default value is None.
+        :paramtype seed: int
+        :return: ChatCompletions. The ChatCompletions is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # JSON input template you can fill out and use as your body input.
+                body = {
+                    "messages": [
+                        chat_request_message
+                    ],
+                    "frequency_penalty": 0.0,  # Optional. A value that influences the
+                      probability of generated tokens appearing based on their cumulative frequency in
+                      generated text. Positive values will make tokens less likely to appear as their
+                      frequency increases and decrease the likelihood of the model repeating the same
+                      statements verbatim. Supported range is [-2, 2].
+                    "max_tokens": 0,  # Optional. The maximum number of tokens to generate.
+                    "presence_penalty": 0.0,  # Optional. A value that influences the probability
+                      of generated tokens appearing based on their existing presence in generated text.
+                      Positive values will make tokens less likely to appear when they already exist
+                      and increase the model's likelihood to output new topics. Supported range is [-2,
+                      2].
+                    "response_format": "str",  # Optional. An object specifying the format that
+                      the model must output. Used to enable JSON mode. Known values are: "text" and
+                      "json_object".
+                    "seed": 0,  # Optional. If specified, the system will make a best effort to
+                      sample deterministically such that repeated requests with the same seed and
+                      parameters should return the same result. Determinism is not guaranteed.".
+                    "stop": [
+                        "str"  # Optional. A collection of textual sequences that will end
+                          completions generation.
+                    ],
+                    "stream": bool,  # Optional. A value indicating whether chat completions
+                      should be streamed for this request.
+                    "temperature": 0.0,  # Optional. The sampling temperature to use that
+                      controls the apparent creativity of generated completions. Higher values will
+                      make output more random while lower values will make results more focused and
+                      deterministic. It is not recommended to modify temperature and top_p for the same
+                      completions request as the interaction of these two settings is difficult to
+                      predict. Supported range is [0, 1].
+                    "tool_choice": "str",  # Optional. If specified, the model will configure
+                      which of the provided tools it can use for the chat completions response. Is
+                      either a Union[str, "_models.ChatCompletionsToolSelectionPreset"] type or a
+                      ChatCompletionsNamedToolSelection type.
+                    "tools": [
+                        chat_completions_tool_definition
+                    ],
+                    "top_p": 0.0  # Optional. An alternative to sampling with temperature called
+                      nucleus sampling. This value causes the model to consider the results of tokens
+                      with the provided probability mass. As an example, a value of 0.15 will cause
+                      only the tokens comprising the top 15% of probability mass to be considered. It
+                      is not recommended to modify temperature and top_p for the same completions
+                      request as the interaction of these two settings is difficult to predict.
+                      Supported range is [0, 1].
+                }
+
+                # response body for status code(s): 200
+                response == {
+                    "choices": [
+                        {
+                            "finish_reason": "str",  # The reason that this chat
+                              completions choice completed its generated. Required. Known values are:
+                              "stop", "length", "content_filter", and "tool_calls".
+                            "index": 0,  # The ordered index associated with this chat
+                              completions choice. Required.
+                            "message": {
+                                "content": "str",  # The content of the message.
+                                  Required.
+                                "role": "str",  # The chat role associated with the
+                                  message. Required. Known values are: "system", "user", "assistant",
+                                  and "tool".
+                                "tool_calls": [
+                                    chat_completions_tool_call
+                                ]
+                            }
+                        }
+                    ],
+                    "created": "2020-02-20 00:00:00",  # The first timestamp associated with
+                      generation activity for this completions response, represented as seconds since
+                      the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
+                    "id": "str",  # A unique identifier associated with this chat completions
+                      response. Required.
+                    "model": "str",  # The model used for the chat completion. Required.
+                    "usage": {
+                        "capacity_type": "str",  # Indicates whether your capacity has been
+                          affected by the usage amount (token count) reported here. Required. Known
+                          values are: "usage" and "fixed".
+                        "completion_tokens": 0,  # The number of tokens generated across all
+                          completions emissions. Required.
+                        "prompt_tokens": 0,  # The number of tokens in the provided prompts
+                          for the completions request. Required.
+                        "total_tokens": 0  # The total number of tokens processed for the
+                          completions request and response. Required.
+                    }
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+        cls: ClsType[_models.ChatCompletions] = kwargs.pop("cls", None)
+
+        if body is _Unset:
+            if messages is _Unset:
+                raise TypeError("missing required argument: messages")
+            body = {
+                "frequency_penalty": frequency_penalty,
+                "max_tokens": max_tokens,
+                "messages": messages,
+                "presence_penalty": presence_penalty,
+                "response_format": response_format,
+                "seed": seed,
+                "stop": stop,
+                "stream": stream_parameter,
+                "temperature": temperature,
+                "tool_choice": tool_choice,
+                "tools": tools,
+                "top_p": top_p,
+            }
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_chat_completions_complete_request(
+            unknown_params=unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ChatCompletions, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+    @distributed_trace_async
+    async def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+        The method makes a REST API call to the ``/info`` route on the given endpoint.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # response body for status code(s): 200
+                response == {
+                    "model_name": "str",  # The name of the AI model. For example: ``Phi21``.
+                      Required.
+                    "model_provider_name": "str",  # The model provider name. For example:
+                      ``Microsoft Research``. Required.
+                    "model_type": "str"  # The type of the AI model. A Unique identifier for the
+                      profile. Required. Known values are: "embeddings", "image_generation",
+                      "text_generation", "image_embeddings", "audio_generation", and "chat".
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = kwargs.pop("headers", {}) or {}
+        _params = kwargs.pop("params", {}) or {}
+
+        cls: ClsType[_models.ModelInfo] = kwargs.pop("cls", None)
+
+        _request = build_chat_completions_get_model_info_request(
+            api_version=self._config.api_version,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ModelInfo, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+
+class EmbeddingsClientOperationsMixin(EmbeddingsClientMixinABC):
+
+    @overload
+    async def _embed(
+        self,
+        body: JSON,
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    async def _embed(
+        self,
+        *,
+        input: List[str],
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    async def _embed(
+        self,
+        body: IO[bytes],
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+
+    @distributed_trace_async
+    async def _embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        input: List[str] = _Unset,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the ``/embeddings`` route on the given endpoint.
+
+        :param body: Is either a JSON type or a IO[bytes] type. Required.
+        :type body: JSON or IO[bytes]
+        :keyword input: Input text to embed, encoded as a string or array of tokens.
+         To embed multiple inputs in a single request, pass an array
+         of strings or array of token arrays. Required.
+        :paramtype input: list[str]
+        :keyword unknown_params: Controls what happens if unknown parameters are passed in the JSON
+         request payload.
+         This sets the HTTP request header ``unknown-parameters``. Known values are: "error", "drop",
+         and "pass_through". Default value is None.
+        :paramtype unknown_params: str or ~azure.ai.inference.models.UnknownParams
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings. Known
+         values are: "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # JSON input template you can fill out and use as your body input.
+                body = {
+                    "input": [
+                        "str"  # Input text to embed, encoded as a string or array of tokens.
+                          To embed multiple inputs in a single request, pass an array of strings or
+                          array of token arrays. Required.
+                    ],
+                    "dimensions": 0,  # Optional. Optional. The number of dimensions the
+                      resulting output embeddings should have. Passing null causes the model to use its
+                      default value. Returns a 422 error if the model doesn't support the value or
+                      parameter.
+                    "encoding_format": "str",  # Optional. Optional. The desired format for the
+                      returned embeddings. Known values are: "base64", "binary", "float", "int8",
+                      "ubinary", and "uint8".
+                    "input_type": "str"  # Optional. Optional. The type of the input. Returns a
+                      422 error if the model doesn't support the value or parameter. Known values are:
+                      "text", "query", and "document".
+                }
+
+                # response body for status code(s): 200
+                response == {
+                    "data": [
+                        {
+                            "embedding": [
+                                0.0  # List of embeddings value for the input prompt.
+                                  These represent a measurement of the vector-based relatedness of the
+                                  provided input. Required.
+                            ],
+                            "index": 0  # Index of the prompt to which the EmbeddingItem
+                              corresponds. Required.
+                        }
+                    ],
+                    "id": "str",  # Unique identifier for the embeddings result. Required.
+                    "model": "str",  # The model ID used to generate this result. Required.
+                    "usage": {
+                        "capacity_type": "str",  # Indicates whether your capacity has been
+                          affected by the usage amount (token count) reported here. Required. Known
+                          values are: "usage" and "fixed".
+                        "input_tokens": 0,  # Number of tokens in the request prompt.
+                          Required.
+                        "prompt_tokens": 0,  # Number of tokens used for the prompt sent to
+                          the AI model. Typically identical to ``input_tokens``. However, certain AI
+                          models may add extra tokens to the input hence the number can be higher. (for
+                          example when input_type="query"). Required.
+                        "total_tokens": 0  # Total number of tokens transacted in this
+                          request/response. Required.
+                    }
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+        cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_embeddings_embed_request(
+            unknown_params=unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+    @distributed_trace_async
+    async def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+        The method makes a REST API call to the ``/info`` route on the given endpoint.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # response body for status code(s): 200
+                response == {
+                    "model_name": "str",  # The name of the AI model. For example: ``Phi21``.
+                      Required.
+                    "model_provider_name": "str",  # The model provider name. For example:
+                      ``Microsoft Research``. Required.
+                    "model_type": "str"  # The type of the AI model. A Unique identifier for the
+                      profile. Required. Known values are: "embeddings", "image_generation",
+                      "text_generation", "image_embeddings", "audio_generation", and "chat".
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = kwargs.pop("headers", {}) or {}
+        _params = kwargs.pop("params", {}) or {}
+
+        cls: ClsType[_models.ModelInfo] = kwargs.pop("cls", None)
+
+        _request = build_embeddings_get_model_info_request(
+            api_version=self._config.api_version,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ModelInfo, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+
+class ImageEmbeddingsClientOperationsMixin(ImageEmbeddingsClientMixinABC):
+
+    @overload
+    async def _embed(
+        self,
+        body: JSON,
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    async def _embed(
+        self,
+        *,
+        input: List[_models.EmbeddingInput],
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+    @overload
+    async def _embed(
+        self,
+        body: IO[bytes],
+        *,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        content_type: str = "application/json",
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult: ...
+
+    @distributed_trace_async
+    async def _embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        input: List[_models.EmbeddingInput] = _Unset,
+        unknown_params: Optional[Union[str, _models._enums.UnknownParams]] = None,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the ``/images/embeddings`` route on the given endpoint.
+
+        :param body: Is either a JSON type or a IO[bytes] type. Required.
+        :type body: JSON or IO[bytes]
+        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
+         array.
+         The input must not exceed the max input tokens for the model. Required.
+        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
+        :keyword unknown_params: Controls what happens if unknown parameters are passed in the JSON
+         request payload.
+         This sets the HTTP request header ``unknown-parameters``. Known values are: "error", "drop",
+         and "pass_through". Default value is None.
+        :paramtype unknown_params: str or ~azure.ai.inference.models.UnknownParams
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The number of dimensions the resulting output embeddings
+         should have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # JSON input template you can fill out and use as your body input.
+                body = {
+                    "input": [
+                        {
+                            "image": "str",  # The input image, in PNG format. Required.
+                            "text": "str"  # Optional. Optional. The text input to feed
+                              into the model (like DINO, CLIP). Returns a 422 error if the model
+                              doesn't support the value or parameter.
+                        }
+                    ],
+                    "dimensions": 0,  # Optional. Optional. The number of dimensions the
+                      resulting output embeddings should have. Passing null causes the model to use its
+                      default value. Returns a 422 error if the model doesn't support the value or
+                      parameter.
+                    "encoding_format": "str",  # Optional. Optional. The number of dimensions the
+                      resulting output embeddings should have. Passing null causes the model to use its
+                      default value. Returns a 422 error if the model doesn't support the value or
+                      parameter. Known values are: "base64", "binary", "float", "int8", "ubinary", and
+                      "uint8".
+                    "input_type": "str"  # Optional. Optional. The type of the input. Returns a
+                      422 error if the model doesn't support the value or parameter. Known values are:
+                      "text", "query", and "document".
+                }
+
+                # response body for status code(s): 200
+                response == {
+                    "data": [
+                        {
+                            "embedding": [
+                                0.0  # List of embeddings value for the input prompt.
+                                  These represent a measurement of the vector-based relatedness of the
+                                  provided input. Required.
+                            ],
+                            "index": 0  # Index of the prompt to which the EmbeddingItem
+                              corresponds. Required.
+                        }
+                    ],
+                    "id": "str",  # Unique identifier for the embeddings result. Required.
+                    "model": "str",  # The model ID used to generate this result. Required.
+                    "usage": {
+                        "capacity_type": "str",  # Indicates whether your capacity has been
+                          affected by the usage amount (token count) reported here. Required. Known
+                          values are: "usage" and "fixed".
+                        "input_tokens": 0,  # Number of tokens in the request prompt.
+                          Required.
+                        "prompt_tokens": 0,  # Number of tokens used for the prompt sent to
+                          the AI model. Typically identical to ``input_tokens``. However, certain AI
+                          models may add extra tokens to the input hence the number can be higher. (for
+                          example when input_type="query"). Required.
+                        "total_tokens": 0  # Total number of tokens transacted in this
+                          request/response. Required.
+                    }
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+        cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_image_embeddings_embed_request(
+            unknown_params=unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
+
+    @distributed_trace_async
+    async def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+        The method makes a REST API call to the ``/info`` route on the given endpoint.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError:
+
+        Example:
+            .. code-block:: python
+
+                # response body for status code(s): 200
+                response == {
+                    "model_name": "str",  # The name of the AI model. For example: ``Phi21``.
+                      Required.
+                    "model_provider_name": "str",  # The model provider name. For example:
+                      ``Microsoft Research``. Required.
+                    "model_type": "str"  # The type of the AI model. A Unique identifier for the
+                      profile. Required. Known values are: "embeddings", "image_generation",
+                      "text_generation", "image_embeddings", "audio_generation", and "chat".
+                }
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = kwargs.pop("headers", {}) or {}
+        _params = kwargs.pop("params", {}) or {}
+
+        cls: ClsType[_models.ModelInfo] = kwargs.pop("cls", None)
+
+        _request = build_image_embeddings_get_model_info_request(
+            api_version=self._config.api_version,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.ModelInfo, response.json())
+
+        if cls:
+            return cls(pipeline_response, deserialized, {})  # type: ignore
+
+        return deserialized  # type: ignore
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_patch.py
new file mode 100644
index 000000000000..f7dd32510333
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_patch.py
@@ -0,0 +1,20 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""Customize generated code here.
+
+Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
+"""
+from typing import List
+
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
+
+
+def patch_sdk():
+    """Do not remove from this file.
+
+    `patch_sdk` is a last resort escape hatch that allows you to do customizations
+    you can't accomplish using the techniques described in
+    https://aka.ms/azsdk/python/dpcodegen/python/customize
+    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
new file mode 100644
index 000000000000..9b5347bb5bc8
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
@@ -0,0 +1,1098 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+# pylint: disable=too-many-lines)
+"""Customize generated code here.
+
+Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
+"""
+import json
+import logging
+import sys
+
+from io import IOBase
+from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING
+
+from azure.core.pipeline import PipelineResponse
+from azure.core.credentials import AzureKeyCredential
+from azure.core.tracing.decorator_async import distributed_trace_async
+from azure.core.utils import case_insensitive_dict
+from azure.core.exceptions import (
+    ClientAuthenticationError,
+    HttpResponseError,
+    map_error,
+    ResourceExistsError,
+    ResourceNotFoundError,
+    ResourceNotModifiedError,
+)
+from .. import models as _models
+from .._model_base import SdkJSONEncoder, _deserialize
+from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
+from ._client import EmbeddingsClient as EmbeddingsClientGenerated
+from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
+from .._operations._operations import (
+    build_chat_completions_complete_request,
+    build_embeddings_embed_request,
+    build_image_embeddings_embed_request,
+)
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core.credentials_async import AsyncTokenCredential
+
+if sys.version_info >= (3, 9):
+    from collections.abc import MutableMapping
+else:
+    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
+
+if sys.version_info >= (3, 11):
+    from typing import Self
+else:
+    from typing_extensions import Self
+
+JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
+_Unset: Any = object()
+_LOGGER = logging.getLogger(__name__)
+
+
+async def load_client(
+    endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+) -> Union[ChatCompletionsClientGenerated, EmbeddingsClientGenerated, ImageEmbeddingsClientGenerated]:
+    # pylint: disable=line-too-long
+    """
+    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
+    on the given endpoint, to determine the model type and therefore which client to instantiate.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials.TokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    :return: ChatCompletionsClient or EmbeddingsClient or ImageEmbeddingsClient
+    :rtype: ~azure.ai.inference.ChatCompletionsClient or ~azure.ai.inference.EmbeddingsClient
+     or ~azure.ai.inference.ImageEmbeddingsClient
+    :raises ~azure.core.exceptions.HttpResponseError
+    """
+
+    async with ChatCompletionsClient(endpoint, credential, **kwargs) as client: # Pick any of the clients, it does not matter.
+        model_info = await client.get_model_info() # type: ignore
+
+    _LOGGER.info("model_info=%s", model_info)
+    if not model_info.model_type:
+        raise ValueError(
+            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
+        )
+
+    # TODO: Remove "completions" and "embedding" once Mistral Large and Cohere fixes their model type
+    if model_info.model_type in (_models.ModelType.CHAT, "completion"):
+        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
+        chat_completion_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
+        return chat_completion_client
+
+    if model_info.model_type in (_models.ModelType.EMBEDDINGS, "embedding"):
+        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
+        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
+        return embedding_client
+
+    if model_info.model_type == _models.ModelType.IMAGE_EMBEDDINGS:
+        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
+        image_embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
+        return image_embedding_client
+
+    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
+
+
+class ChatCompletionsClient(ChatCompletionsClientGenerated):
+    """ChatCompletionsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        self._model_info: Optional[_models.ModelInfo] = None
+        super().__init__(endpoint=endpoint, credential=credential, **kwargs)
+
+    @overload
+    async def complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        content_type: str = "application/json",
+        model_extras: Optional[Dict[str, Any]] = None,
+        extras: Optional[Dict[str, str]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Literal[False] = False,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> _models.ChatCompletions:
+        ...
+
+
+    @overload
+    async def complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        content_type: str = "application/json",
+        model_extras: Optional[Dict[str, Any]] = None,
+        extras: Optional[Dict[str, str]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Literal[True],
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> _models.AsyncStreamingChatCompletions:
+        ...
+
+
+    @overload
+    async def complete(
+        self,
+        *,
+        messages: List[_models.ChatRequestMessage],
+        content_type: str = "application/json",
+        model_extras: Optional[Dict[str, Any]] = None,
+        extras: Optional[Dict[str, str]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Optional[bool] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Union[_models.AsyncStreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
+        on the given endpoint.
+        When using this method with `stream=True`, the response is streamed
+        back to the client. Iterate over the resulting StreamingChatCompletions
+        object to get content updates as they arrive. By default, the response is a ChatCompletions object
+        (non-streaming).
+
+        :keyword messages: The collection of context messages associated with this chat completions
+         request.
+         Typical usage begins with a chat message for the System role that provides instructions for
+         the behavior of the assistant, followed by alternating messages between the User and
+         Assistant roles. Required.
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword extras: Extra parameters (in the form of string key-value pairs) that are not in the
+         standard request payload.
+         They will be passed to the service as-is in the root of the JSON request payload.
+         How the service handles these extra parameters depends on the value of the
+         ``extra-parameters``
+         HTTP request header. Default value is None.
+        :paramtype extras: dict[str, str]
+        :keyword frequency_penalty: A value that influences the probability of generated tokens
+         appearing based on their cumulative frequency in generated text.
+         Positive values will make tokens less likely to appear as their frequency increases and
+         decrease the likelihood of the model repeating the same statements verbatim.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype frequency_penalty: float
+        :keyword presence_penalty: A value that influences the probability of generated tokens
+         appearing based on their existing
+         presence in generated text.
+         Positive values will make tokens less likely to appear when they already exist and increase
+         the model's likelihood to output new topics.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype presence_penalty: float
+        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
+         generated completions.
+         Higher values will make output more random while lower values will make results more focused
+         and deterministic.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype temperature: float
+        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
+         causes the
+         model to consider the results of tokens with the provided probability mass. As an example, a
+         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+         considered.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype top_p: float
+        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
+        :paramtype max_tokens: int
+        :keyword response_format: An object specifying the format that the model must output. Used to
+         enable JSON mode. Known values are: "text" and "json_object". Default value is None.
+        :paramtype response_format: str or ~azure.ai.inference.models.ChatCompletionsResponseFormat
+        :keyword stop: A collection of textual sequences that will end completions generation. Default
+         value is None.
+        :paramtype stop: list[str]
+        :keyword stream: A value indicating whether chat completions should be streamed for this request.
+         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
+         Otherwise the response will be a ChatCompletions.
+        :paramtype stream: bool
+        :keyword tools: The available tool definitions that the chat completions request can use,
+         including caller-defined functions. Default value is None.
+        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
+         use for the chat completions response. Is either a Union[str,
+         "_models.ChatCompletionsToolSelectionPreset"] type or a ChatCompletionsNamedToolSelection type.
+         Default value is None.
+        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolSelectionPreset or
+         ~azure.ai.inference.models.ChatCompletionsNamedToolSelection
+        :keyword seed: If specified, the system will make a best effort to sample deterministically
+         such that repeated requests with the
+         same seed and parameters should return the same result. Determinism is not guaranteed.".
+         Default value is None.
+        :paramtype seed: int
+        :return: ChatCompletions for non-streaming, or AsyncStreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    async def complete(
+        self,
+        body: JSON,
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> Union[_models.AsyncStreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data.
+
+        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
+         specifies the full request payload. Required.
+        :type body: JSON
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: ChatCompletions for non-streaming, or AsyncStreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    async def complete(
+        self,
+        body: IO[bytes],
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> Union[_models.AsyncStreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data.
+
+        :param body: Specifies the full request payload. Required.
+        :type body: IO[bytes]
+        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: ChatCompletions for non-streaming, or AsyncStreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @distributed_trace_async
+    async def complete(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        messages: List[_models.ChatRequestMessage] = _Unset,
+        model_extras: Optional[Dict[str, Any]] = None,
+        extras: Optional[Dict[str, str]] = None,
+        frequency_penalty: Optional[float] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional[Union[str, _models.ChatCompletionsResponseFormat]] = None,
+        stop: Optional[List[str]] = None,
+        stream: Optional[bool] = None,
+        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
+        tool_choice: Optional[
+            Union[str, _models.ChatCompletionsToolSelectionPreset, _models.ChatCompletionsNamedToolSelection]
+        ] = None,
+        seed: Optional[int] = None,
+        **kwargs: Any,
+    ) -> Union[_models.AsyncStreamingChatCompletions, _models.ChatCompletions]:
+        # pylint: disable=line-too-long
+        # pylint: disable=too-many-locals
+        """Gets chat completions for the provided chat messages.
+        Completions support a wide variety of tasks and generate text that continues from or
+        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
+        back to the client. Iterate over the resulting ~azure.ai.inference.models.StreamingChatCompletions
+        object to get content updates as they arrive.
+
+        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
+         that specifies the full request payload. Required.
+        :type body: JSON or IO[bytes]
+        :keyword messages: The collection of context messages associated with this chat completions
+         request.
+         Typical usage begins with a chat message for the System role that provides instructions for
+         the behavior of the assistant, followed by alternating messages between the User and
+         Assistant roles. Required.
+        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword extras: Extra parameters (in the form of string key-value pairs) that are not in the
+         standard request payload.
+         They will be passed to the service as-is in the root of the JSON request payload.
+         How the service handles these extra parameters depends on the value of the
+         ``extra-parameters``
+         HTTP request header. Default value is None.
+        :paramtype extras: dict[str, str]
+        :keyword frequency_penalty: A value that influences the probability of generated tokens
+         appearing based on their cumulative frequency in generated text.
+         Positive values will make tokens less likely to appear as their frequency increases and
+         decrease the likelihood of the model repeating the same statements verbatim.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype frequency_penalty: float
+        :keyword presence_penalty: A value that influences the probability of generated tokens
+         appearing based on their existing
+         presence in generated text.
+         Positive values will make tokens less likely to appear when they already exist and increase
+         the model's likelihood to output new topics.
+         Supported range is [-2, 2].
+         Default value is None.
+        :paramtype presence_penalty: float
+        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
+         generated completions.
+         Higher values will make output more random while lower values will make results more focused
+         and deterministic.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype temperature: float
+        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
+         causes the
+         model to consider the results of tokens with the provided probability mass. As an example, a
+         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+         considered.
+         It is not recommended to modify temperature and top_p for the same completions request as the
+         interaction of these two settings is difficult to predict.
+         Supported range is [0, 1].
+         Default value is None.
+        :paramtype top_p: float
+        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
+        :paramtype max_tokens: int
+        :keyword response_format: An object specifying the format that the model must output. Used to
+         enable JSON mode. Known values are: "text" and "json_object". Default value is None.
+        :paramtype response_format: str or ~azure.ai.inference.models.ChatCompletionsResponseFormat
+        :keyword stop: A collection of textual sequences that will end completions generation. Default
+         value is None.
+        :paramtype stop: list[str]
+        :keyword stream: A value indicating whether chat completions should be streamed for this request.
+         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
+         Otherwise the response will be a ChatCompletions.
+        :paramtype stream: bool
+        :keyword tools: The available tool definitions that the chat completions request can use,
+         including caller-defined functions. Default value is None.
+        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
+         use for the chat completions response. Is either a Union[str,
+         "_models.ChatCompletionsToolSelectionPreset"] type or a ChatCompletionsNamedToolSelection type.
+         Default value is None.
+        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolSelectionPreset or
+         ~azure.ai.inference.models.ChatCompletionsNamedToolSelection
+        :keyword seed: If specified, the system will make a best effort to sample deterministically
+         such that repeated requests with the
+         same seed and parameters should return the same result. Determinism is not guaranteed.".
+         Default value is None.
+        :paramtype seed: int
+        :return: ChatCompletions for non-streaming, or AsyncStreamingChatCompletions for streaming.
+        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        error_map = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+        _unknown_params: Union[_models._enums.UnknownParams, None] = None
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+
+        if body is _Unset:
+            if messages is _Unset:
+                raise TypeError("missing required argument: messages")
+            body = {
+                "extras": extras,
+                "frequency_penalty": frequency_penalty,
+                "max_tokens": max_tokens,
+                "messages": messages,
+                "presence_penalty": presence_penalty,
+                "response_format": response_format,
+                "seed": seed,
+                "stop": stop,
+                "stream": stream,
+                "temperature": temperature,
+                "tool_choice": tool_choice,
+                "tools": tools,
+                "top_p": top_p,
+            }
+            if model_extras is not None and bool(model_extras):
+                body.update(model_extras)
+                _unknown_params = _models._enums.UnknownParams.PASS_THROUGH  # pylint: disable=protected-access
+            body = {k: v for k, v in body.items() if v is not None}
+        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
+            stream = body["stream"]
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_chat_completions_complete_request(
+            unknown_params=_unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = stream or False
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            return _models.AsyncStreamingChatCompletions(response)
+
+        return _deserialize(_models.ChatCompletions, response.json())  # pylint: disable=protected-access
+
+
+    @distributed_trace_async
+    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        if not self._model_info:
+            self._model_info = await self._get_model_info(**kwargs) # pylint: disable=attribute-defined-outside-init
+        return self._model_info
+
+
+    def __str__(self) -> str:
+        # pylint: disable=client-method-name-no-double-underscore
+        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
+
+
+    # Remove this once https://github.com/Azure/autorest.python/issues/2619 is fixed,
+    # and you see the equivalent auto-generated method in _client.py return "Self"
+    async def __aenter__(self) -> Self:
+        await self._client.__aenter__()
+        return self
+
+
+class EmbeddingsClient(EmbeddingsClientGenerated):
+    """EmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        self._model_info: Optional[_models.ModelInfo] = None
+        super().__init__(endpoint=endpoint, credential=credential, **kwargs)
+
+
+    @overload
+    async def embed(
+        self,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[str],
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input text to embed, encoded as a string or array of tokens.
+         To embed multiple inputs in a single request, pass an array
+         of strings or array of token arrays. Required.
+        :paramtype input: list[str]
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :keyword extras: Extra parameters (in the form of string key-value pairs) that are not in the
+         standard request payload.
+         They will be passed to the service as-is in the root of the JSON request payload.
+         How the service handles these extra parameters depends on the value of the
+         ``extra-parameters``
+         HTTP request header. Default value is None.
+        :paramtype extras: dict[str, str]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    async def embed(
+        self,
+        body: JSON,
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
+         specifies the full request payload. Required.
+        :type body: JSON
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    async def embed(
+        self,
+        body: IO[bytes],
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :param body: Specifies the full request payload. Required.
+        :type body: IO[bytes]
+        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @distributed_trace_async
+    async def embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[str] = _Unset,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given text prompts.
+        The method makes a REST API call to the `/embeddings` route on the given endpoint.
+
+        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
+         that specifies the full request payload. Required.
+        :type body: JSON or IO[bytes]
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input text to embed, encoded as a string or array of tokens.
+         To embed multiple inputs in a single request, pass an array
+         of strings or array of token arrays. Required.
+        :paramtype input: list[str]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+        _unknown_params: Union[_models._enums.UnknownParams, None] = None
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            if model_extras is not None and bool(model_extras):
+                body.update(model_extras)
+                _unknown_params = _models._enums.UnknownParams.PASS_THROUGH  # pylint: disable=protected-access
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_embeddings_embed_request(
+            unknown_params=_unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        return deserialized  # type: ignore
+
+
+    @distributed_trace_async
+    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        if not self._model_info:
+            self._model_info = await self._get_model_info(**kwargs) # pylint: disable=attribute-defined-outside-init
+        return self._model_info
+
+
+    def __str__(self) -> str:
+        # pylint: disable=client-method-name-no-double-underscore
+        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
+
+
+    # Remove this once https://github.com/Azure/autorest.python/issues/2619 is fixed,
+    # and you see the equivalent auto-generated method in _client.py return "Self"
+    async def __aenter__(self) -> Self:
+        await self._client.__aenter__()
+        return self
+
+
+class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
+    """ImageEmbeddingsClient.
+
+    :param endpoint: Service host. Required.
+    :type endpoint: str
+    :param credential: Credential used to authenticate requests to the service. Is either a
+     AzureKeyCredential type or a TokenCredential type. Required.
+    :type credential: ~azure.core.credentials.AzureKeyCredential or
+     ~azure.core.credentials_async.AsyncTokenCredential
+    :keyword api_version: The API version to use for this operation. Default value is
+     "2024-05-01-preview". Note that overriding this default value may result in unsupported
+     behavior.
+    :paramtype api_version: str
+    """
+
+    def __init__(
+        self, endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
+    ) -> None:
+        self._model_info: Optional[_models.ModelInfo] = None
+        super().__init__(endpoint=endpoint, credential=credential, **kwargs)
+
+
+    @overload
+    async def embed(
+        self,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[_models.EmbeddingInput],
+        content_type: str = "application/json",
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
+         array.
+         The input must not exceed the max input tokens for the model. Required.
+        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :keyword extras: Extra parameters (in the form of string key-value pairs) that are not in the
+         standard request payload.
+         They will be passed to the service as-is in the root of the JSON request payload.
+         How the service handles these extra parameters depends on the value of the
+         ``extra-parameters``
+         HTTP request header. Default value is None.
+        :paramtype extras: dict[str, str]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    async def embed(
+        self,
+        body: JSON,
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
+         specifies the full request payload. Required.
+        :type body: JSON
+        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @overload
+    async def embed(
+        self,
+        body: IO[bytes],
+        *,
+        content_type: str = "application/json",
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :param body: Specifies the full request payload. Required.
+        :type body: IO[bytes]
+        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
+         Default value is "application/json".
+        :paramtype content_type: str
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+
+    @distributed_trace_async
+    async def embed(
+        self,
+        body: Union[JSON, IO[bytes]] = _Unset,
+        *,
+        model_extras: Optional[Dict[str, Any]] = None,
+        input: List[_models.EmbeddingInput] = _Unset,
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
+        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
+        **kwargs: Any,
+    ) -> _models.EmbeddingsResult:
+        # pylint: disable=line-too-long
+        """Return the embedding vectors for given images.
+        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
+
+        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
+         that specifies the full request payload. Required.
+        :type body: JSON or IO[bytes]
+        :keyword model_extras: Additional, model-specific parameters that are not in the
+         standard request payload. They will be added as-is to the root of the JSON in the request body.
+         How the service handles these extra parameters depends on the value of the
+         ``unknown-parameters`` request header. Default value is None.
+        :paramtype model_extras: dict[str, Any]
+        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
+         array.
+         The input must not exceed the max input tokens for the model. Required.
+        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
+        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
+         have.
+         Passing null causes the model to use its default value.
+         Returns a 422 error if the model doesn't support the value or parameter. Default value is
+         None.
+        :paramtype dimensions: int
+        :keyword encoding_format: Optional. The desired format for the returned embeddings.
+         Known values are:
+         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
+        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+        :keyword input_type: Optional. The type of the input.
+         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+         "text", "query", and "document". Default value is None.
+        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.EmbeddingsResult
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+            401: ClientAuthenticationError,
+            404: ResourceNotFoundError,
+            409: ResourceExistsError,
+            304: ResourceNotModifiedError,
+        }
+        error_map.update(kwargs.pop("error_map", {}) or {})
+
+        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
+        _params = kwargs.pop("params", {}) or {}
+        _unknown_params: Union[_models._enums.UnknownParams, None] = None
+
+        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
+
+        if body is _Unset:
+            if input is _Unset:
+                raise TypeError("missing required argument: input")
+            body = {
+                "dimensions": dimensions,
+                "encoding_format": encoding_format,
+                "input": input,
+                "input_type": input_type,
+            }
+            if model_extras is not None and bool(model_extras):
+                body.update(model_extras)
+                _unknown_params = _models._enums.UnknownParams.PASS_THROUGH  # pylint: disable=protected-access
+            body = {k: v for k, v in body.items() if v is not None}
+        content_type = content_type or "application/json"
+        _content = None
+        if isinstance(body, (IOBase, bytes)):
+            _content = body
+        else:
+            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
+
+        _request = build_image_embeddings_embed_request(
+            unknown_params=_unknown_params,
+            content_type=content_type,
+            api_version=self._config.api_version,
+            content=_content,
+            headers=_headers,
+            params=_params,
+        )
+        path_format_arguments = {
+            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
+        }
+        _request.url = self._client.format_url(_request.url, **path_format_arguments)
+
+        _stream = kwargs.pop("stream", False)
+        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
+            _request, stream=_stream, **kwargs
+        )
+
+        response = pipeline_response.http_response
+
+        if response.status_code not in [200]:
+            if _stream:
+                await response.read()  # Load the body in memory and close the socket
+            map_error(status_code=response.status_code, response=response, error_map=error_map)
+            raise HttpResponseError(response=response)
+
+        if _stream:
+            deserialized = response.iter_bytes()
+        else:
+            deserialized = _deserialize(_models.EmbeddingsResult, response.json())
+
+        return deserialized  # type: ignore
+
+
+    @distributed_trace_async
+    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
+        # pylint: disable=line-too-long
+        """Returns information about the AI model.
+
+        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
+        :rtype: ~azure.ai.inference.models.ModelInfo
+        :raises ~azure.core.exceptions.HttpResponseError
+        """
+        if not self._model_info:
+            self._model_info = await self._get_model_info(**kwargs) # pylint: disable=attribute-defined-outside-init
+        return self._model_info
+
+
+    def __str__(self) -> str:
+        # pylint: disable=client-method-name-no-double-underscore
+        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
+
+
+    # Remove this once https://github.com/Azure/autorest.python/issues/2619 is fixed,
+    # and you see the equivalent auto-generated method in _client.py return "Self"
+    async def __aenter__(self) -> Self:
+        await self._client.__aenter__()
+        return self
+
+__all__: List[str] = [
+    "load_client",
+    "ChatCompletionsClient",
+    "EmbeddingsClient",
+    "ImageEmbeddingsClient",
+]  # Add all objects you want publicly available to users at this package level
+
+
+def patch_sdk():
+    """Do not remove from this file.
+
+    `patch_sdk` is a last resort escape hatch that allows you to do customizations
+    you can't accomplish using the techniques described in
+    https://aka.ms/azsdk/python/dpcodegen/python/customize
+    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_vendor.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_vendor.py
new file mode 100644
index 000000000000..dd91e1ea130f
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_vendor.py
@@ -0,0 +1,48 @@
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from abc import ABC
+from typing import TYPE_CHECKING
+
+from ._configuration import (
+    ChatCompletionsClientConfiguration,
+    EmbeddingsClientConfiguration,
+    ImageEmbeddingsClientConfiguration,
+)
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from azure.core import AsyncPipelineClient
+
+    from .._serialization import Deserializer, Serializer
+
+
+class ChatCompletionsClientMixinABC(ABC):
+    """DO NOT use this class. It is for internal typing use only."""
+
+    _client: "AsyncPipelineClient"
+    _config: ChatCompletionsClientConfiguration
+    _serialize: "Serializer"
+    _deserialize: "Deserializer"
+
+
+class EmbeddingsClientMixinABC(ABC):
+    """DO NOT use this class. It is for internal typing use only."""
+
+    _client: "AsyncPipelineClient"
+    _config: EmbeddingsClientConfiguration
+    _serialize: "Serializer"
+    _deserialize: "Deserializer"
+
+
+class ImageEmbeddingsClientMixinABC(ABC):
+    """DO NOT use this class. It is for internal typing use only."""
+
+    _client: "AsyncPipelineClient"
+    _config: ImageEmbeddingsClientConfiguration
+    _serialize: "Serializer"
+    _deserialize: "Deserializer"
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
new file mode 100644
index 000000000000..8a7c4bbbb7b3
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from ._models import AssistantMessage
+from ._models import ChatChoice
+from ._models import ChatCompletions
+from ._models import ChatCompletionsFunctionToolCall
+from ._models import ChatCompletionsFunctionToolDefinition
+from ._models import ChatCompletionsFunctionToolSelection
+from ._models import ChatCompletionsNamedFunctionToolSelection
+from ._models import ChatCompletionsNamedToolSelection
+from ._models import ChatCompletionsToolCall
+from ._models import ChatCompletionsToolDefinition
+from ._models import ChatRequestMessage
+from ._models import ChatResponseMessage
+from ._models import CompletionsUsage
+from ._models import EmbeddingInput
+from ._models import EmbeddingItem
+from ._models import EmbeddingsResult
+from ._models import EmbeddingsUsage
+from ._models import FunctionCall
+from ._models import FunctionDefinition
+from ._models import ModelInfo
+from ._models import StreamingChatChoiceUpdate
+from ._models import StreamingChatCompletionsUpdate
+from ._models import SystemMessage
+from ._models import ToolMessage
+from ._models import UserMessage
+
+from ._enums import ChatCompletionsResponseFormat
+from ._enums import ChatCompletionsToolSelectionPreset
+from ._enums import ChatRole
+from ._enums import EmbeddingEncodingFormat
+from ._enums import EmbeddingInputType
+from ._enums import CompletionsFinishReason
+from ._enums import ModelType
+
+from ._patch import StreamingChatCompletions
+from ._patch import AsyncStreamingChatCompletions
+from ._patch import patch_sdk as _patch_sdk
+
+__all__ = [
+    "StreamingChatCompletions",
+    "AsyncStreamingChatCompletions",
+    "AssistantMessage",
+    "ChatChoice",
+    "ChatCompletions",
+    "ChatCompletionsFunctionToolCall",
+    "ChatCompletionsFunctionToolDefinition",
+    "ChatCompletionsFunctionToolSelection",
+    "ChatCompletionsNamedFunctionToolSelection",
+    "ChatCompletionsNamedToolSelection",
+    "ChatCompletionsToolCall",
+    "ChatCompletionsToolDefinition",
+    "ChatRequestMessage",
+    "ChatResponseMessage",
+    "CompletionsUsage",
+    "EmbeddingInput",
+    "EmbeddingItem",
+    "EmbeddingsResult",
+    "EmbeddingsUsage",
+    "FunctionCall",
+    "FunctionDefinition",
+    "ModelInfo",
+    "StreamingChatChoiceUpdate",
+    "StreamingChatCompletionsUpdate",
+    "SystemMessage",
+    "ToolMessage",
+    "UserMessage",
+    "ChatCompletionsResponseFormat",
+    "ChatCompletionsToolSelectionPreset",
+    "ChatRole",
+    "EmbeddingEncodingFormat",
+    "EmbeddingInputType",
+    "CompletionsFinishReason",
+    "ModelType"
+]
+
+_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py
new file mode 100644
index 000000000000..0d191c4d176d
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py
@@ -0,0 +1,127 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+from enum import Enum
+from azure.core import CaseInsensitiveEnumMeta
+
+
+class ChatCompletionsResponseFormat(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """An representation of a response format configuration usable by Chat Completions. Can be used to
+    enable JSON
+    mode.
+    """
+
+    TEXT = "text"
+    """The standard Chat Completions response format that can freely generate text and is not
+    guaranteed to produce response
+    content that adheres to a specific schema."""
+    JSON_OBJECT = "json_object"
+    """A response format for Chat Completions that restricts responses to emitting valid JSON objects."""
+
+
+class ChatCompletionsToolSelectionPreset(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Represents a generic policy for how a chat completions tool may be selected."""
+
+    AUTO = "auto"
+    """Specifies that the model may either use any of the tools provided in this chat completions
+    request or
+    instead return a standard chat completions response as if no tools were provided."""
+    NONE = "none"
+    """Specifies that the model should not respond with a tool call and should instead provide a
+    standard chat
+    completions response. Response content may still be influenced by the provided tool
+    definitions."""
+    REQUIRED = "required"
+    """Specifies that the model should respond with a call to one or more tools."""
+
+
+class ChatRole(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """A description of the intended purpose of a message within a chat completions interaction."""
+
+    SYSTEM = "system"
+    """The role that instructs or sets the behavior of the assistant."""
+    USER = "user"
+    """The role that provides input for chat completions."""
+    ASSISTANT = "assistant"
+    """The role that provides responses to system-instructed, user-prompted input."""
+    TOOL = "tool"
+    """The role that represents extension tool activity within a chat completions operation."""
+
+
+class CompletionsFinishReason(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Representation of the manner in which a completions response concluded."""
+
+    STOPPED = "stop"
+    """Completions ended normally and reached its end of token generation."""
+    TOKEN_LIMIT_REACHED = "length"
+    """Completions exhausted available token limits before generation could complete."""
+    CONTENT_FILTERED = "content_filter"
+    """Completions generated a response that was identified as potentially sensitive per content
+    moderation policies."""
+    TOOL_CALLS = "tool_calls"
+    """Completion ended with the model calling a provided tool for output."""
+
+
+class EmbeddingEncodingFormat(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """The format of the embeddings result.
+    Returns a 422 error if the model doesn't support the value or parameter.
+    """
+
+    BASE64 = "base64"
+    """Base64"""
+    BINARY = "binary"
+    """Binary"""
+    FLOAT = "float"
+    """Floating point"""
+    INT8 = "int8"
+    """Signed 8-bit integer"""
+    UBINARY = "ubinary"
+    """ubinary"""
+    UINT8 = "uint8"
+    """Unsigned 8-bit integer"""
+
+
+class EmbeddingInputType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Represents the input types used for embedding search."""
+
+    TEXT = "text"
+    """to do"""
+    QUERY = "query"
+    """to do"""
+    DOCUMENT = "document"
+    """to do"""
+
+
+class ModelType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """The type of AI model."""
+
+    EMBEDDINGS = "embeddings"
+    """Embeddings."""
+    IMAGE_GENERATION = "image_generation"
+    """Image generation"""
+    TEXT_GENERATION = "text_generation"
+    """Text generation"""
+    IMAGE_EMBEDDINGS = "image_embeddings"
+    """Image embeddings"""
+    AUDIO_GENERATION = "audio_generation"
+    """Audio generation"""
+    CHAT = "chat"
+    """Chat completions"""
+
+
+class UnknownParams(str, Enum, metaclass=CaseInsensitiveEnumMeta):
+    """Controls what happens if unknown parameters are passed in the JSON request payload."""
+
+    ERROR = "error"
+    """The service will error if it detected unknown parameters in the request payload. This is the
+    service default."""
+    DROP = "drop"
+    """The service will ignore (drop) unknown parameters in the request payload. It will only pass the
+    known parameters to the back-end AI model."""
+    PASS_THROUGH = "pass_through"
+    """The service will pass unknown parameters to the back-end AI model."""
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
new file mode 100644
index 000000000000..66fa73e9173b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
@@ -0,0 +1,1089 @@
+# coding=utf-8
+# pylint: disable=too-many-lines
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+
+import datetime
+import sys
+from typing import Any, Dict, List, Literal, Mapping, Optional, TYPE_CHECKING, Union, overload
+
+from .. import _model_base
+from .._model_base import rest_discriminator, rest_field
+from ._enums import ChatRole
+
+if sys.version_info >= (3, 9):
+    from collections.abc import MutableMapping
+else:
+    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
+
+if TYPE_CHECKING:
+    # pylint: disable=unused-import,ungrouped-imports
+    from .. import models as _models
+JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
+
+
+class ChatRequestMessage(_model_base.Model):
+    """An abstract representation of a chat message as provided in a request.
+
+    You probably want to use the sub-classes and not this class directly. Known sub-classes are:
+    AssistantMessage, SystemMessage, ToolMessage, UserMessage
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message. Required. Known values are: "system",
+     "user", "assistant", and "tool".
+    :vartype role: str or ~azure.ai.inference.models.ChatRole
+    """
+
+    __mapping__: Dict[str, _model_base.Model] = {}
+    role: str = rest_discriminator(name="role")
+    """The chat role associated with this message. Required. Known values are: \"system\", \"user\",
+     \"assistant\", and \"tool\"."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        role: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class AssistantMessage(ChatRequestMessage, discriminator="assistant"):
+    """A request chat message representing response or action from the assistant.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'assistant' for
+     assistant messages. Required. The role that provides responses to system-instructed,
+     user-prompted input.
+    :vartype role: str or ~azure.ai.inference.models.ASSISTANT
+    :ivar content: The content of the message.
+    :vartype content: str
+    :ivar tool_calls: The tool calls that must be resolved and have their outputs appended to
+     subsequent input messages for the chat
+     completions request to resolve as configured.
+    :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
+    """
+
+    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'assistant' for assistant messages.
+     Required. The role that provides responses to system-instructed, user-prompted input."""
+    content: Optional[str] = rest_field()
+    """The content of the message."""
+    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
+    """The tool calls that must be resolved and have their outputs appended to subsequent input
+     messages for the chat
+     completions request to resolve as configured."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: Optional[str] = None,
+        tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = None,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, role=ChatRole.ASSISTANT, **kwargs)
+
+
+class ChatChoice(_model_base.Model):
+    """The representation of a single prompt completion as part of an overall chat completions
+    request.
+    Generally, ``n`` choices are generated per provided prompt with a default value of 1.
+    Token limits and other settings may limit the number of choices generated.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar index: The ordered index associated with this chat completions choice. Required.
+    :vartype index: int
+    :ivar finish_reason: The reason that this chat completions choice completed its generated.
+     Required. Known values are: "stop", "length", "content_filter", and "tool_calls".
+    :vartype finish_reason: str or ~azure.ai.inference.models.CompletionsFinishReason
+    :ivar message: The chat message for a given chat completions prompt. Required.
+    :vartype message: ~azure.ai.inference.models.ChatResponseMessage
+    """
+
+    index: int = rest_field()
+    """The ordered index associated with this chat completions choice. Required."""
+    finish_reason: Union[str, "_models._enums.CompletionsFinishReason"] = rest_field()
+    """The reason that this chat completions choice completed its generated. Required. Known values
+     are: \"stop\", \"length\", \"content_filter\", and \"tool_calls\"."""
+    message: "_models.ChatResponseMessage" = rest_field()
+    """The chat message for a given chat completions prompt. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        index: int,
+        finish_reason: Union[str, "_models._enums.CompletionsFinishReason"],
+        message: "_models.ChatResponseMessage",
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ChatCompletions(_model_base.Model):
+    """Representation of the response data from a chat completions request.
+    Completions support a wide variety of tasks and generate text that continues from or
+    "completes"
+    provided prompt data.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar id: A unique identifier associated with this chat completions response. Required.
+    :vartype id: str
+    :ivar created: The first timestamp associated with generation activity for this completions
+     response,
+     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
+    :vartype created: ~datetime.datetime
+    :ivar model: The model used for the chat completion. Required.
+    :vartype model: str
+    :ivar usage: Usage information for tokens processed and generated as part of this completions
+     operation. Required.
+    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
+    :ivar choices: The collection of completions choices associated with this completions response.
+     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
+     Token limits and other settings may limit the number of choices generated. Required.
+    :vartype choices: list[~azure.ai.inference.models.ChatChoice]
+    """
+
+    id: str = rest_field()
+    """A unique identifier associated with this chat completions response. Required."""
+    created: datetime.datetime = rest_field(format="unix-timestamp")
+    """The first timestamp associated with generation activity for this completions response,
+     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required."""
+    model: str = rest_field()
+    """The model used for the chat completion. Required."""
+    usage: "_models.CompletionsUsage" = rest_field()
+    """Usage information for tokens processed and generated as part of this completions operation.
+     Required."""
+    choices: List["_models.ChatChoice"] = rest_field()
+    """The collection of completions choices associated with this completions response.
+     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
+     Token limits and other settings may limit the number of choices generated. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        id: str,  # pylint: disable=redefined-builtin
+        created: datetime.datetime,
+        model: str,
+        usage: "_models.CompletionsUsage",
+        choices: List["_models.ChatChoice"],
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ChatCompletionsToolCall(_model_base.Model):
+    """An abstract representation of a tool call that must be resolved in a subsequent request to
+    perform the requested
+    chat completion.
+
+    You probably want to use the sub-classes and not this class directly. Known sub-classes are:
+    ChatCompletionsFunctionToolCall
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar type: The object type. Required. Default value is None.
+    :vartype type: str
+    :ivar id: The ID of the tool call. Required.
+    :vartype id: str
+    """
+
+    __mapping__: Dict[str, _model_base.Model] = {}
+    type: str = rest_discriminator(name="type")
+    """The object type. Required. Default value is None."""
+    id: str = rest_field()
+    """The ID of the tool call. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        type: str,
+        id: str,  # pylint: disable=redefined-builtin
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ChatCompletionsFunctionToolCall(ChatCompletionsToolCall, discriminator="function"):
+    """A tool call to a function tool, issued by the model in evaluation of a configured function
+    tool, that represents
+    a function invocation needed for a subsequent chat completions request to resolve.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar id: The ID of the tool call. Required.
+    :vartype id: str
+    :ivar type: The type of tool call, in this case always 'function'. Required. Default value is
+     "function".
+    :vartype type: str
+    :ivar function: The details of the function invocation requested by the tool call. Required.
+    :vartype function: ~azure.ai.inference.models.FunctionCall
+    """
+
+    type: Literal["function"] = rest_discriminator(name="type")  # type: ignore
+    """The type of tool call, in this case always 'function'. Required. Default value is \"function\"."""
+    function: "_models.FunctionCall" = rest_field()
+    """The details of the function invocation requested by the tool call. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        id: str,  # pylint: disable=redefined-builtin
+        function: "_models.FunctionCall",
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, type="function", **kwargs)
+
+
+class ChatCompletionsToolDefinition(_model_base.Model):
+    """An abstract representation of a tool that can be used by the model to improve a chat
+    completions response.
+
+    You probably want to use the sub-classes and not this class directly. Known sub-classes are:
+    ChatCompletionsFunctionToolDefinition
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar type: The object type. Required. Default value is None.
+    :vartype type: str
+    """
+
+    __mapping__: Dict[str, _model_base.Model] = {}
+    type: str = rest_discriminator(name="type")
+    """The object type. Required. Default value is None."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        type: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ChatCompletionsFunctionToolDefinition(ChatCompletionsToolDefinition, discriminator="function"):
+    """The definition information for a chat completions function tool that can call a function in
+    response to a tool call.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar type: The object name, which is always 'function'. Required. Default value is "function".
+    :vartype type: str
+    :ivar function: The function definition details for the function tool. Required.
+    :vartype function: ~azure.ai.inference.models.FunctionDefinition
+    """
+
+    type: Literal["function"] = rest_discriminator(name="type")  # type: ignore
+    """The object name, which is always 'function'. Required. Default value is \"function\"."""
+    function: "_models.FunctionDefinition" = rest_field()
+    """The function definition details for the function tool. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        function: "_models.FunctionDefinition",
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, type="function", **kwargs)
+
+
+class ChatCompletionsFunctionToolSelection(_model_base.Model):
+    """A tool selection of a specific, named function tool that will limit chat completions to using
+    the named function.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar name: The name of the function that should be called. Required.
+    :vartype name: str
+    """
+
+    name: str = rest_field()
+    """The name of the function that should be called. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        name: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ChatCompletionsNamedToolSelection(_model_base.Model):
+    """An abstract representation of an explicit, named tool selection to use for a chat completions
+    request.
+
+    You probably want to use the sub-classes and not this class directly. Known sub-classes are:
+    ChatCompletionsNamedFunctionToolSelection
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar type: The object type. Required. Default value is None.
+    :vartype type: str
+    """
+
+    __mapping__: Dict[str, _model_base.Model] = {}
+    type: str = rest_discriminator(name="type")
+    """The object type. Required. Default value is None."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        type: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ChatCompletionsNamedFunctionToolSelection(
+    ChatCompletionsNamedToolSelection, discriminator="function"
+):  # pylint: disable=name-too-long
+    """A tool selection of a specific, named function tool that will limit chat completions to using
+    the named function.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar type: The object type, which is always 'function'. Required. Default value is "function".
+    :vartype type: str
+    :ivar function: The function that should be called. Required.
+    :vartype function: ~azure.ai.inference.models.ChatCompletionsFunctionToolSelection
+    """
+
+    type: Literal["function"] = rest_discriminator(name="type")  # type: ignore
+    """The object type, which is always 'function'. Required. Default value is \"function\"."""
+    function: "_models.ChatCompletionsFunctionToolSelection" = rest_field()
+    """The function that should be called. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        function: "_models.ChatCompletionsFunctionToolSelection",
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, type="function", **kwargs)
+
+
+class ChatResponseMessage(_model_base.Model):
+    """A representation of a chat message as received in a response.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with the message. Required. Known values are: "system",
+     "user", "assistant", and "tool".
+    :vartype role: str or ~azure.ai.inference.models.ChatRole
+    :ivar content: The content of the message. Required.
+    :vartype content: str
+    :ivar tool_calls: The tool calls that must be resolved and have their outputs appended to
+     subsequent input messages for the chat
+     completions request to resolve as configured.
+    :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
+    """
+
+    role: Union[str, "_models.ChatRole"] = rest_field()
+    """The chat role associated with the message. Required. Known values are: \"system\", \"user\",
+     \"assistant\", and \"tool\"."""
+    content: str = rest_field()
+    """The content of the message. Required."""
+    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
+    """The tool calls that must be resolved and have their outputs appended to subsequent input
+     messages for the chat
+     completions request to resolve as configured."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        role: Union[str, "_models.ChatRole"],
+        content: str,
+        tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = None,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class CompletionsUsage(_model_base.Model):
+    """Representation of the token counts processed for a completions request.
+    Counts consider all tokens across prompts, choices, choice alternates, best_of generations, and
+    other consumers.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar completion_tokens: The number of tokens generated across all completions emissions.
+     Required.
+    :vartype completion_tokens: int
+    :ivar prompt_tokens: The number of tokens in the provided prompts for the completions request.
+     Required.
+    :vartype prompt_tokens: int
+    :ivar total_tokens: The total number of tokens processed for the completions request and
+     response. Required.
+    :vartype total_tokens: int
+    """
+
+    completion_tokens: int = rest_field()
+    """The number of tokens generated across all completions emissions. Required."""
+    prompt_tokens: int = rest_field()
+    """The number of tokens in the provided prompts for the completions request. Required."""
+    total_tokens: int = rest_field()
+    """The total number of tokens processed for the completions request and response. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        completion_tokens: int,
+        prompt_tokens: int,
+        total_tokens: int,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class EmbeddingInput(_model_base.Model):
+    """Represents an image with optional text.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar image: The input image, in PNG format. Required.
+    :vartype image: str
+    :ivar text: Optional. The text input to feed into the model (like DINO, CLIP).
+     Returns a 422 error if the model doesn't support the value or parameter.
+    :vartype text: str
+    """
+
+    image: str = rest_field()
+    """The input image, in PNG format. Required."""
+    text: Optional[str] = rest_field()
+    """Optional. The text input to feed into the model (like DINO, CLIP).
+     Returns a 422 error if the model doesn't support the value or parameter."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        image: str,
+        text: Optional[str] = None,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class EmbeddingItem(_model_base.Model):
+    """Representation of a single embeddings relatedness comparison.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar embedding: List of embeddings value for the input prompt. These represent a measurement
+     of the
+     vector-based relatedness of the provided input. Required.
+    :vartype embedding: list[float]
+    :ivar index: Index of the prompt to which the EmbeddingItem corresponds. Required.
+    :vartype index: int
+    """
+
+    embedding: List[float] = rest_field()
+    """List of embeddings value for the input prompt. These represent a measurement of the
+     vector-based relatedness of the provided input. Required."""
+    index: int = rest_field()
+    """Index of the prompt to which the EmbeddingItem corresponds. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        embedding: List[float],
+        index: int,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class EmbeddingsResult(_model_base.Model):
+    """Representation of the response data from an embeddings request.
+    Embeddings measure the relatedness of text strings and are commonly used for search,
+    clustering,
+    recommendations, and other similar scenarios.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar id: Unique identifier for the embeddings result. Required.
+    :vartype id: str
+    :ivar data: Embedding values for the prompts submitted in the request. Required.
+    :vartype data: list[~azure.ai.inference.models.EmbeddingItem]
+    :ivar usage: Usage counts for tokens input using the embeddings API. Required.
+    :vartype usage: ~azure.ai.inference.models.EmbeddingsUsage
+    :ivar model: The model ID used to generate this result. Required.
+    :vartype model: str
+    """
+
+    id: str = rest_field()
+    """Unique identifier for the embeddings result. Required."""
+    data: List["_models.EmbeddingItem"] = rest_field()
+    """Embedding values for the prompts submitted in the request. Required."""
+    usage: "_models.EmbeddingsUsage" = rest_field()
+    """Usage counts for tokens input using the embeddings API. Required."""
+    model: str = rest_field()
+    """The model ID used to generate this result. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        id: str,  # pylint: disable=redefined-builtin
+        data: List["_models.EmbeddingItem"],
+        usage: "_models.EmbeddingsUsage",
+        model: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class EmbeddingsUsage(_model_base.Model):
+    """Measurement of the amount of tokens used in this request and response.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar input_tokens: Number of tokens in the request prompt. Required.
+    :vartype input_tokens: int
+    :ivar prompt_tokens: Number of tokens used for the prompt sent to the AI model. Typically
+     identical to ``input_tokens``.
+     However, certain AI models may add extra tokens to the input hence the number can be higher.
+     (for example when input_type="query"). Required.
+    :vartype prompt_tokens: int
+    :ivar total_tokens: Total number of tokens transacted in this request/response. Required.
+    :vartype total_tokens: int
+    """
+
+    input_tokens: int = rest_field()
+    """Number of tokens in the request prompt. Required."""
+    prompt_tokens: int = rest_field()
+    """Number of tokens used for the prompt sent to the AI model. Typically identical to
+     ``input_tokens``.
+     However, certain AI models may add extra tokens to the input hence the number can be higher.
+     (for example when input_type=\"query\"). Required."""
+    total_tokens: int = rest_field()
+    """Total number of tokens transacted in this request/response. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        input_tokens: int,
+        prompt_tokens: int,
+        total_tokens: int,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class FunctionCall(_model_base.Model):
+    """The name and arguments of a function that should be called, as generated by the model.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar name: The name of the function to call. Required.
+    :vartype name: str
+    :ivar arguments: The arguments to call the function with, as generated by the model in JSON
+     format.
+     Note that the model does not always generate valid JSON, and may hallucinate parameters
+     not defined by your function schema. Validate the arguments in your code before calling
+     your function. Required.
+    :vartype arguments: str
+    """
+
+    name: str = rest_field()
+    """The name of the function to call. Required."""
+    arguments: str = rest_field()
+    """The arguments to call the function with, as generated by the model in JSON format.
+     Note that the model does not always generate valid JSON, and may hallucinate parameters
+     not defined by your function schema. Validate the arguments in your code before calling
+     your function. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        name: str,
+        arguments: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class FunctionDefinition(_model_base.Model):
+    """The definition of a caller-specified function that chat completions may invoke in response to
+    matching user input.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar name: The name of the function to be called. Required.
+    :vartype name: str
+    :ivar description: A description of what the function does. The model will use this description
+     when selecting the function and
+     interpreting its parameters.
+    :vartype description: str
+    :ivar parameters: The parameters the function accepts, described as a JSON Schema object.
+    :vartype parameters: any
+    """
+
+    name: str = rest_field()
+    """The name of the function to be called. Required."""
+    description: Optional[str] = rest_field()
+    """A description of what the function does. The model will use this description when selecting the
+     function and
+     interpreting its parameters."""
+    parameters: Optional[Any] = rest_field()
+    """The parameters the function accepts, described as a JSON Schema object."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        name: str,
+        description: Optional[str] = None,
+        parameters: Optional[Any] = None,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class ModelInfo(_model_base.Model):
+    """Represents some basic information about the AI model.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar model_name: The name of the AI model. For example: ``Phi21``. Required.
+    :vartype model_name: str
+    :ivar model_type: The type of the AI model. A Unique identifier for the profile. Required.
+     Known values are: "embeddings", "image_generation", "text_generation", "image_embeddings",
+     "audio_generation", and "chat".
+    :vartype model_type: str or ~azure.ai.inference.models.ModelType
+    :ivar model_provider_name: The model provider name. For example: ``Microsoft Research``.
+     Required.
+    :vartype model_provider_name: str
+    """
+
+    model_name: str = rest_field()
+    """The name of the AI model. For example: ``Phi21``. Required."""
+    model_type: Union[str, "_models._enums.ModelType"] = rest_field()
+    """The type of the AI model. A Unique identifier for the profile. Required. Known values are:
+     \"embeddings\", \"image_generation\", \"text_generation\", \"image_embeddings\",
+     \"audio_generation\", and \"chat\"."""
+    model_provider_name: str = rest_field()
+    """The model provider name. For example: ``Microsoft Research``. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        model_name: str,
+        model_type: Union[str, "_models._enums.ModelType"],
+        model_provider_name: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class StreamingChatChoiceUpdate(_model_base.Model):
+    """Represents an update to a single prompt completion when the service is streaming updates
+    using Server Sent Events (SSE).
+    Generally, ``n`` choices are generated per provided prompt with a default value of 1.
+    Token limits and other settings may limit the number of choices generated.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar index: The ordered index associated with this chat completions choice. Required.
+    :vartype index: int
+    :ivar finish_reason: The reason that this chat completions choice completed its generated.
+     Required. Known values are: "stop", "length", "content_filter", and "tool_calls".
+    :vartype finish_reason: str or ~azure.ai.inference.models.CompletionsFinishReason
+    :ivar delta: An update to the chat message for a given chat completions prompt. Required.
+    :vartype delta: ~azure.ai.inference.models.ChatResponseMessage
+    """
+
+    index: int = rest_field()
+    """The ordered index associated with this chat completions choice. Required."""
+    finish_reason: Union[str, "_models._enums.CompletionsFinishReason"] = rest_field()
+    """The reason that this chat completions choice completed its generated. Required. Known values
+     are: \"stop\", \"length\", \"content_filter\", and \"tool_calls\"."""
+    delta: "_models.ChatResponseMessage" = rest_field()
+    """An update to the chat message for a given chat completions prompt. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        index: int,
+        finish_reason: Union[str, "_models._enums.CompletionsFinishReason"],
+        delta: "_models.ChatResponseMessage",
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class StreamingChatCompletionsUpdate(_model_base.Model):
+    """Represents a response update to a chat completions request, when the service is streaming
+    updates
+    using Server Sent Events (SSE).
+    Completions support a wide variety of tasks and generate text that continues from or
+    "completes"
+    provided prompt data.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar id: A unique identifier associated with this chat completions response. Required.
+    :vartype id: str
+    :ivar created: The first timestamp associated with generation activity for this completions
+     response,
+     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
+    :vartype created: ~datetime.datetime
+    :ivar model: The model used for the chat completion. Required.
+    :vartype model: str
+    :ivar usage: Usage information for tokens processed and generated as part of this completions
+     operation. Required.
+    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
+    :ivar choices: An update to the collection of completion choices associated with this
+     completions response.
+     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
+     Token limits and other settings may limit the number of choices generated. Required.
+    :vartype choices: list[~azure.ai.inference.models.StreamingChatChoiceUpdate]
+    """
+
+    id: str = rest_field()
+    """A unique identifier associated with this chat completions response. Required."""
+    created: datetime.datetime = rest_field(format="unix-timestamp")
+    """The first timestamp associated with generation activity for this completions response,
+     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required."""
+    model: str = rest_field()
+    """The model used for the chat completion. Required."""
+    usage: "_models.CompletionsUsage" = rest_field()
+    """Usage information for tokens processed and generated as part of this completions operation.
+     Required."""
+    choices: List["_models.StreamingChatChoiceUpdate"] = rest_field()
+    """An update to the collection of completion choices associated with this completions response.
+     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
+     Token limits and other settings may limit the number of choices generated. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        id: str,  # pylint: disable=redefined-builtin
+        created: datetime.datetime,
+        model: str,
+        usage: "_models.CompletionsUsage",
+        choices: List["_models.StreamingChatChoiceUpdate"],
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, **kwargs)
+
+
+class SystemMessage(ChatRequestMessage, discriminator="system"):
+    """A request chat message containing system instructions that influence how the model will
+    generate a chat completions
+    response.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'system' for system
+     messages. Required. The role that instructs or sets the behavior of the assistant.
+    :vartype role: str or ~azure.ai.inference.models.SYSTEM
+    :ivar content: The contents of the system message. Required.
+    :vartype content: str
+    """
+
+    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'system' for system messages.
+     Required. The role that instructs or sets the behavior of the assistant."""
+    content: str = rest_field()
+    """The contents of the system message. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, role=ChatRole.SYSTEM, **kwargs)
+
+
+class ToolMessage(ChatRequestMessage, discriminator="tool"):
+    """A request chat message representing requested output from a configured tool.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'tool' for tool
+     messages. Required. The role that represents extension tool activity within a chat completions
+     operation.
+    :vartype role: str or ~azure.ai.inference.models.TOOL
+    :ivar content: The content of the message. Required.
+    :vartype content: str
+    :ivar tool_call_id: The ID of the tool call resolved by the provided content. Required.
+    :vartype tool_call_id: str
+    """
+
+    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'tool' for tool messages. Required.
+     The role that represents extension tool activity within a chat completions operation."""
+    content: str = rest_field()
+    """The content of the message. Required."""
+    tool_call_id: str = rest_field()
+    """The ID of the tool call resolved by the provided content. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: str,
+        tool_call_id: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, role=ChatRole.TOOL, **kwargs)
+
+
+class UserMessage(ChatRequestMessage, discriminator="user"):
+    """A request chat message representing user input to the assistant.
+
+    All required parameters must be populated in order to send to server.
+
+    :ivar role: The chat role associated with this message, which is always 'user' for user
+     messages. Required. The role that provides input for chat completions.
+    :vartype role: str or ~azure.ai.inference.models.USER
+    :ivar content: The contents of the user message, with available input types varying by selected
+     model. Required.
+    :vartype content: str
+    """
+
+    role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
+    """The chat role associated with this message, which is always 'user' for user messages. Required.
+     The role that provides input for chat completions."""
+    content: str = rest_field()
+    """The contents of the user message, with available input types varying by selected model.
+     Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        content: str,
+    ): ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]):
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:  # pylint: disable=useless-super-delegation
+        super().__init__(*args, role=ChatRole.USER, **kwargs)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
new file mode 100644
index 000000000000..bbeda9d47f64
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
@@ -0,0 +1,184 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""Customize generated code here.
+
+Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
+"""
+import asyncio
+import json
+import logging
+import queue
+import re
+
+from typing import List, AsyncIterator, Iterator
+from azure.core.rest import HttpResponse, AsyncHttpResponse
+from .. import models as _models
+
+logger = logging.getLogger(__name__)
+
+
+class BaseStreamingChatCompletions:
+    """A base class for the sync and async streaming chat completions responses, holding any common code
+    to deserializes the Server Sent Events (SSE) response stream into chat completions updates, each one
+    represented by a StreamingChatCompletionsUpdate object.
+    """
+
+    # Enable detailed logs of SSE parsing. For development only, should be `False` by default.
+    _ENABLE_CLASS_LOGS = False
+
+    # The prefix of each line in the SSE stream that contains a JSON string
+    # to deserialize into a StreamingChatCompletionsUpdate object
+    _SSE_DATA_EVENT_PREFIX = "data: "
+
+    # The line indicating the end of the SSE stream
+    _SSE_DATA_EVENT_DONE = "data: [DONE]"
+
+    def __init__(self):
+        self._queue: "queue.Queue[_models.StreamingChatCompletionsUpdate]" = queue.Queue()
+        self._incomplete_json = ""
+        self._done = False  # Will be set to True when reading 'data: [DONE]' line
+
+    def _deserialize_and_add_to_queue(self, element: bytes) -> bool:
+
+        # Clear the queue of StreamingChatCompletionsUpdate before processing the next block
+        self._queue.queue.clear()
+
+        # Convert `bytes` to string and split the string by newline, while keeping the new line char.
+        # the last may be a partial "line" that does not contain a newline char at the end.
+        line_list: List[str] = re.split(r"(?<=\n)", element.decode("utf-8"))
+        for index, line in enumerate(line_list):
+
+            if self._ENABLE_CLASS_LOGS:
+                logger.debug("[Original line] %s", repr(line))
+
+            if index == 0:
+                line = self._incomplete_json + line
+                self._incomplete_json = ""
+
+            if index == len(line_list) - 1 and not line.endswith("\n"):
+                self._incomplete_json = line
+                return False
+
+            if self._ENABLE_CLASS_LOGS:
+                logger.debug("[Modified line] %s", repr(line))
+
+            if line == "\n":  # Empty line, indicating flush output to client
+                continue
+
+            if not line.startswith(self._SSE_DATA_EVENT_PREFIX):
+                raise ValueError(f"SSE event not supported (line `{line}`)")
+
+            if line.startswith(self._SSE_DATA_EVENT_DONE):
+                if self._ENABLE_CLASS_LOGS:
+                    logger.debug("[Done]")
+                return True
+
+            # If you reached here, the line should contain `data: {...}\n`
+            # where the curly braces contain a valid JSON object.
+            # Deserialize it into a StreamingChatCompletionsUpdate object
+            # and add it to the queue.
+            self._queue.put(
+                # pylint: disable=W0212 # Access to a protected member _deserialize of a client class
+                _models.StreamingChatCompletionsUpdate._deserialize(
+                    json.loads(line[len(self._SSE_DATA_EVENT_PREFIX) : -1]), []
+                )
+            )
+
+            if self._ENABLE_CLASS_LOGS:
+                logger.debug("[Added to queue]")
+
+        return False
+
+
+class StreamingChatCompletions(BaseStreamingChatCompletions):
+    """Represents an interator over StreamingChatCompletionsUpdate objects. It can be used for either synchronous or
+    asynchronous iterations. The class deserializes the Server Sent Events (SSE) response stream
+    into chat completions updates, each one represented by a StreamingChatCompletionsUpdate object.
+    """
+
+    def __init__(self, response: HttpResponse):
+        super().__init__()
+        self._response = response
+        self._bytes_iterator: Iterator[bytes] = response.iter_bytes()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self) -> _models.StreamingChatCompletionsUpdate:
+        while self._queue.empty() and not self._done:
+            self._done = self._read_next_block()
+        if self._queue.empty():
+            raise StopIteration
+        return self._queue.get()
+
+    def _read_next_block(self) -> bool:
+        if self._ENABLE_CLASS_LOGS:
+            logger.debug("[Reading next block]")
+        try:
+            element = self._bytes_iterator.__next__()
+        except StopIteration:
+            self.close()
+            return True
+        return self._deserialize_and_add_to_queue(element)
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        self.close()
+
+    def close(self) -> None:
+        self._response.close()
+
+
+class AsyncStreamingChatCompletions(BaseStreamingChatCompletions):
+    """Represents an async interator over StreamingChatCompletionsUpdate objects.
+    It can be used for either synchronous or asynchronous iterations. The class
+    deserializes the Server Sent Events (SSE) response stream into chat
+    completions updates, each one represented by a StreamingChatCompletionsUpdate object.
+    """
+
+    def __init__(self, response: AsyncHttpResponse):
+        super().__init__()
+        self._response = response
+        self._bytes_iterator: AsyncIterator[bytes] = response.iter_bytes()
+
+    def __aiter__(self):
+        return self
+
+    async def __anext__(self) -> _models.StreamingChatCompletionsUpdate:
+        while self._queue.empty() and not self._done:
+            self._done = await self._read_next_block_async()
+        if self._queue.empty():
+            raise StopAsyncIteration
+        return self._queue.get()
+
+    async def _read_next_block_async(self) -> bool:
+        if self._ENABLE_CLASS_LOGS:
+            logger.debug("[Reading next block]")
+        try:
+            element = await self._bytes_iterator.__anext__()
+        except StopAsyncIteration:
+            await self.aclose()
+            return True
+        return self._deserialize_and_add_to_queue(element)
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        asyncio.run(self.aclose())
+
+    async def aclose(self) -> None:
+        await self._response.close()
+
+
+__all__: List[str] = [
+    "StreamingChatCompletions",
+    "AsyncStreamingChatCompletions",
+]  # Add all objects you want publicly available to users at this package level
+
+
+def patch_sdk():
+    """Do not remove from this file.
+
+    `patch_sdk` is a last resort escape hatch that allows you to do customizations
+    you can't accomplish using the techniques described in
+    https://aka.ms/azsdk/python/dpcodegen/python/customize
+    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/py.typed b/sdk/ai/azure-ai-inference/azure/ai/inference/py.typed
new file mode 100644
index 000000000000..e5aff4f83af8
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/py.typed
@@ -0,0 +1 @@
+# Marker file for PEP 561.
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/dev_requirements.txt b/sdk/ai/azure-ai-inference/dev_requirements.txt
new file mode 100644
index 000000000000..105486471444
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/dev_requirements.txt
@@ -0,0 +1,3 @@
+-e ../../../tools/azure-sdk-tools
+../../core/azure-core
+aiohttp
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/pyproject.toml b/sdk/ai/azure-ai-inference/pyproject.toml
new file mode 100644
index 000000000000..0817f7c7a6c2
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/pyproject.toml
@@ -0,0 +1,2 @@
+[tool.generate]
+autorest-post-process = true
diff --git a/sdk/ai/azure-ai-inference/samples/README.md b/sdk/ai/azure-ai-inference/samples/README.md
new file mode 100644
index 000000000000..8d9b0d40c77f
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/README.md
@@ -0,0 +1,142 @@
+---
+page_type: sample
+languages:
+  - python
+products:
+  - azure
+  - azure-ai
+urlFragment: model-inference-samples
+---
+
+# Samples for Azure AI Inference client library for Python
+
+These are runnable console Python scripts that show how to do chat completion and text embeddings using the clients in this package. Samples in this folder use the a synchronous clients. Samples in the subfolder `async_samples` use the asynchronous clients. The concepts are similar, you can easily modify any of the  synchronous samples to asynchronous.
+
+## Prerequisites
+
+See [Prerequisites](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/README.md#prerequisites) here.
+
+## Setup
+
+* Clone or download this sample repository
+* Open a command prompt / terminal window in this samples folder
+* Install the client library for Python with pip:
+  ```bash
+  pip install azure-ai-inference
+  ```
+  or update an existing installation:
+  ```bash
+  pip install --upgrade azure-ai-inference
+  ```
+* If you plan to run the asynchronous client samples, insall the additional package [aiohttp](https://pypi.org/project/aiohttp/):
+  ```bash
+  pip install aiohttp
+  ```
+
+## Set environment variables
+
+To construct any of the clients, you will need to pass in the endpoint URL. If you are using key authentication, you also need to pass in the key associated with your deployed AI model.
+
+* The endpoint URL has the form `https://your-deployment-name.your-azure-region.inference.ai.azure.com`, where `your-deployment-name` is your unique model deployment name and `your-azure-region` is the Azure region where the model is deployed (e.g. `eastus2`).
+
+* The key is a 32-character string.
+
+For convenience, and to promote the practice of not hard-coding secrets in your source code, all samples here assume the endpoint URL and key are stored in environment variables. You will need to set these environment variables before running the samples as-is. The environment variables are mentioned in the tables below.
+
+Note that the client library does not directly read these environment variable at run time. The sample code reads the environment variables and constructs the relevant client with these values.
+
+## Serverless API and Managed Compute Endpoints
+
+| Sample type | Endpoint environment variable name | Key environment variable name  |
+|----------|----------|----------|
+| Chat completions | `CHAT_COMPLETIONS_ENDPOINT` | `CHAT_COMPLETIONS_KEY` |
+| Embeddings | `EMBEDDINGS_ENDPOINT` | `EMBEDDINGS_KEY` |
+<!--
+| Image generation | `IMAGE_GENERATION_ENDPOINT` | `IMAGE_GENERATION_KEY` |
+-->
+
+To run against a Managed Compute Endpoint, some samples also have an optional environment variable `CHAT_COMPLETIONS_DEPLOYMENT_NAME`. This is the value used to set the HTTP request header `azureml-model-deployment` when constructing the client.
+
+## Azure OpenAI Endpoints
+
+| Sample type | Endpoint environment variable name | Key environment variable name  |
+|----------|----------|----------|
+| Chat completions | `AOAI_CHAT_COMPLETIONS_ENDPOINT` | `AOAI_CHAT_COMPLETIONS_KEY` |
+| Embeddings | `AOAI_EMBEDDINGS_ENDPOINT` | `AOAI_EMBEDDINGS_KEY` |
+<!--
+| Image generation | `IMAGE_GENERATION_ENDPOINT` | `IMAGE_GENERATION_KEY` |
+-->
+
+## Running the samples
+
+To run the first sample, type:
+
+```bash
+python sample_chat_completions.py
+```
+
+similarly for the other samples.
+
+## Synchronous client samples
+
+### Chat completions
+
+|**File Name**|**Description**|
+|----------------|-------------|
+|[sample_chat_completions_streaming.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming.py) | One chat completion operation using a synchronous client and streaming response. |
+|[sample_chat_completions_streaming_with_entra_id_auth.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_entra_id_auth.py) | One chat completion operation using a synchronous client and streaming response, using Entra ID authentication. This sample also shows setting the `azureml-model-deployment` HTTP request header, which may be required for Selfhosted Endpoints. |
+|[sample_chat_completions.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions.py) | One chat completion operation using a synchronous client. |
+|[sample_chat_completions_with_history.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py) | Two chat completion operations using a synchronous client, which the second completion using chat history from the first. |
+|[sample_chat_completions_from_input_bytes.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_bytes.py) | One chat completion operation using a synchronous client, with input messages provided as `IO[bytes]`. |
+|[sample_chat_completions_from_input_json.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py) | One chat completion operation using a synchronous client, with input messages provided as `MutableMapping[str, Any]` |
+|[sample_chat_completions_with_tools.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py) | Shows how do use a tool (function) in chat completions, for an AI model that supports tools |
+|[sample_load_client.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_load_client.py) | Shows how do use the function `load_client` to create the appropriate synchronous client based on the provided endpoint URL. In this example, it creates a synchronous `ChatCompletionsClient`. |
+|[sample_get_model_info.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_get_model_info.py) | Get AI model information using the chat completions client. Similarly can be done with all other clients. |
+|[sample_chat_completions_with_model_extras.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_model_extras.py) | Chat completions with additional model-specific parameters. |
+|[sample_chat_completions_azure_openai.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py) | Chat completions against Azure OpenAI endpoint. |
+
+### Text embeddings
+
+|**File Name**|**Description**|
+|----------------|-------------|
+|[sample_embeddings.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_embeddings.py) | One embeddings operation using a synchronous client. |
+|[sample_embeddings_azure_openai.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py) | One embeddings operation using a synchronous client, against Azure OpenAI endpoint. |
+
+<!--
+### Image embeddings
+
+|**File Name**|**Description**|
+|----------------|-------------|
+|[sample_image_embeddings.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/sample_image_embeddings.py) | One image embeddings operation, on two input images, using a synchronous client. |
+-->
+
+## Asynchronous client samples
+
+### Chat completions
+
+|**File Name**|**Description**|
+|----------------|-------------|
+|[sample_chat_completions_streaming_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_async.py) | One chat completion operation using an asynchronous client and streaming response. |
+|[sample_chat_completions_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_async.py) | One chat completion operation using an asynchronous client. |
+|[sample_load_client_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py) | Shows how do use the function `load_async_client` to create the appropriate asynchronous client based on the provided endpoint URL. In this example, it creates an asynchronous `ChatCompletionsClient`. |
+|[sample_chat_completions_from_input_bytes_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_bytes_async.py) | One chat completion operation using a synchronous client, with input messages provided as `IO[bytes]`. |
+|[sample_chat_completions_from_input_json_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py) | One chat completion operation using a synchronous client, with input messages provided as `MutableMapping[str, Any]` |
+|[sample_chat_completions_streaming_azure_openai_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py) | One chat completion operation using an asynchronous client and streaming response against an Azure OpenAI endpoint |
+
+### Text embeddings
+
+|**File Name**|**Description**|
+|----------------|-------------|
+|[sample_embeddings_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py) | One embeddings operation using an asynchronous client. |
+
+<!--
+### Image embeddings
+
+|**File Name**|**Description**|
+|----------------|-------------|
+|[sample_image_embeddings_async.py](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/async_samples/sample_image_embeddings_async.py) | One image embeddings operation, on two input images, using an asynchronous client. |
+-->
+
+## Troubleshooting
+
+See [Troubleshooting](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/README.md#troubleshooting) here.
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/example_chat.json b/sdk/ai/azure-ai-inference/samples/async_samples/example_chat.json
new file mode 100644
index 000000000000..ed95ff670792
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/example_chat.json
@@ -0,0 +1,13 @@
+{
+    "messages":
+    [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": "How many feet are in a mile?"
+        }
+    ]
+}
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample1.png b/sdk/ai/azure-ai-inference/samples/async_samples/sample1.png
new file mode 100644
index 000000000000..ba18b500872f
Binary files /dev/null and b/sdk/ai/azure-ai-inference/samples/async_samples/sample1.png differ
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample2.png b/sdk/ai/azure-ai-inference/samples/async_samples/sample2.png
new file mode 100644
index 000000000000..59d79ff28fc5
Binary files /dev/null and b/sdk/ai/azure-ai-inference/samples/async_samples/sample2.png differ
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_async.py
new file mode 100644
index 000000000000..bb530e6f9dc5
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_async.py
@@ -0,0 +1,58 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completion response
+    from the service using an asynchronous client.
+
+USAGE:
+    python sample_chat_completion_async.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+import asyncio
+
+
+async def sample_chat_completions_async():
+    import os
+    from azure.ai.inference.aio import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+    from azure.core.credentials import AzureKeyCredential
+
+    # Read the values of your model endpoint and key from environment variables
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # Create a chat completion client for synchronous operations
+    async with ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key)) as client: 
+
+        # Do a single chat completion operation
+        response = await client.complete(
+            messages=[
+                SystemMessage(content="You are a helpful assistant."),
+                UserMessage(content="How many feet are in a mile?"),
+            ]
+        )
+
+        # Print response the the console
+        print(response.choices[0].message.content)
+
+
+async def main():
+    await sample_chat_completions_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_bytes_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_bytes_async.py
new file mode 100644
index 000000000000..a57a093d30df
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_bytes_async.py
@@ -0,0 +1,67 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using an asynchronous client, and directly providing the 
+    IO[bytes] request body (containing input chat messages).
+
+USAGE:
+    python sample_chat_completions_from_input_bytes_async.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+# mypy: disable-error-code="union-attr"
+# pyright: reportAttributeAccessIssue=false
+
+import asyncio
+import io
+
+
+async def sample_chat_completions_from_input_bytes_async():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference.aio import ChatCompletionsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    async with ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key)) as client:
+
+        # Make a chat completion call, by directly providing the
+        # HTTP request body as IO[bytes], containing chat messages.
+        response = await client.complete(read_text_file("example_chat.json"))
+
+        print(response.choices[0].message.content)
+
+
+def read_text_file(file_name: str) -> io.BytesIO:
+    """
+    Reads a text file and returns a BytesIO object with the file content in UTF-8 encoding.
+    The file is expected to be in the same directory as this Python script.
+    """
+    from pathlib import Path
+
+    with Path(__file__).with_name(file_name).open("r") as f:
+        return io.BytesIO(f.read().encode("utf-8"))
+
+
+async def main():
+    await sample_chat_completions_from_input_bytes_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py
new file mode 100644
index 000000000000..670638fd05cc
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py
@@ -0,0 +1,67 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using an asynchronous client, and directly providing the 
+    JSON request body (containing input chat messages).
+
+USAGE:
+    python sample_chat_completions_from_input_json_async.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+# mypy: disable-error-code="union-attr"
+# pyright: reportAttributeAccessIssue=false
+
+import asyncio
+
+
+async def sample_chat_completions_from_input_json_async():
+    import os
+    from azure.ai.inference.aio import ChatCompletionsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    async with ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key)) as client:
+
+        request_body = {
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are an AI assistant that helps people find information. Your replies are short, no more than two sentences.",
+                },
+                {"role": "user", "content": "What year was construction of the International Space Station mostly done?"},
+                {
+                    "role": "assistant",
+                    "content": "The main construction of the International Space Station (ISS) was completed between 1998 and 2011. During this period, more than 30 flights by US space shuttles and 40 by Russian rockets were conducted to transport components and modules to the station.",
+                },
+                {"role": "user", "content": "And what was the estimated cost to build it?"},
+            ]
+        }
+
+        response = await client.complete(request_body)
+
+        print(response.choices[0].message.content)
+
+
+async def main():
+    await sample_chat_completions_from_input_json_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_async.py
new file mode 100644
index 000000000000..457d117a68c9
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_async.py
@@ -0,0 +1,60 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completion streaming response
+    from the service using an asynchronous client.
+
+USAGE:
+    python sample_streaming_chat_completions_async.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+import asyncio
+
+
+async def sample_chat_completions_streaming_async():
+    import os
+    from azure.ai.inference.aio import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage, StreamingChatCompletionsUpdate
+    from azure.core.credentials import AzureKeyCredential
+
+    # Read the values of your model endpoint and key from environment variables
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # Create chat completions client for synchronous operations
+    async with ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key)) as client:
+
+        # Do a single streaming chat completion operation. Start the operation and get a Future object.
+        response = await client.complete(
+            stream=True,
+            messages=[
+                SystemMessage(content="You are a helpful assistant."),
+                UserMessage(content="Give me 5 good reasons why I should exercise every day."),
+            ],
+        )
+
+        # Iterate on the response to get chat completion updates, as they arrive from the service
+        async for update in response:
+            print(update.choices[0].delta.content or "", end="")
+
+
+async def main():
+    await sample_chat_completions_streaming_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py
new file mode 100644
index 000000000000..5db3d52848a8
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py
@@ -0,0 +1,91 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions streaming response from
+    the service using an asynchronous client, with an Azure OpenAI (AOAI) endpoint.
+    Two types of authentications are shown: key authentication and Entra ID
+    authentication.
+
+USAGE:
+    1. Update `key_auth` below to `True` for key authentication, or `False` for
+       Entra ID authentication.
+    2. Update `api_version` (the AOAI REST API version) as needed.
+    3. Set one or two environment variables, depending on your authentication method:
+        * AOAI_CHAT_COMPLETIONS_ENDPOINT - Your AOAI endpoint URL, with partial path, in the form 
+            https://<your-unique-resouce-name>.openai.azure.com/openai/deployments/<your-deployment-name>
+            where `your-unique-resource-name` is your globally unique AOAI resource name,
+            and `your-deployment-name` is your AI Model deployment name.
+            For example: https://your-unique-host.openai.azure.com/openai/deployments/gpt-4-turbo
+        * AOAI_CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret. This
+            is only required for key authentication.
+    4. Run the sample:
+       python sample_chat_completions_streaming_azure_openai_async.py
+"""
+import asyncio
+
+async def sample_chat_completions_streaming_azure_openai_async():
+    import os
+    from azure.ai.inference.aio import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+
+    try:
+        endpoint = os.environ["AOAI_CHAT_COMPLETIONS_ENDPOINT"]
+    except KeyError:
+        print("Missing environment variable 'AOAI_CHAT_COMPLETIONS_ENDPOINT'")
+        print("Set it before running this sample.")
+        exit()
+
+    key_auth = True  # Set to True for key authentication, or False for Entra ID authentication.
+
+    if key_auth:
+        from azure.core.credentials import AzureKeyCredential
+
+        try:
+            key = os.environ["AOAI_CHAT_COMPLETIONS_KEY"]
+        except KeyError:
+            print("Missing environment variable 'AOAI_CHAT_COMPLETIONS_KEY'")
+            print("Set it before running this sample.")
+            exit()
+
+        client = ChatCompletionsClient(
+            endpoint=endpoint,
+            credential=AzureKeyCredential(""),  # Pass in an empty value.
+            headers={"api-key": key},
+            api_version="2024-02-15-preview",  # AOAI api-version. Update as needed.
+        )
+
+    else:  # Entra ID authentication
+        from azure.identity import DefaultAzureCredential
+
+        client = ChatCompletionsClient(
+            endpoint=endpoint,
+            credential=DefaultAzureCredential(exclude_interactive_browser_credential=False),
+            credential_scopes=["https://cognitiveservices.azure.com/.default"],
+            api_version="2024-02-15-preview",  # AOAI api-version. Update as needed.
+        )
+
+    response = await client.complete(
+        stream=True,
+        messages=[
+            SystemMessage(content="You are a helpful assistant."),
+            UserMessage(content="Give me 5 good reasons why I should exercise every day."),
+        ]
+    )
+
+    # Iterate on the response to get chat completion updates, as they arrive from the service
+    async for update in response:
+        if len(update.choices) > 0:
+            print(update.choices[0].delta.content or "", end="")
+
+    await client.close()
+
+
+async def main():
+    await sample_chat_completions_streaming_azure_openai_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py
new file mode 100644
index 000000000000..02b894de948f
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py
@@ -0,0 +1,55 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get embeddings for a list of sentences using an asynchronous client.
+
+USAGE:
+    python sample_embeddings_async.py
+
+    Set these two environment variables before running the sample:
+    1) EMBEDDINGS_ENDPOINT - Your endpoint URL, in the form
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) EMBEDDINGS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+import asyncio
+
+
+async def sample_embeddings_async():
+    import os
+    from azure.ai.inference.aio import EmbeddingsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    # Read the values of your model endpoint and key from environment variables
+    try:
+        endpoint = os.environ["EMBEDDINGS_ENDPOINT"]
+        key = os.environ["EMBEDDINGS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'EMBEDDINGS_ENDPOINT' or 'EMBEDDINGS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # Create a text embeddings client for synchronous operations
+    async with EmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(key)) as client:
+
+        # Do a single embeddings operation. Start the operation and get a Future object.
+        response = await client.embed(input=["first phrase", "second phrase", "third phrase"])
+
+        print("Embeddings response:")
+        for item in response.data:
+            length = len(item.embedding)
+            print(
+                f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, ..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+            )
+
+
+async def main():
+    await sample_embeddings_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_image_embeddings_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_image_embeddings_async.py
new file mode 100644
index 000000000000..48e5c0fa85cd
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_image_embeddings_async.py
@@ -0,0 +1,63 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get image embeddings vectors for 
+    two input images, using an asynchronous client.
+
+USAGE:
+    python sample_image_embeddings_async.py
+
+    Set these two environment variables before running the sample:
+    1) IMAGE_EMBEDDINGS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) IMAGE_EMBEDDINGS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+import asyncio
+
+
+async def sample_image_embeddings_async():
+    import os
+    import base64
+
+    try:
+        endpoint = os.environ["IMAGE_EMBEDDINGS_ENDPOINT"]
+        key = os.environ["IMAGE_EMBEDDINGS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'IMAGE_EMBEDDINGS_ENDPOINT' or 'IMAGE_EMBEDDINGS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference.aio import ImageEmbeddingsClient
+    from azure.ai.inference.models import EmbeddingInput
+    from azure.core.credentials import AzureKeyCredential
+
+    with open("sample1.png", "rb") as f:
+        image1: str = base64.b64encode(f.read()).decode("utf-8")
+    with open("sample2.png", "rb") as f:
+        image2: str = base64.b64encode(f.read()).decode("utf-8")
+
+    async with ImageEmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(key)) as client:
+
+        # Do a single image embeddings operation. Start the operation and get a Future object.
+        response = await client.embed(input=[EmbeddingInput(image=image1), EmbeddingInput(image=image2)])
+
+        print("Embeddings response:")
+        for item in response.data:
+            length = len(item.embedding)
+            print(
+                f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, "
+                f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+            )
+
+
+async def main():
+    await sample_image_embeddings_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py
new file mode 100644
index 000000000000..7006dbef78e0
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py
@@ -0,0 +1,59 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to create an asynchronous client from a given endpoint URL using
+    the load_client() function, imported from azure.ai.inference.aio.
+    In this sample, we get an asynchronous embeddings client and do one embeddings call.
+
+USAGE:
+    python sample_load_client_async.py
+
+    Set these two environment variables before running the sample:
+    1) EMBEDDINGS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) EMBEDDINGS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+import asyncio
+
+
+async def sample_load_client_async():
+    import os
+
+    try:
+        endpoint = os.environ["EMBEDDINGS_ENDPOINT"]
+        key = os.environ["EMBEDDINGS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'EMBEDDINGS_ENDPOINT' or 'EMBEDDINGS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference.aio import load_client, EmbeddingsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    async with await load_client(endpoint=endpoint, credential=AzureKeyCredential(key)) as client:
+
+        # This should create a client of type `EmbeddingsClient`
+        print(f"Created client of type `{type(client).__name__}`.")
+
+        if isinstance(client, EmbeddingsClient):
+            response = await client.embed(input=["first phrase", "second phrase", "third phrase"])
+
+            print("Embeddings response:")
+            for item in response.data:
+                length = len(item.embedding)
+                print(
+                    f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, ..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+                )
+
+
+async def main():
+    await sample_load_client_async()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/sdk/ai/azure-ai-inference/samples/example_chat.json b/sdk/ai/azure-ai-inference/samples/example_chat.json
new file mode 100644
index 000000000000..f9acff2e18df
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/example_chat.json
@@ -0,0 +1,13 @@
+{
+    "messages":
+    [
+        {
+            "role": "system",
+            "content": "You are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": "How many feet are in a mile?"
+        }
+    ]
+}
diff --git a/sdk/ai/azure-ai-inference/samples/sample1.png b/sdk/ai/azure-ai-inference/samples/sample1.png
new file mode 100644
index 000000000000..ba18b500872f
Binary files /dev/null and b/sdk/ai/azure-ai-inference/samples/sample1.png differ
diff --git a/sdk/ai/azure-ai-inference/samples/sample2.png b/sdk/ai/azure-ai-inference/samples/sample2.png
new file mode 100644
index 000000000000..59d79ff28fc5
Binary files /dev/null and b/sdk/ai/azure-ai-inference/samples/sample2.png differ
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions.py
new file mode 100644
index 000000000000..24fefc7f0c84
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions.py
@@ -0,0 +1,51 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using a synchronous client.
+
+USAGE:
+    python sample_chat_completions.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+def sample_chat_completions():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # [START chat_completions]
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+    from azure.core.credentials import AzureKeyCredential
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    response = client.complete(
+        messages=[
+            SystemMessage(content="You are a helpful assistant."),
+            UserMessage(content="How many feet are in a mile?"),
+        ]
+    )
+
+    print(response.choices[0].message.content)
+    # [END chat_completions]
+
+
+if __name__ == "__main__":
+    sample_chat_completions()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py
new file mode 100644
index 000000000000..94818f6527e5
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py
@@ -0,0 +1,81 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using a synchronous client, with an Azure OpenAI (AOAI) endpoint.
+    Two types of authentications are shown: key authentication and Entra ID
+    authentication.
+
+USAGE:
+    1. Update `key_auth` below to `True` for key authentication, or `False` for
+       Entra ID authentication.
+    2. Update `api_version` (the AOAI REST API version) as needed.
+    3. Set one or two environment variables, depending on your authentication method:
+        * AOAI_CHAT_COMPLETIONS_ENDPOINT - Your AOAI endpoint URL, with partial path, in the form 
+            https://<your-unique-resouce-name>.openai.azure.com/openai/deployments/<your-deployment-name>
+            where `your-unique-resource-name` is your globally unique AOAI resource name,
+            and `your-deployment-name` is your AI Model deployment name.
+            For example: https://your-unique-host.openai.azure.com/openai/deployments/gpt-4-turbo
+        * AOAI_CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret. This
+            is only required for key authentication.
+    4. Run the sample:
+       python sample_chat_completions_azure_openai.py
+"""
+
+
+def sample_chat_completions_azure_openai():
+    import os
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+
+    try:
+        endpoint = os.environ["AOAI_CHAT_COMPLETIONS_ENDPOINT"]
+    except KeyError:
+        print("Missing environment variable 'AOAI_CHAT_COMPLETIONS_ENDPOINT'")
+        print("Set it before running this sample.")
+        exit()
+
+    key_auth = True  # Set to True for key authentication, or False for Entra ID authentication.
+
+    if key_auth:
+        from azure.core.credentials import AzureKeyCredential
+
+        try:
+            key = os.environ["AOAI_CHAT_COMPLETIONS_KEY"]
+        except KeyError:
+            print("Missing environment variable 'AOAI_CHAT_COMPLETIONS_KEY'")
+            print("Set it before running this sample.")
+            exit()
+
+        client = ChatCompletionsClient(
+            endpoint=endpoint,
+            credential=AzureKeyCredential(""),  # Pass in an empty value.
+            headers={"api-key": key},
+            api_version="2024-02-15-preview",  # AOAI api-version. Update as needed.
+        )
+
+    else:  # Entra ID authentication
+        from azure.identity import DefaultAzureCredential
+
+        client = ChatCompletionsClient(
+            endpoint=endpoint,
+            credential=DefaultAzureCredential(exclude_interactive_browser_credential=False),
+            credential_scopes=["https://cognitiveservices.azure.com/.default"],
+            api_version="2024-02-15-preview",  # AOAI api-version. Update as needed.
+        )
+
+    response = client.complete(
+        messages=[
+            SystemMessage(content="You are a helpful assistant."),
+            UserMessage(content="How many feet are in a mile?"),
+        ]
+    )
+
+    print(response.choices[0].message.content)
+
+
+if __name__ == "__main__":
+    sample_chat_completions_azure_openai()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_bytes.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_bytes.py
new file mode 100644
index 000000000000..89fbd606ae27
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_bytes.py
@@ -0,0 +1,63 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using a synchronous client, and directly providing the 
+    IO[bytes] request body (containing input chat messages).
+
+USAGE:
+    python sample_chat_completions_from_input_bytes.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+# mypy: disable-error-code="union-attr"
+# pyright: reportAttributeAccessIssue=false
+
+
+import io
+
+
+def sample_chat_completions_from_input_bytes():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    # Make a chat completion call, by directly providing the
+    # HTTP request body as IO[bytes], containing chat messages.
+    response = client.complete(read_text_file("example_chat.json"))
+
+    print(response.choices[0].message.content)
+
+
+def read_text_file(file_name: str) -> io.BytesIO:
+    """
+    Reads a text file and returns a BytesIO object with the file content in UTF-8 encoding.
+    The file is expected to be in the same directory as this Python script.
+    """
+    from pathlib import Path
+
+    with Path(__file__).with_name(file_name).open("r") as f:
+        return io.BytesIO(f.read().encode("utf-8"))
+
+
+if __name__ == "__main__":
+    sample_chat_completions_from_input_bytes()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py
new file mode 100644
index 000000000000..f3495f7d4904
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py
@@ -0,0 +1,66 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using a synchronous client, and directly providing the 
+    JSON request body (containing input chat messages).
+
+USAGE:
+    python sample_chat_completions_from_input_json.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+# mypy: disable-error-code="union-attr"
+# pyright: reportAttributeAccessIssue=false
+
+
+def sample_chat_completions_from_input_json():
+    import os
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    # [START chat_completions]
+    response = client.complete(
+        {
+            "messages": [
+                {
+                    "role": "system",
+                    "content": "You are an AI assistant that helps people find information. Your replies are short, no more than two sentences.",
+                },
+                {
+                    "role": "user",
+                    "content": "What year was construction of the International Space Station mostly done?",
+                },
+                {
+                    "role": "assistant",
+                    "content": "The main construction of the International Space Station (ISS) was completed between 1998 and 2011. During this period, more than 30 flights by US space shuttles and 40 by Russian rockets were conducted to transport components and modules to the station.",
+                },
+                {"role": "user", "content": "And what was the estimated cost to build it?"},
+            ]
+        }
+    )
+    # [END chat_completions]
+
+    print(response.choices[0].message.content)
+
+
+if __name__ == "__main__":
+    sample_chat_completions_from_input_json()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming.py
new file mode 100644
index 000000000000..226489918000
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming.py
@@ -0,0 +1,56 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completion streaming response 
+    from the service using a synchronous client.
+
+USAGE:
+    python sample_chat_completions_streaming.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_chat_completions_streaming():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # [START chat_completions_streaming]
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+    from azure.core.credentials import AzureKeyCredential
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    response = client.complete(
+        stream=True,
+        messages=[
+            SystemMessage(content="You are a helpful assistant."),
+            UserMessage(content="Give me 5 good reasons why I should exercise every day."),
+        ],
+    )
+
+    for update in response:
+        print(update.choices[0].delta.content or "", end="")
+
+    client.close()
+    # [END chat_completions_streaming]
+
+
+if __name__ == "__main__":
+    sample_chat_completions_streaming()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_entra_id_auth.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_entra_id_auth.py
new file mode 100644
index 000000000000..aac9a6e290d4
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_entra_id_auth.py
@@ -0,0 +1,72 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to do chat completions with streaming,
+    using a synchronous client, with an Entra ID authentication.
+    It also shows how to set the optional HTTP request header `azureml-model-deployment`,
+    which is supported when you deploy a model using "Managed Compute Endpoints". 
+    It can be used to target test deployment during staging,
+    instead of the default production deployment.
+
+USAGE:
+    python sample_chat_completions_streaming_with_entra_id_auth.py
+
+    Set one or two of these environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_DEPLOYMENT_NAME - Optional. The value for the HTTP
+        request header `azureml-model-deployment`.
+"""
+
+
+def sample_chat_completions_streaming_with_entra_id_auth():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT'")
+        print("Set it. before running this sample.")
+        exit()
+
+    try:
+        model_deployment = os.environ["CHAT_COMPLETIONS_DEPLOYMENT_NAME"]
+    except KeyError:
+        print("Could not read optional environment variable `CHAT_COMPLETIONS_DEPLOYMENT_NAME`.")
+        print("HTTP request header `azureml-model-deployment` will not be set.")
+        model_deployment = None
+
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+    from azure.identity import DefaultAzureCredential
+
+    # For details on DefaultAzureCredential, see
+    # https://learn.microsoft.com/python/api/overview/azure/identity-readme#defaultazurecredential
+
+    client = ChatCompletionsClient(
+        endpoint=endpoint,
+        credential=DefaultAzureCredential(exclude_interactive_browser_credential=False),
+        headers={"azureml-model-deployment": model_deployment}
+    )
+
+    response = client.complete(
+        stream=True,
+        messages=[
+            SystemMessage(content="You are a helpful assistant."),
+            UserMessage(content="Give me 5 good reasons why I should exercise every day."),
+        ],
+    )
+
+    for update in response:
+        print(update.choices[0].delta.content or "", end="")
+
+    client.close()
+
+
+if __name__ == "__main__":
+    sample_chat_completions_streaming_with_entra_id_auth()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py
new file mode 100644
index 000000000000..7d3e8d7a74cf
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py
@@ -0,0 +1,58 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using a synchronous client. Two completion calls are made,
+    the second one containing the chat history from the first one.
+
+USAGE:
+    python sample_chat_completions_with_history.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_chat_completions_with_history():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage, AssistantMessage
+    from azure.core.credentials import AzureKeyCredential
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    messages = [
+        SystemMessage(
+            content="You are an AI assistant that helps people find information. Your replies are short, no more than two sentences."
+        ),
+        UserMessage(content="What year was construction of the international space station mostly done?"),
+    ]
+
+    response = client.complete(messages=messages)
+    print(response.choices[0].message.content)
+
+    messages.append(AssistantMessage(content=response.choices[0].message.content))
+    messages.append(UserMessage(content="And what was the estimated cost to build it?"))
+
+    response = client.complete(messages=messages)
+    print(response.choices[0].message.content)
+
+
+if __name__ == "__main__":
+    sample_chat_completions_with_history()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_model_extras.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_model_extras.py
new file mode 100644
index 000000000000..7b9e2e3577eb
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_model_extras.py
@@ -0,0 +1,57 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get a chat completions response from
+    the service using a synchronous client, while supplying additional
+    model-specific parameters as part of the request.
+    See setting of the optional `model_extras` in the `complete` method.
+    Also see related setting of the optional `unknown-parameters`
+    parameter in the `complete` method.
+
+USAGE:
+    python sample_chat_completions_with_model_extras.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_chat_completions_with_model_extras():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+    from azure.core.credentials import AzureKeyCredential
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    # [START model_extras]
+    response = client.complete(
+        messages=[
+            SystemMessage(content="You are a helpful assistant."),
+            UserMessage(content="How many feet are in a mile?"),
+        ],
+        model_extras={"key1": "value1", "key2": "value2"},  # Optional. Additional parameters to pass to the model.
+    )
+    # [END chat_completions]
+
+    print(response.choices[0].message.content)
+
+
+if __name__ == "__main__":
+    sample_chat_completions_with_model_extras()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
new file mode 100644
index 000000000000..e9dbec8114cf
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
@@ -0,0 +1,133 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to do chat completions using a synchronous client,
+    with the assistance of tools. In this sample, we use a mock function tool to retrieve
+    flight information in order to answer a query about the next flight between two
+    cities.
+
+USAGE:
+    python sample_chat_completions_with_tools.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_chat_completions_with_tools():
+    import os
+    import json
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.ai.inference.models import (
+        AssistantMessage,
+        ChatCompletionsFunctionToolCall,
+        ChatCompletionsFunctionToolDefinition,
+        CompletionsFinishReason,
+        FunctionDefinition,
+        SystemMessage,
+        ToolMessage,
+        UserMessage,
+    )
+    from azure.core.credentials import AzureKeyCredential
+
+    # Define a function that retrieves flight information
+    def get_flight_info(origin_city: str, destination_city: str):
+        """
+        This is a mock function that returns information about the next
+        flight between two cities.
+
+        Parameters:
+        origin_city (str): The name of the city where the flight originates.
+        destination_city (str): The destination city.
+
+        Returns:
+        str: The airline name, fight number, date and time of the next flight between the cities.
+        """
+        if origin_city == "Seattle" and destination_city == "Miami":
+            return "Delta airlines flight number 123 from Seattle to Miami, departing May 7th, 2024 at 10:00 AM."
+        else:
+            return "Sorry, I don't have that information."
+
+    # Define a 'tool' that the model can use to retrieves flight information
+    flight_info = ChatCompletionsFunctionToolDefinition(
+        function=FunctionDefinition(
+            name="get_flight_info",
+            description="Returns information about the next flight between two cities. This includes the name of the airline, flight number and the date and time of the next flight",
+            parameters={
+                "type": "object",
+                "properties": {
+                    "origin_city": {
+                        "type": "string",
+                        "description": "The name of the city where the flight originates",
+                    },
+                    "destination_city": {
+                        "type": "string",
+                        "description": "The flight destination city",
+                    },
+                },
+                "required": ["origin_city", "destination_city"],
+            },
+        )
+    )
+
+    # Create a chat completion client. Make sure you selected a model that supports tools.
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    # Make a chat completions call asking for flight information, while providing a tool to handle the request
+    messages = [
+        SystemMessage(content="You an assistant that helps users find flight information."),
+        UserMessage(content="What is the next flights from Seattle to Miami?"),
+    ]
+
+    response = client.complete(
+        messages=messages,
+        tools=[flight_info],
+    )
+
+    # The model should be asking for tool calls
+    if response.choices[0].finish_reason == CompletionsFinishReason.TOOL_CALLS:
+
+        # Append the previous model response to the chat history
+        messages.append(AssistantMessage(tool_calls=response.choices[0].message.tool_calls))
+
+        # The tool should be of type function call. He we assume only one function call is required.
+        if response.choices[0].message.tool_calls is not None and len(response.choices[0].message.tool_calls) == 1:
+
+            tool_call = response.choices[0].message.tool_calls[0]
+
+            if isinstance(tool_call, ChatCompletionsFunctionToolCall):
+
+                function_args = json.loads(tool_call.function.arguments.replace("'", '"'))
+                print(f"Calling function `{tool_call.function.name}` with arguments {function_args}")
+                callable_func = locals()[tool_call.function.name]
+
+                function_response = callable_func(**function_args)
+                print(f"Function response = {function_response}")
+
+                # Provide the tool response to the model, by appending it to the chat history
+                messages.append(ToolMessage(tool_call_id=tool_call.id, content=function_response))
+
+                # With the additional tools information on hand, get another response from the model
+                response = client.complete(messages=messages, tools=[flight_info])
+
+                print(f"Model response = {response.choices[0].message.content}")
+
+
+if __name__ == "__main__":
+    sample_chat_completions_with_tools()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_embeddings.py b/sdk/ai/azure-ai-inference/samples/sample_embeddings.py
new file mode 100644
index 000000000000..bddb19ce2a18
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_embeddings.py
@@ -0,0 +1,51 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get text embeddings for a list of sentences
+    using a synchronous client.
+
+USAGE:
+    python sample_embeddings.py
+
+    Set these two environment variables before running the sample:
+    1) EMBEDDINGS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) EMBEDDINGS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_embeddings():
+    import os
+
+    try:
+        endpoint = os.environ["EMBEDDINGS_ENDPOINT"]
+        key = os.environ["EMBEDDINGS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'EMBEDDINGS_ENDPOINT' or 'EMBEDDINGS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # [START embeddings]
+    from azure.ai.inference import EmbeddingsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    client = EmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    response = client.embed(input=["first phrase", "second phrase", "third phrase"])
+
+    for item in response.data:
+        length = len(item.embedding)
+        print(
+            f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, "
+            f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+        )
+    # [END embeddings]
+
+
+if __name__ == "__main__":
+    sample_embeddings()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py b/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py
new file mode 100644
index 000000000000..2ae3cff830a1
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py
@@ -0,0 +1,80 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get text embeddings for a list of sentences
+    using a synchronous client, with an Azure OpenAI (AOAI) endpoint.
+    Two types of authentications are shown: key authentication and Entra ID
+    authentication.
+
+USAGE:
+    1. Update `key_auth` below to `True` for key authentication, or `False` for
+       Entra ID authentication.
+    2. Update `api_version` (the AOAI REST API version) as needed.
+    3. Set one or two environment variables, depending on your authentication method:
+        * AOAI_EMBEDDINGS_ENDPOINT - Your AOAI endpoint URL, with partial path, in the form 
+            https://<your-unique-resouce-name>.openai.azure.com/openai/deployments/<your-deployment-name>
+            where `your-unique-resource-name` is your globally unique AOAI resource name,
+            and `your-deployment-name` is your AI Model deployment name.
+            For example: https://your-unique-host.openai.azure.com/openai/deployments/gpt-4-turbo
+        * AOAI_EMBEDDINGS_KEY - Your model key (a 32-character string). Keep it secret. This
+            is only required for key authentication.
+    4. Run the sample:
+       python sample_embeddings_azure_openai.py
+"""
+
+
+def sample_embeddings_azure_openai():
+    import os
+    from azure.ai.inference import EmbeddingsClient
+
+    try:
+        endpoint = os.environ["AOAI_EMBEDDINGS_ENDPOINT"]
+    except KeyError:
+        print("Missing environment variable 'AOAI_EMBEDDINGS_ENDPOINT'")
+        print("Set it before running this sample.")
+        exit()
+
+    key_auth = True  # Set to True for key authentication, or False for Entra ID authentication.
+
+    if key_auth:
+        from azure.core.credentials import AzureKeyCredential
+
+        try:
+            key = os.environ["AOAI_EMBEDDINGS_KEY"]
+        except KeyError:
+            print("Missing environment variable 'AOAI_EMBEDDINGS_KEY'")
+            print("Set it before running this sample.")
+            exit()
+
+        client = EmbeddingsClient(
+            endpoint=endpoint,
+            credential=AzureKeyCredential(""),  # Pass in an empty value.
+            headers={"api-key": key},
+            api_version="2024-02-15-preview",  # AOAI api-version. Update as needed.
+        )
+
+    else:  # Entra ID authentication
+        from azure.identity import DefaultAzureCredential
+
+        client = EmbeddingsClient(
+            endpoint=endpoint,
+            credential=DefaultAzureCredential(exclude_interactive_browser_credential=False),
+            credential_scopes=["https://cognitiveservices.azure.com/.default"],
+            api_version="2024-02-15-preview",  # AOAI api-version. Update as needed.
+        )
+
+    response = client.embed(input=["first phrase", "second phrase", "third phrase"])
+
+    for item in response.data:
+        length = len(item.embedding)
+        print(
+            f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, "
+            f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+        )
+
+
+if __name__ == "__main__":
+    sample_embeddings_azure_openai()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_get_model_info.py b/sdk/ai/azure-ai-inference/samples/sample_get_model_info.py
new file mode 100644
index 000000000000..02f95ae7502a
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_get_model_info.py
@@ -0,0 +1,49 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get information about the AI model, using the
+    synchronous chat completions client. Similarly can be done with the other
+    clients.
+
+USAGE:
+    python sample_get_model_info.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_get_model_info():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    from azure.ai.inference import ChatCompletionsClient
+    from azure.core.credentials import AzureKeyCredential
+
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    # [START get_model_info]
+    model_info = client.get_model_info()
+
+    print(f"Model name: {model_info.model_name}")
+    print(f"Model provider name: {model_info.model_provider_name}")
+    print(f"Model type: {model_info.model_type}")
+    # [END get_model_info]
+
+
+if __name__ == "__main__":
+    sample_get_model_info()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_image_embeddings.py b/sdk/ai/azure-ai-inference/samples/sample_image_embeddings.py
new file mode 100644
index 000000000000..a7b7dc0e5b6b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_image_embeddings.py
@@ -0,0 +1,58 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to get image embeddings vectors for 
+    two input images, using a synchronous client.
+
+USAGE:
+    python sample_image_embeddings.py
+
+    Set these two environment variables before running the sample:
+    1) IMAGE_EMBEDDINGS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) IMAGE_EMBEDDINGS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_image_embeddings():
+    import os
+    import base64
+
+    try:
+        endpoint = os.environ["IMAGE_EMBEDDINGS_ENDPOINT"]
+        key = os.environ["IMAGE_EMBEDDINGS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'IMAGE_EMBEDDINGS_ENDPOINT' or 'IMAGE_EMBEDDINGS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # [START image_embeddings]
+    from azure.ai.inference import ImageEmbeddingsClient
+    from azure.ai.inference.models import EmbeddingInput
+    from azure.core.credentials import AzureKeyCredential
+
+    with open("sample1.png", "rb") as f:
+        image1: str = base64.b64encode(f.read()).decode("utf-8")
+    with open("sample2.png", "rb") as f:
+        image2: str = base64.b64encode(f.read()).decode("utf-8")
+
+    client = ImageEmbeddingsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    response = client.embed(input=[EmbeddingInput(image=image1), EmbeddingInput(image=image2)])
+
+    for item in response.data:
+        length = len(item.embedding)
+        print(
+            f"data[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, "
+            f"..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+        )
+    # [END image_embeddings]
+
+
+if __name__ == "__main__":
+    sample_image_embeddings()
diff --git a/sdk/ai/azure-ai-inference/samples/sample_load_client.py b/sdk/ai/azure-ai-inference/samples/sample_load_client.py
new file mode 100644
index 000000000000..683a05cb9c9d
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/samples/sample_load_client.py
@@ -0,0 +1,58 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+"""
+DESCRIPTION:
+    This sample demonstrates how to create a client from a given endpoint URL using
+    the load_client() function, imported from azure.ai.inference.
+    In this sample, we get a synchronous chat completions client and do one
+    chat completions call.
+
+USAGE:
+    python sample_load_client.py
+
+    Set these two environment variables before running the sample:
+    1) CHAT_COMPLETIONS_ENDPOINT - Your endpoint URL, in the form 
+        https://<your-deployment-name>.<your-azure-region>.inference.ai.azure.com
+        where `your-deployment-name` is your unique AI Model deployment name, and
+        `your-azure-region` is the Azure region where your model is deployed.
+    2) CHAT_COMPLETIONS_KEY - Your model key (a 32-character string). Keep it secret.
+"""
+
+
+def sample_load_client():
+    import os
+
+    try:
+        endpoint = os.environ["CHAT_COMPLETIONS_ENDPOINT"]
+        key = os.environ["CHAT_COMPLETIONS_KEY"]
+    except KeyError:
+        print("Missing environment variable 'CHAT_COMPLETIONS_ENDPOINT' or 'CHAT_COMPLETIONS_KEY'")
+        print("Set them before running this sample.")
+        exit()
+
+    # [START load_client]
+    from azure.ai.inference import load_client, ChatCompletionsClient
+    from azure.ai.inference.models import SystemMessage, UserMessage
+    from azure.core.credentials import AzureKeyCredential
+
+    client = load_client(endpoint=endpoint, credential=AzureKeyCredential(key))
+
+    # This should create a client of type `ChatCompletionsClient`
+    print(f"Created client of type `{type(client).__name__}`.")
+
+    if isinstance(client, ChatCompletionsClient):
+        response = client.complete(
+            messages=[
+                SystemMessage(content="You are a helpful assistant."),
+                UserMessage(content="How many feet are in a mile?"),
+            ]
+        )
+
+        print(response.choices[0].message.content)
+    # [END load_client]
+
+
+if __name__ == "__main__":
+    sample_load_client()
diff --git a/sdk/ai/azure-ai-inference/setup.py b/sdk/ai/azure-ai-inference/setup.py
new file mode 100644
index 000000000000..c7b5395a3f9f
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/setup.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# --------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for license information.
+# Code generated by Microsoft (R) Python Code Generator.
+# Changes may cause incorrect behavior and will be lost if the code is regenerated.
+# --------------------------------------------------------------------------
+# coding: utf-8
+
+import os
+import re
+from setuptools import setup, find_packages
+
+
+PACKAGE_NAME = "azure-ai-inference"
+PACKAGE_PPRINT_NAME = "Azure Ai Inference"
+
+# a-b-c => a/b/c
+package_folder_path = PACKAGE_NAME.replace("-", "/")
+
+# Version extraction inspired from 'requests'
+with open(os.path.join(package_folder_path, "_version.py"), "r") as fd:
+    version = re.search(r'^VERSION\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1)
+
+if not version:
+    raise RuntimeError("Cannot find version information")
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=version,
+    description="Microsoft {} Client Library for Python".format(PACKAGE_PPRINT_NAME),
+    long_description=open("README.md", "r").read(),
+    long_description_content_type="text/markdown",
+    license="MIT License",
+    author="Microsoft Corporation",
+    author_email="azpysdkhelp@microsoft.com",
+    url="https://github.com/Azure/azure-sdk-for-python/tree/main/sdk",
+    keywords="azure, azure sdk",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3 :: Only",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "License :: OSI Approved :: MIT License",
+    ],
+    zip_safe=False,
+    packages=find_packages(
+        exclude=[
+            "tests",
+            # Exclude packages that will be covered by PEP420 or nspkg
+            "azure",
+            "azure.ai",
+        ]
+    ),
+    include_package_data=True,
+    package_data={
+        "azure.ai.inference": ["py.typed"],
+    },
+    install_requires=[
+        "isodate>=0.6.1",
+        "azure-core>=1.30.0",
+        "typing-extensions>=4.6.0",
+    ],
+    python_requires=">=3.8",
+)
diff --git a/sdk/ai/azure-ai-inference/tests/README.md b/sdk/ai/azure-ai-inference/tests/README.md
new file mode 100644
index 000000000000..712d5411c35e
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tests/README.md
@@ -0,0 +1,66 @@
+# Azure AI Model Inference client library tests for Python
+
+The instructions below are for running tests locally, on a Windows machine, against the live service.
+
+## Prerequisites
+
+The live tests were written against the AI models mentioned below. You will need to deploy them in [Azure AI Studio](https://ai.azure.com/) and have the endpoint and key for each one of them.
+
+- `Mistral-Large` for chat completion tests
+- `Cohere-embed-v3-english` for embedding tests
+<!-- - `TBD` for image generation tests -->
+
+## Setup
+
+- Clone or download this sample repository.
+- Open a command prompt window in the folder `sdk\ai\azure-ai-inference`.
+- If you want to run tests against the latest published client library, install it by running:
+   ```bash
+   pip install azure-ai-inference
+   ```
+- If you want to run tests against a locally built client library:
+    - First build the wheel:
+        ```bash
+        pip install wheel
+        pip install -r dev_requirements.txt
+        python setup.py bdist_wheel
+        ```
+    - Then install the resulting local wheel (update version `1.0.0b1` to the current one):
+        ```bash
+        pip install dist\azure_ai_inference-1.0.0b1-py3-none-any.whl --user --force-reinstall
+        ```
+
+## Set environment variables
+
+The tests read endpoints and keys from environemt variables. See the [Set environment variables](https://github.com/Azure/azure-sdk-for-python/blob/main/sdk/ai/azure-ai-inference/samples/README.md#set-environment-variables) section in the samples README.md file for the full list of environment variables that need to be set for all tests to pass.
+
+In addition, the following environment values **must be** defined, although not used. Assign any value to them:
+
+```bash
+set AI_TENANT_ID=not-used
+set AI_CLIENT_ID=not-used
+set AI_CLIENT_SECRET=not-used
+```
+
+## Configure test proxy
+
+Configure the test proxy to run live service tests without recordings:
+
+```bash
+set AZURE_TEST_RUN_LIVE=true
+set AZURE_SKIP_LIVE_RECORDING=true
+set PROXY_URL=http://localhost:5000
+set AZURE_TEST_USE_CLI_AUTH=true
+```
+
+## Run tests
+
+To run all tests, type:
+
+```bash
+pytest
+```
+
+## Additional information
+
+See [test documentation](https://github.com/Azure/azure-sdk-for-python/blob/main/doc/dev/tests.md) for additional information, including how to set proxy recordings and run tests using recordings.
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/tests/chat.test.json b/sdk/ai/azure-ai-inference/tests/chat.test.json
new file mode 100644
index 000000000000..c3440a386b9b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tests/chat.test.json
@@ -0,0 +1,13 @@
+{
+    "messages":
+    [
+        {
+            "role": "system",
+            "content": "ou are a helpful assistant."
+        },
+        {
+            "role": "user",
+            "content": "How many feet are in a mile?"
+        }
+    ]
+}
diff --git a/sdk/ai/azure-ai-inference/tests/conftest.py b/sdk/ai/azure-ai-inference/tests/conftest.py
new file mode 100644
index 000000000000..1ea8cf843682
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tests/conftest.py
@@ -0,0 +1,13 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+
+import pytest
+from devtools_testutils import test_proxy
+
+
+# autouse=True will trigger this fixture on each pytest run, even if it's not explicitly used by a test method
+@pytest.fixture(scope="session", autouse=True)
+def start_proxy(test_proxy):
+    return
diff --git a/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py b/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py
new file mode 100644
index 000000000000..404ccef53ff9
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py
@@ -0,0 +1,281 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+import azure.ai.inference as sdk
+import azure.ai.inference.aio as async_sdk
+import functools
+import io
+import json
+import logging
+import re
+import sys
+
+from os import path
+from pathlib import Path
+from typing import List, Optional, Union
+from devtools_testutils import AzureRecordedTestCase, EnvironmentVariableLoader
+from azure.core.credentials import AzureKeyCredential
+from azure.core.exceptions import AzureError
+from azure.core.pipeline import PipelineRequest
+
+
+# Set to True to enable SDK logging
+LOGGING_ENABLED = True
+
+if LOGGING_ENABLED:
+    # Create a logger for the 'azure' SDK
+    # See https://docs.python.org/3/library/logging.html
+    logger = logging.getLogger("azure")
+    logger.setLevel(logging.DEBUG)  # INFO or DEBUG
+
+    # Configure a console output
+    handler = logging.StreamHandler(stream=sys.stdout)
+    logger.addHandler(handler)
+
+ServicePreparerChatCompletions = functools.partial(
+    EnvironmentVariableLoader,
+    "chat_completions",
+    chat_completions_endpoint="https://your-deployment-name.your-azure-region.inference.ai.azure.com",
+    chat_completions_key="00000000000000000000000000000000",
+)
+
+ServicePreparerEmbeddings = functools.partial(
+    EnvironmentVariableLoader,
+    "embeddings",
+    embeddings_endpoint="https://your-deployment-name.your-azure-region.inference.ai.azure.com",
+    embeddings_key="00000000000000000000000000000000",
+)
+
+
+# The test class name needs to start with "Test" to get collected by pytest
+class ModelClientTestBase(AzureRecordedTestCase):
+
+    # Set to True to print out all results to the console
+    PRINT_RESULT = True
+
+    # Regular expression describing the pattern of a result ID. Format allowed are:
+    # "183b56eb-8512-484d-be50-5d8df82301a2", "26ef25aa45424781865a2d38a4484274" and "Sanitized"
+    REGEX_RESULT_ID = re.compile(
+        r"^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$|^[0-9a-fA-F]{32}$|^Sanitized$"
+    )
+
+    # Methods to load credentials from environment variables
+    def _load_chat_credentials(self, *, bad_key: bool, **kwargs):
+        endpoint = kwargs.pop("chat_completions_endpoint")
+        key = "00000000000000000000000000000000" if bad_key else kwargs.pop("chat_completions_key")
+        credential = AzureKeyCredential(key)
+        return endpoint, credential
+
+    def _load_embeddings_credentials(self, *, bad_key: bool, **kwargs):
+        endpoint = kwargs.pop("embeddings_endpoint")
+        key = "00000000000000000000000000000000" if bad_key else kwargs.pop("embeddings_key")
+        credential = AzureKeyCredential(key)
+        return endpoint, credential
+
+    # Methods to create sync and async clients using Load_client() function
+    async def _load_async_chat_client(self, *, bad_key: bool = False, **kwargs) -> async_sdk.ChatCompletionsClient:
+        endpoint, credential = self._load_chat_credentials(bad_key=bad_key, **kwargs)
+        return await async_sdk.load_client(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    def _load_chat_client(self, *, bad_key: bool = False, **kwargs) -> sdk.ChatCompletionsClient:
+        endpoint, credential = self._load_chat_credentials(bad_key=bad_key, **kwargs)
+        return sdk.load_client(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    async def _load_async_embeddings_client(self, *, bad_key: bool = False, **kwargs) -> async_sdk.EmbeddingsClient:
+        endpoint, credential = self._load_embeddings_credentials(bad_key=bad_key, **kwargs)
+        return await async_sdk.load_client(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    def _load_embeddings_client(self, *, bad_key: bool = False, **kwargs) -> sdk.EmbeddingsClient:
+        endpoint, credential = self._load_embeddings_credentials(bad_key=bad_key, **kwargs)
+        return sdk.load_client(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    # Methos to create the different sync and async clients directly
+    def _create_async_chat_client(self, *, bad_key: bool = False, **kwargs) -> async_sdk.ChatCompletionsClient:
+        endpoint, credential = self._load_chat_credentials(bad_key=bad_key, **kwargs)
+        return async_sdk.ChatCompletionsClient(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    def _create_chat_client(self, *, bad_key: bool = False, **kwargs) -> sdk.ChatCompletionsClient:
+        endpoint, credential = self._load_chat_credentials(bad_key=bad_key, **kwargs)
+        return sdk.ChatCompletionsClient(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    def _create_async_embeddings_client(self, *, bad_key: bool = False, **kwargs) -> async_sdk.EmbeddingsClient:
+        endpoint, credential = self._load_embeddings_credentials(bad_key=bad_key, **kwargs)
+        return async_sdk.EmbeddingsClient(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    def _create_embeddings_client(self, *, sync: bool = True, bad_key: bool = False, **kwargs) -> sdk.EmbeddingsClient:
+        endpoint, credential = self._load_embeddings_credentials(bad_key=bad_key, **kwargs)
+        return sdk.EmbeddingsClient(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    def _create_embeddings_client_with_chat_completions_credentials(self, **kwargs) -> sdk.EmbeddingsClient:
+        endpoint = kwargs.pop("chat_completions_endpoint")
+        key = kwargs.pop("chat_completions_key")
+        credential = AzureKeyCredential(key)
+        return sdk.EmbeddingsClient(endpoint=endpoint, credential=credential, logging_enable=LOGGING_ENABLED)
+
+    @staticmethod
+    def read_text_file(file_name: str) -> io.BytesIO:
+        """
+        Reads a text file and returns a BytesIO object with the file content in UTF-8 encoding.
+        The file is expected to be in the same directory as this Python script.
+        """
+        with Path(__file__).with_name(file_name).open("r") as f:
+            return io.BytesIO(f.read().encode("utf-8"))
+
+    @staticmethod
+    def _print_model_info_result(model_info: sdk.models.ModelInfo):
+        if ModelClientTestBase.PRINT_RESULT:
+            print(" Model info:")
+            print("\tmodel_name: {}".format(model_info.model_name))
+            print("\tmodel_type: {}".format(model_info.model_type))
+            print("\tmodel_provider_name: {}".format(model_info.model_provider_name))
+
+    @staticmethod
+    def _validate_model_info_result(
+        model_info: sdk.models.ModelInfo, expected_model_type: Union[str, sdk.models.ModelType]
+    ):
+        assert model_info.model_name is not None
+        assert len(model_info.model_name) > 0
+        assert model_info.model_provider_name is not None
+        assert len(model_info.model_provider_name) > 0
+        assert model_info.model_type is not None
+        assert model_info.model_type == expected_model_type
+
+    @staticmethod
+    def _validate_chat_completions_result(response: sdk.models.ChatCompletions, contains: List[str]):
+        assert any(item in response.choices[0].message.content for item in contains)
+        assert response.choices[0].message.role == sdk.models.ChatRole.ASSISTANT
+        assert response.choices[0].finish_reason == sdk.models.CompletionsFinishReason.STOPPED
+        assert response.choices[0].index == 0
+        assert bool(ModelClientTestBase.REGEX_RESULT_ID.match(response.id))
+        assert response.created is not None
+        assert response.created != ""
+        assert response.model is not None
+        assert response.model != ""
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens == response.usage.prompt_tokens + response.usage.completion_tokens
+
+    @staticmethod
+    def _validate_chat_completions_tool_result(response: sdk.models.ChatCompletions):
+        assert response.choices[0].message.content == None or response.choices[0].message.content == ""
+        assert response.choices[0].message.role == sdk.models.ChatRole.ASSISTANT
+        assert response.choices[0].finish_reason == sdk.models.CompletionsFinishReason.TOOL_CALLS
+        assert response.choices[0].index == 0
+        function_args = json.loads(response.choices[0].message.tool_calls[0].function.arguments.replace("'", '"'))
+        print(function_args)
+        assert function_args["city"].lower() == "seattle"
+        assert function_args["days"] == "2"
+        assert bool(ModelClientTestBase.REGEX_RESULT_ID.match(response.id))
+        assert response.created is not None
+        assert response.created != ""
+        assert response.model is not None
+        # assert response.model != ""
+        assert response.usage.prompt_tokens > 0
+        assert response.usage.completion_tokens > 0
+        assert response.usage.total_tokens == response.usage.prompt_tokens + response.usage.completion_tokens
+
+    @staticmethod
+    def _validate_chat_completions_update(update: sdk.models.StreamingChatCompletionsUpdate, first: bool) -> str:
+        if first:
+            # Why is 'content','created' and 'object' missing in the first update?
+            assert update.choices[0].delta.role == sdk.models.ChatRole.ASSISTANT
+        else:
+            assert update.choices[0].delta.role == None
+            assert update.choices[0].delta.content != None
+            assert update.created is not None
+            assert update.created != ""
+        assert update.choices[0].delta.tool_calls == None
+        assert update.choices[0].index == 0
+        assert update.id is not None
+        assert bool(ModelClientTestBase.REGEX_RESULT_ID.match(update.id))
+        assert update.model is not None
+        assert update.model != ""
+        if update.choices[0].delta.content != None:
+            return update.choices[0].delta.content
+        else:
+            return ""
+
+    @staticmethod
+    def _validate_chat_completions_streaming_result(response: sdk.models.StreamingChatCompletions):
+        count = 0
+        content = ""
+        for update in response:
+            content += ModelClientTestBase._validate_chat_completions_update(update, count == 0)
+            count += 1
+        assert count > 2
+        assert len(content) > 100  # Some arbitrary number
+        # The last update should have a finish reason and usage
+        assert update.choices[0].finish_reason == sdk.models.CompletionsFinishReason.STOPPED
+        assert update.usage.prompt_tokens > 0
+        assert update.usage.completion_tokens > 0
+        assert update.usage.total_tokens == update.usage.prompt_tokens + update.usage.completion_tokens
+        if ModelClientTestBase.PRINT_RESULT:
+            print(content)
+
+    @staticmethod
+    async def _validate_async_chat_completions_streaming_result(response: sdk.models.AsyncStreamingChatCompletions):
+        count = 0
+        content = ""
+        async for update in response:
+            content += ModelClientTestBase._validate_chat_completions_update(update, count == 0)
+            count += 1
+        assert count > 2
+        assert len(content) > 100  # Some arbitrary number
+        # The last update should have a finish reason and usage
+        assert update.choices[0].finish_reason == sdk.models.CompletionsFinishReason.STOPPED
+        assert update.usage.prompt_tokens > 0
+        assert update.usage.completion_tokens > 0
+        assert update.usage.total_tokens == update.usage.prompt_tokens + update.usage.completion_tokens
+        if ModelClientTestBase.PRINT_RESULT:
+            print(content)
+
+    @staticmethod
+    def _print_chat_completions_result(response: sdk.models.ChatCompletions):
+        if ModelClientTestBase.PRINT_RESULT:
+            print(" Chat Completions response:")
+            for choice in response.choices:
+                print(f"\tchoices[0].message.content: {choice.message.content}")
+                print(f"\tchoices[0].message.tool_calls: {choice.message.tool_calls}")
+                print("\tchoices[0].message.role: {}".format(choice.message.role))
+                print("\tchoices[0].finish_reason: {}".format(choice.finish_reason))
+                print("\tchoices[0].index: {}".format(choice.index))
+            print("\tid: {}".format(response.id))
+            print("\tcreated: {}".format(response.created))
+            print("\tmodel: {}".format(response.model))
+            print("\tusage.prompt_tokens: {}".format(response.usage.prompt_tokens))
+            print("\tusage.completion_tokens: {}".format(response.usage.completion_tokens))
+            print("\tusage.total_tokens: {}".format(response.usage.total_tokens))
+
+    @staticmethod
+    def _validate_embeddings_result(response: sdk.models.EmbeddingsResult):
+        assert response is not None
+        assert response.data is not None
+        assert len(response.data) == 3
+        for i in [0, 1, 2]:
+            assert response.data[i] is not None
+            assert response.data[i].index == i
+            assert len(response.data[i].embedding) == 1024
+            assert response.data[i].embedding[0] != 0.0
+            assert response.data[i].embedding[1023] != 0.0
+        assert bool(ModelClientTestBase.REGEX_RESULT_ID.match(response.id))
+        # assert len(response.model) > 0  # At the time of writing this test, this JSON field existed but was empty
+        # At the time of writing this test, input_tokens did not exist (I see completion tokens instead)
+        # assert response.usage.input_tokens > 0
+        # assert response.usage.prompt_tokens > 0
+        # assert response.total_tokens == response.usage.input_tokens + response.usage.prompt_tokens
+
+    @staticmethod
+    def _print_embeddings_result(response: sdk.models.EmbeddingsResult):
+        if ModelClientTestBase.PRINT_RESULT:
+            print("Embeddings response:")
+            for item in response.data:
+                length = len(item.embedding)
+                print(
+                    f"\tdata[{item.index}]: length={length}, [{item.embedding[0]}, {item.embedding[1]}, ..., {item.embedding[length-2]}, {item.embedding[length-1]}]"
+                )
+            print(f"\tid: {response.id}")
+            print(f"\tmodel: {response.model}")
+            # print(f"\tusage.input_tokens: {response.usage.input_tokens}") # At the time of writing this test, this JSON field does not exist
+            print(f"\tusage.prompt_tokens: {response.usage.prompt_tokens}")
+            print(f"\tusage.total_tokens: {response.usage.total_tokens}")
diff --git a/sdk/ai/azure-ai-inference/tests/test_model_inference_async_client.py b/sdk/ai/azure-ai-inference/tests/test_model_inference_async_client.py
new file mode 100644
index 000000000000..3c8281428844
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tests/test_model_inference_async_client.py
@@ -0,0 +1,216 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+import inspect
+import azure.ai.inference as sdk
+import azure.ai.inference.aio as async_sdk
+
+from model_inference_test_base import ModelClientTestBase, ServicePreparerChatCompletions, ServicePreparerEmbeddings
+from devtools_testutils.aio import recorded_by_proxy_async
+from azure.core.exceptions import AzureError
+
+
+# The test class name needs to start with "Test" to get collected by pytest
+class TestModelAsyncClient(ModelClientTestBase):
+
+    # **********************************************************************************
+    #
+    #                      HAPPY PATH TESTS - TEXT EMBEDDINGS
+    #
+    # **********************************************************************************
+    """live test with recording fails for this... why?
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy_async
+    async def test_async_load_embeddings_client(self, **kwargs):
+
+        client = await self._load_async_embeddings_client(**kwargs)
+        assert isinstance(client, async_sdk.EmbeddingsClient)
+        assert client._model_info
+
+        response1 = await client.get_model_info()
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(response1, "embedding") # TODO: This should be ModelType.EMBEDDINGS once the model is fixed
+        await client.close()
+    """
+
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy_async
+    async def test_async_get_model_info_on_embeddings_client(self, **kwargs):
+        client = self._create_async_embeddings_client(**kwargs)
+        assert not client._model_info
+
+        response1 = await client.get_model_info()
+        assert client._model_info
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "embedding"
+        )  # TODO: This should be ModelType.EMBEDDINGS once the model is fixed
+
+        # Get the model info again. No network calls should be made here,
+        # as the response is cached in the client.
+        response2 = await client.get_model_info()
+        self._print_model_info_result(response2)
+        assert response1 == response2
+
+        await client.close()
+
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy_async
+    async def test_async_embeddings(self, **kwargs):
+        client = self._create_async_embeddings_client(**kwargs)
+        response = await client.embed(input=["first phrase", "second phrase", "third phrase"])
+        self._print_embeddings_result(response)
+        self._validate_embeddings_result(response)
+        await client.close()
+
+    # **********************************************************************************
+    #
+    #                      HAPPY PATH TESTS - CHAT COMPLETIONS
+    #
+    # **********************************************************************************
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_load_chat_completions_client(self, **kwargs):
+
+        client = await self._load_async_chat_client(**kwargs)
+        assert isinstance(client, async_sdk.ChatCompletionsClient)
+        assert client._model_info
+
+        response1 = await client.get_model_info()
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "completion"
+        )  # TODO: This should be ModelType.CHAT once the model is fixed
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_get_model_info_on_chat_client(self, **kwargs):
+        client = self._create_async_chat_client(**kwargs)
+        assert not client._model_info
+
+        response1 = await client.get_model_info()
+        assert client._model_info
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "completion"
+        )  # TODO: This should be ModelType.CHAT once the model is fixed
+
+        # Get the model info again. No network calls should be made here,
+        # as the response is cached in the client.
+        response2 = await client.get_model_info()
+        self._print_model_info_result(response2)
+        assert response1 == response2
+
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_chat_completions_multi_turn(self, **kwargs):
+        messages = [
+            sdk.models.SystemMessage(content="You are a helpful assistant answering questions regarding length units."),
+            sdk.models.UserMessage(content="How many feet are in a mile?"),
+        ]
+        client = self._create_async_chat_client(**kwargs)
+        response = await client.complete(messages=messages)
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        messages.append(sdk.models.AssistantMessage(content=response.choices[0].message.content))
+        messages.append(sdk.models.UserMessage(content="and how many yards?"))
+        response = await client.complete(messages=messages)
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_result(response, ["1760", "1,760"])
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_chat_completions_with_model_extras(self, **kwargs):
+        client = self._create_async_chat_client(**kwargs)
+        response = await client.complete(
+            messages=[sdk.models.UserMessage(content="How many feet are in a mile?")],
+            model_extras={
+                "key1": 1,
+                "key2": True,
+                "key3": "Some value",
+                "key4": [1, 2, 3],
+                "key5": {"key6": 2, "key7": False, "key8": "Some other value", "key9": [4, 5, 6, 7]},
+            },
+        )
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_chat_completions_streaming(self, **kwargs):
+        client = self._create_async_chat_client(Sync=False, **kwargs)
+        response = await client.complete(
+            stream=True,
+            messages=[
+                sdk.models.SystemMessage(content="You are a helpful assistant."),
+                sdk.models.UserMessage(content="Give me 3 good reasons why I should exercise every day."),
+            ],
+        )
+        await self._validate_async_chat_completions_streaming_result(response)
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_chat_completions_with_json_input(self, **kwargs):
+        client = self._create_async_chat_client(**kwargs)
+        request_body = {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "How many feet are in a mile?"},
+            ]
+        }
+        response = await client.complete(request_body)
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_chat_completions_with_bytes_input(self, **kwargs):
+        client = self._create_async_chat_client(**kwargs)
+        response = await client.complete(self.read_text_file("chat.test.json"))
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        await client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy_async
+    async def test_async_chat_completions_streaming_with_json_input(self, **kwargs):
+        client = self._create_async_chat_client(**kwargs)
+        request_body = {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Give me 3 good reasons why I should exercise every day."},
+            ],
+            "stream": True,
+        }
+        response = await client.complete(request_body)
+        await self._validate_async_chat_completions_streaming_result(response)
+        await client.close()
+
+    # **********************************************************************************
+    #
+    #                            ERROR TESTS
+    #
+    # **********************************************************************************
+
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy_async
+    async def test_embeddings_with_auth_failure(self, **kwargs):
+        client = self._create_async_embeddings_client(bad_key=True, **kwargs)
+        exception_caught = False
+        try:
+            response = await client.embed(input=["first phrase", "second phrase", "third phrase"])
+        except AzureError as e:
+            exception_caught = True
+            print(e)
+            assert hasattr(e, "status_code")
+            assert e.status_code == 401
+            assert "unauthorized" in e.message.lower()
+        await client.close()
+        assert exception_caught
diff --git a/sdk/ai/azure-ai-inference/tests/test_model_inference_client.py b/sdk/ai/azure-ai-inference/tests/test_model_inference_client.py
new file mode 100644
index 000000000000..c8e722d32026
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tests/test_model_inference_client.py
@@ -0,0 +1,283 @@
+# ------------------------------------
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+# ------------------------------------
+import inspect
+import azure.ai.inference as sdk
+
+from model_inference_test_base import ModelClientTestBase, ServicePreparerChatCompletions, ServicePreparerEmbeddings
+from devtools_testutils import recorded_by_proxy
+from azure.core.exceptions import AzureError
+
+
+# The test class name needs to start with "Test" to get collected by pytest
+class TestModelClient(ModelClientTestBase):
+
+    # **********************************************************************************
+    #
+    #                      HAPPY PATH TESTS - TEXT EMBEDDINGS
+    #
+    # **********************************************************************************
+
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy
+    def test_load_embeddings_client(self, **kwargs):
+
+        client = self._load_embeddings_client(**kwargs)
+        assert isinstance(client, sdk.EmbeddingsClient)
+        assert client._model_info
+        response1 = client.get_model_info()
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "embedding"
+        )  # TODO: This should be ModelType.EMBEDDINGS once the model is fixed
+        client.close()
+
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy
+    def test_get_model_info_on_embeddings_client(self, **kwargs):
+
+        client = self._create_embeddings_client(**kwargs)
+        assert not client._model_info
+
+        response1 = client.get_model_info()
+        assert client._model_info
+
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "embedding"
+        )  # TODO: This should be ModelType.EMBEDDINGS once the model is fixed
+
+        # Get the model info again. No network calls should be made here,
+        # as the response is cached in the client.
+        response2 = client.get_model_info()
+        self._print_model_info_result(response2)
+        assert response1 == response2
+
+        client.close()
+
+    @ServicePreparerEmbeddings()
+    @recorded_by_proxy
+    def test_embeddings(self, **kwargs):
+        client = self._create_embeddings_client(**kwargs)
+        response = client.embed(input=["first phrase", "second phrase", "third phrase"])
+        self._print_embeddings_result(response)
+        self._validate_embeddings_result(response)
+        client.close()
+
+    # **********************************************************************************
+    #
+    #                      HAPPY PATH TESTS - CHAT COMPLETIONS
+    #
+    # **********************************************************************************
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_load_chat_completions_client(self, **kwargs):
+
+        client = self._load_chat_client(**kwargs)
+        assert isinstance(client, sdk.ChatCompletionsClient)
+        assert client._model_info
+
+        response1 = client.get_model_info()
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "completion"
+        )  # TODO: This should be ModelType.CHAT once the model is fixed
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_get_model_info_on_chat_client(self, **kwargs):
+
+        client = self._create_chat_client(**kwargs)
+        assert not client._model_info
+
+        response1 = client.get_model_info()
+        assert client._model_info
+
+        self._print_model_info_result(response1)
+        self._validate_model_info_result(
+            response1, "completion"
+        )  # TODO: This should be ModelType.CHAT once the model is fixed
+
+        # Get the model info again. No network calls should be made here,
+        # as the response is cached in the client.
+        response2 = client.get_model_info()
+        self._print_model_info_result(response2)
+        assert response1 == response2
+
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_multi_turn(self, **kwargs):
+        client = self._create_chat_client(**kwargs)
+        messages = [
+            sdk.models.SystemMessage(content="You are a helpful assistant answering questions regarding length units."),
+            sdk.models.UserMessage(content="How many feet are in a mile?"),
+        ]
+        response = client.complete(messages=messages)
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        messages.append(sdk.models.AssistantMessage(content=response.choices[0].message.content))
+        messages.append(sdk.models.UserMessage(content="and how many yards?"))
+        response = client.complete(messages=messages)
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_result(response, ["1760", "1,760"])
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_with_model_extras(self, **kwargs):
+        client = self._create_chat_client(**kwargs)
+        response = client.complete(
+            messages=[sdk.models.UserMessage(content="How many feet are in a mile?")],
+            model_extras={
+                "key1": 1,
+                "key2": True,
+                "key3": "Some value",
+                "key4": [1, 2, 3],
+                "key5": {"key6": 2, "key7": False, "key8": "Some other value", "key9": [4, 5, 6, 7]},
+            },
+        )
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_with_json_input(self, **kwargs):
+        client = self._create_chat_client(**kwargs)
+        request_body = {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "How many feet are in a mile?"},
+            ]
+        }
+        response = client.complete(request_body)
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_with_bytes_input(self, **kwargs):
+        client = self._create_chat_client(**kwargs)
+        response = client.complete(self.read_text_file("chat.test.json"))
+        self._validate_chat_completions_result(response, ["5280", "5,280"])
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_streaming(self, **kwargs):
+        client = self._create_chat_client(**kwargs)
+        response = client.complete(
+            stream=True,
+            messages=[
+                sdk.models.SystemMessage(content="You are a helpful assistant."),
+                sdk.models.UserMessage(content="Give me 3 good reasons why I should exercise every day."),
+            ],
+        )
+        self._validate_chat_completions_streaming_result(response)
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_streaming_with_json_input(self, **kwargs):
+        client = self._create_chat_client(**kwargs)
+        request_body = {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Give me 3 good reasons why I should exercise every day."},
+            ],
+            "stream": True,
+        }
+        response = client.complete(request_body)
+        self._validate_chat_completions_streaming_result(response)
+        client.close()
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completions_with_tool(self, **kwargs):
+        forecast_tool = sdk.models.ChatCompletionsFunctionToolDefinition(
+            function=sdk.models.FunctionDefinition(
+                name="get_max_temperature",
+                description="A function that returns the forecasted maximum temperature IN a given city, a given few days from now, in Fahrenheit. It returns `unknown` if the forecast is not known.",
+                parameters={
+                    "type": "object",
+                    "properties": {
+                        "city": {
+                            "type": "string",
+                            "description": "The name of the city",
+                        },
+                        "days": {
+                            "type": "string",
+                            "description": "The number of days from now, starting from 0, where 0 represents today, 1 represents tomorrow, etc.",
+                        },
+                    },
+                    "required": ["city", "days"],
+                },
+            )
+        )
+        client = self._create_chat_client(**kwargs)
+        messages = [
+            sdk.models.SystemMessage(content="You are an assistant that helps users find weather information."),
+            sdk.models.UserMessage(content="what's the maximum temperature in Seattle two days from now?"),
+        ]
+        response = client.complete(
+            messages=messages,
+            tools=[forecast_tool],
+        )
+        self._print_chat_completions_result(response)
+        self._validate_chat_completions_tool_result(response)
+        messages.append(sdk.models.AssistantMessage(tool_calls=response.choices[0].message.tool_calls))
+        messages.append(
+            sdk.models.ToolMessage(
+                content="62",
+                tool_call_id=response.choices[0].message.tool_calls[0].id,
+            )
+        )
+        response = client.complete(
+            messages=messages,
+            tools=[forecast_tool],
+        )
+        self._validate_chat_completions_result(response, ["62"])
+        client.close()
+
+    # **********************************************************************************
+    #
+    #                            ERROR TESTS
+    #
+    # **********************************************************************************
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_chat_completion_with_auth_failure(self, **kwargs):
+        client = self._create_chat_client(bad_key=True, **kwargs)
+        exception_caught = False
+        try:
+            response = client.complete(messages=[sdk.models.UserMessage(content="How many feet are in a mile?")])
+        except AzureError as e:
+            exception_caught = True
+            print(e)
+            assert hasattr(e, "status_code")
+            assert e.status_code == 401
+            assert "unauthorized" in e.message.lower()
+        client.close()
+        assert exception_caught
+
+    @ServicePreparerChatCompletions()
+    @recorded_by_proxy
+    def test_embeddings_on_chat_completion_endpoint(self, **kwargs):
+        client = self._create_embeddings_client_with_chat_completions_credentials(**kwargs)
+        exception_caught = False
+        try:
+            response = client.embed(input=["first phrase", "second phrase", "third phrase"])
+        except AzureError as e:
+            exception_caught = True
+            print(e)
+            assert hasattr(e, "status_code")
+            assert e.status_code == 404 or e.status_code == 405  # `404 - not found` or `405 - method not allowed`
+            assert "not found" in e.message.lower() or "not allowed" in e.message.lower()
+        client.close()
+        assert exception_caught
diff --git a/sdk/ai/azure-ai-inference/tsp-location.yaml b/sdk/ai/azure-ai-inference/tsp-location.yaml
new file mode 100644
index 000000000000..e107572a4177
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/tsp-location.yaml
@@ -0,0 +1,4 @@
+directory: specification/ai/ModelClient
+commit: 907e4e19cf76132ea281e060fedcfee0eb671e92
+repo: Azure/azure-rest-api-specs
+additionalDirectories:
diff --git a/sdk/ai/azure-ai-resources/setup.py b/sdk/ai/azure-ai-resources/setup.py
index f688a10e2ba1..cb0abaa55318 100644
--- a/sdk/ai/azure-ai-resources/setup.py
+++ b/sdk/ai/azure-ai-resources/setup.py
@@ -42,7 +42,7 @@
     url="https://github.com/Azure/azure-sdk-for-python",
     keywords="azure, azuresdk, azure sdk",
     classifiers=[
-        "Development Status :: 4 - Beta",
+        "Development Status :: 7 - Inactive",
         "Programming Language :: Python",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3 :: Only",
diff --git a/sdk/ai/ci.yml b/sdk/ai/ci.yml
index d0ceb7e8d11f..5780d354da0a 100644
--- a/sdk/ai/ci.yml
+++ b/sdk/ai/ci.yml
@@ -29,23 +29,27 @@ extends:
   template: /eng/pipelines/templates/stages/archetype-sdk-client.yml
   parameters:
     ServiceDirectory: ai
-    TestTimeoutInMinutes: 75
-    BuildDocs: true
     TestProxy: true
+    BuildDocs: true
+    TestTimeoutInMinutes: 60
+    # The below were set before when azure-ai-generative and azure-ai-resources packages were built:
     # This is a short term solution to create API review for python azure-ml package only when running pipeline manually
     # Long term solution should be to have different versions on main branch and release branch for python package so APIView can have different revisions for each version.
     # Tracking issue: https://github.com/Azure/azure-sdk-for-python/issues/29196
-    GenerateApiReviewForManualOnly: true
+    # GenerateApiReviewForManualOnly: true
     # This custom matrix config should be dropped once:
     #  * The Azure SDKs removes Python 3.7 from the test matrix
     #  * Once all of azure-ai-generative's extra packages can be installed on Python3.12
-    MatrixConfigs: 
-      - Name: ai_ci_matrix
-        Path: eng/pipelines/templates/stages/platform-matrix-ai.json
-        Selection: sparse
-        GenerateVMJobs: true
+    # MatrixConfigs: 
+    #  - Name: ai_ci_matrix
+    #    Path: eng/pipelines/templates/stages/platform-matrix-ai.json
+    #    Selection: sparse
+    #    GenerateVMJobs: true
     Artifacts:
-    - name: azure-ai-generative
-      safeName: azureaigenerative
-    - name: azure-ai-resources
-      safeName: azureairesources
+    - name: azure-ai-inference
+      safeName: azureaiinference
+    # These packages are deprecated: 
+    #- name: azure-ai-generative
+    #  safeName: azureaigenerative
+    #- name: azure-ai-resources
+    #  safeName: azureairesources
diff --git a/sdk/ai/tests.yml b/sdk/ai/tests.yml.old
similarity index 68%
rename from sdk/ai/tests.yml
rename to sdk/ai/tests.yml.old
index 960b65f98853..b51eb6aaf15d 100644
--- a/sdk/ai/tests.yml
+++ b/sdk/ai/tests.yml.old
@@ -1,3 +1,4 @@
+# This was the tests.yml file that was used when azure-ai-generative and azure-ai-resources packages were built.
 trigger: none
 
 # NOTE: Service live tests are NOT enabled. This file only enables the analyze stage currently.