diff --git a/sdk/ai/azure-ai-inference/_meta.json b/sdk/ai/azure-ai-inference/_meta.json
new file mode 100644
index 000000000000..95f50a2eb96d
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/_meta.json
@@ -0,0 +1,6 @@
+{
+  "commit": "363e37a1282dd6750232ffd49ed07e6124d2675d",
+  "repository_url": "https://github.com/Azure/azure-rest-api-specs",
+  "typespec_src": "specification/ai/ModelClient",
+  "@azure-tools/typespec-python": "0.31.1"
+}
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py
index ff62b276a309..898076e89409 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/__init__.py
@@ -6,23 +6,25 @@
 # Changes may cause incorrect behavior and will be lost if the code is regenerated.
 # --------------------------------------------------------------------------
 
-from ._patch import ChatCompletionsClient
-from ._patch import EmbeddingsClient
-from ._patch import ImageEmbeddingsClient
+from ._client import ChatCompletionsClient
+from ._client import EmbeddingsClient
+from ._client import ImageEmbeddingsClient
 from ._version import VERSION
 
 __version__ = VERSION
 
-
-from ._patch import load_client
+try:
+    from ._patch import __all__ as _patch_all
+    from ._patch import *  # pylint: disable=unused-wildcard-import
+except ImportError:
+    _patch_all = []
 from ._patch import patch_sdk as _patch_sdk
 
 __all__ = [
-    "load_client",
     "ChatCompletionsClient",
     "EmbeddingsClient",
     "ImageEmbeddingsClient",
 ]
-
+__all__.extend([p for p in _patch_all if p not in __all__])
 
 _patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
index c4b1008c1e85..12ad7f29c71e 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
@@ -4,7 +4,7 @@
 # Licensed under the MIT License. See License.txt in the project root for
 # license information.
 # --------------------------------------------------------------------------
-# pylint: disable=protected-access, arguments-differ, signature-differs, broad-except
+# pylint: disable=protected-access, arguments-differ, signature-differs, broad-except, too-many-lines
 
 import copy
 import calendar
@@ -19,6 +19,7 @@
 import email.utils
 from datetime import datetime, date, time, timedelta, timezone
 from json import JSONEncoder
+import xml.etree.ElementTree as ET
 from typing_extensions import Self
 import isodate
 from azure.core.exceptions import DeserializationError
@@ -123,7 +124,7 @@ def _serialize_datetime(o, format: typing.Optional[str] = None):
 
 def _is_readonly(p):
     try:
-        return p._visibility == ["read"]  # pylint: disable=protected-access
+        return p._visibility == ["read"]
     except AttributeError:
         return False
 
@@ -286,6 +287,12 @@ def _deserialize_decimal(attr):
     return decimal.Decimal(str(attr))
 
 
+def _deserialize_int_as_str(attr):
+    if isinstance(attr, int):
+        return attr
+    return int(attr)
+
+
 _DESERIALIZE_MAPPING = {
     datetime: _deserialize_datetime,
     date: _deserialize_date,
@@ -307,9 +314,11 @@ def _deserialize_decimal(attr):
 
 
 def get_deserializer(annotation: typing.Any, rf: typing.Optional["_RestField"] = None):
+    if annotation is int and rf and rf._format == "str":
+        return _deserialize_int_as_str
     if rf and rf._format:
         return _DESERIALIZE_MAPPING_WITHFORMAT.get(rf._format)
-    return _DESERIALIZE_MAPPING.get(annotation)
+    return _DESERIALIZE_MAPPING.get(annotation)  # pyright: ignore
 
 
 def _get_type_alias_type(module_name: str, alias_name: str):
@@ -441,6 +450,10 @@ def _serialize(o, format: typing.Optional[str] = None):  # pylint: disable=too-m
         return float(o)
     if isinstance(o, enum.Enum):
         return o.value
+    if isinstance(o, int):
+        if format == "str":
+            return str(o)
+        return o
     try:
         # First try datetime.datetime
         return _serialize_datetime(o, format)
@@ -471,6 +484,8 @@ def _create_value(rf: typing.Optional["_RestField"], value: typing.Any) -> typin
         return value
     if rf._is_model:
         return _deserialize(rf._type, value)
+    if isinstance(value, ET.Element):
+        value = _deserialize(rf._type, value)
     return _serialize(value, rf._format)
 
 
@@ -489,10 +504,58 @@ def __init__(self, *args: typing.Any, **kwargs: typing.Any) -> None:
             for rest_field in self._attr_to_rest_field.values()
             if rest_field._default is not _UNSET
         }
-        if args:
-            dict_to_pass.update(
-                {k: _create_value(_get_rest_field(self._attr_to_rest_field, k), v) for k, v in args[0].items()}
-            )
+        if args:  # pylint: disable=too-many-nested-blocks
+            if isinstance(args[0], ET.Element):
+                existed_attr_keys = []
+                model_meta = getattr(self, "_xml", {})
+
+                for rf in self._attr_to_rest_field.values():
+                    prop_meta = getattr(rf, "_xml", {})
+                    xml_name = prop_meta.get("name", rf._rest_name)
+                    xml_ns = prop_meta.get("ns", model_meta.get("ns", None))
+                    if xml_ns:
+                        xml_name = "{" + xml_ns + "}" + xml_name
+
+                    # attribute
+                    if prop_meta.get("attribute", False) and args[0].get(xml_name) is not None:
+                        existed_attr_keys.append(xml_name)
+                        dict_to_pass[rf._rest_name] = _deserialize(rf._type, args[0].get(xml_name))
+                        continue
+
+                    # unwrapped element is array
+                    if prop_meta.get("unwrapped", False):
+                        # unwrapped array could either use prop items meta/prop meta
+                        if prop_meta.get("itemsName"):
+                            xml_name = prop_meta.get("itemsName")
+                            xml_ns = prop_meta.get("itemNs")
+                            if xml_ns:
+                                xml_name = "{" + xml_ns + "}" + xml_name
+                        items = args[0].findall(xml_name)  # pyright: ignore
+                        if len(items) > 0:
+                            existed_attr_keys.append(xml_name)
+                            dict_to_pass[rf._rest_name] = _deserialize(rf._type, items)
+                        continue
+
+                    # text element is primitive type
+                    if prop_meta.get("text", False):
+                        if args[0].text is not None:
+                            dict_to_pass[rf._rest_name] = _deserialize(rf._type, args[0].text)
+                        continue
+
+                    # wrapped element could be normal property or array, it should only have one element
+                    item = args[0].find(xml_name)
+                    if item is not None:
+                        existed_attr_keys.append(xml_name)
+                        dict_to_pass[rf._rest_name] = _deserialize(rf._type, item)
+
+                # rest thing is additional properties
+                for e in args[0]:
+                    if e.tag not in existed_attr_keys:
+                        dict_to_pass[e.tag] = _convert_element(e)
+            else:
+                dict_to_pass.update(
+                    {k: _create_value(_get_rest_field(self._attr_to_rest_field, k), v) for k, v in args[0].items()}
+                )
         else:
             non_attr_kwargs = [k for k in kwargs if k not in self._attr_to_rest_field]
             if non_attr_kwargs:
@@ -541,12 +604,10 @@ def __init_subclass__(cls, discriminator: typing.Optional[str] = None) -> None:
                 base.__mapping__[discriminator or cls.__name__] = cls  # type: ignore  # pylint: disable=no-member
 
     @classmethod
-    def _get_discriminator(cls, exist_discriminators) -> typing.Optional[str]:
+    def _get_discriminator(cls, exist_discriminators) -> typing.Optional["_RestField"]:
         for v in cls.__dict__.values():
-            if (
-                isinstance(v, _RestField) and v._is_discriminator and v._rest_name not in exist_discriminators
-            ):  # pylint: disable=protected-access
-                return v._rest_name  # pylint: disable=protected-access
+            if isinstance(v, _RestField) and v._is_discriminator and v._rest_name not in exist_discriminators:
+                return v
         return None
 
     @classmethod
@@ -554,11 +615,25 @@ def _deserialize(cls, data, exist_discriminators):
         if not hasattr(cls, "__mapping__"):  # pylint: disable=no-member
             return cls(data)
         discriminator = cls._get_discriminator(exist_discriminators)
-        exist_discriminators.append(discriminator)
-        mapped_cls = cls.__mapping__.get(data.get(discriminator), cls)  # pyright: ignore # pylint: disable=no-member
-        if mapped_cls == cls:
+        if discriminator is None:
             return cls(data)
-        return mapped_cls._deserialize(data, exist_discriminators)  # pylint: disable=protected-access
+        exist_discriminators.append(discriminator._rest_name)
+        if isinstance(data, ET.Element):
+            model_meta = getattr(cls, "_xml", {})
+            prop_meta = getattr(discriminator, "_xml", {})
+            xml_name = prop_meta.get("name", discriminator._rest_name)
+            xml_ns = prop_meta.get("ns", model_meta.get("ns", None))
+            if xml_ns:
+                xml_name = "{" + xml_ns + "}" + xml_name
+
+            if data.get(xml_name) is not None:
+                discriminator_value = data.get(xml_name)
+            else:
+                discriminator_value = data.find(xml_name).text  # pyright: ignore
+        else:
+            discriminator_value = data.get(discriminator._rest_name)
+        mapped_cls = cls.__mapping__.get(discriminator_value, cls)  # pyright: ignore # pylint: disable=no-member
+        return mapped_cls._deserialize(data, exist_discriminators)
 
     def as_dict(self, *, exclude_readonly: bool = False) -> typing.Dict[str, typing.Any]:
         """Return a dict that can be JSONify using json.dump.
@@ -624,6 +699,8 @@ def _deserialize_dict(
 ):
     if obj is None:
         return obj
+    if isinstance(obj, ET.Element):
+        obj = {child.tag: child for child in obj}
     return {k: _deserialize(value_deserializer, v, module) for k, v in obj.items()}
 
 
@@ -644,6 +721,8 @@ def _deserialize_sequence(
 ):
     if obj is None:
         return obj
+    if isinstance(obj, ET.Element):
+        obj = list(obj)
     return type(obj)(_deserialize(deserializer, entry, module) for entry in obj)
 
 
@@ -659,7 +738,7 @@ def _get_deserialize_callable_from_annotation(  # pylint: disable=R0911, R0915,
     module: typing.Optional[str],
     rf: typing.Optional["_RestField"] = None,
 ) -> typing.Optional[typing.Callable[[typing.Any], typing.Any]]:
-    if not annotation or annotation in [int, float]:
+    if not annotation:
         return None
 
     # is it a type alias?
@@ -734,7 +813,6 @@ def _get_deserialize_callable_from_annotation(  # pylint: disable=R0911, R0915,
     try:
         if annotation._name in ["List", "Set", "Tuple", "Sequence"]:  # pyright: ignore
             if len(annotation.__args__) > 1:  # pyright: ignore
-
                 entry_deserializers = [
                     _get_deserialize_callable_from_annotation(dt, module, rf)
                     for dt in annotation.__args__  # pyright: ignore
@@ -769,12 +847,23 @@ def _deserialize_default(
 def _deserialize_with_callable(
     deserializer: typing.Optional[typing.Callable[[typing.Any], typing.Any]],
     value: typing.Any,
-):
+):  # pylint: disable=too-many-return-statements
     try:
         if value is None or isinstance(value, _Null):
             return None
+        if isinstance(value, ET.Element):
+            if deserializer is str:
+                return value.text or ""
+            if deserializer is int:
+                return int(value.text) if value.text else None
+            if deserializer is float:
+                return float(value.text) if value.text else None
+            if deserializer is bool:
+                return value.text == "true" if value.text else None
         if deserializer is None:
             return value
+        if deserializer in [int, float, bool]:
+            return deserializer(value)
         if isinstance(deserializer, CaseInsensitiveEnumMeta):
             try:
                 return deserializer(value)
@@ -815,6 +904,7 @@ def __init__(
         default: typing.Any = _UNSET,
         format: typing.Optional[str] = None,
         is_multipart_file_input: bool = False,
+        xml: typing.Optional[typing.Dict[str, typing.Any]] = None,
     ):
         self._type = type
         self._rest_name_input = name
@@ -825,6 +915,7 @@ def __init__(
         self._default = default
         self._format = format
         self._is_multipart_file_input = is_multipart_file_input
+        self._xml = xml if xml is not None else {}
 
     @property
     def _class_type(self) -> typing.Any:
@@ -875,6 +966,7 @@ def rest_field(
     default: typing.Any = _UNSET,
     format: typing.Optional[str] = None,
     is_multipart_file_input: bool = False,
+    xml: typing.Optional[typing.Dict[str, typing.Any]] = None,
 ) -> typing.Any:
     return _RestField(
         name=name,
@@ -883,6 +975,7 @@ def rest_field(
         default=default,
         format=format,
         is_multipart_file_input=is_multipart_file_input,
+        xml=xml,
     )
 
 
@@ -891,5 +984,175 @@ def rest_discriminator(
     name: typing.Optional[str] = None,
     type: typing.Optional[typing.Callable] = None,  # pylint: disable=redefined-builtin
     visibility: typing.Optional[typing.List[str]] = None,
+    xml: typing.Optional[typing.Dict[str, typing.Any]] = None,
+) -> typing.Any:
+    return _RestField(name=name, type=type, is_discriminator=True, visibility=visibility, xml=xml)
+
+
+def serialize_xml(model: Model, exclude_readonly: bool = False) -> str:
+    """Serialize a model to XML.
+
+    :param Model model: The model to serialize.
+    :param bool exclude_readonly: Whether to exclude readonly properties.
+    :returns: The XML representation of the model.
+    :rtype: str
+    """
+    return ET.tostring(_get_element(model, exclude_readonly), encoding="unicode")  # type: ignore
+
+
+def _get_element(
+    o: typing.Any,
+    exclude_readonly: bool = False,
+    parent_meta: typing.Optional[typing.Dict[str, typing.Any]] = None,
+    wrapped_element: typing.Optional[ET.Element] = None,
+) -> typing.Union[ET.Element, typing.List[ET.Element]]:
+    if _is_model(o):
+        model_meta = getattr(o, "_xml", {})
+
+        # if prop is a model, then use the prop element directly, else generate a wrapper of model
+        if wrapped_element is None:
+            wrapped_element = _create_xml_element(
+                model_meta.get("name", o.__class__.__name__),
+                model_meta.get("prefix"),
+                model_meta.get("ns"),
+            )
+
+        readonly_props = []
+        if exclude_readonly:
+            readonly_props = [p._rest_name for p in o._attr_to_rest_field.values() if _is_readonly(p)]
+
+        for k, v in o.items():
+            # do not serialize readonly properties
+            if exclude_readonly and k in readonly_props:
+                continue
+
+            prop_rest_field = _get_rest_field(o._attr_to_rest_field, k)
+            if prop_rest_field:
+                prop_meta = getattr(prop_rest_field, "_xml").copy()
+                # use the wire name as xml name if no specific name is set
+                if prop_meta.get("name") is None:
+                    prop_meta["name"] = k
+            else:
+                # additional properties will not have rest field, use the wire name as xml name
+                prop_meta = {"name": k}
+
+            # if no ns for prop, use model's
+            if prop_meta.get("ns") is None and model_meta.get("ns"):
+                prop_meta["ns"] = model_meta.get("ns")
+                prop_meta["prefix"] = model_meta.get("prefix")
+
+            if prop_meta.get("unwrapped", False):
+                # unwrapped could only set on array
+                wrapped_element.extend(_get_element(v, exclude_readonly, prop_meta))
+            elif prop_meta.get("text", False):
+                # text could only set on primitive type
+                wrapped_element.text = _get_primitive_type_value(v)
+            elif prop_meta.get("attribute", False):
+                xml_name = prop_meta.get("name", k)
+                if prop_meta.get("ns"):
+                    ET.register_namespace(prop_meta.get("prefix"), prop_meta.get("ns"))  # pyright: ignore
+                    xml_name = "{" + prop_meta.get("ns") + "}" + xml_name  # pyright: ignore
+                # attribute should be primitive type
+                wrapped_element.set(xml_name, _get_primitive_type_value(v))
+            else:
+                # other wrapped prop element
+                wrapped_element.append(_get_wrapped_element(v, exclude_readonly, prop_meta))
+        return wrapped_element
+    if isinstance(o, list):
+        return [_get_element(x, exclude_readonly, parent_meta) for x in o]  # type: ignore
+    if isinstance(o, dict):
+        result = []
+        for k, v in o.items():
+            result.append(
+                _get_wrapped_element(
+                    v,
+                    exclude_readonly,
+                    {
+                        "name": k,
+                        "ns": parent_meta.get("ns") if parent_meta else None,
+                        "prefix": parent_meta.get("prefix") if parent_meta else None,
+                    },
+                )
+            )
+        return result
+
+    # primitive case need to create element based on parent_meta
+    if parent_meta:
+        return _get_wrapped_element(
+            o,
+            exclude_readonly,
+            {
+                "name": parent_meta.get("itemsName", parent_meta.get("name")),
+                "prefix": parent_meta.get("itemsPrefix", parent_meta.get("prefix")),
+                "ns": parent_meta.get("itemsNs", parent_meta.get("ns")),
+            },
+        )
+
+    raise ValueError("Could not serialize value into xml: " + o)
+
+
+def _get_wrapped_element(
+    v: typing.Any,
+    exclude_readonly: bool,
+    meta: typing.Optional[typing.Dict[str, typing.Any]],
+) -> ET.Element:
+    wrapped_element = _create_xml_element(
+        meta.get("name") if meta else None, meta.get("prefix") if meta else None, meta.get("ns") if meta else None
+    )
+    if isinstance(v, (dict, list)):
+        wrapped_element.extend(_get_element(v, exclude_readonly, meta))
+    elif _is_model(v):
+        _get_element(v, exclude_readonly, meta, wrapped_element)
+    else:
+        wrapped_element.text = _get_primitive_type_value(v)
+    return wrapped_element
+
+
+def _get_primitive_type_value(v) -> str:
+    if v is True:
+        return "true"
+    if v is False:
+        return "false"
+    if isinstance(v, _Null):
+        return ""
+    return str(v)
+
+
+def _create_xml_element(tag, prefix=None, ns=None):
+    if prefix and ns:
+        ET.register_namespace(prefix, ns)
+    if ns:
+        return ET.Element("{" + ns + "}" + tag)
+    return ET.Element(tag)
+
+
+def _deserialize_xml(
+    deserializer: typing.Any,
+    value: str,
 ) -> typing.Any:
-    return _RestField(name=name, type=type, is_discriminator=True, visibility=visibility)
+    element = ET.fromstring(value)  # nosec
+    return _deserialize(deserializer, element)
+
+
+def _convert_element(e: ET.Element):
+    # dict case
+    if len(e.attrib) > 0 or len({child.tag for child in e}) > 1:
+        dict_result: typing.Dict[str, typing.Any] = {}
+        for child in e:
+            if dict_result.get(child.tag) is not None:
+                if isinstance(dict_result[child.tag], list):
+                    dict_result[child.tag].append(_convert_element(child))
+                else:
+                    dict_result[child.tag] = [dict_result[child.tag], _convert_element(child)]
+            else:
+                dict_result[child.tag] = _convert_element(child)
+        dict_result.update(e.attrib)
+        return dict_result
+    # array case
+    if len(e) > 0:
+        array_result: typing.List[typing.Any] = []
+        for child in e:
+            array_result.append(_convert_element(child))
+        return array_result
+    # primitive case
+    return e.text
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
index 3a24ee5736d3..f7145442e5d8 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
@@ -246,7 +246,6 @@ def _complete(
         model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.ChatCompletions:
-        # pylint: disable=too-many-locals
         """Gets chat completions for the provided chat messages.
         Completions support a wide variety of tasks and generate text that continues from or
         "completes"
@@ -335,7 +334,7 @@ def _complete(
         :rtype: ~azure.ai.inference.models.ChatCompletions
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -425,7 +424,7 @@ def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
         :rtype: ~azure.ai.inference.models.ModelInfo
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -556,7 +555,7 @@ def _embed(
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -638,7 +637,7 @@ def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
         :rtype: ~azure.ai.inference.models.ModelInfo
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -772,7 +771,7 @@ def _embed(
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -854,7 +853,7 @@ def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
         :rtype: ~azure.ai.inference.models.ModelInfo
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
index 362fa75e2a91..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
@@ -2,1244 +2,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 # ------------------------------------
-# pylint: disable=too-many-lines)
 """Customize generated code here.
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
-
-Why do we patch auto-generated code?
-1. Add support for input argument `model_extras` (all clients)
-2. Add support for function load_client
-3. Add support for setting sticky chat completions/embeddings input arguments in the client constructor
-4. Add support for get_model_info, while caching the result (all clients)
-5. Add support for chat completion streaming (ChatCompletionsClient client only)
-6. Add support for friendly print of result objects (__str__ method) (all clients)
-7. Add support for load() method in ImageUrl class (see /models/_patch.py).
-
 """
-import json
-import logging
-import sys
-
-from io import IOBase
-from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING, Iterable
-
-from azure.core.pipeline import PipelineResponse
-from azure.core.credentials import AzureKeyCredential
-from azure.core.tracing.decorator import distributed_trace
-from azure.core.utils import case_insensitive_dict
-from azure.core.exceptions import (
-    ClientAuthenticationError,
-    HttpResponseError,
-    map_error,
-    ResourceExistsError,
-    ResourceNotFoundError,
-    ResourceNotModifiedError,
-)
-from . import models as _models
-from ._model_base import SdkJSONEncoder, _deserialize
-from ._serialization import Serializer
-from ._operations._operations import (
-    build_chat_completions_complete_request,
-    build_embeddings_embed_request,
-    build_image_embeddings_embed_request,
-)
-from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
-from ._client import EmbeddingsClient as EmbeddingsClientGenerated
-from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
-
-if sys.version_info >= (3, 9):
-    from collections.abc import MutableMapping
-else:
-    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
-
-if TYPE_CHECKING:
-    # pylint: disable=unused-import,ungrouped-imports
-    from azure.core.credentials import TokenCredential
-
-JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
-
-_SERIALIZER = Serializer()
-_SERIALIZER.client_side_validation = False
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def load_client(
-    endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any
-) -> Union["ChatCompletionsClient", "EmbeddingsClient", "ImageEmbeddingsClient"]:
-    """
-    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
-    on the given endpoint, to determine the model type and therefore which client to instantiate.
-    This method will only work when using Serverless API or Managed Compute endpoint.
-    It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    :return: The appropriate synchronous client associated with the given endpoint
-    :rtype: ~azure.ai.inference.ChatCompletionsClient or ~azure.ai.inference.EmbeddingsClient
-     or ~azure.ai.inference.ImageEmbeddingsClient
-    :raises ~azure.core.exceptions.HttpResponseError:
-    """
-
-    with ChatCompletionsClient(
-        endpoint, credential, **kwargs
-    ) as client:  # Pick any of the clients, it does not matter.
-        model_info = client.get_model_info()  # type: ignore
-
-    _LOGGER.info("model_info=%s", model_info)
-    if not model_info.model_type:
-        raise ValueError(
-            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
-        )
-
-    # TODO: Remove "completions" and "embedding" once Mistral Large and Cohere fixes their model type
-    if model_info.model_type in (_models.ModelType.CHAT, "completion"):
-        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
-        chat_completion_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return chat_completion_client
-
-    if model_info.model_type in (_models.ModelType.EMBEDDINGS, "embedding"):
-        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
-        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
-        return embedding_client
-
-    if model_info.model_type == _models.ModelType.IMAGE_EMBEDDINGS:
-        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
-        image_embedding_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return image_embedding_client
-
-    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
-
-
-class ChatCompletionsClient(ChatCompletionsClientGenerated):  # pylint: disable=too-many-instance-attributes
-    """ChatCompletionsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword frequency_penalty: A value that influences the probability of generated tokens
-        appearing based on their cumulative frequency in generated text.
-        Positive values will make tokens less likely to appear as their frequency increases and
-        decrease the likelihood of the model repeating the same statements verbatim.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype frequency_penalty: float
-    :keyword presence_penalty: A value that influences the probability of generated tokens
-        appearing based on their existing
-        presence in generated text.
-        Positive values will make tokens less likely to appear when they already exist and increase
-        the model's likelihood to output new topics.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype presence_penalty: float
-    :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-        generated completions.
-        Higher values will make output more random while lower values will make results more focused
-        and deterministic.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype temperature: float
-    :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-        causes the
-        model to consider the results of tokens with the provided probability mass. As an example, a
-        value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-        considered.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype top_p: float
-    :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-    :paramtype max_tokens: int
-    :keyword response_format: The format that the model must output. Use this to enable JSON mode
-        instead of the default text mode.
-        Note that to enable JSON mode, some AI models may also require you to instruct the model to
-        produce JSON via a system or user message. Default value is None.
-    :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
-    :keyword stop: A collection of textual sequences that will end completions generation. Default
-        value is None.
-    :paramtype stop: list[str]
-    :keyword tools: The available tool definitions that the chat completions request can use,
-        including caller-defined functions. Default value is None.
-    :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-    :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-        use for the chat completions response. Is either a Union[str,
-        "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-        Default value is None.
-    :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-        ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-    :keyword seed: If specified, the system will make a best effort to sample deterministically
-        such that repeated requests with the
-        same seed and parameters should return the same result. Determinism is not guaranteed.
-        Default value is None.
-    :paramtype seed: int
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default chat completions settings, to be applied in all future service calls
-        # unless overridden by arguments in the `complete` method.
-        self._frequency_penalty = frequency_penalty
-        self._presence_penalty = presence_penalty
-        self._temperature = temperature
-        self._top_p = top_p
-        self._max_tokens = max_tokens
-        self._response_format = response_format
-        self._stop = stop
-        self._tools = tools
-        self._tool_choice = tool_choice
-        self._seed = seed
-        self._model = model
-        self._model_extras = model_extras
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[False] = False,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.ChatCompletions: ...
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[True],
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Iterable[_models.StreamingChatCompletionsUpdate]: ...
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
-        on the given endpoint.
-        When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting StreamingChatCompletions
-        object to get content updates as they arrive. By default, the response is a ChatCompletions object
-        (non-streaming).
-
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the model must output. Use this to enable JSON mode
-         instead of the default text mode.
-         Note that to enable JSON mode, some AI models may also require you to instruct the model to
-         produce JSON via a system or user message. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def complete(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def complete(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def complete(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        messages: List[_models.ChatRequestMessage] = _Unset,
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting :class:`~azure.ai.inference.models.StreamingChatCompletions`
-        object to get content updates as they arrive.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the model must output. Use this to enable JSON mode
-         instead of the default text mode.
-         Note that to enable JSON mode, some AI models may also require you to instruct the model to
-         produce JSON via a system or user message. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "messages": messages,
-                "stream": stream,
-                "frequency_penalty": frequency_penalty if frequency_penalty is not None else self._frequency_penalty,
-                "max_tokens": max_tokens if max_tokens is not None else self._max_tokens,
-                "model": model if model is not None else self._model,
-                "presence_penalty": presence_penalty if presence_penalty is not None else self._presence_penalty,
-                "response_format": response_format if response_format is not None else self._response_format,
-                "seed": seed if seed is not None else self._seed,
-                "stop": stop if stop is not None else self._stop,
-                "temperature": temperature if temperature is not None else self._temperature,
-                "tool_choice": tool_choice if tool_choice is not None else self._tool_choice,
-                "tools": tools if tools is not None else self._tools,
-                "top_p": top_p if top_p is not None else self._top_p,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
-            stream = body["stream"]
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_chat_completions_complete_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = stream or False
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            return _models.StreamingChatCompletions(response)
-
-        return _deserialize(_models._patch.ChatCompletions, response.json())  # pylint: disable=protected-access
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class EmbeddingsClient(EmbeddingsClientGenerated):
-    """EmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def embed(
-        self,
-        *,
-        input: List[str],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[str] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
-    """ImageEmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def embed(
-        self,
-        *,
-        input: List[_models.EmbeddingInput],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[_models.EmbeddingInput] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_image_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
+from typing import List
 
-__all__: List[str] = [
-    "load_client",
-    "ChatCompletionsClient",
-    "EmbeddingsClient",
-    "ImageEmbeddingsClient",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
index 8139854b97bb..01a226bd7f14 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
@@ -1,3 +1,4 @@
+# pylint: disable=too-many-lines
 # --------------------------------------------------------------------------
 #
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -24,7 +25,6 @@
 #
 # --------------------------------------------------------------------------
 
-# pylint: skip-file
 # pyright: reportUnnecessaryTypeIgnoreComment=false
 
 from base64 import b64decode, b64encode
@@ -52,7 +52,6 @@
     MutableMapping,
     Type,
     List,
-    Mapping,
 )
 
 try:
@@ -91,6 +90,8 @@ def deserialize_from_text(cls, data: Optional[Union[AnyStr, IO]], content_type:
         :param data: Input, could be bytes or stream (will be decoded with UTF8) or text
         :type data: str or bytes or IO
         :param str content_type: The content type.
+        :return: The deserialized data.
+        :rtype: object
         """
         if hasattr(data, "read"):
             # Assume a stream
@@ -112,7 +113,7 @@ def deserialize_from_text(cls, data: Optional[Union[AnyStr, IO]], content_type:
             try:
                 return json.loads(data_as_str)
             except ValueError as err:
-                raise DeserializationError("JSON is invalid: {}".format(err), err)
+                raise DeserializationError("JSON is invalid: {}".format(err), err) from err
         elif "xml" in (content_type or []):
             try:
 
@@ -155,6 +156,11 @@ def deserialize_from_http_generics(cls, body_bytes: Optional[Union[AnyStr, IO]],
         Use bytes and headers to NOT use any requests/aiohttp or whatever
         specific implementation.
         Headers will tested for "content-type"
+
+        :param bytes body_bytes: The body of the response.
+        :param dict headers: The headers of the response.
+        :returns: The deserialized data.
+        :rtype: object
         """
         # Try to use content-type from headers if available
         content_type = None
@@ -184,15 +190,30 @@ class UTC(datetime.tzinfo):
     """Time Zone info for handling UTC"""
 
     def utcoffset(self, dt):
-        """UTF offset for UTC is 0."""
+        """UTF offset for UTC is 0.
+
+        :param datetime.datetime dt: The datetime
+        :returns: The offset
+        :rtype: datetime.timedelta
+        """
         return datetime.timedelta(0)
 
     def tzname(self, dt):
-        """Timestamp representation."""
+        """Timestamp representation.
+
+        :param datetime.datetime dt: The datetime
+        :returns: The timestamp representation
+        :rtype: str
+        """
         return "Z"
 
     def dst(self, dt):
-        """No daylight saving for UTC."""
+        """No daylight saving for UTC.
+
+        :param datetime.datetime dt: The datetime
+        :returns: The daylight saving time
+        :rtype: datetime.timedelta
+        """
         return datetime.timedelta(hours=1)
 
 
@@ -235,24 +256,26 @@ def __getinitargs__(self):
 _FLATTEN = re.compile(r"(?<!\\)\.")
 
 
-def attribute_transformer(key, attr_desc, value):
+def attribute_transformer(key, attr_desc, value):  # pylint: disable=unused-argument
     """A key transformer that returns the Python attribute.
 
     :param str key: The attribute name
     :param dict attr_desc: The attribute metadata
     :param object value: The value
     :returns: A key using attribute name
+    :rtype: str
     """
     return (key, value)
 
 
-def full_restapi_key_transformer(key, attr_desc, value):
+def full_restapi_key_transformer(key, attr_desc, value):  # pylint: disable=unused-argument
     """A key transformer that returns the full RestAPI key path.
 
-    :param str _: The attribute name
+    :param str key: The attribute name
     :param dict attr_desc: The attribute metadata
     :param object value: The value
     :returns: A list of keys using RestAPI syntax.
+    :rtype: list
     """
     keys = _FLATTEN.split(attr_desc["key"])
     return ([_decode_attribute_map_key(k) for k in keys], value)
@@ -265,19 +288,26 @@ def last_restapi_key_transformer(key, attr_desc, value):
     :param dict attr_desc: The attribute metadata
     :param object value: The value
     :returns: The last RestAPI key.
+    :rtype: str
     """
     key, value = full_restapi_key_transformer(key, attr_desc, value)
     return (key[-1], value)
 
 
 def _create_xml_node(tag, prefix=None, ns=None):
-    """Create a XML node."""
+    """Create a XML node.
+
+    :param str tag: The tag name
+    :param str prefix: The prefix
+    :param str ns: The namespace
+    :return: The XML node
+    :rtype: xml.etree.ElementTree.Element
+    """
     if prefix and ns:
         ET.register_namespace(prefix, ns)
     if ns:
         return ET.Element("{" + ns + "}" + tag)
-    else:
-        return ET.Element(tag)
+    return ET.Element(tag)
 
 
 class Model(object):
@@ -291,7 +321,7 @@ class Model(object):
 
     def __init__(self, **kwargs: Any) -> None:
         self.additional_properties: Optional[Dict[str, Any]] = {}
-        for k in kwargs:
+        for k in kwargs:  # pylint: disable=consider-using-dict-items
             if k not in self._attribute_map:
                 _LOGGER.warning("%s is not a known attribute of class %s and will be ignored", k, self.__class__)
             elif k in self._validation and self._validation[k].get("readonly", False):
@@ -300,13 +330,23 @@ def __init__(self, **kwargs: Any) -> None:
                 setattr(self, k, kwargs[k])
 
     def __eq__(self, other: Any) -> bool:
-        """Compare objects by comparing all attributes."""
+        """Compare objects by comparing all attributes.
+
+        :param object other: The object to compare
+        :returns: True if objects are equal
+        :rtype: bool
+        """
         if isinstance(other, self.__class__):
             return self.__dict__ == other.__dict__
         return False
 
     def __ne__(self, other: Any) -> bool:
-        """Compare objects by comparing all attributes."""
+        """Compare objects by comparing all attributes.
+
+        :param object other: The object to compare
+        :returns: True if objects are not equal
+        :rtype: bool
+        """
         return not self.__eq__(other)
 
     def __str__(self) -> str:
@@ -326,7 +366,11 @@ def is_xml_model(cls) -> bool:
 
     @classmethod
     def _create_xml_node(cls):
-        """Create XML node."""
+        """Create XML node.
+
+        :returns: The XML node
+        :rtype: xml.etree.ElementTree.Element
+        """
         try:
             xml_map = cls._xml_map  # type: ignore
         except AttributeError:
@@ -346,7 +390,9 @@ def serialize(self, keep_readonly: bool = False, **kwargs: Any) -> JSON:
         :rtype: dict
         """
         serializer = Serializer(self._infer_class_models())
-        return serializer._serialize(self, keep_readonly=keep_readonly, **kwargs)  # type: ignore
+        return serializer._serialize(  # type: ignore # pylint: disable=protected-access
+            self, keep_readonly=keep_readonly, **kwargs
+        )
 
     def as_dict(
         self,
@@ -380,12 +426,15 @@ def my_key_transformer(key, attr_desc, value):
 
         If you want XML serialization, you can pass the kwargs is_xml=True.
 
+        :param bool keep_readonly: If you want to serialize the readonly attributes
         :param function key_transformer: A key transformer function.
         :returns: A dict JSON compatible object
         :rtype: dict
         """
         serializer = Serializer(self._infer_class_models())
-        return serializer._serialize(self, key_transformer=key_transformer, keep_readonly=keep_readonly, **kwargs)  # type: ignore
+        return serializer._serialize(  # type: ignore # pylint: disable=protected-access
+            self, key_transformer=key_transformer, keep_readonly=keep_readonly, **kwargs
+        )
 
     @classmethod
     def _infer_class_models(cls):
@@ -395,7 +444,7 @@ def _infer_class_models(cls):
             client_models = {k: v for k, v in models.__dict__.items() if isinstance(v, type)}
             if cls.__name__ not in client_models:
                 raise ValueError("Not Autorest generated code")
-        except Exception:
+        except Exception:  # pylint: disable=broad-exception-caught
             # Assume it's not Autorest generated (tests?). Add ourselves as dependencies.
             client_models = {cls.__name__: cls}
         return client_models
@@ -408,6 +457,7 @@ def deserialize(cls: Type[ModelType], data: Any, content_type: Optional[str] = N
         :param str content_type: JSON by default, set application/xml if XML.
         :returns: An instance of this model
         :raises: DeserializationError if something went wrong
+        :rtype: ModelType
         """
         deserializer = Deserializer(cls._infer_class_models())
         return deserializer(cls.__name__, data, content_type=content_type)  # type: ignore
@@ -426,9 +476,11 @@ def from_dict(
         and last_rest_key_case_insensitive_extractor)
 
         :param dict data: A dict using RestAPI structure
+        :param function key_extractors: A key extractor function.
         :param str content_type: JSON by default, set application/xml if XML.
         :returns: An instance of this model
         :raises: DeserializationError if something went wrong
+        :rtype: ModelType
         """
         deserializer = Deserializer(cls._infer_class_models())
         deserializer.key_extractors = (  # type: ignore
@@ -448,7 +500,7 @@ def _flatten_subtype(cls, key, objects):
             return {}
         result = dict(cls._subtype_map[key])
         for valuetype in cls._subtype_map[key].values():
-            result.update(objects[valuetype]._flatten_subtype(key, objects))
+            result.update(objects[valuetype]._flatten_subtype(key, objects))  # pylint: disable=protected-access
         return result
 
     @classmethod
@@ -456,6 +508,11 @@ def _classify(cls, response, objects):
         """Check the class _subtype_map for any child classes.
         We want to ignore any inherited _subtype_maps.
         Remove the polymorphic key from the initial data.
+
+        :param dict response: The initial data
+        :param dict objects: The class objects
+        :returns: The class to be used
+        :rtype: class
         """
         for subtype_key in cls.__dict__.get("_subtype_map", {}).keys():
             subtype_value = None
@@ -501,11 +558,13 @@ def _decode_attribute_map_key(key):
     inside the received data.
 
     :param str key: A key string from the generated code
+    :returns: The decoded key
+    :rtype: str
     """
     return key.replace("\\.", ".")
 
 
-class Serializer(object):
+class Serializer(object):  # pylint: disable=too-many-public-methods
     """Request object model serializer."""
 
     basic_types = {str: "str", int: "int", bool: "bool", float: "float"}
@@ -560,13 +619,16 @@ def __init__(self, classes: Optional[Mapping[str, type]] = None):
         self.key_transformer = full_restapi_key_transformer
         self.client_side_validation = True
 
-    def _serialize(self, target_obj, data_type=None, **kwargs):
+    def _serialize(  # pylint: disable=too-many-nested-blocks, too-many-branches, too-many-statements, too-many-locals
+        self, target_obj, data_type=None, **kwargs
+    ):
         """Serialize data into a string according to type.
 
-        :param target_obj: The data to be serialized.
+        :param object target_obj: The data to be serialized.
         :param str data_type: The type to be serialized from.
         :rtype: str, dict
         :raises: SerializationError if serialization fails.
+        :returns: The serialized data.
         """
         key_transformer = kwargs.get("key_transformer", self.key_transformer)
         keep_readonly = kwargs.get("keep_readonly", False)
@@ -592,12 +654,14 @@ def _serialize(self, target_obj, data_type=None, **kwargs):
 
         serialized = {}
         if is_xml_model_serialization:
-            serialized = target_obj._create_xml_node()
+            serialized = target_obj._create_xml_node()  # pylint: disable=protected-access
         try:
-            attributes = target_obj._attribute_map
+            attributes = target_obj._attribute_map  # pylint: disable=protected-access
             for attr, attr_desc in attributes.items():
                 attr_name = attr
-                if not keep_readonly and target_obj._validation.get(attr_name, {}).get("readonly", False):
+                if not keep_readonly and target_obj._validation.get(  # pylint: disable=protected-access
+                    attr_name, {}
+                ).get("readonly", False):
                     continue
 
                 if attr_name == "additional_properties" and attr_desc["key"] == "":
@@ -633,7 +697,8 @@ def _serialize(self, target_obj, data_type=None, **kwargs):
                         if isinstance(new_attr, list):
                             serialized.extend(new_attr)  # type: ignore
                         elif isinstance(new_attr, ET.Element):
-                            # If the down XML has no XML/Name, we MUST replace the tag with the local tag. But keeping the namespaces.
+                            # If the down XML has no XML/Name,
+                            # we MUST replace the tag with the local tag. But keeping the namespaces.
                             if "name" not in getattr(orig_attr, "_xml_map", {}):
                                 splitted_tag = new_attr.tag.split("}")
                                 if len(splitted_tag) == 2:  # Namespace
@@ -664,17 +729,17 @@ def _serialize(self, target_obj, data_type=None, **kwargs):
         except (AttributeError, KeyError, TypeError) as err:
             msg = "Attribute {} in object {} cannot be serialized.\n{}".format(attr_name, class_name, str(target_obj))
             raise SerializationError(msg) from err
-        else:
-            return serialized
+        return serialized
 
     def body(self, data, data_type, **kwargs):
         """Serialize data intended for a request body.
 
-        :param data: The data to be serialized.
+        :param object data: The data to be serialized.
         :param str data_type: The type to be serialized from.
         :rtype: dict
         :raises: SerializationError if serialization fails.
         :raises: ValueError if data is None
+        :returns: The serialized request body
         """
 
         # Just in case this is a dict
@@ -703,7 +768,7 @@ def body(self, data, data_type, **kwargs):
                         attribute_key_case_insensitive_extractor,
                         last_rest_key_case_insensitive_extractor,
                     ]
-                data = deserializer._deserialize(data_type, data)
+                data = deserializer._deserialize(data_type, data)  # pylint: disable=protected-access
             except DeserializationError as err:
                 raise SerializationError("Unable to build a model: " + str(err)) from err
 
@@ -712,9 +777,11 @@ def body(self, data, data_type, **kwargs):
     def url(self, name, data, data_type, **kwargs):
         """Serialize data intended for a URL path.
 
-        :param data: The data to be serialized.
+        :param str name: The name of the URL path parameter.
+        :param object data: The data to be serialized.
         :param str data_type: The type to be serialized from.
         :rtype: str
+        :returns: The serialized URL path
         :raises: TypeError if serialization fails.
         :raises: ValueError if data is None
         """
@@ -728,21 +795,20 @@ def url(self, name, data, data_type, **kwargs):
                 output = output.replace("{", quote("{")).replace("}", quote("}"))
             else:
                 output = quote(str(output), safe="")
-        except SerializationError:
-            raise TypeError("{} must be type {}.".format(name, data_type))
-        else:
-            return output
+        except SerializationError as exc:
+            raise TypeError("{} must be type {}.".format(name, data_type)) from exc
+        return output
 
     def query(self, name, data, data_type, **kwargs):
         """Serialize data intended for a URL query.
 
-        :param data: The data to be serialized.
+        :param str name: The name of the query parameter.
+        :param object data: The data to be serialized.
         :param str data_type: The type to be serialized from.
-        :keyword bool skip_quote: Whether to skip quote the serialized result.
-        Defaults to False.
         :rtype: str, list
         :raises: TypeError if serialization fails.
         :raises: ValueError if data is None
+        :returns: The serialized query parameter
         """
         try:
             # Treat the list aside, since we don't want to encode the div separator
@@ -759,19 +825,20 @@ def query(self, name, data, data_type, **kwargs):
                 output = str(output)
             else:
                 output = quote(str(output), safe="")
-        except SerializationError:
-            raise TypeError("{} must be type {}.".format(name, data_type))
-        else:
-            return str(output)
+        except SerializationError as exc:
+            raise TypeError("{} must be type {}.".format(name, data_type)) from exc
+        return str(output)
 
     def header(self, name, data, data_type, **kwargs):
         """Serialize data intended for a request header.
 
-        :param data: The data to be serialized.
+        :param str name: The name of the header.
+        :param object data: The data to be serialized.
         :param str data_type: The type to be serialized from.
         :rtype: str
         :raises: TypeError if serialization fails.
         :raises: ValueError if data is None
+        :returns: The serialized header
         """
         try:
             if data_type in ["[str]"]:
@@ -780,21 +847,20 @@ def header(self, name, data, data_type, **kwargs):
             output = self.serialize_data(data, data_type, **kwargs)
             if data_type == "bool":
                 output = json.dumps(output)
-        except SerializationError:
-            raise TypeError("{} must be type {}.".format(name, data_type))
-        else:
-            return str(output)
+        except SerializationError as exc:
+            raise TypeError("{} must be type {}.".format(name, data_type)) from exc
+        return str(output)
 
     def serialize_data(self, data, data_type, **kwargs):
         """Serialize generic data according to supplied data type.
 
-        :param data: The data to be serialized.
+        :param object data: The data to be serialized.
         :param str data_type: The type to be serialized from.
-        :param bool required: Whether it's essential that the data not be
-         empty or None
         :raises: AttributeError if required data is None.
         :raises: ValueError if data is None
         :raises: SerializationError if serialization fails.
+        :returns: The serialized data.
+        :rtype: str, int, float, bool, dict, list
         """
         if data is None:
             raise ValueError("No value for given attribute")
@@ -805,7 +871,7 @@ def serialize_data(self, data, data_type, **kwargs):
             if data_type in self.basic_types.values():
                 return self.serialize_basic(data, data_type, **kwargs)
 
-            elif data_type in self.serialize_type:
+            if data_type in self.serialize_type:
                 return self.serialize_type[data_type](data, **kwargs)
 
             # If dependencies is empty, try with current data class
@@ -821,11 +887,10 @@ def serialize_data(self, data, data_type, **kwargs):
         except (ValueError, TypeError) as err:
             msg = "Unable to serialize value: {!r} as type: {!r}."
             raise SerializationError(msg.format(data, data_type)) from err
-        else:
-            return self._serialize(data, **kwargs)
+        return self._serialize(data, **kwargs)
 
     @classmethod
-    def _get_custom_serializers(cls, data_type, **kwargs):
+    def _get_custom_serializers(cls, data_type, **kwargs):  # pylint: disable=inconsistent-return-statements
         custom_serializer = kwargs.get("basic_types_serializers", {}).get(data_type)
         if custom_serializer:
             return custom_serializer
@@ -841,23 +906,26 @@ def serialize_basic(cls, data, data_type, **kwargs):
         - basic_types_serializers dict[str, callable] : If set, use the callable as serializer
         - is_xml bool : If set, use xml_basic_types_serializers
 
-        :param data: Object to be serialized.
+        :param obj data: Object to be serialized.
         :param str data_type: Type of object in the iterable.
+        :rtype: str, int, float, bool
+        :return: serialized object
         """
         custom_serializer = cls._get_custom_serializers(data_type, **kwargs)
         if custom_serializer:
             return custom_serializer(data)
         if data_type == "str":
             return cls.serialize_unicode(data)
-        return eval(data_type)(data)  # nosec
+        return eval(data_type)(data)  # nosec # pylint: disable=eval-used
 
     @classmethod
     def serialize_unicode(cls, data):
         """Special handling for serializing unicode strings in Py2.
         Encode to UTF-8 if unicode, otherwise handle as a str.
 
-        :param data: Object to be serialized.
+        :param str data: Object to be serialized.
         :rtype: str
+        :return: serialized object
         """
         try:  # If I received an enum, return its value
             return data.value
@@ -871,8 +939,7 @@ def serialize_unicode(cls, data):
                 return data
         except NameError:
             return str(data)
-        else:
-            return str(data)
+        return str(data)
 
     def serialize_iter(self, data, iter_type, div=None, **kwargs):
         """Serialize iterable.
@@ -882,15 +949,13 @@ def serialize_iter(self, data, iter_type, div=None, **kwargs):
           serialization_ctxt['type'] should be same as data_type.
         - is_xml bool : If set, serialize as XML
 
-        :param list attr: Object to be serialized.
+        :param list data: Object to be serialized.
         :param str iter_type: Type of object in the iterable.
-        :param bool required: Whether the objects in the iterable must
-         not be None or empty.
         :param str div: If set, this str will be used to combine the elements
          in the iterable into a combined string. Default is 'None'.
-        :keyword bool do_quote: Whether to quote the serialized result of each iterable element.
         Defaults to False.
         :rtype: list, str
+        :return: serialized iterable
         """
         if isinstance(data, str):
             raise SerializationError("Refuse str type as a valid iter type.")
@@ -945,9 +1010,8 @@ def serialize_dict(self, attr, dict_type, **kwargs):
 
         :param dict attr: Object to be serialized.
         :param str dict_type: Type of object in the dictionary.
-        :param bool required: Whether the objects in the dictionary must
-         not be None or empty.
         :rtype: dict
+        :return: serialized dictionary
         """
         serialization_ctxt = kwargs.get("serialization_ctxt", {})
         serialized = {}
@@ -971,7 +1035,7 @@ def serialize_dict(self, attr, dict_type, **kwargs):
 
         return serialized
 
-    def serialize_object(self, attr, **kwargs):
+    def serialize_object(self, attr, **kwargs):  # pylint: disable=too-many-return-statements
         """Serialize a generic object.
         This will be handled as a dictionary. If object passed in is not
         a basic type (str, int, float, dict, list) it will simply be
@@ -979,6 +1043,7 @@ def serialize_object(self, attr, **kwargs):
 
         :param dict attr: Object to be serialized.
         :rtype: dict or str
+        :return: serialized object
         """
         if attr is None:
             return None
@@ -1003,7 +1068,7 @@ def serialize_object(self, attr, **kwargs):
             return self.serialize_decimal(attr)
 
         # If it's a model or I know this dependency, serialize as a Model
-        elif obj_type in self.dependencies.values() or isinstance(attr, Model):
+        if obj_type in self.dependencies.values() or isinstance(attr, Model):
             return self._serialize(attr)
 
         if obj_type == dict:
@@ -1034,56 +1099,61 @@ def serialize_enum(attr, enum_obj=None):
         try:
             enum_obj(result)  # type: ignore
             return result
-        except ValueError:
+        except ValueError as exc:
             for enum_value in enum_obj:  # type: ignore
                 if enum_value.value.lower() == str(attr).lower():
                     return enum_value.value
             error = "{!r} is not valid value for enum {!r}"
-            raise SerializationError(error.format(attr, enum_obj))
+            raise SerializationError(error.format(attr, enum_obj)) from exc
 
     @staticmethod
-    def serialize_bytearray(attr, **kwargs):
+    def serialize_bytearray(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize bytearray into base-64 string.
 
-        :param attr: Object to be serialized.
+        :param str attr: Object to be serialized.
         :rtype: str
+        :return: serialized base64
         """
         return b64encode(attr).decode()
 
     @staticmethod
-    def serialize_base64(attr, **kwargs):
+    def serialize_base64(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize str into base-64 string.
 
-        :param attr: Object to be serialized.
+        :param str attr: Object to be serialized.
         :rtype: str
+        :return: serialized base64
         """
         encoded = b64encode(attr).decode("ascii")
         return encoded.strip("=").replace("+", "-").replace("/", "_")
 
     @staticmethod
-    def serialize_decimal(attr, **kwargs):
+    def serialize_decimal(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize Decimal object to float.
 
-        :param attr: Object to be serialized.
+        :param decimal attr: Object to be serialized.
         :rtype: float
+        :return: serialized decimal
         """
         return float(attr)
 
     @staticmethod
-    def serialize_long(attr, **kwargs):
+    def serialize_long(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize long (Py2) or int (Py3).
 
-        :param attr: Object to be serialized.
+        :param int attr: Object to be serialized.
         :rtype: int/long
+        :return: serialized long
         """
         return _long_type(attr)
 
     @staticmethod
-    def serialize_date(attr, **kwargs):
+    def serialize_date(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize Date object into ISO-8601 formatted string.
 
         :param Date attr: Object to be serialized.
         :rtype: str
+        :return: serialized date
         """
         if isinstance(attr, str):
             attr = isodate.parse_date(attr)
@@ -1091,11 +1161,12 @@ def serialize_date(attr, **kwargs):
         return t
 
     @staticmethod
-    def serialize_time(attr, **kwargs):
+    def serialize_time(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize Time object into ISO-8601 formatted string.
 
         :param datetime.time attr: Object to be serialized.
         :rtype: str
+        :return: serialized time
         """
         if isinstance(attr, str):
             attr = isodate.parse_time(attr)
@@ -1105,30 +1176,32 @@ def serialize_time(attr, **kwargs):
         return t
 
     @staticmethod
-    def serialize_duration(attr, **kwargs):
+    def serialize_duration(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize TimeDelta object into ISO-8601 formatted string.
 
         :param TimeDelta attr: Object to be serialized.
         :rtype: str
+        :return: serialized duration
         """
         if isinstance(attr, str):
             attr = isodate.parse_duration(attr)
         return isodate.duration_isoformat(attr)
 
     @staticmethod
-    def serialize_rfc(attr, **kwargs):
+    def serialize_rfc(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize Datetime object into RFC-1123 formatted string.
 
         :param Datetime attr: Object to be serialized.
         :rtype: str
         :raises: TypeError if format invalid.
+        :return: serialized rfc
         """
         try:
             if not attr.tzinfo:
                 _LOGGER.warning("Datetime with no tzinfo will be considered UTC.")
             utc = attr.utctimetuple()
-        except AttributeError:
-            raise TypeError("RFC1123 object must be valid Datetime object.")
+        except AttributeError as exc:
+            raise TypeError("RFC1123 object must be valid Datetime object.") from exc
 
         return "{}, {:02} {} {:04} {:02}:{:02}:{:02} GMT".format(
             Serializer.days[utc.tm_wday],
@@ -1141,12 +1214,13 @@ def serialize_rfc(attr, **kwargs):
         )
 
     @staticmethod
-    def serialize_iso(attr, **kwargs):
+    def serialize_iso(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize Datetime object into ISO-8601 formatted string.
 
         :param Datetime attr: Object to be serialized.
         :rtype: str
         :raises: SerializationError if format invalid.
+        :return: serialized iso
         """
         if isinstance(attr, str):
             attr = isodate.parse_datetime(attr)
@@ -1172,13 +1246,14 @@ def serialize_iso(attr, **kwargs):
             raise TypeError(msg) from err
 
     @staticmethod
-    def serialize_unix(attr, **kwargs):
+    def serialize_unix(attr, **kwargs):  # pylint: disable=unused-argument
         """Serialize Datetime object into IntTime format.
         This is represented as seconds.
 
         :param Datetime attr: Object to be serialized.
         :rtype: int
         :raises: SerializationError if format invalid
+        :return: serialied unix
         """
         if isinstance(attr, int):
             return attr
@@ -1186,11 +1261,11 @@ def serialize_unix(attr, **kwargs):
             if not attr.tzinfo:
                 _LOGGER.warning("Datetime with no tzinfo will be considered UTC.")
             return int(calendar.timegm(attr.utctimetuple()))
-        except AttributeError:
-            raise TypeError("Unix time object must be valid Datetime object.")
+        except AttributeError as exc:
+            raise TypeError("Unix time object must be valid Datetime object.") from exc
 
 
-def rest_key_extractor(attr, attr_desc, data):
+def rest_key_extractor(attr, attr_desc, data):  # pylint: disable=unused-argument
     key = attr_desc["key"]
     working_data = data
 
@@ -1211,7 +1286,9 @@ def rest_key_extractor(attr, attr_desc, data):
     return working_data.get(key)
 
 
-def rest_key_case_insensitive_extractor(attr, attr_desc, data):
+def rest_key_case_insensitive_extractor(  # pylint: disable=unused-argument, inconsistent-return-statements
+    attr, attr_desc, data
+):
     key = attr_desc["key"]
     working_data = data
 
@@ -1232,17 +1309,29 @@ def rest_key_case_insensitive_extractor(attr, attr_desc, data):
         return attribute_key_case_insensitive_extractor(key, None, working_data)
 
 
-def last_rest_key_extractor(attr, attr_desc, data):
-    """Extract the attribute in "data" based on the last part of the JSON path key."""
+def last_rest_key_extractor(attr, attr_desc, data):  # pylint: disable=unused-argument
+    """Extract the attribute in "data" based on the last part of the JSON path key.
+
+    :param str attr: The attribute to extract
+    :param dict attr_desc: The attribute description
+    :param dict data: The data to extract from
+    :rtype: object
+    :returns: The extracted attribute
+    """
     key = attr_desc["key"]
     dict_keys = _FLATTEN.split(key)
     return attribute_key_extractor(dict_keys[-1], None, data)
 
 
-def last_rest_key_case_insensitive_extractor(attr, attr_desc, data):
+def last_rest_key_case_insensitive_extractor(attr, attr_desc, data):  # pylint: disable=unused-argument
     """Extract the attribute in "data" based on the last part of the JSON path key.
 
     This is the case insensitive version of "last_rest_key_extractor"
+    :param str attr: The attribute to extract
+    :param dict attr_desc: The attribute description
+    :param dict data: The data to extract from
+    :rtype: object
+    :returns: The extracted attribute
     """
     key = attr_desc["key"]
     dict_keys = _FLATTEN.split(key)
@@ -1279,7 +1368,7 @@ def _extract_name_from_internal_type(internal_type):
     return xml_name
 
 
-def xml_key_extractor(attr, attr_desc, data):
+def xml_key_extractor(attr, attr_desc, data):  # pylint: disable=unused-argument,too-many-return-statements
     if isinstance(data, dict):
         return None
 
@@ -1331,22 +1420,21 @@ def xml_key_extractor(attr, attr_desc, data):
         if is_iter_type:
             if is_wrapped:
                 return None  # is_wrapped no node, we want None
-            else:
-                return []  # not wrapped, assume empty list
+            return []  # not wrapped, assume empty list
         return None  # Assume it's not there, maybe an optional node.
 
     # If is_iter_type and not wrapped, return all found children
     if is_iter_type:
         if not is_wrapped:
             return children
-        else:  # Iter and wrapped, should have found one node only (the wrap one)
-            if len(children) != 1:
-                raise DeserializationError(
-                    "Tried to deserialize an array not wrapped, and found several nodes '{}'. Maybe you should declare this array as wrapped?".format(
-                        xml_name
-                    )
+        # Iter and wrapped, should have found one node only (the wrap one)
+        if len(children) != 1:
+            raise DeserializationError(
+                "Tried to deserialize an array not wrapped, and found several nodes '{}'. Maybe you should declare this array as wrapped?".format(  # pylint: disable=line-too-long
+                    xml_name
                 )
-            return list(children[0])  # Might be empty list and that's ok.
+            )
+        return list(children[0])  # Might be empty list and that's ok.
 
     # Here it's not a itertype, we should have found one element only or empty
     if len(children) > 1:
@@ -1363,7 +1451,7 @@ class Deserializer(object):
 
     basic_types = {str: "str", int: "int", bool: "bool", float: "float"}
 
-    valid_date = re.compile(r"\d{4}[-]\d{2}[-]\d{2}T\d{2}:\d{2}:\d{2}" r"\.?\d*Z?[-+]?[\d{2}]?:?[\d{2}]?")
+    valid_date = re.compile(r"\d{4}[-]\d{2}[-]\d{2}T\d{2}:\d{2}:\d{2}\.?\d*Z?[-+]?[\d{2}]?:?[\d{2}]?")
 
     def __init__(self, classes: Optional[Mapping[str, type]] = None):
         self.deserialize_type = {
@@ -1403,11 +1491,12 @@ def __call__(self, target_obj, response_data, content_type=None):
         :param str content_type: Swagger "produces" if available.
         :raises: DeserializationError if deserialization fails.
         :return: Deserialized object.
+        :rtype: object
         """
         data = self._unpack_content(response_data, content_type)
         return self._deserialize(target_obj, data)
 
-    def _deserialize(self, target_obj, data):
+    def _deserialize(self, target_obj, data):  # pylint: disable=inconsistent-return-statements
         """Call the deserializer on a model.
 
         Data needs to be already deserialized as JSON or XML ElementTree
@@ -1416,12 +1505,13 @@ def _deserialize(self, target_obj, data):
         :param object data: Object to deserialize.
         :raises: DeserializationError if deserialization fails.
         :return: Deserialized object.
+        :rtype: object
         """
         # This is already a model, go recursive just in case
         if hasattr(data, "_attribute_map"):
             constants = [name for name, config in getattr(data, "_validation", {}).items() if config.get("constant")]
             try:
-                for attr, mapconfig in data._attribute_map.items():
+                for attr, mapconfig in data._attribute_map.items():  # pylint: disable=protected-access
                     if attr in constants:
                         continue
                     value = getattr(data, attr)
@@ -1440,13 +1530,13 @@ def _deserialize(self, target_obj, data):
 
         if isinstance(response, str):
             return self.deserialize_data(data, response)
-        elif isinstance(response, type) and issubclass(response, Enum):
+        if isinstance(response, type) and issubclass(response, Enum):
             return self.deserialize_enum(data, response)
 
         if data is None or data is CoreNull:
             return data
         try:
-            attributes = response._attribute_map  # type: ignore
+            attributes = response._attribute_map  # type: ignore # pylint: disable=protected-access
             d_attrs = {}
             for attr, attr_desc in attributes.items():
                 # Check empty string. If it's not empty, someone has a real "additionalProperties"...
@@ -1476,9 +1566,8 @@ def _deserialize(self, target_obj, data):
         except (AttributeError, TypeError, KeyError) as err:
             msg = "Unable to deserialize to object: " + class_name  # type: ignore
             raise DeserializationError(msg) from err
-        else:
-            additional_properties = self._build_additional_properties(attributes, data)
-            return self._instantiate_model(response, d_attrs, additional_properties)
+        additional_properties = self._build_additional_properties(attributes, data)
+        return self._instantiate_model(response, d_attrs, additional_properties)
 
     def _build_additional_properties(self, attribute_map, data):
         if not self.additional_properties_detection:
@@ -1505,6 +1594,8 @@ def _classify_target(self, target, data):
 
         :param str target: The target object type to deserialize to.
         :param str/dict data: The response data to deserialize.
+        :return: The classified target object and its class name.
+        :rtype: tuple
         """
         if target is None:
             return None, None
@@ -1516,7 +1607,7 @@ def _classify_target(self, target, data):
                 return target, target
 
         try:
-            target = target._classify(data, self.dependencies)  # type: ignore
+            target = target._classify(data, self.dependencies)  # type: ignore # pylint: disable=protected-access
         except AttributeError:
             pass  # Target is not a Model, no classify
         return target, target.__class__.__name__  # type: ignore
@@ -1531,10 +1622,12 @@ def failsafe_deserialize(self, target_obj, data, content_type=None):
         :param str target_obj: The target object type to deserialize to.
         :param str/dict data: The response data to deserialize.
         :param str content_type: Swagger "produces" if available.
+        :return: Deserialized object.
+        :rtype: object
         """
         try:
             return self(target_obj, data, content_type=content_type)
-        except:
+        except:  # pylint: disable=bare-except
             _LOGGER.debug(
                 "Ran into a deserialization error. Ignoring since this is failsafe deserialization", exc_info=True
             )
@@ -1552,10 +1645,12 @@ def _unpack_content(raw_data, content_type=None):
 
         If raw_data is something else, bypass all logic and return it directly.
 
-        :param raw_data: Data to be processed.
-        :param content_type: How to parse if raw_data is a string/bytes.
+        :param obj raw_data: Data to be processed.
+        :param str content_type: How to parse if raw_data is a string/bytes.
         :raises JSONDecodeError: If JSON is requested and parsing is impossible.
         :raises UnicodeDecodeError: If bytes is not UTF8
+        :rtype: object
+        :return: Unpacked content.
         """
         # Assume this is enough to detect a Pipeline Response without importing it
         context = getattr(raw_data, "context", {})
@@ -1579,14 +1674,21 @@ def _unpack_content(raw_data, content_type=None):
     def _instantiate_model(self, response, attrs, additional_properties=None):
         """Instantiate a response model passing in deserialized args.
 
-        :param response: The response model class.
-        :param d_attrs: The deserialized response attributes.
+        :param Response response: The response model class.
+        :param dict attrs: The deserialized response attributes.
+        :param dict additional_properties: Additional properties to be set.
+        :rtype: Response
+        :return: The instantiated response model.
         """
         if callable(response):
             subtype = getattr(response, "_subtype_map", {})
             try:
-                readonly = [k for k, v in response._validation.items() if v.get("readonly")]
-                const = [k for k, v in response._validation.items() if v.get("constant")]
+                readonly = [
+                    k for k, v in response._validation.items() if v.get("readonly")  # pylint: disable=protected-access
+                ]
+                const = [
+                    k for k, v in response._validation.items() if v.get("constant")  # pylint: disable=protected-access
+                ]
                 kwargs = {k: v for k, v in attrs.items() if k not in subtype and k not in readonly + const}
                 response_obj = response(**kwargs)
                 for attr in readonly:
@@ -1596,7 +1698,7 @@ def _instantiate_model(self, response, attrs, additional_properties=None):
                 return response_obj
             except TypeError as err:
                 msg = "Unable to deserialize {} into model {}. ".format(kwargs, response)  # type: ignore
-                raise DeserializationError(msg + str(err))
+                raise DeserializationError(msg + str(err)) from err
         else:
             try:
                 for attr, value in attrs.items():
@@ -1605,15 +1707,16 @@ def _instantiate_model(self, response, attrs, additional_properties=None):
             except Exception as exp:
                 msg = "Unable to populate response model. "
                 msg += "Type: {}, Error: {}".format(type(response), exp)
-                raise DeserializationError(msg)
+                raise DeserializationError(msg) from exp
 
-    def deserialize_data(self, data, data_type):
+    def deserialize_data(self, data, data_type):  # pylint: disable=too-many-return-statements
         """Process data for deserialization according to data type.
 
         :param str data: The response string to be deserialized.
         :param str data_type: The type to deserialize to.
         :raises: DeserializationError if deserialization fails.
         :return: Deserialized object.
+        :rtype: object
         """
         if data is None:
             return data
@@ -1627,7 +1730,11 @@ def deserialize_data(self, data, data_type):
                 if isinstance(data, self.deserialize_expected_types.get(data_type, tuple())):
                     return data
 
-                is_a_text_parsing_type = lambda x: x not in ["object", "[]", r"{}"]
+                is_a_text_parsing_type = lambda x: x not in [  # pylint: disable=unnecessary-lambda-assignment
+                    "object",
+                    "[]",
+                    r"{}",
+                ]
                 if isinstance(data, ET.Element) and is_a_text_parsing_type(data_type) and not data.text:
                     return None
                 data_val = self.deserialize_type[data_type](data)
@@ -1647,14 +1754,14 @@ def deserialize_data(self, data, data_type):
             msg = "Unable to deserialize response data."
             msg += " Data: {}, {}".format(data, data_type)
             raise DeserializationError(msg) from err
-        else:
-            return self._deserialize(obj_type, data)
+        return self._deserialize(obj_type, data)
 
     def deserialize_iter(self, attr, iter_type):
         """Deserialize an iterable.
 
         :param list attr: Iterable to be deserialized.
         :param str iter_type: The type of object in the iterable.
+        :return: Deserialized iterable.
         :rtype: list
         """
         if attr is None:
@@ -1671,6 +1778,7 @@ def deserialize_dict(self, attr, dict_type):
         :param dict/list attr: Dictionary to be deserialized. Also accepts
          a list of key, value pairs.
         :param str dict_type: The object type of the items in the dictionary.
+        :return: Deserialized dictionary.
         :rtype: dict
         """
         if isinstance(attr, list):
@@ -1681,11 +1789,12 @@ def deserialize_dict(self, attr, dict_type):
             attr = {el.tag: el.text for el in attr}
         return {k: self.deserialize_data(v, dict_type) for k, v in attr.items()}
 
-    def deserialize_object(self, attr, **kwargs):
+    def deserialize_object(self, attr, **kwargs):  # pylint: disable=too-many-return-statements
         """Deserialize a generic object.
         This will be handled as a dictionary.
 
         :param dict attr: Dictionary to be deserialized.
+        :return: Deserialized object.
         :rtype: dict
         :raises: TypeError if non-builtin datatype encountered.
         """
@@ -1720,11 +1829,10 @@ def deserialize_object(self, attr, **kwargs):
                     pass
             return deserialized
 
-        else:
-            error = "Cannot deserialize generic object with type: "
-            raise TypeError(error + str(obj_type))
+        error = "Cannot deserialize generic object with type: "
+        raise TypeError(error + str(obj_type))
 
-    def deserialize_basic(self, attr, data_type):
+    def deserialize_basic(self, attr, data_type):  # pylint: disable=too-many-return-statements
         """Deserialize basic builtin data type from string.
         Will attempt to convert to str, int, float and bool.
         This function will also accept '1', '0', 'true' and 'false' as
@@ -1732,6 +1840,7 @@ def deserialize_basic(self, attr, data_type):
 
         :param str attr: response string to be deserialized.
         :param str data_type: deserialization data type.
+        :return: Deserialized basic type.
         :rtype: str, int, float or bool
         :raises: TypeError if string format is not valid.
         """
@@ -1743,24 +1852,23 @@ def deserialize_basic(self, attr, data_type):
                 if data_type == "str":
                     # None or '', node <a/> is empty string.
                     return ""
-                else:
-                    # None or '', node <a/> with a strong type is None.
-                    # Don't try to model "empty bool" or "empty int"
-                    return None
+                # None or '', node <a/> with a strong type is None.
+                # Don't try to model "empty bool" or "empty int"
+                return None
 
         if data_type == "bool":
             if attr in [True, False, 1, 0]:
                 return bool(attr)
-            elif isinstance(attr, str):
+            if isinstance(attr, str):
                 if attr.lower() in ["true", "1"]:
                     return True
-                elif attr.lower() in ["false", "0"]:
+                if attr.lower() in ["false", "0"]:
                     return False
             raise TypeError("Invalid boolean value: {}".format(attr))
 
         if data_type == "str":
             return self.deserialize_unicode(attr)
-        return eval(data_type)(attr)  # nosec
+        return eval(data_type)(attr)  # nosec # pylint: disable=eval-used
 
     @staticmethod
     def deserialize_unicode(data):
@@ -1768,6 +1876,7 @@ def deserialize_unicode(data):
         as a string.
 
         :param str data: response string to be deserialized.
+        :return: Deserialized string.
         :rtype: str or unicode
         """
         # We might be here because we have an enum modeled as string,
@@ -1781,8 +1890,7 @@ def deserialize_unicode(data):
                 return data
         except NameError:
             return str(data)
-        else:
-            return str(data)
+        return str(data)
 
     @staticmethod
     def deserialize_enum(data, enum_obj):
@@ -1794,6 +1902,7 @@ def deserialize_enum(data, enum_obj):
         :param str data: Response string to be deserialized. If this value is
          None or invalid it will be returned as-is.
         :param Enum enum_obj: Enum object to deserialize to.
+        :return: Deserialized enum object.
         :rtype: Enum
         """
         if isinstance(data, enum_obj) or data is None:
@@ -1804,9 +1913,9 @@ def deserialize_enum(data, enum_obj):
             # Workaround. We might consider remove it in the future.
             try:
                 return list(enum_obj.__members__.values())[data]
-            except IndexError:
+            except IndexError as exc:
                 error = "{!r} is not a valid index for enum {!r}"
-                raise DeserializationError(error.format(data, enum_obj))
+                raise DeserializationError(error.format(data, enum_obj)) from exc
         try:
             return enum_obj(str(data))
         except ValueError:
@@ -1822,6 +1931,7 @@ def deserialize_bytearray(attr):
         """Deserialize string into bytearray.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized bytearray
         :rtype: bytearray
         :raises: TypeError if string format invalid.
         """
@@ -1834,6 +1944,7 @@ def deserialize_base64(attr):
         """Deserialize base64 encoded string into string.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized base64 string
         :rtype: bytearray
         :raises: TypeError if string format invalid.
         """
@@ -1849,8 +1960,9 @@ def deserialize_decimal(attr):
         """Deserialize string into Decimal object.
 
         :param str attr: response string to be deserialized.
-        :rtype: Decimal
+        :return: Deserialized decimal
         :raises: DeserializationError if string format invalid.
+        :rtype: decimal
         """
         if isinstance(attr, ET.Element):
             attr = attr.text
@@ -1865,6 +1977,7 @@ def deserialize_long(attr):
         """Deserialize string into long (Py2) or int (Py3).
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized int
         :rtype: long or int
         :raises: ValueError if string format invalid.
         """
@@ -1877,6 +1990,7 @@ def deserialize_duration(attr):
         """Deserialize ISO-8601 formatted string into TimeDelta object.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized duration
         :rtype: TimeDelta
         :raises: DeserializationError if string format invalid.
         """
@@ -1887,14 +2001,14 @@ def deserialize_duration(attr):
         except (ValueError, OverflowError, AttributeError) as err:
             msg = "Cannot deserialize duration object."
             raise DeserializationError(msg) from err
-        else:
-            return duration
+        return duration
 
     @staticmethod
     def deserialize_date(attr):
         """Deserialize ISO-8601 formatted string into Date object.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized date
         :rtype: Date
         :raises: DeserializationError if string format invalid.
         """
@@ -1910,6 +2024,7 @@ def deserialize_time(attr):
         """Deserialize ISO-8601 formatted string into time object.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized time
         :rtype: datetime.time
         :raises: DeserializationError if string format invalid.
         """
@@ -1924,6 +2039,7 @@ def deserialize_rfc(attr):
         """Deserialize RFC-1123 formatted string into Datetime object.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized RFC datetime
         :rtype: Datetime
         :raises: DeserializationError if string format invalid.
         """
@@ -1939,14 +2055,14 @@ def deserialize_rfc(attr):
         except ValueError as err:
             msg = "Cannot deserialize to rfc datetime object."
             raise DeserializationError(msg) from err
-        else:
-            return date_obj
+        return date_obj
 
     @staticmethod
     def deserialize_iso(attr):
         """Deserialize ISO-8601 formatted string into Datetime object.
 
         :param str attr: response string to be deserialized.
+        :return: Deserialized ISO datetime
         :rtype: Datetime
         :raises: DeserializationError if string format invalid.
         """
@@ -1976,8 +2092,7 @@ def deserialize_iso(attr):
         except (ValueError, OverflowError, AttributeError) as err:
             msg = "Cannot deserialize datetime object."
             raise DeserializationError(msg) from err
-        else:
-            return date_obj
+        return date_obj
 
     @staticmethod
     def deserialize_unix(attr):
@@ -1985,6 +2100,7 @@ def deserialize_unix(attr):
         This is represented as seconds.
 
         :param int attr: Object to be serialized.
+        :return: Deserialized datetime
         :rtype: Datetime
         :raises: DeserializationError if format invalid
         """
@@ -1996,5 +2112,4 @@ def deserialize_unix(attr):
         except ValueError as err:
             msg = "Cannot deserialize to unix datetime object."
             raise DeserializationError(msg) from err
-        else:
-            return date_obj
+        return date_obj
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
index c7d155d924dd..be71c81bd282 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
@@ -6,4 +6,4 @@
 # Changes may cause incorrect behavior and will be lost if the code is regenerated.
 # --------------------------------------------------------------------------
 
-VERSION = "1.0.0b5"
+VERSION = "1.0.0b1"
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py
index c31764c00803..e9e1b0469645 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/__init__.py
@@ -6,20 +6,22 @@
 # Changes may cause incorrect behavior and will be lost if the code is regenerated.
 # --------------------------------------------------------------------------
 
-from ._patch import ChatCompletionsClient
-from ._patch import EmbeddingsClient
-from ._patch import ImageEmbeddingsClient
+from ._client import ChatCompletionsClient
+from ._client import EmbeddingsClient
+from ._client import ImageEmbeddingsClient
 
-
-from ._patch import load_client
+try:
+    from ._patch import __all__ as _patch_all
+    from ._patch import *  # pylint: disable=unused-wildcard-import
+except ImportError:
+    _patch_all = []
 from ._patch import patch_sdk as _patch_sdk
 
 __all__ = [
-    "load_client",
     "ChatCompletionsClient",
     "EmbeddingsClient",
     "ImageEmbeddingsClient",
 ]
-
+__all__.extend([p for p in _patch_all if p not in __all__])
 
 _patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
index 0be948bd275d..70b2ccb0dfef 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
@@ -115,7 +115,6 @@ async def _complete(
         model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.ChatCompletions:
-        # pylint: disable=too-many-locals
         """Gets chat completions for the provided chat messages.
         Completions support a wide variety of tasks and generate text that continues from or
         "completes"
@@ -204,7 +203,7 @@ async def _complete(
         :rtype: ~azure.ai.inference.models.ChatCompletions
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -294,7 +293,7 @@ async def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
         :rtype: ~azure.ai.inference.models.ModelInfo
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -425,7 +424,7 @@ async def _embed(
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -507,7 +506,7 @@ async def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
         :rtype: ~azure.ai.inference.models.ModelInfo
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -641,7 +640,7 @@ async def _embed(
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
@@ -723,7 +722,7 @@ async def _get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
         :rtype: ~azure.ai.inference.models.ModelInfo
         :raises ~azure.core.exceptions.HttpResponseError:
         """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
+        error_map: MutableMapping[int, Type[HttpResponseError]] = {  # pylint: disable=unsubscriptable-object
             401: ClientAuthenticationError,
             404: ResourceNotFoundError,
             409: ResourceExistsError,
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
index ac31fdb88108..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
@@ -2,1227 +2,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
 # ------------------------------------
-# pylint: disable=too-many-lines)
 """Customize generated code here.
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
 """
-import json
-import logging
-import sys
+from typing import List
 
-from io import IOBase
-from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING, AsyncIterable
-
-from azure.core.pipeline import PipelineResponse
-from azure.core.credentials import AzureKeyCredential
-from azure.core.tracing.decorator_async import distributed_trace_async
-from azure.core.utils import case_insensitive_dict
-from azure.core.exceptions import (
-    ClientAuthenticationError,
-    HttpResponseError,
-    map_error,
-    ResourceExistsError,
-    ResourceNotFoundError,
-    ResourceNotModifiedError,
-)
-from .. import models as _models
-from .._model_base import SdkJSONEncoder, _deserialize
-from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
-from ._client import EmbeddingsClient as EmbeddingsClientGenerated
-from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
-from .._operations._operations import (
-    build_chat_completions_complete_request,
-    build_embeddings_embed_request,
-    build_image_embeddings_embed_request,
-)
-
-if TYPE_CHECKING:
-    # pylint: disable=unused-import,ungrouped-imports
-    from azure.core.credentials_async import AsyncTokenCredential
-
-if sys.version_info >= (3, 9):
-    from collections.abc import MutableMapping
-else:
-    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
-
-JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
-_LOGGER = logging.getLogger(__name__)
-
-
-async def load_client(
-    endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
-) -> Union["ChatCompletionsClient", "EmbeddingsClient", "ImageEmbeddingsClient"]:
-    # pylint: disable=line-too-long
-    """
-    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
-    on the given endpoint, to determine the model type and therefore which client to instantiate.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    :return: The appropriate asynchronous client associated with the given endpoint
-    :rtype: ~azure.ai.inference.aio.ChatCompletionsClient or ~azure.ai.inference.aio.EmbeddingsClient
-     or ~azure.ai.inference.aio.ImageEmbeddingsClient
-    :raises ~azure.core.exceptions.HttpResponseError:
-    """
-
-    async with ChatCompletionsClient(
-        endpoint, credential, **kwargs
-    ) as client:  # Pick any of the clients, it does not matter.
-        model_info = await client.get_model_info()  # type: ignore
-
-    _LOGGER.info("model_info=%s", model_info)
-    if not model_info.model_type:
-        raise ValueError(
-            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
-        )
-
-    # TODO: Remove "completions" and "embedding" once Mistral Large and Cohere fixes their model type
-    if model_info.model_type in (_models.ModelType.CHAT, "completion"):
-        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
-        chat_completion_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return chat_completion_client
-
-    if model_info.model_type in (_models.ModelType.EMBEDDINGS, "embedding"):
-        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
-        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
-        return embedding_client
-
-    if model_info.model_type == _models.ModelType.IMAGE_EMBEDDINGS:
-        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
-        image_embedding_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return image_embedding_client
-
-    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
-
-
-class ChatCompletionsClient(ChatCompletionsClientGenerated):  # pylint: disable=too-many-instance-attributes
-    """ChatCompletionsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword frequency_penalty: A value that influences the probability of generated tokens
-        appearing based on their cumulative frequency in generated text.
-        Positive values will make tokens less likely to appear as their frequency increases and
-        decrease the likelihood of the model repeating the same statements verbatim.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype frequency_penalty: float
-    :keyword presence_penalty: A value that influences the probability of generated tokens
-        appearing based on their existing
-        presence in generated text.
-        Positive values will make tokens less likely to appear when they already exist and increase
-        the model's likelihood to output new topics.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype presence_penalty: float
-    :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-        generated completions.
-        Higher values will make output more random while lower values will make results more focused
-        and deterministic.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype temperature: float
-    :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-        causes the
-        model to consider the results of tokens with the provided probability mass. As an example, a
-        value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-        considered.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype top_p: float
-    :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-    :paramtype max_tokens: int
-    :keyword response_format: The format that the model must output. Use this to enable JSON mode
-        instead of the default text mode.
-        Note that to enable JSON mode, some AI models may also require you to instruct the model to
-        produce JSON via a system or user message. Default value is None.
-    :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
-    :keyword stop: A collection of textual sequences that will end completions generation. Default
-        value is None.
-    :paramtype stop: list[str]
-    :keyword tools: The available tool definitions that the chat completions request can use,
-        including caller-defined functions. Default value is None.
-    :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-    :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-        use for the chat completions response. Is either a Union[str,
-        "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-        Default value is None.
-    :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-        ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-    :keyword seed: If specified, the system will make a best effort to sample deterministically
-        such that repeated requests with the
-        same seed and parameters should return the same result. Determinism is not guaranteed.
-        Default value is None.
-    :paramtype seed: int
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default chat completions settings, to be applied in all future service calls
-        # unless overridden by arguments in the `complete` method.
-        self._frequency_penalty = frequency_penalty
-        self._presence_penalty = presence_penalty
-        self._temperature = temperature
-        self._top_p = top_p
-        self._max_tokens = max_tokens
-        self._response_format = response_format
-        self._stop = stop
-        self._tools = tools
-        self._tool_choice = tool_choice
-        self._seed = seed
-        self._model = model
-        self._model_extras = model_extras
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[False] = False,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.ChatCompletions: ...
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[True],
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> AsyncIterable[_models.StreamingChatCompletionsUpdate]: ...
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
-        on the given endpoint.
-        When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting StreamingChatCompletions
-        object to get content updates as they arrive. By default, the response is a ChatCompletions object
-        (non-streaming).
-
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the model must output. Use this to enable JSON mode
-         instead of the default text mode.
-         Note that to enable JSON mode, some AI models may also require you to instruct the model to
-         produce JSON via a system or user message. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def complete(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def complete(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def complete(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        messages: List[_models.ChatRequestMessage] = _Unset,
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting :class:`~azure.ai.inference.models.StreamingChatCompletions`
-        object to get content updates as they arrive.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the model must output. Use this to enable JSON mode
-         instead of the default text mode.
-         Note that to enable JSON mode, some AI models may also require you to instruct the model to
-         produce JSON via a system or user message. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models.ChatCompletionsResponseFormat
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "messages": messages,
-                "stream": stream,
-                "frequency_penalty": frequency_penalty if frequency_penalty is not None else self._frequency_penalty,
-                "max_tokens": max_tokens if max_tokens is not None else self._max_tokens,
-                "model": model if model is not None else self._model,
-                "presence_penalty": presence_penalty if presence_penalty is not None else self._presence_penalty,
-                "response_format": response_format if response_format is not None else self._response_format,
-                "seed": seed if seed is not None else self._seed,
-                "stop": stop if stop is not None else self._stop,
-                "temperature": temperature if temperature is not None else self._temperature,
-                "tool_choice": tool_choice if tool_choice is not None else self._tool_choice,
-                "tools": tools if tools is not None else self._tools,
-                "top_p": top_p if top_p is not None else self._top_p,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
-            stream = body["stream"]
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_chat_completions_complete_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = stream or False
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            return _models.AsyncStreamingChatCompletions(response)
-
-        return _deserialize(_models._patch.ChatCompletions, response.json())  # pylint: disable=protected-access
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = await self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class EmbeddingsClient(EmbeddingsClientGenerated):
-    """EmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def embed(
-        self,
-        *,
-        input: List[str],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[str] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = await self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
-    """ImageEmbeddingsClient.
-
-    :param endpoint: Service host. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def embed(
-        self,
-        *,
-        input: List[_models.EmbeddingInput],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[_models.EmbeddingInput] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.EmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_image_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            self._model_info = await self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-__all__: List[str] = [
-    "load_client",
-    "ChatCompletionsClient",
-    "EmbeddingsClient",
-    "ImageEmbeddingsClient",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
index 1832edc83399..2244cab47521 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
@@ -8,7 +8,7 @@
 
 from ._models import AssistantMessage
 from ._models import ChatChoice
-from ._patch import ChatCompletions
+from ._models import ChatCompletions
 from ._models import ChatCompletionsNamedToolChoice
 from ._models import ChatCompletionsNamedToolChoiceFunction
 from ._models import ChatCompletionsResponseFormat
@@ -22,12 +22,12 @@
 from ._models import ContentItem
 from ._models import EmbeddingInput
 from ._models import EmbeddingItem
-from ._patch import EmbeddingsResult
+from ._models import EmbeddingsResult
 from ._models import EmbeddingsUsage
 from ._models import FunctionCall
 from ._models import FunctionDefinition
 from ._models import ImageContentItem
-from ._patch import ImageUrl
+from ._models import ImageUrl
 from ._models import ModelInfo
 from ._models import StreamingChatChoiceUpdate
 from ._models import StreamingChatCompletionsUpdate
@@ -45,14 +45,11 @@
 from ._enums import EmbeddingInputType
 from ._enums import ImageDetailLevel
 from ._enums import ModelType
-
-from ._patch import StreamingChatCompletions
-from ._patch import AsyncStreamingChatCompletions
+from ._patch import __all__ as _patch_all
+from ._patch import *  # pylint: disable=unused-wildcard-import
 from ._patch import patch_sdk as _patch_sdk
 
 __all__ = [
-    "StreamingChatCompletions",
-    "AsyncStreamingChatCompletions",
     "AssistantMessage",
     "ChatChoice",
     "ChatCompletions",
@@ -92,5 +89,5 @@
     "ImageDetailLevel",
     "ModelType",
 ]
-
+__all__.extend([p for p in _patch_all if p not in __all__])
 _patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py
index 830a93f75472..61443cbfbb85 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_enums.py
@@ -121,14 +121,14 @@ class ModelType(str, Enum, metaclass=CaseInsensitiveEnumMeta):
     """The type of AI model."""
 
     EMBEDDINGS = "embeddings"
-    """Embeddings."""
+    """A model capable of generating embeddings from a text"""
     IMAGE_GENERATION = "image_generation"
-    """Image generation"""
+    """A model capable of generating images from an image and text description"""
     TEXT_GENERATION = "text_generation"
-    """Text generation"""
+    """A text generation model"""
     IMAGE_EMBEDDINGS = "image_embeddings"
-    """Image embeddings"""
+    """A model capable of generating embeddings from an image"""
     AUDIO_GENERATION = "audio_generation"
-    """Audio generation"""
-    CHAT = "chat"
-    """Chat completions"""
+    """A text-to-audio generative model"""
+    CHAT_COMPLETION = "chat_completion"
+    """A model capable of taking chat-formatted messages and generate responses"""
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
index 4ac8f16f94d1..527d17045f19 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
@@ -634,7 +634,7 @@ class EmbeddingItem(_model_base.Model):
     :vartype index: int
     """
 
-    embedding: Union["str", List[float]] = rest_field()
+    embedding: Union[str, List[float]] = rest_field()
     """List of embedding values for the input prompt. These represent a measurement of the
      vector-based relatedness of the provided input. Or a base64 encoded string of the embedding
      vector. Required. Is either a str type or a [float] type."""
@@ -915,7 +915,7 @@ class ModelInfo(_model_base.Model):
     :vartype model_name: str
     :ivar model_type: The type of the AI model. A Unique identifier for the profile. Required.
      Known values are: "embeddings", "image_generation", "text_generation", "image_embeddings",
-     "audio_generation", and "chat".
+     "audio_generation", and "chat_completion".
     :vartype model_type: str or ~azure.ai.inference.models.ModelType
     :ivar model_provider_name: The model provider name. For example: ``Microsoft Research``.
      Required.
@@ -927,7 +927,7 @@ class ModelInfo(_model_base.Model):
     model_type: Union[str, "_models.ModelType"] = rest_field()
     """The type of the AI model. A Unique identifier for the profile. Required. Known values are:
      \"embeddings\", \"image_generation\", \"text_generation\", \"image_embeddings\",
-     \"audio_generation\", and \"chat\"."""
+     \"audio_generation\", and \"chat_completion\"."""
     model_provider_name: str = rest_field()
     """The model provider name. For example: ``Microsoft Research``. Required."""
 
@@ -1269,7 +1269,7 @@ class UserMessage(ChatRequestMessage, discriminator="user"):
     role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
     """The chat role associated with this message, which is always 'user' for user messages. Required.
      The role that provides input for chat completions."""
-    content: Union["str", List["_models.ContentItem"]] = rest_field()
+    content: Union[str, List["_models.ContentItem"]] = rest_field()
     """The contents of the user message, with available input types varying by selected model.
      Required. Is either a str type or a [ContentItem] type."""
 
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
index b5950abe582a..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
@@ -6,273 +6,9 @@
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
 """
-import asyncio
-import base64
-import json
-import logging
-import queue
-import re
-import sys
+from typing import List
 
-from typing import List, AsyncIterator, Iterator, Optional, Union
-from azure.core.rest import HttpResponse, AsyncHttpResponse
-from ._models import ImageUrl as ImageUrlGenerated
-from ._models import ChatCompletions as ChatCompletionsGenerated
-from ._models import EmbeddingsResult as EmbeddingsResultGenerated
-from .. import models as _models
-
-if sys.version_info >= (3, 11):
-    from typing import Self
-else:
-    from typing_extensions import Self
-
-logger = logging.getLogger(__name__)
-
-
-class ChatCompletions(ChatCompletionsGenerated):
-    """Representation of the response data from a chat completions request.
-    Completions support a wide variety of tasks and generate text that continues from or
-    "completes"
-    provided prompt data.
-
-
-    :ivar id: A unique identifier associated with this chat completions response. Required.
-    :vartype id: str
-    :ivar created: The first timestamp associated with generation activity for this completions
-     response,
-     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
-    :vartype created: ~datetime.datetime
-    :ivar model: The model used for the chat completion. Required.
-    :vartype model: str
-    :ivar usage: Usage information for tokens processed and generated as part of this completions
-     operation. Required.
-    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
-    :ivar choices: The collection of completions choices associated with this completions response.
-     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
-     Token limits and other settings may limit the number of choices generated. Required.
-    :vartype choices: list[~azure.ai.inference.models.ChatChoice]
-    """
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return json.dumps(self.as_dict(), indent=2)
-
-
-class EmbeddingsResult(EmbeddingsResultGenerated):
-    """Representation of the response data from an embeddings request.
-    Embeddings measure the relatedness of text strings and are commonly used for search,
-    clustering,
-    recommendations, and other similar scenarios.
-
-
-    :ivar data: Embedding values for the prompts submitted in the request. Required.
-    :vartype data: list[~azure.ai.inference.models.EmbeddingItem]
-    :ivar usage: Usage counts for tokens input using the embeddings API. Required.
-    :vartype usage: ~azure.ai.inference.models.EmbeddingsUsage
-    :ivar model: The model ID used to generate this result. Required.
-    :vartype model: str
-    """
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return json.dumps(self.as_dict(), indent=2)
-
-
-class ImageUrl(ImageUrlGenerated):
-
-    @classmethod
-    def load(
-        cls, *, image_file: str, image_format: str, detail: Optional[Union[str, "_models.ImageDetailLevel"]] = None
-    ) -> Self:
-        """
-        Create an ImageUrl object from a local image file. The method reads the image
-        file and encodes it as a base64 string, which together with the image format
-        is then used to format the JSON `url` value passed in the request payload.
-
-        :ivar image_file: The name of the local image file to load. Required.
-        :vartype image_file: str
-        :ivar image_format: The MIME type format of the image. For example: "jpeg", "png". Required.
-        :vartype image_format: str
-        :ivar detail: The evaluation quality setting to use, which controls relative prioritization of
-         speed, token consumption, and accuracy. Known values are: "auto", "low", and "high".
-        :vartype detail: str or ~azure.ai.inference.models.ImageDetailLevel
-        :return: An ImageUrl object with the image data encoded as a base64 string.
-        :rtype: ~azure.ai.inference.models.ImageUrl
-        :raises FileNotFoundError: when the image file could not be opened.
-        """
-        with open(image_file, "rb") as f:
-            image_data = base64.b64encode(f.read()).decode("utf-8")
-        url = f"data:image/{image_format};base64,{image_data}"
-        return cls(url=url, detail=detail)
-
-
-class BaseStreamingChatCompletions:
-    """A base class for the sync and async streaming chat completions responses, holding any common code
-    to deserializes the Server Sent Events (SSE) response stream into chat completions updates, each one
-    represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    # Enable detailed logs of SSE parsing. For development only, should be `False` by default.
-    _ENABLE_CLASS_LOGS = False
-
-    # The prefix of each line in the SSE stream that contains a JSON string
-    # to deserialize into a StreamingChatCompletionsUpdate object
-    _SSE_DATA_EVENT_PREFIX = "data: "
-
-    # The line indicating the end of the SSE stream
-    _SSE_DATA_EVENT_DONE = "data: [DONE]"
-
-    def __init__(self):
-        self._queue: "queue.Queue[_models.StreamingChatCompletionsUpdate]" = queue.Queue()
-        self._incomplete_json = ""
-        self._done = False  # Will be set to True when reading 'data: [DONE]' line
-
-    def _deserialize_and_add_to_queue(self, element: bytes) -> bool:
-
-        # Clear the queue of StreamingChatCompletionsUpdate before processing the next block
-        self._queue.queue.clear()
-
-        # Convert `bytes` to string and split the string by newline, while keeping the new line char.
-        # the last may be a partial "line" that does not contain a newline char at the end.
-        line_list: List[str] = re.split(r"(?<=\n)", element.decode("utf-8"))
-        for index, line in enumerate(line_list):
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Original line] %s", repr(line))
-
-            if index == 0:
-                line = self._incomplete_json + line
-                self._incomplete_json = ""
-
-            if index == len(line_list) - 1 and not line.endswith("\n"):
-                self._incomplete_json = line
-                return False
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Modified line] %s", repr(line))
-
-            if line == "\n":  # Empty line, indicating flush output to client
-                continue
-
-            if not line.startswith(self._SSE_DATA_EVENT_PREFIX):
-                raise ValueError(f"SSE event not supported (line `{line}`)")
-
-            if line.startswith(self._SSE_DATA_EVENT_DONE):
-                if self._ENABLE_CLASS_LOGS:
-                    logger.debug("[Done]")
-                return True
-
-            # If you reached here, the line should contain `data: {...}\n`
-            # where the curly braces contain a valid JSON object.
-            # Deserialize it into a StreamingChatCompletionsUpdate object
-            # and add it to the queue.
-            # pylint: disable=W0212 # Access to a protected member _deserialize of a client class
-            update = _models.StreamingChatCompletionsUpdate._deserialize(
-                json.loads(line[len(self._SSE_DATA_EVENT_PREFIX) : -1]), []
-            )
-
-            # We skip any update that has a None or empty choices list
-            # (this is what OpenAI Python SDK does)
-            if update.choices:
-
-                # We update all empty content strings to None
-                # (this is what OpenAI Python SDK does)
-                # for choice in update.choices:
-                #    if not choice.delta.content:
-                #        choice.delta.content = None
-
-                self._queue.put(update)
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Added to queue]")
-
-        return False
-
-
-class StreamingChatCompletions(BaseStreamingChatCompletions):
-    """Represents an interator over StreamingChatCompletionsUpdate objects. It can be used for either synchronous or
-    asynchronous iterations. The class deserializes the Server Sent Events (SSE) response stream
-    into chat completions updates, each one represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    def __init__(self, response: HttpResponse):
-        super().__init__()
-        self._response = response
-        self._bytes_iterator: Iterator[bytes] = response.iter_bytes()
-
-    def __iter__(self):
-        return self
-
-    def __next__(self) -> "_models.StreamingChatCompletionsUpdate":
-        while self._queue.empty() and not self._done:
-            self._done = self._read_next_block()
-        if self._queue.empty():
-            raise StopIteration
-        return self._queue.get()
-
-    def _read_next_block(self) -> bool:
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Reading next block]")
-        try:
-            element = self._bytes_iterator.__next__()
-        except StopIteration:
-            self.close()
-            return True
-        return self._deserialize_and_add_to_queue(element)
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        self.close()
-
-    def close(self) -> None:
-        self._response.close()
-
-
-class AsyncStreamingChatCompletions(BaseStreamingChatCompletions):
-    """Represents an async interator over StreamingChatCompletionsUpdate objects.
-    It can be used for either synchronous or asynchronous iterations. The class
-    deserializes the Server Sent Events (SSE) response stream into chat
-    completions updates, each one represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    def __init__(self, response: AsyncHttpResponse):
-        super().__init__()
-        self._response = response
-        self._bytes_iterator: AsyncIterator[bytes] = response.iter_bytes()
-
-    def __aiter__(self):
-        return self
-
-    async def __anext__(self) -> "_models.StreamingChatCompletionsUpdate":
-        while self._queue.empty() and not self._done:
-            self._done = await self._read_next_block_async()
-        if self._queue.empty():
-            raise StopAsyncIteration
-        return self._queue.get()
-
-    async def _read_next_block_async(self) -> bool:
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Reading next block]")
-        try:
-            element = await self._bytes_iterator.__anext__()
-        except StopAsyncIteration:
-            await self.aclose()
-            return True
-        return self._deserialize_and_add_to_queue(element)
-
-    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
-        asyncio.run(self.aclose())
-
-    async def aclose(self) -> None:
-        await self._response.close()
-
-
-__all__: List[str] = [
-    "ImageUrl",
-    "ChatCompletions",
-    "EmbeddingsResult",
-    "StreamingChatCompletions",
-    "AsyncStreamingChatCompletions",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py
index ec2dd6afae75..25d6ce20cce7 100644
--- a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_json_async.py
@@ -58,10 +58,7 @@ async def sample_chat_completions_from_input_json_async():
                     "role": "assistant",
                     "content": "The main construction of the International Space Station (ISS) was completed between 1998 and 2011. During this period, more than 30 flights by US space shuttles and 40 by Russian rockets were conducted to transport components and modules to the station.",
                 },
-                {
-                    "role": "user",
-                    "content": "And what was the estimated cost to build it?"
-                },
+                {"role": "user", "content": "And what was the estimated cost to build it?"},
             ]
         }
 
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py
index 925583af4772..78a9b9a42690 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json.py
@@ -58,10 +58,7 @@ def sample_chat_completions_from_input_json():
                     "role": "assistant",
                     "content": "The main construction of the International Space Station (ISS) was completed between 1998 and 2011. During this period, more than 30 flights by US space shuttles and 40 by Russian rockets were conducted to transport components and modules to the station.",
                 },
-                {
-                    "role": "user",
-                    "content": "And what was the estimated cost to build it?"
-                },
+                {"role": "user", "content": "And what was the estimated cost to build it?"},
             ]
         }
     )
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json_with_image_url.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json_with_image_url.py
index 912b98afccb8..83f3afceaa19 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json_with_image_url.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_json_with_image_url.py
@@ -54,9 +54,7 @@ def sample_chat_completions_from_input_json_with_image_url():
         model_deployment = None
 
     client = ChatCompletionsClient(
-        endpoint=endpoint,
-        credential=AzureKeyCredential(key),
-        headers={"azureml-model-deployment": model_deployment}
+        endpoint=endpoint, credential=AzureKeyCredential(key), headers={"azureml-model-deployment": model_deployment}
     )
 
     response = client.complete(
@@ -69,10 +67,7 @@ def sample_chat_completions_from_input_json_with_image_url():
                 {
                     "role": "user",
                     "content": [
-                        {
-                            "type": "text",
-                            "text": "What's in this image?"
-                        },
+                        {"type": "text", "text": "What's in this image?"},
                         {
                             "type": "image_url",
                             "image_url": {
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py
index da24328ece8b..beb3155f268c 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py
@@ -35,6 +35,7 @@
 
 use_azure_openai_endpoint = True
 
+
 def sample_chat_completions_streaming_with_tools():
     import os
     import json
@@ -79,11 +80,9 @@ def get_flight_info(origin_city: str, destination_city: str):
         str: The airline name, fight number, date and time of the next flight between the cities, in JSON format.
         """
         if origin_city == "Seattle" and destination_city == "Miami":
-            return json.dumps({
-                "airline": "Delta",
-                "flight_number": "DL123",
-                "flight_date": "May 7th, 2024",
-                "flight_time": "10:00AM"})
+            return json.dumps(
+                {"airline": "Delta", "flight_number": "DL123", "flight_date": "May 7th, 2024", "flight_time": "10:00AM"}
+            )
         return json.dumps({"error": "No flights found between the cities"})
 
     # Define a function 'tool' that the model can use to retrieves flight information
@@ -118,10 +117,7 @@ def get_flight_info(origin_city: str, destination_city: str):
         )
     else:
         # Create a chat completions client for Serverless API endpoint or Managed Compute endpoint
-        client = ChatCompletionsClient(
-            endpoint=endpoint,
-            credential=AzureKeyCredential(key)
-        )
+        client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
 
     # Make a streaming chat completions call asking for flight information, while providing a tool to handle the request
     messages = [
@@ -129,10 +125,7 @@ def get_flight_info(origin_city: str, destination_city: str):
         UserMessage(content="What is the next flights from Seattle to Miami?"),
     ]
 
-    response = client.complete(
-        messages=messages,
-        tools=[flight_info],
-        stream=True)
+    response = client.complete(messages=messages, tools=[flight_info], stream=True)
 
     # Note that in the above call we did not specify `tool_choice`. The service defaults to a setting equivalent
     # to specifying `tool_choice=ChatCompletionsToolChoicePreset.AUTO`. Other than ChatCompletionsToolChoicePreset
@@ -159,11 +152,7 @@ def get_flight_info(origin_city: str, destination_city: str):
         AssistantMessage(
             tool_calls=[
                 ChatCompletionsToolCall(
-                    id=tool_call_id,
-                    function=FunctionCall(
-                        name=function_name,
-                        arguments=function_args
-                    )
+                    id=tool_call_id, function=FunctionCall(name=function_name, arguments=function_args)
                 )
             ]
         )
@@ -177,19 +166,10 @@ def get_flight_info(origin_city: str, destination_city: str):
     print(f"Function response = {function_response}")
 
     # Append the function response as a tool message to the chat history
-    messages.append(
-        ToolMessage(
-            tool_call_id=tool_call_id,
-            content=function_response
-        )
-    )
+    messages.append(ToolMessage(tool_call_id=tool_call_id, content=function_response))
 
     # With the additional tools information on hand, get another streaming response from the model
-    response = client.complete(
-        messages=messages,
-        tools=[flight_info],
-        stream=True
-    )
+    response = client.complete(messages=messages, tools=[flight_info], stream=True)
 
     print("Model response = ", end="")
     for update in response:
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_defaults.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_defaults.py
index 36f43a5601a4..80936ee03142 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_defaults.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_defaults.py
@@ -43,10 +43,7 @@ def sample_chat_completions_with_defaults():
 
     # Create a client with default chat completions settings
     client = ChatCompletionsClient(
-        endpoint=endpoint,
-        credential=AzureKeyCredential(key),
-        temperature=0.5,
-        max_tokens=1000
+        endpoint=endpoint, credential=AzureKeyCredential(key), temperature=0.5, max_tokens=1000
     )
 
     # Call the service with the defaults specified above
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
index 3d14a550ab68..2074c447fdfe 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
@@ -64,14 +64,11 @@ def get_flight_info(origin_city: str, destination_city: str):
         str: The airline name, fight number, date and time of the next flight between the cities, in JSON format.
         """
         if origin_city == "Seattle" and destination_city == "Miami":
-            return json.dumps({
-                "airline": "Delta",
-                "flight_number": "DL123",
-                "flight_date": "May 7th, 2024",
-                "flight_time": "10:00AM"})
+            return json.dumps(
+                {"airline": "Delta", "flight_number": "DL123", "flight_date": "May 7th, 2024", "flight_time": "10:00AM"}
+            )
         return json.dumps({"error": "No flights found between the cities"})
 
-
     # Define a function 'tool' that the model can use to retrieves flight information
     flight_info = ChatCompletionsToolDefinition(
         function=FunctionDefinition(
@@ -95,10 +92,7 @@ def get_flight_info(origin_city: str, destination_city: str):
     )
 
     # Create a chat completion client. Make sure you selected a model that supports tools.
-    client = ChatCompletionsClient(
-        endpoint=endpoint,
-        credential=AzureKeyCredential(key)
-    )
+    client = ChatCompletionsClient(endpoint=endpoint, credential=AzureKeyCredential(key))
 
     # Make a chat completions call asking for flight information, while providing a tool to handle the request
     messages = [
diff --git a/sdk/ai/azure-ai-inference/samples/sample_embeddings_with_base64_encoding.py b/sdk/ai/azure-ai-inference/samples/sample_embeddings_with_base64_encoding.py
index 9d9ec9c5c492..248bccb83a55 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_embeddings_with_base64_encoding.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_embeddings_with_base64_encoding.py
@@ -44,13 +44,15 @@ def sample_embeddings_with_base64_encoding():
 
     # Request embeddings as base64 encoded strings
     response = client.embed(
-        input=["first phrase", "second phrase", "third phrase"],
-        encoding_format=EmbeddingEncodingFormat.BASE64)
+        input=["first phrase", "second phrase", "third phrase"], encoding_format=EmbeddingEncodingFormat.BASE64
+    )
 
     for item in response.data:
         # Display the start and end of the resulting base64 string
-        print(f"data[{item.index}] encoded (string length={len(item.embedding)}): "
-              f"\"{item.embedding[:32]}...{item.embedding[-32:]}\"")
+        print(
+            f"data[{item.index}] encoded (string length={len(item.embedding)}): "
+            f'"{item.embedding[:32]}...{item.embedding[-32:]}"'
+        )
 
         # For display purposes, decode the string into a list of floating point numbers.
         # Display the first and last two elements of the list.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_image_embeddings_with_defaults.py b/sdk/ai/azure-ai-inference/samples/sample_image_embeddings_with_defaults.py
index 3ce84554ab4d..5282f22e4f45 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_image_embeddings_with_defaults.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_image_embeddings_with_defaults.py
@@ -49,10 +49,7 @@ def sample_image_embeddings_with_defaults():
 
     # Create a client with default embeddings settings
     client = ImageEmbeddingsClient(
-        endpoint=endpoint,
-        credential=AzureKeyCredential(key),
-        dimensions=1024,
-        input_type=EmbeddingInputType.QUERY
+        endpoint=endpoint, credential=AzureKeyCredential(key), dimensions=1024, input_type=EmbeddingInputType.QUERY
     )
 
     # Call the service with the defaults specified above
diff --git a/sdk/ai/azure-ai-inference/sdk_packaging.toml b/sdk/ai/azure-ai-inference/sdk_packaging.toml
new file mode 100644
index 000000000000..e7687fdae93b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/sdk_packaging.toml
@@ -0,0 +1,2 @@
+[packaging]
+auto_update = false
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/tsp-location.yaml b/sdk/ai/azure-ai-inference/tsp-location.yaml
index df185250688b..e3d26332c4dd 100644
--- a/sdk/ai/azure-ai-inference/tsp-location.yaml
+++ b/sdk/ai/azure-ai-inference/tsp-location.yaml
@@ -1,4 +1,4 @@
 directory: specification/ai/ModelClient
-commit: 3e95e575e537024a02470cf59c7a78078dc10cd1
+commit: 363e37a1282dd6750232ffd49ed07e6124d2675d
 repo: Azure/azure-rest-api-specs
-additionalDirectories:
+additionalDirectories: