diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py index 60fb5e0306dd..32e920651062 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_build_mlindex.py @@ -38,8 +38,8 @@ def build_index( from azure.ai.generative.index._documents import DocumentChunksIterator, split_documents from azure.ai.generative.index._embeddings import EmbeddingsContainer from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 from azure.ai.generative.index._utils.logging import disable_mlflow - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 except ImportError as e: print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed") raise e @@ -176,7 +176,7 @@ def _create_mlindex_from_existing_acs( ) -> Index: try: from azure.ai.generative.index._embeddings import EmbeddingsContainer - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 except ImportError as e: print("In order to use build_index to build an Index locally, you must have azure-ai-generative[index] installed") raise e diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/__init__.py new file mode 100644 index 000000000000..5328bdc55b18 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/__init__.py @@ -0,0 +1,20 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DataIndex configuration and operations.""" + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) + +from azure.ai.generative.index._dataindex.entities import Data, CitationRegex, DataIndex, Embedding, IndexSource, IndexStore, index_data +from azure.ai.generative.index._dataindex.operations import DataOperations + +__all__ = [ + "DataOperations", + "DataIndex", + "IndexSource", + "Data", + "CitationRegex", + "Embedding", + "IndexStore", + "index_data", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/__init__.py new file mode 100644 index 000000000000..14f3af959399 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/__init__.py @@ -0,0 +1,23 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) + +from .data_index import ( + CitationRegexSchema, + DataIndexSchema, + DataIndexTypes, + EmbeddingSchema, + IndexSourceSchema, + IndexStoreSchema, +) + +__all__ = [ + "DataIndexSchema", + "IndexSourceSchema", + "CitationRegexSchema", + "EmbeddingSchema", + "IndexStoreSchema", + "DataIndexTypes", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/data_index.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/data_index.py new file mode 100644 index 000000000000..bfe70f705f37 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/_schema/_data_index/data_index.py @@ -0,0 +1,226 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=unused-argument + +from marshmallow import fields, post_load + +from azure.ai.ml._schema.assets.data import DataSchema +from azure.ai.ml._schema.core.fields import ArmVersionedStr, LocalPathField, NestedField, StringTransformedEnum, UnionField +from azure.ai.ml._schema.core.schema import PatchedSchemaMeta +from azure.ai.ml._schema.job.input_output_entry import generate_datastore_property +from azure.ai.ml._utils._experimental import experimental +from azure.ai.ml.constants._common import AssetTypes, AzureMLResourceType, InputOutputModes + + +# FROM: azure.ai.ml._schema.job.input_output_entry +def generate_path_property(azureml_type, **kwargs): + return UnionField( + [ + ArmVersionedStr(azureml_type=azureml_type), + fields.Str(metadata={"pattern": r"^(http(s)?):.*"}), + fields.Str(metadata={"pattern": r"^(wasb(s)?):.*"}), + LocalPathField(pattern=r"^file:.*"), + LocalPathField( + pattern=r"^(?!(azureml|http(s)?|wasb(s)?|file):).*", + ), + ], + is_strict=True, + **kwargs, + ) + + +class DataIndexTypes: + """DataIndexTypes is an enumeration of values for the types out indexes which can be written to by DataIndex.""" + + ACS = "acs" + """Azure Cognitive Search index type.""" + FAISS = "faiss" + """Faiss index type.""" + + +class CitationRegexSchema(metaclass=PatchedSchemaMeta): + match_pattern = fields.Str( + required=True, + metadata={"description": "Regex to match citation in the citation_url + input file path. e.g. '\\1/\\2'"}, + ) + replacement_pattern = fields.Str( + required=True, + metadata={"description": r"Replacement string for citation. e.g. '(.*)/articles/(.*)(\.[^.]+)$'"}, + ) + + @post_load + def make(self, data, **kwargs): + from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex + + return CitationRegex(**data) + + +class InputDataSchema(metaclass=PatchedSchemaMeta): + mode = StringTransformedEnum( + allowed_values=[ + InputOutputModes.RO_MOUNT, + InputOutputModes.RW_MOUNT, + InputOutputModes.DOWNLOAD, + ], + required=False, + ) + type = StringTransformedEnum( + allowed_values=[ + AssetTypes.URI_FILE, + AssetTypes.URI_FOLDER, + ] + ) + path = generate_path_property(azureml_type=AzureMLResourceType.DATA) + datastore = generate_datastore_property() + + @post_load + def make(self, data, **kwargs): + from azure.ai.ml.entities import Data + + return Data(**data) + + +class InputMLTableSchema(metaclass=PatchedSchemaMeta): + mode = StringTransformedEnum( + allowed_values=[ + InputOutputModes.EVAL_MOUNT, + InputOutputModes.EVAL_DOWNLOAD, + ], + required=False, + ) + type = StringTransformedEnum(allowed_values=[AssetTypes.MLTABLE]) + path = generate_path_property(azureml_type=AzureMLResourceType.DATA) + datastore = generate_datastore_property() + + @post_load + def make(self, data, **kwargs): + from azure.ai.ml.entities import Data + + return Data(**data) + + +class IndexSourceSchema(metaclass=PatchedSchemaMeta): + input_data = UnionField( + [NestedField(InputDataSchema), NestedField(InputMLTableSchema)], + required=True, + allow_none=False, + metadata={"description": "Input Data to index files from. MLTable type inputs will use `mode: eval_mount`."}, + ) + input_glob = fields.Str( + required=False, + metadata={ + "description": "Glob pattern to filter files from input_data. If not specified, all files will be indexed." + }, + ) + chunk_size = fields.Int( + required=False, + allow_none=False, + metadata={"description": "Maximum number of tokens to put in each chunk."}, + ) + chunk_overlap = fields.Int( + required=False, + allow_none=False, + metadata={"description": "Number of tokens to overlap between chunks."}, + ) + citation_url = fields.Str( + required=False, + metadata={"description": "Base URL to join with file paths to create full source file URL for chunk metadata."}, + ) + citation_url_replacement_regex = NestedField( + CitationRegexSchema, + required=False, + metadata={ + "description": "Regex match and replacement patterns for citation url. Useful if the paths in `input_data` " + "don't match the desired citation format." + }, + ) + + @post_load + def make(self, data, **kwargs): + from azure.ai.generative.index._dataindex.entities.data_index import IndexSource + + return IndexSource(**data) + + +class EmbeddingSchema(metaclass=PatchedSchemaMeta): + model = fields.Str( + required=True, + allow_none=False, + metadata={ + "description": "The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/" + "all-mpnet-base-v2' or 'azure_open_ai://deployment/{{deployment_name}}/model/{{model_name}}'" + }, + ) + connection = fields.Str( + required=False, + metadata={ + "description": "Connection reference to use for embedding model information, " + "only needed for hosted embeddings models (such as Azure OpenAI)." + }, + ) + cache_path = generate_path_property( + azureml_type=AzureMLResourceType.DATASTORE, + required=False, + metadata={ + "description": "Folder containing previously generated embeddings. " + "Should be parent folder of the 'embeddings' output path used for for this component. " + "Will compare input data to existing embeddings and only embed changed/new data, " + "reusing existing chunks." + }, + ) + + @post_load + def make(self, data, **kwargs): + from azure.ai.generative.index._dataindex.entities.data_index import Embedding + + return Embedding(**data) + + +class IndexStoreSchema(metaclass=PatchedSchemaMeta): + type = StringTransformedEnum( + allowed_values=[ + DataIndexTypes.ACS, + DataIndexTypes.FAISS, + ], + metadata={"description": "The type of index to write to. Currently supported types are 'acs' and 'faiss'."}, + ) + name = fields.Str( + required=False, + metadata={"description": "Name of the index to write to. If not specified, a name will be generated."}, + ) + connection = fields.Str( + required=False, + metadata={ + "description": "Connection reference to use for index information, " + "only needed for hosted indexes (such as Azure Cognitive Search)." + }, + ) + config = fields.Dict( + required=False, + metadata={ + "description": "Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings." + "Such as custom `field_mapping` for known field types." + } + ) + + @post_load + def make(self, data, **kwargs): + from azure.ai.generative.index._dataindex.entities.data_index import IndexStore + + return IndexStore(**data) + + +@experimental +class DataIndexSchema(DataSchema): + source = NestedField(IndexSourceSchema, required=True, allow_none=False) + embedding = NestedField(EmbeddingSchema, required=True, allow_none=False) + index = NestedField(IndexStoreSchema, required=True, allow_none=False) + incremental_update = fields.Bool() + + @post_load + def make(self, data, **kwargs): + from azure.ai.generative.index._dataindex.entities.data_index import DataIndex + + return DataIndex(**data) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/_component.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/_component.py new file mode 100644 index 000000000000..1c927e0b8972 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/constants/_component.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +class DataIndexComponentUri(object): + DATA_INDEX_COG_SEARCH = "azureml://registries/azureml/components/llm_ingest_dataset_to_acs_basic/labels/default" + DATA_INDEX_FAISS = "azureml://registries/azureml/components/llm_ingest_dataset_to_faiss_basic/labels/default" + + @staticmethod + def with_registry(component_uri: str, registry_name: str) -> str: + return component_uri.replace("azureml://registries/azureml", f"azureml://registries/{registry_name}") + + +class LLMRAGComponentUri(object): + LLM_RAG_CRACK_AND_CHUNK = "azureml://registries/azureml/components/llm_rag_crack_and_chunk/labels/default" + LLM_RAG_GENERATE_EMBEDDINGS = "azureml://registries/azureml/components/llm_rag_generate_embeddings/labels/default" + LLM_RAG_CRACK_AND_CHUNK_AND_EMBED = ( + "azureml://registries/azureml/components/llm_rag_crack_and_chunk_and_embed/labels/default" + ) + LLM_RAG_UPDATE_ACS_INDEX = "azureml://registries/azureml/components/llm_rag_update_acs_index/labels/default" + LLM_RAG_CREATE_FAISS_INDEX = "azureml://registries/azureml/components/llm_rag_create_faiss_index/labels/default" + LLM_RAG_REGISTER_MLINDEX_ASSET = ( + "azureml://registries/azureml/components/llm_rag_register_mlindex_asset/labels/default" + ) + LLM_RAG_VALIDATE_DEPLOYMENTS = "azureml://registries/azureml/components/llm_rag_validate_deployments/labels/default" + LLM_RAG_CREATE_PROMPTFLOW = "azureml://registries/azureml/components/llm_rag_create_promptflow/labels/default" diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/__init__.py new file mode 100644 index 000000000000..37ec50189954 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/__init__.py @@ -0,0 +1,19 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DataIndex configuration and operations.""" + +from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol +from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, Data, DataIndex, Embedding, IndexSource, IndexStore +from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data + +__all__ = [ + "DataIndex", + "IndexSource", + "Data", + "CitationRegex", + "Embedding", + "IndexStore", + "index_data", + "build_model_protocol", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/models.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/models.py new file mode 100644 index 000000000000..22521bf94d4b --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/data_index/models.py @@ -0,0 +1,26 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DataIndex embedding model helpers.""" +import re +from typing import Optional + +OPEN_AI_PROTOCOL_TEMPLATE = "azure_open_ai://deployment/{}/model/{}" +OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_PROTOCOL_TEMPLATE.format(".*", ".*") +OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE = "azure_open_ai://deployments?/{}" +OPEN_AI_PROTOCOL_REGEX_PATTERN = OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE.format(".*") + +HUGGINGFACE_PROTOCOL_TEMPLATE = "hugging_face://model/{}" +HUGGINGFACE_PROTOCOL_REGEX_PATTERN = HUGGINGFACE_PROTOCOL_TEMPLATE.format(".*") + + +def build_model_protocol(model: Optional[str] = None): + """Build a model protocol from user input.""" + if not model or re.match(OPEN_AI_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE): + return model + if re.match(OPEN_AI_SHORT_FORM_PROTOCOL_TEMPLATE, model, re.IGNORECASE): + return model + if re.match(HUGGINGFACE_PROTOCOL_REGEX_PATTERN, model, re.IGNORECASE): + return model + + return OPEN_AI_PROTOCOL_TEMPLATE.format(model, model) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/_pipeline_decorator.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/_pipeline_decorator.py new file mode 100644 index 000000000000..033d63b445df --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/dsl/_pipeline_decorator.py @@ -0,0 +1,259 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=protected-access + +import inspect +import logging +from collections import OrderedDict +from functools import wraps +from inspect import Parameter, signature +from pathlib import Path +from typing import Callable, Dict, List, Optional, TypeVar, Union, overload + +from typing_extensions import ParamSpec + +from azure.ai.ml._utils.utils import is_private_preview_enabled +from azure.ai.ml.entities import Data, Model, PipelineJob, PipelineJobSettings +from azure.ai.ml.entities._builders.pipeline import Pipeline +from azure.ai.ml.entities._inputs_outputs import Input, is_group +from azure.ai.ml.entities._job.pipeline._io import NodeOutput, PipelineInput, _GroupAttrDict +from azure.ai.ml.entities._job.pipeline._pipeline_expression import PipelineExpression +from azure.ai.ml.exceptions import ( + MultipleValueError, + ParamValueNotExistsError, + TooManyPositionalArgsError, + UnexpectedKeywordError, + UnsupportedParameterKindError, + UserErrorException, +) + +from azure.ai.ml.entities._builders import BaseNode +from azure.ai.ml.dsl._pipeline_component_builder import PipelineComponentBuilder, _is_inside_dsl_pipeline_func +from azure.ai.ml.dsl._pipeline_decorator import _validate_args +from azure.ai.ml.dsl._settings import _dsl_settings_stack +from azure.ai.ml.dsl._utils import _resolve_source_file + +SUPPORTED_INPUT_TYPES = ( + PipelineInput, + NodeOutput, + Input, + Model, + Data, # For the case use a Data object as an input, we will convert it to Input object + Pipeline, # For the case use a pipeline node as the input, we use its only one output as the real input. + str, + bool, + int, + float, + PipelineExpression, + _GroupAttrDict, +) +module_logger = logging.getLogger(__name__) + +T = TypeVar("T") +P = ParamSpec("P") + + +# Overload the returns a decorator when func is None +@overload +def pipeline( # type: ignore[misc] + # TODO: Bug 2876412 + func: None = None, + *, + name: Optional[str] = None, + version: Optional[str] = None, + display_name: Optional[str] = None, + description: Optional[str] = None, + experiment_name: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, + **kwargs, +) -> Callable[[Callable[P, T]], Callable[P, PipelineJob]]: + ... + + +# Overload the returns a decorated function when func isn't None +@overload +def pipeline( + func: Optional[Callable[P, T]] = None, + *, + name: Optional[str] = None, + version: Optional[str] = None, + display_name: Optional[str] = None, + description: Optional[str] = None, + experiment_name: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, + **kwargs, +) -> Callable[P, PipelineJob]: + ... + + +def pipeline( + func: Optional[Callable[P, T]] = None, + *, + name: Optional[str] = None, + version: Optional[str] = None, + display_name: Optional[str] = None, + description: Optional[str] = None, + experiment_name: Optional[str] = None, + tags: Optional[Dict[str, str]] = None, + **kwargs, +) -> Union[Callable[[Callable[P, T]], Callable[P, PipelineJob]], Callable[P, PipelineJob]]: + """Build a pipeline which contains all component nodes defined in this function. + + :param func: The user pipeline function to be decorated. + :type func: types.FunctionType + :keyword name: The name of pipeline component, defaults to function name. + :paramtype name: str + :keyword version: The version of pipeline component, defaults to "1". + :paramtype version: str + :keyword display_name: The display name of pipeline component, defaults to function name. + :paramtype display_name: str + :keyword description: The description of the built pipeline. + :paramtype description: str + :keyword experiment_name: Name of the experiment the job will be created under, \ + if None is provided, experiment will be set to current directory. + :paramtype experiment_name: str + :keyword tags: The tags of pipeline component. + :paramtype tags: dict[str, str] + :keyword kwargs: A dictionary of additional configuration parameters. + :paramtype kwargs: dict + + .. admonition:: Example: + + .. literalinclude:: ../../../../samples/ml_samples_pipeline_job_configurations.py + :start-after: [START configure_pipeline] + :end-before: [END configure_pipeline] + :language: python + :dedent: 8 + :caption: Shows how to create a pipeline using this decorator. + :return: Either + * A decorator, if `func` is None + * The decorated `func` + :rtype: Union[ + Callable[[Callable], Callable[..., PipelineJob]], + Callable[P, PipelineJob] + ] + """ + get_component = kwargs.get("get_component", False) + + def pipeline_decorator(func: Callable[P, T]) -> Callable[P, PipelineJob]: + # pylint: disable=isinstance-second-argument-not-valid-type + if not isinstance(func, Callable): # type: ignore + raise UserErrorException(f"Dsl pipeline decorator accept only function type, got {type(func)}.") + + non_pipeline_inputs = kwargs.get("non_pipeline_inputs", []) or kwargs.get("non_pipeline_parameters", []) + # compute variable names changed from default_compute_targe -> compute -> default_compute -> none + # to support legacy usage, we support them with priority. + compute = kwargs.get("compute", None) + default_compute_target = kwargs.get("default_compute_target", None) + default_compute_target = kwargs.get("default_compute", None) or default_compute_target + continue_on_step_failure = kwargs.get("continue_on_step_failure", None) + on_init = kwargs.get("on_init", None) + on_finalize = kwargs.get("on_finalize", None) + + default_datastore = kwargs.get("default_datastore", None) + force_rerun = kwargs.get("force_rerun", None) + job_settings = { + "default_datastore": default_datastore, + "continue_on_step_failure": continue_on_step_failure, + "force_rerun": force_rerun, + "default_compute": default_compute_target, + "on_init": on_init, + "on_finalize": on_finalize, + } + func_entry_path = _resolve_source_file() + if not func_entry_path: + func_path = Path(inspect.getfile(func)) + # in notebook, func_path may be a fake path and will raise error when trying to resolve this fake path + if func_path.exists(): + func_entry_path = func_path.resolve().absolute() + + job_settings = {k: v for k, v in job_settings.items() if v is not None} + pipeline_builder = PipelineComponentBuilder( + func=func, + name=name, + version=version, + display_name=display_name, + description=description, + default_datastore=default_datastore, + tags=tags, + source_path=str(func_entry_path), + non_pipeline_inputs=non_pipeline_inputs, + ) + + @wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> PipelineJob: + # Default args will be added here. + # pylint: disable=abstract-class-instantiated + # Node: push/pop stack here instead of put it inside build() + # Because we only want to enable dsl settings on top level pipeline + _dsl_settings_stack.push() # use this stack to track on_init/on_finalize settings + try: + # Convert args to kwargs + provided_positional_kwargs = _validate_args(func, args, kwargs, non_pipeline_inputs) + + # When pipeline supports variable params, update pipeline component to support the inputs in **kwargs. + pipeline_parameters = { + k: v for k, v in provided_positional_kwargs.items() if k not in non_pipeline_inputs + } + pipeline_builder._update_inputs(pipeline_parameters) + + non_pipeline_params_dict = { + k: v for k, v in provided_positional_kwargs.items() if k in non_pipeline_inputs + } + + # TODO: cache built pipeline component + pipeline_component = pipeline_builder.build( + user_provided_kwargs=provided_positional_kwargs, + non_pipeline_inputs_dict=non_pipeline_params_dict, + non_pipeline_inputs=non_pipeline_inputs, + ) + finally: + # use `finally` to ensure pop operation from the stack + dsl_settings = _dsl_settings_stack.pop() + + # update on_init/on_finalize settings if init/finalize job is set + if dsl_settings.init_job_set: + job_settings["on_init"] = dsl_settings.init_job_name(pipeline_component.jobs) + if dsl_settings.finalize_job_set: + job_settings["on_finalize"] = dsl_settings.finalize_job_name(pipeline_component.jobs) + + # TODO: pass compute & default_compute separately? + common_init_args = { + "experiment_name": experiment_name, + "component": pipeline_component, + "inputs": pipeline_parameters, + "tags": tags, + } + if _is_inside_dsl_pipeline_func() or get_component: + # on_init/on_finalize is not supported for pipeline component + if job_settings.get("on_init") is not None or job_settings.get("on_finalize") is not None: + raise UserErrorException("On_init/on_finalize is not supported for pipeline component.") + # Build pipeline node instead of pipeline job if inside dsl. + built_pipeline = Pipeline(_from_component_func=True, **common_init_args) + if job_settings: + module_logger.warning( + ("Job settings %s on pipeline function %r are ignored when using inside PipelineJob."), + job_settings, + func.__name__, + ) + else: + built_pipeline = PipelineJob( + jobs=pipeline_component.jobs, + compute=compute, + settings=PipelineJobSettings(**job_settings), + **common_init_args, + ) + + return built_pipeline + + wrapper._is_dsl_func = True # type: ignore[attr-defined] + wrapper._job_settings = job_settings # type: ignore[attr-defined] + wrapper._pipeline_builder = pipeline_builder # type: ignore[attr-defined] + return wrapper + + # enable use decorator without "()" if all arguments are default values + if func is not None: + return pipeline_decorator(func) + return pipeline_decorator diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/__init__.py new file mode 100644 index 000000000000..87592ba0f0af --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/__init__.py @@ -0,0 +1,20 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DataIndex entities.""" + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) + +from azure.ai.generative.index._dataindex.entities._assets import Data +from azure.ai.generative.index._dataindex.entities.data_index import CitationRegex, DataIndex, Embedding, IndexSource, IndexStore +from azure.ai.generative.index._dataindex.entities._builders.data_index_func import index_data + +__all__ = [ + "DataIndex", + "IndexSource", + "Data", + "CitationRegex", + "Embedding", + "IndexStore", + "index_data", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/__init__.py new file mode 100644 index 000000000000..e016d3136023 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/__init__.py @@ -0,0 +1,11 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) + +from azure.ai.generative.index._dataindex.entities._assets._artifacts import Data + +__all__ = [ + "Data", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/__init__.py new file mode 100644 index 000000000000..fa5f7425e90f --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/__init__.py @@ -0,0 +1,11 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) + +from azure.ai.generative.index._dataindex.entities._assets._artifacts.data import Data + +__all__ = [ + "Data", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/data.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/data.py new file mode 100644 index 000000000000..25f83ac660e1 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_assets/_artifacts/data.py @@ -0,0 +1,24 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +from azure.ai.ml.entities._assets._artifacts.data import Data + + +@classmethod # type: ignore[misc] +# TODO: Bug 2874139 +def _resolve_cls_and_type(cls, data, params_override): + from azure.ai.ml.entities._data_import.data_import import DataImport + from azure.ai.generative.index._dataindex.entities.data_index import DataIndex + print("Hellllooo") + + if "index" in data: + return DataIndex, None + + if "source" in data: + return DataImport, None + return cls, None + + +# Override the _resolve_cls_and_type function in the Data class to support serilization of DataIndex +Data._resolve_cls_and_type = _resolve_cls_and_type diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py new file mode 100644 index 000000000000..edfa2f18b737 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py @@ -0,0 +1,773 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +# pylint: disable=protected-access +# pylint: disable=no-member +# pylint: disable=unused-argument + +import json +import re +from typing import Any, Callable, Dict, Optional, Tuple, Union + +from azure.ai.ml._utils._experimental import experimental +from azure.ai.ml.constants._common import AssetTypes, LegacyAssetTypes +from azure.ai.ml.entities import PipelineJob +from azure.ai.ml.entities._builders.base_node import pipeline_node_decorator +from azure.ai.ml.entities._credentials import ManagedIdentityConfiguration, UserIdentityConfiguration +from azure.ai.ml.entities._inputs_outputs import Input, Output +from azure.ai.ml.entities._job.pipeline._component_translatable import ComponentTranslatableMixin +from azure.ai.ml.entities._job.pipeline._io import NodeOutput, PipelineInput +from azure.ai.ml.entities._workspace.connections.workspace_connection import WorkspaceConnection +from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationErrorType, ValidationException +from azure.ai.generative.index._dataindex._schema._data_index import DataIndexTypes +from azure.ai.generative.index._dataindex.constants._component import LLMRAGComponentUri +from azure.ai.generative.index._dataindex.entities.data_index import DataIndex + +SUPPORTED_INPUTS = [ + LegacyAssetTypes.PATH, + AssetTypes.URI_FILE, + AssetTypes.URI_FOLDER, + AssetTypes.MLTABLE, +] + + +def _parse_input(input_value): + component_input, job_input = None, None + if isinstance(input_value, Input): + component_input = Input(**input_value._to_dict()) + input_type = input_value.type + if input_type in SUPPORTED_INPUTS: + job_input = Input(**input_value._to_dict()) + elif isinstance(input_value, dict): + # if user provided dict, we try to parse it to Input. + # for job input, only parse for path type + input_type = input_value.get("type", None) + if input_type in SUPPORTED_INPUTS: + job_input = Input(**input_value) + component_input = Input(**input_value) + elif isinstance(input_value, str): + # Input bindings + component_input = ComponentTranslatableMixin._to_input_builder_function(input_value) + job_input = input_value + elif isinstance(input_value, (PipelineInput, NodeOutput)): + # datatransfer node can accept PipelineInput/NodeOutput for export task. + if input_value._data is None or isinstance(input_value._data, Output): + data = Input(type=input_value.type, mode=input_value.mode) + else: + data = input_value._data + component_input, _ = _parse_input(data) + job_input = input_value + else: + msg = ( + f"Unsupported input type: {type(input_value)}, only Input, dict, str, PipelineInput and NodeOutput are " + f"supported." + ) + raise ValidationException( + message=msg, + no_personal_data_message=msg, + target=ErrorTarget.JOB, + error_type=ValidationErrorType.INVALID_VALUE, + ) + return component_input, job_input + + +def _parse_output(output_value): + component_output, job_output = None, None + if isinstance(output_value, Output): + component_output = Output(**output_value._to_dict()) + job_output = Output(**output_value._to_dict()) + elif not output_value: + # output value can be None or empty dictionary + # None output value will be packed into a JobOutput object with mode = ReadWriteMount & type = UriFolder + component_output = ComponentTranslatableMixin._to_output(output_value) + job_output = output_value + elif isinstance(output_value, dict): # When output value is a non-empty dictionary + job_output = Output(**output_value) + component_output = Output(**output_value) + elif isinstance(output_value, str): # When output is passed in from pipeline job yaml + job_output = output_value + else: + msg = f"Unsupported output type: {type(output_value)}, only Output and dict are supported." + raise ValidationException( + message=msg, + no_personal_data_message=msg, + target=ErrorTarget.JOB, + error_type=ValidationErrorType.INVALID_VALUE, + ) + return component_output, job_output + + +def _parse_inputs_outputs(io_dict: Dict, parse_func: Callable) -> Tuple[Dict, Dict]: + component_io_dict, job_io_dict = {}, {} + if io_dict: + for key, val in io_dict.items(): + component_io, job_io = parse_func(val) + component_io_dict[key] = component_io + job_io_dict[key] = job_io + return component_io_dict, job_io_dict + + +def _build_data_index(io_dict: Union[Dict, DataIndex]): + if io_dict is None: + return io_dict + if isinstance(io_dict, DataIndex): + component_io = io_dict + else: + if isinstance(io_dict, dict): + component_io = DataIndex(**io_dict) + else: + msg = "data_index only support dict and DataIndex" + raise ValidationException( + message=msg, + no_personal_data_message=msg, + target=ErrorTarget.DATA, + error_category=ErrorCategory.USER_ERROR, + error_type=ValidationErrorType.INVALID_VALUE, + ) + + return component_io + + +@experimental +@pipeline_node_decorator +def index_data( + *, + data_index: DataIndex, + description: Optional[str] = None, + tags: Optional[Dict] = None, + name: Optional[str] = None, + display_name: Optional[str] = None, + experiment_name: Optional[str] = None, + compute: Optional[str] = None, + serverless_instance_type: Optional[str] = None, + ml_client: Optional[Any] = None, + identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, + input_data_override: Optional[Input] = None, + **kwargs, +) -> PipelineJob: + """ + Create a PipelineJob object which can be used inside dsl.pipeline. + + :keywork data_index: The data index configuration. + :type data_index: DataIndex + :keyword description: Description of the job. + :type description: str + :keyword tags: Tag dictionary. Tags can be added, removed, and updated. + :type tags: dict[str, str] + :keyword display_name: Display name of the job. + :type display_name: str + :keyword experiment_name: Name of the experiment the job will be created under. + :type experiment_name: str + :keyword compute: The compute resource the job runs on. + :type compute: str + :keyword serverless_instance_type: The instance type to use for serverless compute. + :type serverless_instance_type: Optional[str] + :keyword ml_client: The ml client to use for the job. + :type ml_client: Any + :keyword identity: Identity configuration for the job. + :type identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] + :keyword input_data_override: Input data override for the job. + Used to pipe output of step into DataIndex Job in a pipeline. + :type input_data_override: Optional[Input] + :return: A PipelineJob object. + :rtype: ~azure.ai.ml.entities.PipelineJob. + """ + data_index = _build_data_index(data_index) + + if data_index.index.type == DataIndexTypes.FAISS: + configured_component = data_index_faiss( + ml_client, + data_index, + description, + tags, + name, + display_name, + experiment_name, + compute, + serverless_instance_type, + identity, + input_data_override, + ) + elif data_index.index.type == DataIndexTypes.ACS: + if kwargs.get("incremental_update", False): + configured_component = data_index_incremental_update_acs( + ml_client, + data_index, + description, + tags, + name, + display_name, + experiment_name, + compute, + serverless_instance_type, + identity, + input_data_override, + ) + else: + configured_component = data_index_acs( + ml_client, + data_index, + description, + tags, + name, + display_name, + experiment_name, + compute, + serverless_instance_type, + identity, + input_data_override, + ) + else: + raise ValueError(f"Unsupported index type: {data_index.index.type}") + + configured_component.properties["azureml.mlIndexAssetName"] = data_index.name + configured_component.properties["azureml.mlIndexAssetKind"] = data_index.index.type + configured_component.properties["azureml.mlIndexAssetSource"] = "Data Asset" + + return configured_component + + +def data_index_incremental_update_acs( + ml_client: Any, + data_index: DataIndex, + description: Optional[str] = None, + tags: Optional[Dict] = None, + name: Optional[str] = None, + display_name: Optional[str] = None, + experiment_name: Optional[str] = None, + compute: Optional[str] = None, + serverless_instance_type: Optional[str] = None, + identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, + input_data_override: Optional[Input] = None, +): + from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol + from azure.ai.generative.index._dataindex.dsl._pipeline_decorator import pipeline + + crack_and_chunk_and_embed_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CRACK_AND_CHUNK_AND_EMBED) + update_acs_index_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_UPDATE_ACS_INDEX) + register_mlindex_asset_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_REGISTER_MLINDEX_ASSET) + + @pipeline( + name=name if name else "data_index_incremental_update_acs", + description=description, + tags=tags, + display_name=display_name if display_name else "LLM - Data to ACS (Incremental Update)", + experiment_name=experiment_name, + compute=compute, + get_component=True, + ) + def data_index_acs_pipeline( + input_data: Input, + embeddings_model: str, + acs_config: str, + acs_connection_id: str, + aoai_connection_id: str, + embeddings_container: Input, + chunk_size: int = 768, + chunk_overlap: Optional[int] = 0, + input_glob: Optional[str] = "**/*", + citation_url: Optional[str] = None, + citation_replacement_regex: Optional[str] = None, + ): + """ + Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index. + + :param input_data: The input data to be indexed. + :type input_data: Input + :param embeddings_model: The embedding model to use when processing source data chunks. + :type embeddings_model: str + :param acs_config: The configuration for the Azure Cognitive Search index. + :type acs_config: str + :param acs_connection_id: The connection ID for the Azure Cognitive Search index. + :type acs_connection_id: str + :param chunk_size: The size of the chunks to break the input data into. + :type chunk_size: int + :param chunk_overlap: The number of tokens to overlap between chunks. + :type chunk_overlap: Optional[int] + :param input_glob: The glob pattern to use when searching for input data. + :type input_glob: Optional[str] + :param citation_url: The URL to use when generating citations for the input data. + :type citation_url: str + :param citation_replacement_regex: The regex to use when generating citations for the input data. + :type citation_replacement_regex: str + :param aoai_connection_id: The connection ID for the Azure Open AI service. + :type aoai_connection_id: str + :param embeddings_container: The container to use when caching embeddings. + :type embeddings_container: Input + :return: The URI of the generated Azure Cognitive Search index. + :rtype: str. + """ + if input_glob is None: + input_glob = "**/*" + if chunk_overlap is None: + chunk_overlap = 0 + + crack_and_chunk_and_embed = crack_and_chunk_and_embed_component( + input_data=input_data, + input_glob=input_glob, + chunk_size=chunk_size, + chunk_overlap=chunk_overlap, + citation_url=citation_url, + citation_replacement_regex=citation_replacement_regex, + embeddings_container=embeddings_container, + embeddings_model=embeddings_model, + embeddings_connection_id=aoai_connection_id, + ) + if compute is None or compute == "serverless": + use_automatic_compute(crack_and_chunk_and_embed, instance_type=serverless_instance_type) + if optional_pipeline_input_provided(embeddings_container): + crack_and_chunk_and_embed.outputs.embeddings = Output( + type="uri_folder", path=f"{embeddings_container.path}/{{name}}" + ) + if identity: + crack_and_chunk_and_embed.identity = identity + + update_acs_index = update_acs_index_component( + embeddings=crack_and_chunk_and_embed.outputs.embeddings, acs_config=acs_config + ) + if compute is None or compute == "serverless": + use_automatic_compute(update_acs_index, instance_type=serverless_instance_type) + update_acs_index.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_ACS"] = acs_connection_id + if identity: + update_acs_index.identity = identity + + register_mlindex_asset = register_mlindex_asset_component( + storage_uri=update_acs_index.outputs.index, + asset_name=data_index.name, + ) + if compute is None or compute == "serverless": + use_automatic_compute(register_mlindex_asset, instance_type=serverless_instance_type) + if identity: + register_mlindex_asset.identity = identity + return { + "mlindex_asset_uri": update_acs_index.outputs.index, + "mlindex_asset_id": register_mlindex_asset.outputs.asset_id, + } + + if input_data_override is not None: + input_data = input_data_override + else: + input_data = Input(type=data_index.source.input_data.type, path=data_index.source.input_data.path) + + acs_config = { + "index_name": data_index.index.name if data_index.index.name is not None else data_index.name, + "full_sync": True, + } + if data_index.index.config is not None: + acs_config.update(data_index.index.config) + + component = data_index_acs_pipeline( + input_data=input_data, + input_glob=data_index.source.input_glob, + chunk_size=data_index.source.chunk_size, # type: ignore[arg-type] + chunk_overlap=data_index.source.chunk_overlap, + citation_url=data_index.source.citation_url, + citation_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) + if data_index.source.citation_url_replacement_regex + else None, + embeddings_model=build_model_protocol(data_index.embedding.model), + aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection), + embeddings_container=Input(type=AssetTypes.URI_FOLDER, path=data_index.embedding.cache_path) if data_index.embedding.cache_path else None, + acs_config=json.dumps(acs_config), + acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection), + ) + # Hack until full Component classes are implemented that can annotate the optional parameters properly + component.inputs["input_glob"]._meta.optional = True + component.inputs["chunk_size"]._meta.optional = True + component.inputs["chunk_overlap"]._meta.optional = True + component.inputs["citation_url"]._meta.optional = True + component.inputs["citation_replacement_regex"]._meta.optional = True + component.inputs["aoai_connection_id"]._meta.optional = True + component.inputs["embeddings_container"]._meta.optional = True + + if data_index.path: + component.outputs.mlindex_asset_uri = Output(type=AssetTypes.URI_FOLDER, path=data_index.path) + + return component + + +def data_index_faiss( + ml_client: Any, + data_index: DataIndex, + description: Optional[str] = None, + tags: Optional[Dict] = None, + name: Optional[str] = None, + display_name: Optional[str] = None, + experiment_name: Optional[str] = None, + compute: Optional[str] = None, + serverless_instance_type: Optional[str] = None, + identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, + input_data_override: Optional[Input] = None, +): + from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol + from azure.ai.generative.index._dataindex.dsl._pipeline_decorator import pipeline + + crack_and_chunk_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CRACK_AND_CHUNK) + generate_embeddings_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_GENERATE_EMBEDDINGS) + create_faiss_index_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CREATE_FAISS_INDEX) + register_mlindex_asset_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_REGISTER_MLINDEX_ASSET) + + @pipeline( + name=name if name else "data_index_faiss", + description=description, + tags=tags, + display_name=display_name if display_name else "LLM - Data to Faiss", + experiment_name=experiment_name, + compute=compute, + get_component=True, + ) + def data_index_faiss_pipeline( + input_data: Input, + embeddings_model: str, + embeddings_container: Input, + chunk_size: int = 1024, + data_source_glob: str = None, # type: ignore[assignment] + data_source_url: str = None, # type: ignore[assignment] + document_path_replacement_regex: str = None, # type: ignore[assignment] + aoai_connection_id: str = None, # type: ignore[assignment] + ): + """ + Generate embeddings for a `input_data` source and create a Faiss index from them. + + :param input_data: The input data to be indexed. + :type input_data: Input + :param embeddings_model: The embedding model to use when processing source data chunks. + :type embeddings_model: str + :param chunk_size: The size of the chunks to break the input data into. + :type chunk_size: Optional[int] + :param data_source_glob: The glob pattern to use when searching for input data. + :type data_source_glob: str + :param data_source_url: The URL to use when generating citations for the input data. + :type data_source_url: str + :param document_path_replacement_regex: The regex to use when generating citations for the input data. + :type document_path_replacement_regex: str + :param aoai_connection_id: The connection ID for the Azure Open AI service. + :type aoai_connection_id: str + :param embeddings_container: The container to use when caching embeddings. + :type embeddings_container: Input + :return: The URI of the generated Faiss index. + :rtype: str. + """ + if chunk_size is None: + chunk_size = 1024 + + crack_and_chunk = crack_and_chunk_component( + input_data=input_data, + input_glob=data_source_glob, + chunk_size=chunk_size, + data_source_url=data_source_url, + document_path_replacement_regex=document_path_replacement_regex, + ) + if compute is None or compute == "serverless": + use_automatic_compute(crack_and_chunk, instance_type=serverless_instance_type) + if identity: + crack_and_chunk.identity = identity + + generate_embeddings = generate_embeddings_component( + chunks_source=crack_and_chunk.outputs.output_chunks, + embeddings_container=embeddings_container, + embeddings_model=embeddings_model, + ) + if compute is None or compute == "serverless": + use_automatic_compute(generate_embeddings, instance_type=serverless_instance_type) + if optional_pipeline_input_provided(aoai_connection_id): + generate_embeddings.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_AOAI"] = aoai_connection_id + if optional_pipeline_input_provided(embeddings_container): + generate_embeddings.outputs.embeddings = Output( + type="uri_folder", path=f"{embeddings_container.path}/{{name}}" + ) + if identity: + generate_embeddings.identity = identity + + create_faiss_index = create_faiss_index_component(embeddings=generate_embeddings.outputs.embeddings) + if compute is None or compute == "serverless": + use_automatic_compute(create_faiss_index, instance_type=serverless_instance_type) + if identity: + create_faiss_index.identity = identity + + register_mlindex_asset = register_mlindex_asset_component( + storage_uri=create_faiss_index.outputs.index, + asset_name=data_index.name, + ) + if compute is None or compute == "serverless": + use_automatic_compute(register_mlindex_asset, instance_type=serverless_instance_type) + if identity: + register_mlindex_asset.identity = identity + return { + "mlindex_asset_uri": create_faiss_index.outputs.index, + "mlindex_asset_id": register_mlindex_asset.outputs.asset_id, + } + + if input_data_override is not None: + input_data = input_data_override + else: + input_data = Input(type=data_index.source.input_data.type, path=data_index.source.input_data.path) + + component = data_index_faiss_pipeline( + input_data=input_data, + embeddings_model=build_model_protocol(data_index.embedding.model), + chunk_size=data_index.source.chunk_size, # type: ignore[arg-type] + data_source_glob=data_index.source.input_glob, # type: ignore[arg-type] + data_source_url=data_index.source.citation_url, # type: ignore[arg-type] + document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type] + if data_index.source.citation_url_replacement_regex + else None, + aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection), + embeddings_container=Input(type=AssetTypes.URI_FOLDER, path=data_index.embedding.cache_path) if data_index.embedding.cache_path else None, + ) + # Hack until full Component classes are implemented that can annotate the optional parameters properly + component.inputs["data_source_glob"]._meta.optional = True + component.inputs["data_source_url"]._meta.optional = True + component.inputs["document_path_replacement_regex"]._meta.optional = True + component.inputs["aoai_connection_id"]._meta.optional = True + component.inputs["embeddings_container"]._meta.optional = True + if data_index.path: + component.outputs.mlindex_asset_uri = Output(type=AssetTypes.URI_FOLDER, path=data_index.path) + + return component + + +def data_index_acs( + ml_client: Any, + data_index: DataIndex, + description: Optional[str] = None, + tags: Optional[Dict] = None, + name: Optional[str] = None, + display_name: Optional[str] = None, + experiment_name: Optional[str] = None, + compute: Optional[str] = None, + serverless_instance_type: Optional[str] = None, + identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, + input_data_override: Optional[Input] = None, +): + from azure.ai.generative.index._dataindex.data_index.models import build_model_protocol + from azure.ai.generative.index._dataindex.dsl._pipeline_decorator import pipeline + + crack_and_chunk_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_CRACK_AND_CHUNK) + generate_embeddings_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_GENERATE_EMBEDDINGS) + update_acs_index_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_UPDATE_ACS_INDEX) + register_mlindex_asset_component = get_component_obj(ml_client, LLMRAGComponentUri.LLM_RAG_REGISTER_MLINDEX_ASSET) + + @pipeline( + name=name if name else "data_index_acs", + description=description, + tags=tags, + display_name=display_name if display_name else "LLM - Data to ACS", + experiment_name=experiment_name, + compute=compute, + get_component=True, + ) + def data_index_acs_pipeline( + input_data: Input, + embeddings_model: str, + acs_config: str, + acs_connection_id: str, + embeddings_container: Input, + chunk_size: int = 1024, + data_source_glob: str = None, # type: ignore[assignment] + data_source_url: str = None, # type: ignore[assignment] + document_path_replacement_regex: str = None, # type: ignore[assignment] + aoai_connection_id: str = None, # type: ignore[assignment] + ): + """ + Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index. + + :param input_data: The input data to be indexed. + :type input_data: Input + :param embeddings_model: The embedding model to use when processing source data chunks. + :type embeddings_model: str + :param acs_config: The configuration for the Azure Cognitive Search index. + :type acs_config: str + :param acs_connection_id: The connection ID for the Azure Cognitive Search index. + :type acs_connection_id: str + :param chunk_size: The size of the chunks to break the input data into. + :type chunk_size: Optional[int] + :param data_source_glob: The glob pattern to use when searching for input data. + :type data_source_glob: str + :param data_source_url: The URL to use when generating citations for the input data. + :type data_source_url: str + :param document_path_replacement_regex: The regex to use when generating citations for the input data. + :type document_path_replacement_regex: str + :param aoai_connection_id: The connection ID for the Azure Open AI service. + :type aoai_connection_id: str + :param embeddings_container: The container to use when caching embeddings. + :type embeddings_container: Input + :return: The URI of the generated Azure Cognitive Search index. + :rtype: str. + """ + if chunk_size is None: + chunk_size = 1024 + + crack_and_chunk = crack_and_chunk_component( + input_data=input_data, + input_glob=data_source_glob, + chunk_size=chunk_size, + data_source_url=data_source_url, + document_path_replacement_regex=document_path_replacement_regex, + ) + if compute is None or compute == "serverless": + use_automatic_compute(crack_and_chunk, instance_type=serverless_instance_type) + if identity: + crack_and_chunk.identity = identity + + generate_embeddings = generate_embeddings_component( + chunks_source=crack_and_chunk.outputs.output_chunks, + embeddings_container=embeddings_container, + embeddings_model=embeddings_model, + ) + if compute is None or compute == "serverless": + use_automatic_compute(generate_embeddings, instance_type=serverless_instance_type) + if optional_pipeline_input_provided(aoai_connection_id): + generate_embeddings.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_AOAI"] = aoai_connection_id + if optional_pipeline_input_provided(embeddings_container): + generate_embeddings.outputs.embeddings = Output( + type="uri_folder", path=f"{embeddings_container.path}/{{name}}" + ) + if identity: + generate_embeddings.identity = identity + + update_acs_index = update_acs_index_component( + embeddings=generate_embeddings.outputs.embeddings, acs_config=acs_config + ) + if compute is None or compute == "serverless": + use_automatic_compute(update_acs_index, instance_type=serverless_instance_type) + update_acs_index.environment_variables["AZUREML_WORKSPACE_CONNECTION_ID_ACS"] = acs_connection_id + if identity: + update_acs_index.identity = identity + + register_mlindex_asset = register_mlindex_asset_component( + storage_uri=update_acs_index.outputs.index, + asset_name=data_index.name, + ) + if compute is None or compute == "serverless": + use_automatic_compute(register_mlindex_asset, instance_type=serverless_instance_type) + if identity: + register_mlindex_asset.identity = identity + return { + "mlindex_asset_uri": update_acs_index.outputs.index, + "mlindex_asset_id": register_mlindex_asset.outputs.asset_id, + } + + if input_data_override is not None: + input_data = input_data_override + else: + input_data = Input(type=data_index.source.input_data.type, path=data_index.source.input_data.path) + + acs_config = { + "index_name": data_index.index.name if data_index.index.name is not None else data_index.name, + } + if data_index.index.config is not None: + acs_config.update(data_index.index.config) + + component = data_index_acs_pipeline( + input_data=input_data, + embeddings_model=build_model_protocol(data_index.embedding.model), + acs_config=json.dumps(acs_config), + acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection), + chunk_size=data_index.source.chunk_size, # type: ignore[arg-type] + data_source_glob=data_index.source.input_glob, # type: ignore[arg-type] + data_source_url=data_index.source.citation_url, # type: ignore[arg-type] + document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type] + if data_index.source.citation_url_replacement_regex + else None, + aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection), + embeddings_container=Input(type=AssetTypes.URI_FOLDER, path=data_index.embedding.cache_path) if data_index.embedding.cache_path else None, + ) + # Hack until full Component classes are implemented that can annotate the optional parameters properly + component.inputs["data_source_glob"]._meta.optional = True + component.inputs["data_source_url"]._meta.optional = True + component.inputs["document_path_replacement_regex"]._meta.optional = True + component.inputs["aoai_connection_id"]._meta.optional = True + component.inputs["embeddings_container"]._meta.optional = True + + if data_index.path: + component.outputs.mlindex_asset_uri = Output(type=AssetTypes.URI_FOLDER, path=data_index.path) + + return component + + +def optional_pipeline_input_provided(input: Optional[PipelineInput]): + """ + Checks if optional pipeline inputs are provided. + + :param input: The pipeline input to check. + :type input: Optional[PipelineInput] + :return: True if the input is not None and has a value, False otherwise. + :rtype: bool. + """ + return input is not None and input._data is not None + + +def use_automatic_compute(component, instance_count=1, instance_type=None): + """ + Configure input `component` to use automatic compute with `instance_count` and `instance_type`. + + This avoids the need to provision a compute cluster to run the component. + :param component: The component to configure. + :type component: Any + :param instance_count: The number of instances to use. + :type instance_count: int + :param instance_type: The type of instance to use. + :type instance_type: str + :return: The configured component. + :rtype: Any. + """ + component.set_resources( + instance_count=instance_count, + instance_type=instance_type, + properties={"compute_specification": {"automatic": True}}, + ) + return component + + +def get_component_obj(ml_client, component_uri): + from azure.ai.ml import MLClient + + if not isinstance(component_uri, str): + # Assume Component object + return component_uri + + matches = re.match( + r"azureml://registries/(?P.*)/components/(?P.*)" + r"/(?P.*)/(?P.*)", + component_uri, + ) + if matches is None: + from azure.ai.ml import load_component + + # Assume local path to component + return load_component(source=component_uri) + + registry_name = matches.group("registry_name") + registry_client = MLClient( + subscription_id=ml_client.subscription_id, + resource_group_name=ml_client.resource_group_name, + credential=ml_client._credential, + registry_name=registry_name, + ) + component_obj = registry_client.components.get( + matches.group("component_name"), + **{matches.group("identifier_type").rstrip("s"): matches.group("identifier_name")}, + ) + return component_obj + + +def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> str: + if connection is None: + return "" + + if isinstance(connection, str): + short_form = re.match(r"azureml:(?P[^/]*)", connection) + if short_form: + connection_name = short_form.group("connection_name") + else: + # TODO: Handle long form connection sub/rg/ws, ideally reuse logic implemented by connections code. + long_form = re.match(r"(azureml:/)?/.*/connections/(?P[^/]*)", connection) + connection_name = long_form.group("connection_name") if long_form else connection + + connection = ml_client.connections.get(connection_name) + elif hasattr(connection, "_workspace_connection"): + # Handle azure.ai.generative Connections + connection = connection._workspace_connection + + return connection.id diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/data_index.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/data_index.py new file mode 100644 index 000000000000..3351c18af283 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/data_index.py @@ -0,0 +1,278 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DataIndex entities.""" + +# pylint: disable=no-member + +from os import PathLike +from pathlib import Path +from typing import Dict, Optional, Union + +from azure.ai.generative.index._dataindex._schema._data_index import DataIndexTypes +from azure.ai.ml._utils._experimental import experimental +from azure.ai.ml.constants._common import BASE_PATH_CONTEXT_KEY, PARAMS_OVERRIDE_KEY +from azure.ai.ml.entities._assets import Data +from azure.ai.ml.entities._inputs_outputs.utils import _remove_empty_values +from azure.ai.ml.entities._mixins import DictMixin +from azure.ai.ml.entities._util import load_from_dict + + +@experimental +class CitationRegex(DictMixin): + """ + :keyword match_pattern: Regex to match citation in the citation_url + input file path. + e.g. '(.*)/articles/(.*)(\\.[^.]+)$'. + :type match_pattern: str + :keyword replacement_pattern: Replacement string for citation. e.g. '\\1/\\2'. + :type replacement_pattern: str + """ + + def __init__( + self, + *, + match_pattern: str, + replacement_pattern: str, + ) -> None: + """Initialize a CitationRegex object.""" + self.match_pattern = match_pattern + self.replacement_pattern = replacement_pattern + + def _to_dict(self) -> Dict: + """Convert the Source object to a dict. + :return: The dictionary representation of the class + :rtype: Dict + """ + keys = [ + "match_pattern", + "replacement_pattern", + ] + result = {key: getattr(self, key) for key in keys} + return _remove_empty_values(result) + + +@experimental +class IndexSource(DictMixin): + """Congifuration for the destination index to write processed data to. + :keyword input_data: Input Data to index files from. MLTable type inputs will use `mode: eval_mount`. + :type input_data: Data + :keyword input_glob: Connection reference to use for embedding model information, + only needed for hosted embeddings models (such as Azure OpenAI). + :type input_glob: str, optional + :keyword chunk_size: Maximum number of tokens to put in each chunk. + :type chunk_size: int, optional + :keyword chunk_overlap: Number of tokens to overlap between chunks. + :type chunk_overlap: int, optional + :keyword citation_url: Base URL to join with file paths to create full source file URL for chunk metadata. + :type citation_url: str, optional + :keyword citation_url_replacement_regex: Regex match and replacement patterns for citation url. Useful if the paths + in `input_data` don't match the desired citation format. + :type citation_url_replacement_regex: CitationRegex, optional + :raises ~azure.ai.ml.exceptions.ValidationException: Raised if the IndexSource object cannot be validated. + Details will be provided in the error message. + """ + + def __init__( + self, + *, + input_data: Data, + input_glob: Optional[str] = None, + chunk_size: Optional[int] = None, + chunk_overlap: Optional[int] = None, + citation_url: Optional[str] = None, + citation_url_replacement_regex: Optional[CitationRegex] = None, + ) -> None: + """Initialize a IndexSource object.""" + self.input_data = input_data + self.input_glob = input_glob + self.chunk_size = chunk_size + self.chunk_overlap = chunk_overlap + self.citation_url = citation_url + self.citation_url_replacement_regex = citation_url_replacement_regex + + def _to_dict(self) -> Dict: + """Convert the Source object to a dict. + :return: The dictionary representation of the class + :rtype: Dict + """ + keys = [ + "input_data", + "input_glob", + "chunk_size", + "chunk_overlap", + "citation_url", + "citation_url_replacement_regex", + ] + result = {key: getattr(self, key) for key in keys} + return _remove_empty_values(result) + + +@experimental +class Embedding(DictMixin): + """Congifuration for the destination index to write processed data to. + :keyword model: The model to use to embed data. E.g. 'hugging_face://model/sentence-transformers/all-mpnet-base-v2' + or 'azure_open_ai://deployment/{deployment_name}/model/{model_name}' + :type model: str + :keyword connection: Connection reference to use for embedding model information, + only needed for hosted embeddings models (such as Azure OpenAI). + :type connection: str, optional + :keyword cache_path: Folder containing previously generated embeddings. + Should be parent folder of the 'embeddings' output path used for for this component. + Will compare input data to existing embeddings and only embed changed/new data, reusing existing chunks. + :type cache_path: str, optional + :raises ~azure.ai.ml.exceptions.ValidationException: Raised if the Embedding object cannot be validated. + Details will be provided in the error message. + """ + + def __init__( + self, + *, + model: str, + connection: Optional[str] = None, + cache_path: Optional[str] = None, + ) -> None: + """Initialize a Embedding object.""" + self.model = model + self.connection = connection + self.cache_path = cache_path + + def _to_dict(self) -> Dict: + """Convert the Source object to a dict. + :return: The dictionary representation of the class + :rtype: Dict + """ + keys = [ + "model", + "connection", + "cache_path", + ] + result = {key: getattr(self, key) for key in keys} + return _remove_empty_values(result) + + +@experimental +class IndexStore(DictMixin): + """Congifuration for the destination index to write processed data to. + :keyword type: The type of index to write to. Currently supported types are 'acs' and 'faiss'. + :type type: str + :keyword name: Name of index to update/create, only needed for hosted indexes (such as Azure Cognitive Search). + :type name: str, optional + :keyword connection: Connection reference to use for index information, + only needed for hosted indexes (such as Azure Cognitive Search). + :type connection: str, optional + :keyword config: Configuration for the index. Configuration for the index. Primary use is to configure Azure Cognitive Search specific settings. + Such as custom `field_mapping` for known field types. + :type config: dict, optional + :raises ~azure.ai.ml.exceptions.ValidationException: Raised if the IndexStore object cannot be validated. + Details will be provided in the error message. + """ + + def __init__( + self, + *, + type: str = DataIndexTypes.FAISS, + name: Optional[str] = None, + connection: Optional[str] = None, + config: Optional[Dict] = None, + ) -> None: + """Initialize a IndexStore object.""" + self.type = type + self.name = name + self.connection = connection + self.config = config + + def _to_dict(self) -> Dict: + """Convert the Source object to a dict. + :return: The dictionary representation of the class + :rtype: Dict + """ + keys = [ + "type", + "name", + "connection", + "config" + ] + result = {key: getattr(self, key) for key in keys} + return _remove_empty_values(result) + + +@experimental +class DataIndex(Data): + """Data asset with a creating data index job. + :param name: Name of the asset. + :type name: str + :param path: The path to the asset being created by data index job. + :type path: str + :param source: The source data to be indexed. + :type source: IndexSource + :param embedding: The embedding model to use when processing source data chunks. + :type embedding: Embedding + :param index: The destination index to write processed data to. + :type index: IndexStore + :param incremental_update: Whether to update the index incrementally or not. + :type incremental_update: bool + :param version: Version of the asset created by running this DataIndex Job. + :type version: str + :param description: Description of the resource. + :type description: str + :param tags: Tag dictionary. Tags can be added, removed, and updated. + :type tags: dict[str, str] + :param properties: The asset property dictionary. + :type properties: dict[str, str] + :param kwargs: A dictionary of additional configuration parameters. + :type kwargs: dict + """ + + def __init__( + self, + *, + name: str, + source: IndexSource, + embedding: Embedding, + index: IndexStore, + incremental_update: bool = False, + path: Optional[str] = None, + version: Optional[str] = None, + description: Optional[str] = None, + tags: Optional[Dict] = None, + properties: Optional[Dict] = None, + **kwargs, + ) -> None: + """Initialize a DataIndex object.""" + super().__init__( + name=name, + version=version, + description=description, + tags=tags, + properties=properties, + path=path, + **kwargs, + ) + self.source = source + self.embedding = embedding + self.index = index + self.incremental_update = incremental_update + + @classmethod + def _load( + cls, + data: Optional[Dict] = None, + yaml_path: Optional[Union[PathLike, str]] = None, + params_override: Optional[list] = None, + **kwargs, + ) -> "DataIndex": + from azure.ai.generative.index._dataindex._schema._data_index import DataIndexSchema + + data = data or {} + params_override = params_override or [] + context = { + BASE_PATH_CONTEXT_KEY: Path(yaml_path).parent if yaml_path else Path("./"), + PARAMS_OVERRIDE_KEY: params_override, + } + return load_from_dict(DataIndexSchema, data, context, **kwargs) + + def _to_dict(self) -> Dict: + # pylint: disable=no-member + from azure.ai.generative.index._dataindex._schema._data_index import DataIndexSchema + + return DataIndexSchema(context={BASE_PATH_CONTEXT_KEY: "./"}).dump(self) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/__init__.py new file mode 100644 index 000000000000..d058f334e5ac --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/__init__.py @@ -0,0 +1,12 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DataIndex operations.""" + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) + +from azure.ai.generative.index._dataindex.operations._data_operations import DataOperations + +__all__ = [ + "DataOperations", +] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/_data_operations.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/_data_operations.py new file mode 100644 index 000000000000..4e7da155c15c --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/operations/_data_operations.py @@ -0,0 +1,115 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + +# pylint: disable=protected-access +# pylint: disable=no-member + +from typing import Optional, Union + +from azure.ai.ml._telemetry import ActivityType, monitor_with_activity +from azure.ai.ml._utils._asset_utils import ( + _validate_auto_delete_setting_in_data_output, + _validate_workspace_managed_datastore, +) +from azure.ai.ml._utils._experimental import experimental +from azure.ai.ml.constants._common import ( + AssetTypes, + AzureMLResourceType, +) +from azure.ai.ml.entities import PipelineJob, PipelineJobSettings +from azure.ai.ml.entities._credentials import ManagedIdentityConfiguration, UserIdentityConfiguration +from azure.ai.ml.entities._inputs_outputs import Input +from azure.ai.ml.operations._data_operations import DataOperations, logger +from azure.ai.generative.index._dataindex.data_index import index_data as index_data_func +from azure.ai.generative.index._dataindex.entities.data_index import DataIndex + + +@monitor_with_activity(logger, "Data.IndexData", ActivityType.PUBLICAPI) +@experimental +def index_data( + self, + data_index: DataIndex, + identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] = None, + compute: str = "serverless", + serverless_instance_type: Optional[str] = None, + input_data_override: Optional[Input] = None, + submit_job: bool = True, + **kwargs, +) -> PipelineJob: + """ + Returns the data import job that is creating the data asset. + + :param data_index: DataIndex object. + :type data_index: azure.ai.ml.entities._dataindex + :param identity: Identity configuration for the job. + :type identity: Optional[Union[ManagedIdentityConfiguration, UserIdentityConfiguration]] + :param compute: The compute target to use for the job. Default: "serverless". + :type compute: str + :param serverless_instance_type: The instance type to use for serverless compute. + :type serverless_instance_type: Optional[str] + :param input_data_override: Input data override for the job. + Used to pipe output of step into DataIndex Job in a pipeline. + :type input_data_override: Optional[Input] + :param submit_job: Whether to submit the job to the service. Default: True. + :type submit_job: bool + :return: data import job object. + :rtype: ~azure.ai.ml.entities.PipelineJob. + """ + from azure.ai.ml import MLClient + + default_name = "data_index_" + data_index.name + experiment_name = kwargs.pop("experiment_name", None) or default_name + data_index.type = AssetTypes.URI_FOLDER + + # avoid specifying auto_delete_setting in job output now + _validate_auto_delete_setting_in_data_output(data_index.auto_delete_setting) + + # block customer specified path on managed datastore + data_index.path = _validate_workspace_managed_datastore(data_index.path) + + # TODO: This is import_data behavior, not sure if it should be default for index_data, or just be documented? + if "${{name}}" not in data_index.path and "{name}" not in data_index.path: + data_index.path = data_index.path.rstrip("/") + "/${{name}}" + + index_job = index_data_func( + description=data_index.description or kwargs.pop("description", None) or default_name, + name=data_index.name or kwargs.pop("name", None), + display_name=kwargs.pop("display_name", None) or default_name, + experiment_name=experiment_name, + compute=compute, + serverless_instance_type=serverless_instance_type, + data_index=data_index, + ml_client=MLClient( + subscription_id=self._subscription_id, + resource_group_name=self._resource_group_name, + workspace_name=self._workspace_name, + credential=self._service_client._config.credential, + ), + identity=identity, + input_data_override=input_data_override, + **kwargs, + ) + index_pipeline = PipelineJob( + description=index_job.description, + tags=index_job.tags, + name=index_job.name, + display_name=index_job.display_name, + experiment_name=experiment_name, + properties=index_job.properties or {}, + settings=PipelineJobSettings(force_rerun=True, default_compute=compute), + jobs={default_name: index_job}, + ) + index_pipeline.properties["azureml.mlIndexAssetName"] = data_index.name + index_pipeline.properties["azureml.mlIndexAssetKind"] = data_index.index.type + index_pipeline.properties["azureml.mlIndexAssetSource"] = kwargs.pop("mlindex_asset_source", "Data Asset") + + if submit_job: + return self._all_operations.all_operations[AzureMLResourceType.JOB].create_or_update( + job=index_pipeline, skip_validation=True, **kwargs + ) + + return index_pipeline + + +DataOperations.index_data = index_data diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_docstore.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_docstore.py new file mode 100644 index 000000000000..3464e1aacbc7 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_docstore.py @@ -0,0 +1,91 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""DocumentStore.""" +from pathlib import Path +from typing import Dict, Optional, Union + +from azure.ai.generative.index._documents import Document, StaticDocument +from azure.ai.generative.index._utils.logging import get_logger + +logger = get_logger(__name__) + + +class FileBasedDocstore: + """Simple docstore which serializes to file and loads into memory.""" + + def __init__(self, _dict: Optional[Dict[str, Document]] = None): + """Initialize with dict.""" + self._dict = _dict if _dict is not None else {} + + def add(self, texts: Dict[str, Document]) -> None: + """ + Add texts to in memory dictionary. + + Args: + ---- + texts: dictionary of id -> document. + + Returns: + ------- + None + """ + overlapping = set(texts).intersection(self._dict) + if overlapping: + raise ValueError(f"Tried to add ids that already exist: {overlapping}") + self._dict = {**self._dict, **texts} + + def delete(self, ids: list) -> None: + """Deleting IDs from in memory dictionary.""" + overlapping = set(ids).intersection(self._dict) + if not overlapping: + raise ValueError(f"Tried to delete ids that does not exist: {ids}") + for _id in ids: + self._dict.pop(_id) + + def search(self, search: str) -> Union[Document, str]: + """ + Search via direct lookup. + + Args: + ---- + search: id of a document to search for. + + Returns: + ------- + Document if found, else error message. + """ + if search not in self._dict: + return f"ID {search} not found." + else: + return self._dict[search] + + def save(self, output_path: str): + """ + Save to JSONL file. + + Args: + ---- + output_path: folder to save doctore contents in. + """ + output_path_obj = Path(output_path) + output_path_obj.mkdir(parents=True, exist_ok=True) + + with (output_path_obj / "docs.jsonl").open("w", encoding="utf-8") as f: + for doc in self._dict.values(): + json_line = doc.dumps() + f.write(json_line + "\n") + + @classmethod + def load(cls, input_path: str) -> "FileBasedDocstore": + """Load from JSONL file.""" + from fsspec.core import url_to_fs + + fs, uri = url_to_fs(input_path) + + documents: Optional[Dict[str, Document]] = {} + with fs.open(f"{input_path.rstrip('/')}/docs.jsonl") as f: + for line in f: + document = StaticDocument.loads(line.strip()) + documents[document.document_id] = document # type: ignore[index] + return cls(documents) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py index 4c27e08e1a80..c3538f847f09 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/__init__.py @@ -16,8 +16,8 @@ crack_documents, files_to_document_source, ) +from azure.ai.generative.index._documents.document import Document, StaticDocument from azure.ai.generative.index._utils.logging import get_logger -from azure.ai.resources._index._documents.document import Document, StaticDocument logger = get_logger(__name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py index f3360060e3e9..1edb6a1aeb42 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/chunking.py @@ -10,12 +10,11 @@ from functools import lru_cache from typing import Any, Iterable, Iterator, List, Optional, Sequence -from azure.ai.generative.index._documents.document import DocumentSource +from azure.ai.generative.index._documents.document import Document, DocumentSource, StaticDocument +from azure.ai.generative.index._langchain.vendor.text_splitter import TextSplitter from azure.ai.generative.index._utils import merge_dicts from azure.ai.generative.index._utils.logging import get_logger, safe_mlflow_log_metric -from azure.ai.resources._index._langchain.vendor.text_splitter import TextSplitter -from azure.ai.resources._index._utils.tokens import tiktoken_cache_dir, token_length_function -from azure.ai.resources._index._documents.document import Document, StaticDocument +from azure.ai.generative.index._utils.tokens import tiktoken_cache_dir, token_length_function logger = get_logger(__name__) @@ -69,7 +68,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter # Handle non-natural language splitters if file_extension == ".py": - from azure.ai.resources._index._langchain.vendor.text_splitter import Language, RecursiveCharacterTextSplitter + from azure.ai.generative.index._langchain.vendor.text_splitter import Language, RecursiveCharacterTextSplitter with tiktoken_cache_dir(): return RecursiveCharacterTextSplitter.from_tiktoken_encoder( **{ @@ -85,7 +84,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter # If configured to use NLTK for splitting on sentence boundaries use that for non-code text formats if use_nltk: _init_nltk() - from azure.ai.resources._index._langchain.vendor.text_splitter import NLTKTextSplitter + from azure.ai.generative.index._langchain.vendor.text_splitter import NLTKTextSplitter return NLTKTextSplitter( length_function=token_length_function(), @@ -98,7 +97,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter # Finally use any text format specific splitters formats_to_treat_as_txt_once_loaded = [".pdf", ".ppt", ".pptx", ".doc", ".docx", ".xls", ".xlsx"] if file_extension == ".txt" or file_extension in formats_to_treat_as_txt_once_loaded: - from azure.ai.resources._index._langchain.vendor.text_splitter import TokenTextSplitter + from azure.ai.generative.index._langchain.vendor.text_splitter import TokenTextSplitter with tiktoken_cache_dir(): return TokenTextSplitter( @@ -107,7 +106,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter **{**arguments, "disallowed_special": (), "allowed_special": "all"} ) elif file_extension == ".html" or file_extension == ".htm": - from azure.ai.resources._index._langchain.vendor.text_splitter import TokenTextSplitter + from azure.ai.generative.index._langchain.vendor.text_splitter import TokenTextSplitter logger.info("Using HTML splitter.") with tiktoken_cache_dir(): @@ -118,7 +117,7 @@ def get_langchain_splitter(file_extension: str, arguments: dict) -> TextSplitter ) elif file_extension == ".md": if use_rcts: - from azure.ai.resources._index._langchain.vendor.text_splitter import MarkdownTextSplitter + from azure.ai.generative.index._langchain.vendor.text_splitter import MarkdownTextSplitter with tiktoken_cache_dir(): return MarkdownTextSplitter.from_tiktoken_encoder( @@ -266,7 +265,7 @@ class MarkdownHeaderSplitter(TextSplitter): def __init__(self, remove_hyperlinks: bool = True, remove_images: bool = True, **kwargs: Any): """Initialize Markdown Header Splitter.""" - from azure.ai.resources._index._langchain.vendor.text_splitter import TokenTextSplitter + from azure.ai.generative.index._langchain.vendor.text_splitter import TokenTextSplitter self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images with tiktoken_cache_dir(): diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py index 8674d5840dc8..edc020ed77b4 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/cracking.py @@ -10,9 +10,9 @@ from typing import IO, Any, Callable, Iterator, List, Optional, Tuple, Type, Union from azure.ai.generative.index._documents.chunking import ChunkedDocument, DocumentSource +from azure.ai.generative.index._documents.document import Document, StaticDocument from azure.ai.generative.index._langchain.vendor.document_loaders.unstructured import UnstructuredFileIOLoader from azure.ai.generative.index._utils.logging import get_logger, safe_mlflow_log_metric -from azure.ai.resources._index._documents.document import Document, StaticDocument logger = get_logger(__name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py index cfc247d4e63a..3f08113fda9c 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_documents/document.py @@ -2,11 +2,16 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- """Document abstraction.""" -from abc import ABC +import json +from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Optional +from typing import Any, Optional, Union from pathlib import Path +import mmh3 +from azure.ai.generative.index._utils.tokens import token_length_function + + @dataclass class DocumentSource: """Document Source.""" @@ -29,3 +34,126 @@ def get_metadata(self) -> dict: "url": self.url, "mtime": self.mtime, } + + +class Document(ABC): + """Document.""" + + document_id: str + + def __init__(self, document_id: str): + """Initialize Document.""" + self.document_id = document_id + + @abstractmethod + def modified_time(self) -> Any: + """Get the modified time of the document.""" + pass + + @abstractmethod + def load_data(self) -> str: + """Load the data of the document.""" + pass + + @abstractmethod + def get_metadata(self) -> dict: + """Get the metadata of the document.""" + pass + + @abstractmethod + def set_metadata(self, metadata: dict): + """Set the metadata of the document.""" + pass + + @property + def page_content(self) -> str: + """Get the page content of the document.""" + return self.load_data() + + @property + def metadata(self) -> dict: + """Get the metadata of the document.""" + return self.get_metadata() + + @metadata.setter + def metadata(self, value: dict): + """Set the metadata of the document.""" + self.set_metadata(value) + + @abstractmethod + def dumps(self) -> str: + """Dump the document to a json string.""" + pass + + @classmethod + @abstractmethod + def loads(cls, data: str) -> "Document": + """Load the document from a json string.""" + pass + + +class StaticDocument(Document): + """Static Document holds data in-memory.""" + + data: str + _metadata: dict + + def __init__(self, data: str, metadata: dict, document_id: Optional[str] = None, mtime=None): + """Initialize StaticDocument.""" + if document_id is None: + filename = metadata.get("source", {}).get("filename", None) + if filename is not None: + document_id = f"{filename}{metadata.get('source', {}).get('chunk_id', '')}" + else: + document_id = str(mmh3.hash128(data)) + + super().__init__(document_id) + self.data = data + self._metadata = metadata + self.mtime = mtime + + def modified_time(self) -> Any: + """Get the modified time of the document.""" + return self.mtime + + def load_data(self) -> str: + """Load the data of the document.""" + return self.data + + def get_metadata(self) -> dict: + """Get the metadata of the document.""" + # if "stats" in self._metadata: + # if "source" not in self._metadata: + # self._metadata["source"] = {} + # self._metadata["source"]["stats"] = self._metadata["stats"] + # del self._metadata["stats"] + + self._metadata = {**self._metadata, "stats": self.document_stats()} + return self._metadata + + def set_metadata(self, metadata: dict): + """Set the metadata of the document.""" + self._metadata = metadata + + def document_stats(self) -> dict: + """Get the stats of the document.""" + return { + "tiktokens": token_length_function()(self.data), + "chars": len(self.data), + "lines": len(self.data.splitlines()), + } + + def __repr__(self): + """Get the representation of the document.""" + return f"StaticDocument(id={self.document_id}, mtime={self.mtime}, metadata={self._metadata})" + + def dumps(self) -> str: + """Dump the document to a json string.""" + return json.dumps({"content": self.data, "metadata": self._metadata, "document_id": self.document_id}) + + @classmethod + def loads(cls, data: str) -> "StaticDocument": + """Load the document from a json string.""" + data_dict = json.loads(data) + metadata = data_dict["metadata"] + return cls(data_dict["content"], metadata, data_dict.get("document_id", metadata.get("document_id", metadata.get("id", mmh3.hash128(data_dict["content"]))))) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py index cc8fe662a705..c1d8f0049ce8 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/__init__.py @@ -21,15 +21,14 @@ import pyarrow.parquet as pq import yaml # type: ignore[import] from azure.core.credentials import TokenCredential -from azure.ai.generative.index._documents import DocumentChunksIterator, DocumentSource +from azure.ai.generative.index._documents import Document, DocumentChunksIterator, DocumentSource, StaticDocument +from azure.ai.generative.index._embeddings.openai import OpenAIEmbedder from azure.ai.generative.index._langchain.vendor.document_loaders.base import BaseLoader +from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings as Embedder +from azure.ai.generative.index._langchain.vendor.schema.document import Document as LangChainDocument +from azure.ai.generative.index._models import init_open_ai_from_config, parse_model_uri from azure.ai.generative.index._utils.logging import get_logger, track_activity from azure.ai.generative.index._utils.tokens import tiktoken_cache_dir -from azure.ai.resources._index._documents import Document, StaticDocument -from azure.ai.resources._index._embeddings.openai import OpenAIEmbedder -from azure.ai.resources._index._langchain.vendor.embeddings.base import Embeddings as Embedder -from azure.ai.resources._index._langchain.vendor.schema.document import Document as LangChainDocument -from azure.ai.resources._index._models import init_open_ai_from_config, parse_model_uri logger = get_logger(__name__) @@ -55,7 +54,7 @@ def get_langchain_embeddings(embedding_kind: str, arguments: dict, credential: O ) return embedder elif embedding_kind == "hugging_face": - from azure.ai.resources._index._langchain.vendor.embeddings.huggingface import HuggingFaceEmbeddings + from azure.ai.generative.index._langchain.vendor.embeddings.huggingface import HuggingFaceEmbeddings args = copy.deepcopy(arguments) @@ -1005,7 +1004,7 @@ def add_doc(doc_id, emb_doc, documents): import_faiss_or_so_help_me = dependable_faiss_import elif engine.endswith("indexes.faiss.FaissAndDocStore"): from azure.ai.generative.index._docstore import FileBasedDocstore - from azure.ai.generative.index._indexes.faiss import FaissAndDocStore, import_faiss_or_so_help_me # type: ignore[no-redef] + from azure.ai.generative.index._indexes.faiss import FaissAndDocStore, import_faiss_or_so_help_me def add_doc(doc_id, emb_doc, documents): documents.append( diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/openai.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/openai.py new file mode 100644 index 000000000000..efb1b982e71e --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_embeddings/openai.py @@ -0,0 +1,325 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""OpenAI Embeddings generation and management tools.""" +import os +import time +from typing import Any, Dict, List, Optional + +from azure.ai.resources.constants._common import USER_AGENT_HEADER_KEY +from azure.ai.generative._user_agent import USER_AGENT +from azure.ai.generative.index._utils.logging import get_logger +from packaging import version + +logger = get_logger("embeddings.openai") + + +class OpenAIEmbedder: + """OpenAI Embedding client wrapper with retries.""" + + def __init__( + self, + api_base: str, + api_type: str, + api_version: Optional[str] = None, + api_key: Optional[str] = None, + azure_credential: Optional[Any] = None, + model: str = "text-embedding-ada-002", + deployment: Optional[str] = None, + batch_size: Optional[int] = None, + max_retries: Optional[int] = None, + embedding_ctx_length: Optional[int] = None, + show_progress_bar: bool = False, + openai_passthrough_args: Optional[dict] = None, + ): + """Initialize an OpenAI Embedding client.""" + self.api_base = api_base + self.api_type = api_type + self.api_key = api_key or os.getenv("AZURE_OPENAI_KEY") or "" + # TODO: If azure_credential set, check api_type is azure or azure_ad and setup auth accordingly + self.azure_credential = azure_credential + + if batch_size is None and "azure" in self.api_type: + batch_size = 16 + elif batch_size is None: + batch_size = 1000 + self.batch_size = int(batch_size) + self._dynamic_batch_size: Optional[int] = None + + if max_retries is None: + max_retries = 10 + self.max_retries = max_retries + + if model is None: + model = "text-embedding-ada-002" + self.model = model + + if "azure" in self.api_type and deployment is None: + raise ValueError("Azure OpenAI requires a deployment name.") + self.deployment = deployment + + if embedding_ctx_length is None: + embedding_ctx_length = 8191 + self.embedding_ctx_length = embedding_ctx_length + + self.show_progress_bar = show_progress_bar + self.openai_passthrough_args = openai_passthrough_args or {} + + try: + import openai + except ImportError as e: + raise ImportError("Please install openai via `pip install openai`") from e + + if version.parse(openai.version.VERSION) >= version.parse("1.0.0"): + self.openai_v1plus = True + self.api_version = api_version if api_version else "2023-05-15" + + if "azure" in self.api_type: + client = openai.AzureOpenAI( + api_key=self.api_key, + api_version=self.api_version, + azure_endpoint=self.api_base, + azure_deployment=self.deployment, + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + ) + else: + client = openai.OpenAI( + api_key=self.api_key, + base_url=self.api_base, + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + ) + + self.embedding_client = client.embeddings + + self._params = { + "model": self.model, + **self.openai_passthrough_args, + } + self._retry_exceptions = [ + openai._exceptions.APIStatusError, + openai._exceptions.APITimeoutError, + openai._exceptions.APIError, + openai._exceptions.APIConnectionError, + openai._exceptions.RateLimitError, + openai._exceptions.InternalServerError, + openai._exceptions.APIResponseValidationError, + ] + self._RateLimitError = openai._exceptions.RateLimitError + else: + self.openai_v1plus = False + self.api_version = api_version if api_version else "2023-03-15-preview" + self.embedding_client = openai.Embeddings + self._params = { + "model": self.model, + "api_base": self.api_base, + "api_type": self.api_type, + "api_version": self.api_version, + "api_key": self.api_key, + **self.openai_passthrough_args, + } + if self.deployment is not None: + self._params["engine"] = self.deployment + self._retry_exceptions = [ + openai.error.Timeout, + openai.error.APIError, + openai.error.APIConnectionError, + openai.error.RateLimitError, + openai.error.ServiceUnavailableError, + ] + self._RateLimitError = openai.error.RateLimitError + + self._statistics = { + "num_retries": 0, + "time_spent_sleeping": 0, + "num_tokens": 0, + } + + @property + def _openai_client_params(self) -> dict: + return self._params + + @property + def _retryable_openai_errors(self) -> List[Exception]: + return self._retry_exceptions + + def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: + try: + if self._dynamic_batch_size is None: + return self._embed_request(tokenized_texts=tokenized_texts, **kwargs) + else: + embedding_response: Dict[str, List] = {"data": []} + for i in range(0, len(tokenized_texts), self._dynamic_batch_size): + embedding_response["data"].extend( + self._embed_request( + tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs + )["data"] + ) + except Exception as e: + err_msg = str(e) + if "Too many inputs" not in err_msg: + raise + + import re + match = re.match(r".*The max number of inputs is ([0-9]+).*", err_msg) + if match and match.group(1): + try: + self._dynamic_batch_size = int(match.group(1)) + except Exception: + logger.error( + "Failed to parse max number of inputs from error message, falling back to batch_size=1." + ) + self._dynamic_batch_size = 1 + logger.warning(f"Reducing batch_size to {self._dynamic_batch_size} and retrying.") + embedding_response: Dict[str, List] = {"data": []} # type: ignore[no-redef] + for i in range(0, len(tokenized_texts), self._dynamic_batch_size): + embedding_response["data"].extend( + self._embed_request( + tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs + )["data"] + ) + else: + raise + + return embedding_response + + def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: + try: + total_delay = 0 + last_exception = None + for retry in range(self.max_retries): + logger.info(f"Attempt {retry} to embed {len(tokenized_texts)} documents.") + try: + response = self.embedding_client.create( + input=tokenized_texts, + **kwargs, + ) + if self.openai_v1plus: + response = {"object": "list", "data": [{"object": "embedding", "embedding": d.embedding} for d in response.data]} + return response + except Exception as e: + err_msg = str(e) + logger.warning(f"Error embedding: {err_msg}", exc_info=e) + last_exception = e + retrying = False + for retryable_error in self._retryable_openai_errors: + if isinstance(e, type(retryable_error)): + retrying = True + + # Retry with retry-after if found in RateLimitError + if isinstance(e, self._RateLimitError): + logger.warning(f"Retrying error type {type(e)}.") + response_headers = e.headers if hasattr(e, "headers") else {} + if "Retry-After" in response_headers: + delay = int(response_headers["Retry-After"]) + logger.warning(f"OpenAI throws RateLimitError with Retry-After {delay} seconds.") + else: + # Wait for 1 minute as suggested by openai https://help.openai.com/en/articles/6897202-ratelimiterror + logger.warning("Retry after 60 seconds.") + delay = 60 + total_delay += delay + logger.warning(f"Sleeping for {delay} seconds before retrying.") + time.sleep(delay) + break + + if not retrying: + break + finally: + self._statistics["num_retries"] += retry + self._statistics["time_spent_sleeping"] += total_delay + + err_msg = f"Failed to embed {len(tokenized_texts)} documents after {total_delay}s and {retry} retries. {last_exception}" + logger.error(err_msg) # TODO: Add custom dimensions + raise RuntimeError(err_msg) + + def _embed(self, texts: List[str]) -> List[List[float]]: + """Embed the given texts.""" + import numpy as np + import tiktoken + + try: + encoding = tiktoken.encoding_for_model(self.model) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + + tokenized_texts = [] + num_tokens = 0 + tokenized_texts_to_original_texts_indices = [] + for i, text in enumerate(texts): + if self.model.endswith("001"): + # Replace newlines, which can negatively affect performance. + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + text = text.replace("\n", " ") + + tokens = encoding.encode( + text, + # TODO: Does this need to be configurable? Our use cases treat all text as raw data. + disallowed_special=(), + ) + # Text longer than a models context length can be split and the embeddings averaged to approximate full text + # See: https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb + for j in range(0, len(tokens), self.embedding_ctx_length): + tokenized_texts.append(tokens[j : j + self.embedding_ctx_length]) + num_tokens += len(tokenized_texts[-1]) + tokenized_texts_to_original_texts_indices.append(i) + + self._statistics["num_tokens"] += num_tokens + + if self.show_progress_bar: + try: + import tqdm + + _iter = tqdm.tqdm(range(0, len(tokenized_texts), self.batch_size)) + except ImportError: + _iter = range(0, len(tokenized_texts), self.batch_size) + else: + _iter = range(0, len(tokenized_texts), self.batch_size) + + batched_embeddings: List[List[float]] = [] + for i in _iter: + response = self._dynamic_batch_size_embed_request( + tokenized_texts=tokenized_texts[i : i + self.batch_size], + **self._openai_client_params, + ) + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + embedding_results: List[List[List[float]]] = [[] for _ in range(len(texts))] + num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] + for i in range(len(tokenized_texts_to_original_texts_indices)): + embedding_results[tokenized_texts_to_original_texts_indices[i]].append(batched_embeddings[i]) + num_tokens_in_batch[tokenized_texts_to_original_texts_indices[i]].append(len(tokenized_texts[i])) + + embeddings: List[List[float]] = [[] for _ in range(len(texts))] + for i in range(len(texts)): + _result = embedding_results[i] + if len(_result) == 0: + average = self._embed_request(tokenized_texts="", **self._openai_client_params)["data"][0]["embedding"] # type: ignore[arg-type] + else: + average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) + embeddings[i] = (average / np.linalg.norm(average)).tolist() + + return embeddings + + def embed_documents(self, documents: List[str]) -> List[List[float]]: + """Batch embed documents.""" + return self._embed(documents) + + def embed_query(self, query: str) -> List[float]: + """Embed a single query.""" + return self.embed_documents([query])[0] + + # # TODO: _aembed + # async def aembed_documents(self, documents: List[str]) -> List[List[float]]: + # """Batch embed documents.""" + # return await self._aembed(documents) + + # async def aembed_query(self, query: str) -> List[float]: + # """Embed a single query.""" + # embeddings = await self.aembed_documents([query]) + # return embeddings[0] + + @property + def statistics(self) -> Dict[str, Any]: + """Return statistics about the last embedding request.""" + return self._statistics diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py index 7b8f97ac5356..0690940f7a3a 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_functions.py @@ -36,4 +36,4 @@ def get_native_index_client_from_index( path: Optional[Union[str, Path]], credential: Optional[TokenCredential] = None, ): - return DataplaneMLIndex(path).as_native_index_client(credential=credential) \ No newline at end of file + return DataplaneMLIndex(path).as_native_index_client(credential=credential) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/azure_search.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/azure_search.py new file mode 100644 index 000000000000..94f448c5763f --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/azure_search.py @@ -0,0 +1,21 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Azure Cognitive Search based Vector Index.""" +from types import ModuleType + +from azure.ai.generative.index._utils.logging import get_logger, version + +logger = get_logger("indexes.azure_search") + + +def import_azure_search_or_so_help_me() -> ModuleType: + """Import azure-search-documents if available, otherwise raise error.""" + try: + import azure.search.documents as azure_search_documents + except ImportError as e: + raise ImportError( + "Could not import azure-search-documents python package. " + f"Please install it with `pip install azure-ai-generative[cognitive_search]=={version}`" + ) from e + return azure_search_documents diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/faiss.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/faiss.py new file mode 100644 index 000000000000..301c1d7a9402 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_indexes/faiss.py @@ -0,0 +1,187 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Faiss based Vector Index using a file based DocumentStore.""" +import json +import os +from pathlib import Path +from types import ModuleType +from typing import Any, Callable, Dict, List, Tuple + +import numpy as np +from azure.ai.generative.index._docstore import FileBasedDocstore +from azure.ai.generative.index._documents import Document +from azure.ai.generative.index._utils.logging import get_logger + +logger = get_logger("indexes.faiss") + + +def import_faiss_or_so_help_me() -> ModuleType: + """Import faiss if available, otherwise raise error.""" + try: + if os.getenv("FAISS_NO_AVX2", "false").lower() == "true": + from faiss import swigfaiss as faiss + else: + import faiss + except ImportError as e: + raise ImportError( + "Could not import faiss python package. " + "Please install it with `pip install faiss-gpu` (for CUDA supported GPU) " + "or `pip install faiss-cpu` (depending on Python version)." + ) from e + return faiss + + +class FaissAndDocStore: + """Faiss based VectorStore using a file based DocumentStore.""" + + docstore: FileBasedDocstore + index: Any + query_embed: Callable[[str], List[float]] + index_to_doc_id: Dict[str, str] + + def __init__( + self, + query_embed: Callable[[str], List[float]], + index: Any, + docstore: FileBasedDocstore, + index_to_doc_id: Dict[str, str] + ): + """Initialize FaissAndDocStore.""" + self.query_embed = query_embed + self.index = index + self.docstore = docstore + self.index_to_doc_id = index_to_doc_id + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Return docs most similar to query. + + Args: + ---- + embedding: Embedding vector to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + kwargs: kwargs to be passed to similarity search. Can include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + ------- + List of documents most similar to the query text and L2 distance + in float for each. Lower score represents more similarity. + """ + vector = np.array([embedding], dtype=np.float32) + scores, indices = self.index.search(vector, k) + docs = [] + for j, i in enumerate(indices[0]): + if i == -1: + # This happens when not enough docs are returned. + continue + _id = self.index_to_doc_id[str(i)] + doc = self.docstore.search(_id) + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + docs.append((doc, scores[0][j])) + + score_threshold = kwargs.get("score_threshold") + if score_threshold is not None: + docs = [ + (doc, similarity) + for doc, similarity in docs + if similarity > score_threshold + ] + return docs[:k] + + def similarity_search_with_score(self, query: str, k: int = 8, **kwargs: Any) -> List[Tuple[Document, float]]: + """ + Return docs most similar to query. + + Args: + ---- + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + ------- + List of documents most similar to the query text with + L2 distance in float. Lower score represents more similarity. + """ + embedding = self.query_embed(query) + docs = self.similarity_search_with_score_by_vector(embedding, k, **kwargs) + return docs + + def similarity_search_by_vector(self, embedding: List[float], k: int = 8, **kwargs) -> List[Document]: + """ + Return docs most similar to embedding vector. + + Args: + ---- + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. + + Returns: + ------- + List of Documents most similar to the embedding. + """ + docs_and_scores = self.similarity_search_with_score_by_vector(embedding, k, **kwargs) + return [doc for doc, _ in docs_and_scores] + + def similarity_search(self, query: str, k: int = 8, **kwargs) -> List[Document]: + """ + Return docs most similar to query. + + Args: + ---- + query: Text to look up documents similar to. + k: Number of Documents to return. + + Returns: + ------- + List of Documents most similar to the query. + """ + docs_and_scores = self.similarity_search_with_score(query, k, **kwargs) + return [doc for doc, _ in docs_and_scores] + + def save(self, output_path: str): + """Write index and docstore to output_path.""" + output_path_obj = Path(output_path) + output_path_obj.mkdir(exist_ok=True, parents=True) + + faiss = import_faiss_or_so_help_me() + faiss.write_index(self.index, str(output_path_obj / "index.faiss")) + + self.docstore.save(str(output_path_obj / "docstore")) + + with (output_path_obj / "index_to_doc_id.json").open("w") as f: + json.dump(self.index_to_doc_id, f) + + def save_local(self, output_path: str): + """Same as save, alias to match langchain.vectorstores.FAISS.""" + return self.save(output_path) + + @classmethod + def load(cls, input_path: str, query_embed: Callable[[str], List[float]]) -> "FaissAndDocStore": + """Read index and docstore from input_path.""" + import tempfile + + from fsspec.core import url_to_fs + + logger.info(f"Loading FaissAndDocStore from: {input_path}") + fs, uri = url_to_fs(input_path) + + with tempfile.TemporaryDirectory() as tmpdir: + fs.download(f"{uri.rstrip('/')}/index.faiss", str(tmpdir)) + faiss = import_faiss_or_so_help_me() + index = faiss.read_index(f"{tmpdir.rstrip('/')}/index.faiss") + + with fs.open(f"{uri.rstrip('/')}/index_to_doc_id.json", "r") as f: + index_to_doc_id = json.load(f) + + docstore = FileBasedDocstore.load(f"{input_path.rstrip('/')}/docstore") + + return cls(query_embed, index, docstore, index_to_doc_id) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py index 7141bd822a55..543ba1c211f2 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/acs.py @@ -7,16 +7,16 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple from azure.ai.generative.index._utils.logging import get_logger -from azure.ai.resources._index._utils.requests import send_post_request +from azure.ai.generative.index._utils.requests import send_post_request try: from langchain.schema.document import Document from langchain.schema.embeddings import Embeddings from langchain.schema.vectorstore import VectorStore except ImportError: - from azure.ai.resources._index._langchain.vendor.embeddings.base import Embeddings - from azure.ai.resources._index._langchain.vendor.schema.document import Document - from azure.ai.resources._index._langchain.vendor.vectorstores.base import VectorStore + from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings + from azure.ai.generative.index._langchain.vendor.schema.document import Document + from azure.ai.generative.index._langchain.vendor.vectorstores.base import VectorStore logger = get_logger("langchain.acs") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py index 6604dec8bd49..5682b44555ae 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/docstore.py @@ -4,9 +4,9 @@ """Langchain compatible Docstore which serializes to jsonl.""" from typing import Dict, Union +from azure.ai.generative.index._docstore import FileBasedDocstore from azure.ai.generative.index._embeddings import WrappedLangChainDocument -from azure.ai.resources._index._documents import Document -from azure.ai.resources._index._docstore import FileBasedDocstore +from azure.ai.generative.index._documents import Document from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.document import Document as LangChainDocument @@ -15,7 +15,7 @@ class FileBasedDocStore(Docstore, AddableMixin): """Simple docstore which serializes to file and loads into memory.""" def __init__(self, docstore: FileBasedDocstore): - """Initialize with azure.ai.resources._index._docstore.FileBasedDocstore.""" + """Initialize with azure.ai.generative.index._docstore.FileBasedDocstore.""" self.docstore = docstore def add(self, texts: Dict[str, LangChainDocument]) -> None: diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py index dc8025e48a78..d94590722a4e 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/faiss.py @@ -2,9 +2,9 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # --------------------------------------------------------- """Faiss based VectorStore using a file based DocumentStore.""" +from azure.ai.generative.index._indexes.faiss import FaissAndDocStore from azure.ai.generative.index._langchain.docstore import FileBasedDocStore from azure.ai.generative.index._utils.logging import get_logger -from azure.ai.resources._index._indexes.faiss import FaissAndDocStore from langchain.vectorstores import FAISS from langchain.vectorstores.base import VectorStore diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py index 34bfddcfd02f..ce4e9e340788 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/openai.py @@ -8,7 +8,7 @@ def patch_openai_embedding_retries(logger, activity_logger, max_seconds_retrying """Patch the openai embedding to retry on failure."".""" from datetime import datetime - from azure.ai.resources._index._langchain.vendor.embeddings import openai as langchain_openai + from azure.ai.generative.index._langchain.vendor.embeddings import openai as langchain_openai from tenacity import ( retry, retry_if_exception_type, diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/base.py new file mode 100644 index 000000000000..c054c26cfc69 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/docstore/base.py @@ -0,0 +1,25 @@ +"""Interface to access to place that stores documents.""" +from abc import ABC, abstractmethod +from typing import Dict, Union + +from azure.ai.generative.index._langchain.vendor.schema.document import Document + + +class Docstore(ABC): + """Interface to access to place that stores documents.""" + + @abstractmethod + def search(self, search: str) -> Union[str, Document]: + """Search for document. + + If page exists, return the page summary, and a Document object. + If page does not exist, return similar entries. + """ + + +class AddableMixin(ABC): + """Mixin class that supports adding texts.""" + + @abstractmethod + def add(self, texts: Dict[str, Document]) -> None: + """Add more documents.""" diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py index 7af9218aa109..5bb2836af8a8 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/base.py @@ -5,8 +5,8 @@ from abc import ABC, abstractmethod from typing import Iterator, List, Optional -from azure.ai.resources._index._langchain.vendor.schema.document import Document -from azure.ai.resources._index._langchain.vendor.text_splitter import RecursiveCharacterTextSplitter, TextSplitter +from azure.ai.generative.index._langchain.vendor.schema.document import Document +from azure.ai.generative.index._langchain.vendor.text_splitter import RecursiveCharacterTextSplitter, TextSplitter class BaseLoader(ABC): diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py index 002047dae436..bd507e0517b1 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/document_loaders/unstructured.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from typing import IO, Any, Callable, Dict, List, Optional, Sequence, Union -from azure.ai.resources._index._langchain.vendor.schema.document import Document +from azure.ai.generative.index._langchain.vendor.schema.document import Document from azure.ai.generative.index._langchain.vendor.document_loaders.base import BaseLoader diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/base.py new file mode 100644 index 000000000000..e9996c3c3f0f --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/base.py @@ -0,0 +1,25 @@ +# This file has been copied as is. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +from abc import ABC, abstractmethod +from typing import List + + +class Embeddings(ABC): + """Interface for embedding models.""" + + @abstractmethod + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Embed search docs.""" + + @abstractmethod + def embed_query(self, text: str) -> List[float]: + """Embed query text.""" + + async def aembed_documents(self, texts: List[str]) -> List[List[float]]: + """Asynchronous Embed search docs.""" + raise NotImplementedError + + async def aembed_query(self, text: str) -> List[float]: + """Asynchronous Embed query text.""" + raise NotImplementedError diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/huggingface.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/huggingface.py new file mode 100644 index 000000000000..9495b815e7cc --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/huggingface.py @@ -0,0 +1,256 @@ +# This file has been copied as is. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings + +DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" +DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large" +DEFAULT_BGE_MODEL = "BAAI/bge-large-en" +DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: " +DEFAULT_QUERY_INSTRUCTION = ( + "Represent the question for retrieving supporting documents: " +) +DEFAULT_QUERY_BGE_INSTRUCTION_EN = ( + "Represent this question for searching relevant passages: " +) +DEFAULT_QUERY_BGE_INSTRUCTION_ZH = "为这个句子生成表示以用于检索相关文章:" + + +@dataclass +class HuggingFaceEmbeddings(Embeddings): + """HuggingFace sentence_transformers embedding models. + + To use, you should have the ``sentence_transformers`` python package installed. + + Example: + .. code-block:: python + + from langchain.embeddings import HuggingFaceEmbeddings + + model_name = "sentence-transformers/all-mpnet-base-v2" + model_kwargs = {'device': 'cpu'} + encode_kwargs = {'normalize_embeddings': False} + hf = HuggingFaceEmbeddings( + model_name=model_name, + model_kwargs=model_kwargs, + encode_kwargs=encode_kwargs + ) + """ + + client: Any = field(init=False) #: :meta private: + model_name: str = DEFAULT_MODEL_NAME + """Model name to use.""" + cache_folder: Optional[str] = None + """Path to store models. + Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" + model_kwargs: Dict[str, Any] = field(default_factory=dict) + """Key word arguments to pass to the model.""" + encode_kwargs: Dict[str, Any] = field(default_factory=dict) + """Key word arguments to pass when calling the `encode` method of the model.""" + multi_process: bool = False + """Run encode() on multiple GPUs.""" + + def __post_init__(self, **kwargs: Any): + """Initialize the sentence_transformer.""" + try: + import sentence_transformers + + except ImportError as exc: + raise ImportError( + "Could not import sentence_transformers python package. " + "Please install it with `pip install sentence_transformers`." + ) from exc + + self.client = sentence_transformers.SentenceTransformer( + self.model_name, cache_folder=self.cache_folder, **self.model_kwargs + ) + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a HuggingFace transformer model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + import sentence_transformers + + texts = list(map(lambda x: x.replace("\n", " "), texts)) + if self.multi_process: + pool = self.client.start_multi_process_pool() + embeddings = self.client.encode_multi_process(texts, pool) + sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool) + else: + embeddings = self.client.encode(texts, **self.encode_kwargs) + + return embeddings.tolist() + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a HuggingFace transformer model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + return self.embed_documents([text])[0] + + +@dataclass +class HuggingFaceInstructEmbeddings(Embeddings): + """Wrapper around sentence_transformers embedding models. + + To use, you should have the ``sentence_transformers`` + and ``InstructorEmbedding`` python packages installed. + + Example: + .. code-block:: python + + from langchain.embeddings import HuggingFaceInstructEmbeddings + + model_name = "hkunlp/instructor-large" + model_kwargs = {'device': 'cpu'} + encode_kwargs = {'normalize_embeddings': True} + hf = HuggingFaceInstructEmbeddings( + model_name=model_name, + model_kwargs=model_kwargs, + encode_kwargs=encode_kwargs + ) + """ + + client: Any = field(init=False) #: :meta private: + model_name: str = DEFAULT_INSTRUCT_MODEL + """Model name to use.""" + cache_folder: Optional[str] = None + """Path to store models. + Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" + model_kwargs: Dict[str, Any] = field(default_factory=dict) + """Key word arguments to pass to the model.""" + encode_kwargs: Dict[str, Any] = field(default_factory=dict) + """Key word arguments to pass when calling the `encode` method of the model.""" + embed_instruction: str = DEFAULT_EMBED_INSTRUCTION + """Instruction to use for embedding documents.""" + query_instruction: str = DEFAULT_QUERY_INSTRUCTION + """Instruction to use for embedding query.""" + + def __post_init__(self, **kwargs: Any): + """Initialize the sentence_transformer.""" + try: + from InstructorEmbedding import INSTRUCTOR + + self.client = INSTRUCTOR( + self.model_name, cache_folder=self.cache_folder, **self.model_kwargs + ) + except ImportError as e: + raise ImportError("Dependencies for InstructorEmbedding not found.") from e + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a HuggingFace instruct model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + instruction_pairs = [[self.embed_instruction, text] for text in texts] + embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs) + return embeddings.tolist() + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a HuggingFace instruct model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + instruction_pair = [self.query_instruction, text] + embedding = self.client.encode([instruction_pair], **self.encode_kwargs)[0] + return embedding.tolist() + + +@dataclass +class HuggingFaceBgeEmbeddings(Embeddings): + """HuggingFace BGE sentence_transformers embedding models. + + To use, you should have the ``sentence_transformers`` python package installed. + + Example: + .. code-block:: python + + from langchain.embeddings import HuggingFaceBgeEmbeddings + + model_name = "BAAI/bge-large-en" + model_kwargs = {'device': 'cpu'} + encode_kwargs = {'normalize_embeddings': True} + hf = HuggingFaceBgeEmbeddings( + model_name=model_name, + model_kwargs=model_kwargs, + encode_kwargs=encode_kwargs + ) + """ + + client: Any = field(init=False) #: :meta private: + model_name: str = DEFAULT_BGE_MODEL + """Model name to use.""" + cache_folder: Optional[str] = None + """Path to store models. + Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" + model_kwargs: Dict[str, Any] = field(default_factory=dict) + """Key word arguments to pass to the model.""" + encode_kwargs: Dict[str, Any] = field(default_factory=dict) + """Key word arguments to pass when calling the `encode` method of the model.""" + query_instruction: str = DEFAULT_QUERY_BGE_INSTRUCTION_EN + """Instruction to use for embedding query.""" + + def __post_init__(self, **kwargs: Any): + """Initialize the sentence_transformer.""" + try: + import sentence_transformers + + except ImportError as exc: + raise ImportError( + "Could not import sentence_transformers python package. " + "Please install it with `pip install sentence_transformers`." + ) from exc + + self.client = sentence_transformers.SentenceTransformer( + self.model_name, cache_folder=self.cache_folder, **self.model_kwargs + ) + if "-zh" in self.model_name: + self.query_instruction = DEFAULT_QUERY_BGE_INSTRUCTION_ZH + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a HuggingFace transformer model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + texts = [t.replace("\n", " ") for t in texts] + embeddings = self.client.encode(texts, **self.encode_kwargs) + return embeddings.tolist() + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a HuggingFace transformer model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + text = text.replace("\n", " ") + embedding = self.client.encode( + self.query_instruction + text, **self.encode_kwargs + ) + return embedding.tolist() diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/openai.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/openai.py new file mode 100644 index 000000000000..2d1446426017 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/embeddings/openai.py @@ -0,0 +1,439 @@ +# This file has been slightly modified to not rely on Pydantic. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +from __future__ import annotations + +import logging +from dataclasses import dataclass, field +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Sequence, + Set, + Tuple, + Union, +) + +import openai +import numpy as np +from tenacity import ( + AsyncRetrying, + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_attempt, + wait_exponential, +) + +from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings + +logger = logging.getLogger(__name__) + + +def _create_retry_decorator(embeddings: OpenAIEmbeddings) -> Callable[[Any], Any]: + import openai + + min_seconds = 4 + max_seconds = 10 + # Wait 2^x * 1 second between each retry starting with + # 4 seconds, then up to 10 seconds, then 10 seconds afterwards + return retry( + reraise=True, + stop=stop_after_attempt(embeddings.max_retries), + wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), + retry=( + retry_if_exception_type(openai.error.Timeout) + | retry_if_exception_type(openai.error.APIError) + | retry_if_exception_type(openai.error.APIConnectionError) + | retry_if_exception_type(openai.error.RateLimitError) + | retry_if_exception_type(openai.error.ServiceUnavailableError) + ), + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + + +def _async_retry_decorator(embeddings: OpenAIEmbeddings) -> Any: + import openai + + min_seconds = 4 + max_seconds = 10 + # Wait 2^x * 1 second between each retry starting with + # 4 seconds, then up to 10 seconds, then 10 seconds afterwards + async_retrying = AsyncRetrying( + reraise=True, + stop=stop_after_attempt(embeddings.max_retries), + wait=wait_exponential(multiplier=1, min=min_seconds, max=max_seconds), + retry=( + retry_if_exception_type(openai.error.Timeout) + | retry_if_exception_type(openai.error.APIError) + | retry_if_exception_type(openai.error.APIConnectionError) + | retry_if_exception_type(openai.error.RateLimitError) + | retry_if_exception_type(openai.error.ServiceUnavailableError) + ), + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + + def wrap(func: Callable) -> Callable: + async def wrapped_f(*args: Any, **kwargs: Any) -> Callable: + async for _ in async_retrying: + return await func(*args, **kwargs) + raise AssertionError("this is unreachable") + + return wrapped_f + + return wrap + + +# https://stackoverflow.com/questions/76469415/getting-embeddings-of-length-1-from-langchain-openaiembeddings +def _check_response(response: dict) -> dict: + if any(len(d["embedding"]) == 1 for d in response["data"]): + import openai + + raise openai.error.APIError("OpenAI API returned an empty embedding") + return response + + +def embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any: + """Use tenacity to retry the embedding call.""" + retry_decorator = _create_retry_decorator(embeddings) + + @retry_decorator + def _embed_with_retry(**kwargs: Any) -> Any: + response = embeddings.client.create(**kwargs) + return _check_response(response) + + return _embed_with_retry(**kwargs) + + +async def async_embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any: + """Use tenacity to retry the embedding call.""" + + @_async_retry_decorator(embeddings) + async def _async_embed_with_retry(**kwargs: Any) -> Any: + response = await embeddings.client.acreate(**kwargs) + return _check_response(response) + + return await _async_embed_with_retry(**kwargs) + + +@dataclass +class OpenAIEmbeddings(Embeddings): + """OpenAI embedding models. + + To use, you should have the ``openai`` python package installed, and the + environment variable ``OPENAI_API_KEY`` set with your API key or pass it + as a named parameter to the constructor. + + Example: + .. code-block:: python + + from langchain.embeddings import OpenAIEmbeddings + openai = OpenAIEmbeddings(openai_api_key="my-api-key") + + In order to use the library with Microsoft Azure endpoints, you need to set + the OPENAI_API_TYPE, OPENAI_API_BASE, OPENAI_API_KEY and OPENAI_API_VERSION. + The OPENAI_API_TYPE must be set to 'azure' and the others correspond to + the properties of your endpoint. + In addition, the deployment name must be passed as the model parameter. + + Example: + .. code-block:: python + + import os + os.environ["OPENAI_API_TYPE"] = "azure" + os.environ["OPENAI_API_BASE"] = "https:// Dict: + openai_args = { + "model": self.model, + "request_timeout": self.request_timeout, + "headers": self.headers, + "api_key": self.openai_api_key, + "organization": self.openai_organization, + "api_base": self.openai_api_base, + "api_type": self.openai_api_type, + "api_version": self.openai_api_version, + **self.model_kwargs, + } + if self.openai_api_type in ("azure", "azure_ad", "azuread"): + openai_args["engine"] = self.deployment + if self.openai_proxy: + try: + import openai + except ImportError: + raise ImportError( + "Could not import openai python package. " + "Please install it with `pip install openai`." + ) + + openai.proxy = { + "http": self.openai_proxy, + "https": self.openai_proxy, + } # type: ignore[assignment] # noqa: E501 + return openai_args + + # please refer to + # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb + def _get_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + embeddings: List[List[float]] = [[] for _ in range(len(texts))] + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for OpenAIEmbeddings. " + "Please install it with `pip install tiktoken`." + ) + + tokens = [] + indices = [] + model_name = self.tiktoken_model_name or self.model + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + for i, text in enumerate(texts): + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") + token = encoding.encode( + text, + allowed_special=self.allowed_special, + disallowed_special=self.disallowed_special, + ) + for j in range(0, len(token), self.embedding_ctx_length): + tokens.append(token[j : j + self.embedding_ctx_length]) + indices.append(i) + + batched_embeddings: List[List[float]] = [] + _chunk_size = chunk_size or self.chunk_size + + if self.show_progress_bar: + try: + from tqdm.auto import tqdm + + _iter = tqdm(range(0, len(tokens), _chunk_size)) + except ImportError: + _iter = range(0, len(tokens), _chunk_size) + else: + _iter = range(0, len(tokens), _chunk_size) + + for i in _iter: + response = embed_with_retry( + self, + input=tokens[i : i + _chunk_size], + **self._invocation_params, + ) + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + results: List[List[List[float]]] = [[] for _ in range(len(texts))] + num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] + for i in range(len(indices)): + results[indices[i]].append(batched_embeddings[i]) + num_tokens_in_batch[indices[i]].append(len(tokens[i])) + + for i in range(len(texts)): + _result = results[i] + if len(_result) == 0: + average = embed_with_retry( + self, + input="", + **self._invocation_params, + )[ + "data" + ][0]["embedding"] + else: + average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) + embeddings[i] = (average / np.linalg.norm(average)).tolist() + + return embeddings + + # please refer to + # https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb + async def _aget_len_safe_embeddings( + self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None + ) -> List[List[float]]: + embeddings: List[List[float]] = [[] for _ in range(len(texts))] + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for OpenAIEmbeddings. " + "Please install it with `pip install tiktoken`." + ) + + tokens = [] + indices = [] + model_name = self.tiktoken_model_name or self.model + try: + encoding = tiktoken.encoding_for_model(model_name) + except KeyError: + logger.warning("Warning: model not found. Using cl100k_base encoding.") + model = "cl100k_base" + encoding = tiktoken.get_encoding(model) + for i, text in enumerate(texts): + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") + token = encoding.encode( + text, + allowed_special=self.allowed_special, + disallowed_special=self.disallowed_special, + ) + for j in range(0, len(token), self.embedding_ctx_length): + tokens.append(token[j : j + self.embedding_ctx_length]) + indices.append(i) + + batched_embeddings: List[List[float]] = [] + _chunk_size = chunk_size or self.chunk_size + for i in range(0, len(tokens), _chunk_size): + response = await async_embed_with_retry( + self, + input=tokens[i : i + _chunk_size], + **self._invocation_params, + ) + batched_embeddings.extend(r["embedding"] for r in response["data"]) + + results: List[List[List[float]]] = [[] for _ in range(len(texts))] + num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))] + for i in range(len(indices)): + results[indices[i]].append(batched_embeddings[i]) + num_tokens_in_batch[indices[i]].append(len(tokens[i])) + + for i in range(len(texts)): + _result = results[i] + if len(_result) == 0: + average = ( + await async_embed_with_retry( + self, + input="", + **self._invocation_params, + ) + )["data"][0]["embedding"] + else: + average = np.average(_result, axis=0, weights=num_tokens_in_batch[i]) + embeddings[i] = (average / np.linalg.norm(average)).tolist() + + return embeddings + + def embed_documents( + self, texts: List[str], chunk_size: Optional[int] = 0 + ) -> List[List[float]]: + """Call out to OpenAI's embedding endpoint for embedding search docs. + + Args: + texts: The list of texts to embed. + chunk_size: The chunk size of embeddings. If None, will use the chunk size + specified by the class. + + Returns: + List of embeddings, one for each text. + """ + # NOTE: to keep things simple, we assume the list may contain texts longer + # than the maximum context and use length-safe embedding function. + return self._get_len_safe_embeddings(texts, engine=self.deployment) + + async def aembed_documents( + self, texts: List[str], chunk_size: Optional[int] = 0 + ) -> List[List[float]]: + """Call out to OpenAI's embedding endpoint async for embedding search docs. + + Args: + texts: The list of texts to embed. + chunk_size: The chunk size of embeddings. If None, will use the chunk size + specified by the class. + + Returns: + List of embeddings, one for each text. + """ + # NOTE: to keep things simple, we assume the list may contain texts longer + # than the maximum context and use length-safe embedding function. + return await self._aget_len_safe_embeddings(texts, engine=self.deployment) + + def embed_query(self, text: str) -> List[float]: + """Call out to OpenAI's embedding endpoint for embedding query text. + + Args: + text: The text to embed. + + Returns: + Embedding for the text. + """ + return self.embed_documents([text])[0] + + async def aembed_query(self, text: str) -> List[float]: + """Call out to OpenAI's embedding endpoint async for embedding query text. + + Args: + text: The text to embed. + + Returns: + Embedding for the text. + """ + embeddings = await self.aembed_documents([text]) + return embeddings[0] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/__init__.py new file mode 100644 index 000000000000..a72d8893762e --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/__init__.py @@ -0,0 +1,35 @@ +# This class has been copied from 'langchain/langchain/schema.py +# Last Sync: 2023-09-05 +# Tag: v0.0.220 +from abc import ABC, abstractmethod +from typing import ( + List, +) + +from azure.ai.generative.index._langchain.vendor.schema.document import Document + + +class BaseRetriever(ABC): + """Base interface for retrievers.""" + + @abstractmethod + def get_relevant_documents(self, query: str) -> List[Document]: + """Get documents relevant for a query. + + Args: + query: string to find relevant documents for + + Returns: + List of relevant documents + """ + + @abstractmethod + async def aget_relevant_documents(self, query: str) -> List[Document]: + """Get documents relevant for a query. + + Args: + query: string to find relevant documents for + + Returns: + List of relevant documents + """ diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/document.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/document.py new file mode 100644 index 000000000000..ae3b99d56c4c --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/schema/document.py @@ -0,0 +1,83 @@ +# This file has been slightly modified to not rely on Pydantic for the Document class. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Any, Sequence + + +@dataclass +class Document: + """Class for storing a piece of text and associated metadata.""" + + page_content: str + """String text.""" + metadata: dict = field(default_factory=dict) + """Arbitrary metadata about the page content (e.g., source, relationships to other + documents, etc.). + """ + + +class BaseDocumentTransformer(ABC): + """Abstract base class for document transformation systems. + + A document transformation system takes a sequence of Documents and returns a + sequence of transformed Documents. + + Example: + .. code-block:: python + + class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel): + embeddings: Embeddings + similarity_fn: Callable = cosine_similarity + similarity_threshold: float = 0.95 + + class Config: + arbitrary_types_allowed = True + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + stateful_documents = get_stateful_documents(documents) + embedded_documents = _get_embeddings_from_stateful_docs( + self.embeddings, stateful_documents + ) + included_idxs = _filter_similar_embeddings( + embedded_documents, self.similarity_fn, self.similarity_threshold + ) + return [stateful_documents[i] for i in sorted(included_idxs)] + + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + raise NotImplementedError + + """ # noqa: E501 + + @abstractmethod + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Transform a list of documents. + + Args: + documents: A sequence of Documents to be transformed. + + Returns: + A list of transformed Documents. + """ + + @abstractmethod + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Asynchronously transform a list of documents. + + Args: + documents: A sequence of Documents to be transformed. + + Returns: + A list of transformed Documents. + """ diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/text_splitter.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/text_splitter.py new file mode 100644 index 000000000000..c9fb26ead05e --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/text_splitter.py @@ -0,0 +1,1051 @@ +# Not all of this file has been vendor, just the parts we use. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +"""**Text Splitters** are classes for splitting text. + + +**Class hierarchy:** + +.. code-block:: + + BaseDocumentTransformer --> TextSplitter --> TextSplitter # Example: CharacterTextSplitter + RecursiveCharacterTextSplitter --> TextSplitter + +Note: **MarkdownHeaderTextSplitter** does not derive from TextSplitter. + + +**Main helpers:** + +.. code-block:: + + Document, Tokenizer, Language, LineType, HeaderType + +""" +from __future__ import annotations + +import copy +import logging +import re +from abc import ABC, abstractmethod +from dataclasses import dataclass +from enum import Enum +from typing import ( + AbstractSet, + Any, + Callable, + Collection, + Dict, + Iterable, + List, + Literal, + Optional, + Sequence, + Tuple, + Type, + TypedDict, + TypeVar, + Union, + cast, +) + +from azure.ai.generative.index._langchain.vendor.schema.document import Document +from azure.ai.generative.index._langchain.vendor.schema.document import BaseDocumentTransformer + +logger = logging.getLogger(__name__) + +TS = TypeVar("TS", bound="TextSplitter") + + +def _split_text_with_regex( + text: str, separator: str, keep_separator: bool +) -> List[str]: + # Now that we have the separator, split the text + if separator: + if keep_separator: + # The parentheses in the pattern keep the delimiters in the result. + _splits = re.split(f"({separator})", text) + splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)] + if len(_splits) % 2 == 0: + splits += _splits[-1:] + splits = [_splits[0]] + splits + else: + splits = re.split(separator, text) + else: + splits = list(text) + return [s for s in splits if s != ""] + + +class TextSplitter(BaseDocumentTransformer, ABC): + """Interface for splitting text into chunks.""" + + def __init__( + self, + chunk_size: int = 4000, + chunk_overlap: int = 200, + length_function: Callable[[str], int] = len, + keep_separator: bool = False, + add_start_index: bool = False, + ) -> None: + """Create a new TextSplitter. + + Args: + chunk_size: Maximum size of chunks to return + chunk_overlap: Overlap in characters between chunks + length_function: Function that measures the length of given chunks + keep_separator: Whether to keep the separator in the chunks + add_start_index: If `True`, includes chunk's start index in metadata + """ + if chunk_overlap > chunk_size: + raise ValueError( + f"Got a larger chunk overlap ({chunk_overlap}) than chunk size " + f"({chunk_size}), should be smaller." + ) + self._chunk_size = chunk_size + self._chunk_overlap = chunk_overlap + self._length_function = length_function + self._keep_separator = keep_separator + self._add_start_index = add_start_index + + @abstractmethod + def split_text(self, text: str) -> List[str]: + """Split text into multiple components.""" + + def create_documents( + self, texts: List[str], metadatas: Optional[List[dict]] = None + ) -> List[Document]: + """Create documents from a list of texts.""" + _metadatas = metadatas or [{}] * len(texts) + documents = [] + for i, text in enumerate(texts): + index = -1 + for chunk in self.split_text(text): + metadata = copy.deepcopy(_metadatas[i]) + if self._add_start_index: + index = text.find(chunk, index + 1) + metadata["start_index"] = index + new_doc = Document(page_content=chunk, metadata=metadata) + documents.append(new_doc) + return documents + + def split_documents(self, documents: Iterable[Document]) -> List[Document]: + """Split documents.""" + texts, metadatas = [], [] + for doc in documents: + texts.append(doc.page_content) + metadatas.append(doc.metadata) + return self.create_documents(texts, metadatas=metadatas) + + def _join_docs(self, docs: List[str], separator: str) -> Optional[str]: + text = separator.join(docs) + text = text.strip() + if text == "": + return None + else: + return text + + def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]: + # We now want to combine these smaller pieces into medium size + # chunks to send to the LLM. + separator_len = self._length_function(separator) + + docs = [] + current_doc: List[str] = [] + total = 0 + for d in splits: + _len = self._length_function(d) + if ( + total + _len + (separator_len if len(current_doc) > 0 else 0) + > self._chunk_size + ): + if total > self._chunk_size: + logger.warning( + f"Created a chunk of size {total}, " + f"which is longer than the specified {self._chunk_size}" + ) + if len(current_doc) > 0: + doc = self._join_docs(current_doc, separator) + if doc is not None: + docs.append(doc) + # Keep on popping if: + # - we have a larger chunk than in the chunk overlap + # - or if we still have any chunks and the length is long + while total > self._chunk_overlap or ( + total + _len + (separator_len if len(current_doc) > 0 else 0) + > self._chunk_size + and total > 0 + ): + total -= self._length_function(current_doc[0]) + ( + separator_len if len(current_doc) > 1 else 0 + ) + current_doc = current_doc[1:] + current_doc.append(d) + total += _len + (separator_len if len(current_doc) > 1 else 0) + doc = self._join_docs(current_doc, separator) + if doc is not None: + docs.append(doc) + return docs + + @classmethod + def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter: + """Text splitter that uses HuggingFace tokenizer to count length.""" + try: + from transformers import PreTrainedTokenizerBase + + if not isinstance(tokenizer, PreTrainedTokenizerBase): + raise ValueError( + "Tokenizer received was not an instance of PreTrainedTokenizerBase" + ) + + def _huggingface_tokenizer_length(text: str) -> int: + return len(tokenizer.encode(text)) + + except ImportError: + raise ValueError( + "Could not import transformers python package. " + "Please install it with `pip install transformers`." + ) + return cls(length_function=_huggingface_tokenizer_length, **kwargs) + + @classmethod + def from_tiktoken_encoder( + cls: Type[TS], + encoding_name: str = "gpt2", + model_name: Optional[str] = None, + allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), + disallowed_special: Union[Literal["all"], Collection[str]] = "all", + **kwargs: Any, + ) -> TS: + """Text splitter that uses tiktoken encoder to count length.""" + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to calculate max_tokens_for_prompt. " + "Please install it with `pip install tiktoken`." + ) + + if model_name is not None: + enc = tiktoken.encoding_for_model(model_name) + else: + enc = tiktoken.get_encoding(encoding_name) + + def _tiktoken_encoder(text: str) -> int: + return len( + enc.encode( + text, + allowed_special=allowed_special, + disallowed_special=disallowed_special, + ) + ) + + if issubclass(cls, TokenTextSplitter): + extra_kwargs = { + "encoding_name": encoding_name, + "model_name": model_name, + "allowed_special": allowed_special, + "disallowed_special": disallowed_special, + } + kwargs = {**kwargs, **extra_kwargs} + + return cls(length_function=_tiktoken_encoder, **kwargs) + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Transform sequence of documents by splitting them.""" + return self.split_documents(list(documents)) + + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + """Asynchronously transform a sequence of documents by splitting them.""" + raise NotImplementedError + + +class CharacterTextSplitter(TextSplitter): + """Splitting text that looks at characters.""" + + def __init__( + self, separator: str = "\n\n", is_separator_regex: bool = False, **kwargs: Any + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs) + self._separator = separator + self._is_separator_regex = is_separator_regex + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + separator = ( + self._separator if self._is_separator_regex else re.escape(self._separator) + ) + splits = _split_text_with_regex(text, separator, self._keep_separator) + _separator = "" if self._keep_separator else self._separator + return self._merge_splits(splits, _separator) + + +class LineType(TypedDict): + """Line type as typed dict.""" + + metadata: Dict[str, str] + content: str + + +class HeaderType(TypedDict): + """Header type as typed dict.""" + + level: int + name: str + data: str + + +class MarkdownHeaderTextSplitter: + """Splitting markdown files based on specified headers.""" + + def __init__( + self, headers_to_split_on: List[Tuple[str, str]], return_each_line: bool = False + ): + """Create a new MarkdownHeaderTextSplitter. + + Args: + headers_to_split_on: Headers we want to track + return_each_line: Return each line w/ associated headers + """ + # Output line-by-line or aggregated into chunks w/ common headers + self.return_each_line = return_each_line + # Given the headers we want to split on, + # (e.g., "#, ##, etc") order by length + self.headers_to_split_on = sorted( + headers_to_split_on, key=lambda split: len(split[0]), reverse=True + ) + + def aggregate_lines_to_chunks(self, lines: List[LineType]) -> List[Document]: + """Combine lines with common metadata into chunks + Args: + lines: Line of text / associated header metadata + """ + aggregated_chunks: List[LineType] = [] + + for line in lines: + if ( + aggregated_chunks + and aggregated_chunks[-1]["metadata"] == line["metadata"] + ): + # If the last line in the aggregated list + # has the same metadata as the current line, + # append the current content to the last lines's content + aggregated_chunks[-1]["content"] += " \n" + line["content"] + else: + # Otherwise, append the current line to the aggregated list + aggregated_chunks.append(line) + + return [ + Document(page_content=chunk["content"], metadata=chunk["metadata"]) + for chunk in aggregated_chunks + ] + + def split_text(self, text: str) -> List[Document]: + """Split markdown file + Args: + text: Markdown file""" + + # Split the input text by newline character ("\n"). + lines = text.split("\n") + # Final output + lines_with_metadata: List[LineType] = [] + # Content and metadata of the chunk currently being processed + current_content: List[str] = [] + current_metadata: Dict[str, str] = {} + # Keep track of the nested header structure + # header_stack: List[Dict[str, Union[int, str]]] = [] + header_stack: List[HeaderType] = [] + initial_metadata: Dict[str, str] = {} + + for line in lines: + stripped_line = line.strip() + # Check each line against each of the header types (e.g., #, ##) + for sep, name in self.headers_to_split_on: + # Check if line starts with a header that we intend to split on + if stripped_line.startswith(sep) and ( + # Header with no text OR header is followed by space + # Both are valid conditions that sep is being used a header + len(stripped_line) == len(sep) + or stripped_line[len(sep)] == " " + ): + # Ensure we are tracking the header as metadata + if name is not None: + # Get the current header level + current_header_level = sep.count("#") + + # Pop out headers of lower or same level from the stack + while ( + header_stack + and header_stack[-1]["level"] >= current_header_level + ): + # We have encountered a new header + # at the same or higher level + popped_header = header_stack.pop() + # Clear the metadata for the + # popped header in initial_metadata + if popped_header["name"] in initial_metadata: + initial_metadata.pop(popped_header["name"]) + + # Push the current header to the stack + header: HeaderType = { + "level": current_header_level, + "name": name, + "data": stripped_line[len(sep) :].strip(), + } + header_stack.append(header) + # Update initial_metadata with the current header + initial_metadata[name] = header["data"] + + # Add the previous line to the lines_with_metadata + # only if current_content is not empty + if current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + break + else: + if stripped_line: + current_content.append(stripped_line) + elif current_content: + lines_with_metadata.append( + { + "content": "\n".join(current_content), + "metadata": current_metadata.copy(), + } + ) + current_content.clear() + + current_metadata = initial_metadata.copy() + + if current_content: + lines_with_metadata.append( + {"content": "\n".join(current_content), "metadata": current_metadata} + ) + + # lines_with_metadata has each line with associated header metadata + # aggregate these into chunks based on common metadata + if not self.return_each_line: + return self.aggregate_lines_to_chunks(lines_with_metadata) + else: + return [ + Document(page_content=chunk["content"], metadata=chunk["metadata"]) + for chunk in lines_with_metadata + ] + + +# should be in newer Python versions (3.10+) +# @dataclass(frozen=True, kw_only=True, slots=True) +@dataclass(frozen=True) +class Tokenizer: + chunk_overlap: int + tokens_per_chunk: int + decode: Callable[[list[int]], str] + encode: Callable[[str], List[int]] + + +def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]: + """Split incoming text and return chunks using tokenizer.""" + splits: List[str] = [] + input_ids = tokenizer.encode(text) + start_idx = 0 + cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[start_idx:cur_idx] + while start_idx < len(input_ids): + splits.append(tokenizer.decode(chunk_ids)) + start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap + cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids)) + chunk_ids = input_ids[start_idx:cur_idx] + return splits + + +class TokenTextSplitter(TextSplitter): + """Splitting text to tokens using model tokenizer.""" + + def __init__( + self, + encoding_name: str = "gpt2", + model_name: Optional[str] = None, + allowed_special: Union[Literal["all"], AbstractSet[str]] = set(), + disallowed_special: Union[Literal["all"], Collection[str]] = "all", + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs) + try: + import tiktoken + except ImportError: + raise ImportError( + "Could not import tiktoken python package. " + "This is needed in order to for TokenTextSplitter. " + "Please install it with `pip install tiktoken`." + ) + + if model_name is not None: + enc = tiktoken.encoding_for_model(model_name) + else: + enc = tiktoken.get_encoding(encoding_name) + self._tokenizer = enc + self._allowed_special = allowed_special + self._disallowed_special = disallowed_special + + def split_text(self, text: str) -> List[str]: + def _encode(_text: str) -> List[int]: + return self._tokenizer.encode( + _text, + allowed_special=self._allowed_special, + disallowed_special=self._disallowed_special, + ) + + tokenizer = Tokenizer( + chunk_overlap=self._chunk_overlap, + tokens_per_chunk=self._chunk_size, + decode=self._tokenizer.decode, + encode=_encode, + ) + + return split_text_on_tokens(text=text, tokenizer=tokenizer) + + +class SentenceTransformersTokenTextSplitter(TextSplitter): + """Splitting text to tokens using sentence model tokenizer.""" + + def __init__( + self, + chunk_overlap: int = 50, + model_name: str = "sentence-transformers/all-mpnet-base-v2", + tokens_per_chunk: Optional[int] = None, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(**kwargs, chunk_overlap=chunk_overlap) + + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + "Could not import sentence_transformer python package. " + "This is needed in order to for SentenceTransformersTokenTextSplitter. " + "Please install it with `pip install sentence-transformers`." + ) + + self.model_name = model_name + self._model = SentenceTransformer(self.model_name) + self.tokenizer = self._model.tokenizer + self._initialize_chunk_configuration(tokens_per_chunk=tokens_per_chunk) + + def _initialize_chunk_configuration( + self, *, tokens_per_chunk: Optional[int] + ) -> None: + self.maximum_tokens_per_chunk = cast(int, self._model.max_seq_length) + + if tokens_per_chunk is None: + self.tokens_per_chunk = self.maximum_tokens_per_chunk + else: + self.tokens_per_chunk = tokens_per_chunk + + if self.tokens_per_chunk > self.maximum_tokens_per_chunk: + raise ValueError( + f"The token limit of the models '{self.model_name}'" + f" is: {self.maximum_tokens_per_chunk}." + f" Argument tokens_per_chunk={self.tokens_per_chunk}" + f" > maximum token limit." + ) + + def split_text(self, text: str) -> List[str]: + def encode_strip_start_and_stop_token_ids(text: str) -> List[int]: + return self._encode(text)[1:-1] + + tokenizer = Tokenizer( + chunk_overlap=self._chunk_overlap, + tokens_per_chunk=self.tokens_per_chunk, + decode=self.tokenizer.decode, + encode=encode_strip_start_and_stop_token_ids, + ) + + return split_text_on_tokens(text=text, tokenizer=tokenizer) + + def count_tokens(self, *, text: str) -> int: + return len(self._encode(text)) + + _max_length_equal_32_bit_integer: int = 2**32 + + def _encode(self, text: str) -> List[int]: + token_ids_with_start_and_end_token_ids = self.tokenizer.encode( + text, + max_length=self._max_length_equal_32_bit_integer, + truncation="do_not_truncate", + ) + return token_ids_with_start_and_end_token_ids + + +class Language(str, Enum): + """Enum of the programming languages.""" + + CPP = "cpp" + GO = "go" + JAVA = "java" + JS = "js" + PHP = "php" + PROTO = "proto" + PYTHON = "python" + RST = "rst" + RUBY = "ruby" + RUST = "rust" + SCALA = "scala" + SWIFT = "swift" + MARKDOWN = "markdown" + LATEX = "latex" + HTML = "html" + SOL = "sol" + + +class RecursiveCharacterTextSplitter(TextSplitter): + """Splitting text by recursively look at characters. + + Recursively tries to split by different characters to find one + that works. + """ + + def __init__( + self, + separators: Optional[List[str]] = None, + keep_separator: bool = True, + is_separator_regex: bool = False, + **kwargs: Any, + ) -> None: + """Create a new TextSplitter.""" + super().__init__(keep_separator=keep_separator, **kwargs) + self._separators = separators or ["\n\n", "\n", " ", ""] + self._is_separator_regex = is_separator_regex + + def _split_text(self, text: str, separators: List[str]) -> List[str]: + """Split incoming text and return chunks.""" + final_chunks = [] + # Get appropriate separator to use + separator = separators[-1] + new_separators = [] + for i, _s in enumerate(separators): + _separator = _s if self._is_separator_regex else re.escape(_s) + if _s == "": + separator = _s + break + if re.search(_separator, text): + separator = _s + new_separators = separators[i + 1 :] + break + + _separator = separator if self._is_separator_regex else re.escape(separator) + splits = _split_text_with_regex(text, _separator, self._keep_separator) + + # Now go merging things, recursively splitting longer texts. + _good_splits = [] + _separator = "" if self._keep_separator else separator + for s in splits: + if self._length_function(s) < self._chunk_size: + _good_splits.append(s) + else: + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + _good_splits = [] + if not new_separators: + final_chunks.append(s) + else: + other_info = self._split_text(s, new_separators) + final_chunks.extend(other_info) + if _good_splits: + merged_text = self._merge_splits(_good_splits, _separator) + final_chunks.extend(merged_text) + return final_chunks + + def split_text(self, text: str) -> List[str]: + return self._split_text(text, self._separators) + + @classmethod + def from_language( + cls, language: Language, **kwargs: Any + ) -> RecursiveCharacterTextSplitter: + separators = cls.get_separators_for_language(language) + return cls(separators=separators, is_separator_regex=True, **kwargs) + + @staticmethod + def get_separators_for_language(language: Language) -> List[str]: + if language == Language.CPP: + return [ + # Split along class definitions + "\nclass ", + # Split along function definitions + "\nvoid ", + "\nint ", + "\nfloat ", + "\ndouble ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.GO: + return [ + # Split along function definitions + "\nfunc ", + "\nvar ", + "\nconst ", + "\ntype ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.JAVA: + return [ + # Split along class definitions + "\nclass ", + # Split along method definitions + "\npublic ", + "\nprotected ", + "\nprivate ", + "\nstatic ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.JS: + return [ + # Split along function definitions + "\nfunction ", + "\nconst ", + "\nlet ", + "\nvar ", + "\nclass ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nswitch ", + "\ncase ", + "\ndefault ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.PHP: + return [ + # Split along function definitions + "\nfunction ", + # Split along class definitions + "\nclass ", + # Split along control flow statements + "\nif ", + "\nforeach ", + "\nwhile ", + "\ndo ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.PROTO: + return [ + # Split along message definitions + "\nmessage ", + # Split along service definitions + "\nservice ", + # Split along enum definitions + "\nenum ", + # Split along option definitions + "\noption ", + # Split along import statements + "\nimport ", + # Split along syntax declarations + "\nsyntax ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.PYTHON: + return [ + # First, try to split along class definitions + "\nclass ", + "\ndef ", + "\n\tdef ", + # Now split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.RST: + return [ + # Split along section titles + "\n=+\n", + "\n-+\n", + "\n\\*+\n", + # Split along directive markers + "\n\n.. *\n\n", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.RUBY: + return [ + # Split along method definitions + "\ndef ", + "\nclass ", + # Split along control flow statements + "\nif ", + "\nunless ", + "\nwhile ", + "\nfor ", + "\ndo ", + "\nbegin ", + "\nrescue ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.RUST: + return [ + # Split along function definitions + "\nfn ", + "\nconst ", + "\nlet ", + # Split along control flow statements + "\nif ", + "\nwhile ", + "\nfor ", + "\nloop ", + "\nmatch ", + "\nconst ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.SCALA: + return [ + # Split along class definitions + "\nclass ", + "\nobject ", + # Split along method definitions + "\ndef ", + "\nval ", + "\nvar ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\nmatch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.SWIFT: + return [ + # Split along function definitions + "\nfunc ", + # Split along class definitions + "\nclass ", + "\nstruct ", + "\nenum ", + # Split along control flow statements + "\nif ", + "\nfor ", + "\nwhile ", + "\ndo ", + "\nswitch ", + "\ncase ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.MARKDOWN: + return [ + # First, try to split along Markdown headings (starting with level 2) + "\n#{1,6} ", + # Note the alternative syntax for headings (below) is not handled here + # Heading level 2 + # --------------- + # End of code block + "```\n", + # Horizontal lines + "\n\\*\\*\\*+\n", + "\n---+\n", + "\n___+\n", + # Note that this splitter doesn't handle horizontal lines defined + # by *three or more* of ***, ---, or ___, but this is not handled + "\n\n", + "\n", + " ", + "", + ] + elif language == Language.LATEX: + return [ + # First, try to split along Latex sections + "\n\\\\chapter{", + "\n\\\\section{", + "\n\\\\subsection{", + "\n\\\\subsubsection{", + # Now split by environments + "\n\\\\begin{enumerate}", + "\n\\\\begin{itemize}", + "\n\\\\begin{description}", + "\n\\\\begin{list}", + "\n\\\\begin{quote}", + "\n\\\\begin{quotation}", + "\n\\\\begin{verse}", + "\n\\\\begin{verbatim}", + # Now split by math environments + "\n\\\begin{align}", + "$$", + "$", + # Now split by the normal type of lines + " ", + "", + ] + elif language == Language.HTML: + return [ + # First, try to split along HTML tags + " None: + """Initialize the NLTK splitter.""" + super().__init__(**kwargs) + try: + from nltk.tokenize import sent_tokenize + + self._tokenizer = sent_tokenize + except ImportError: + raise ImportError( + "NLTK is not installed, please install it with `pip install nltk`." + ) + self._separator = separator + + def split_text(self, text: str) -> List[str]: + """Split incoming text and return chunks.""" + # First we naively split the large input into a bunch of smaller ones. + splits = self._tokenizer(text) + return self._merge_splits(splits, self._separator) + + +class MarkdownTextSplitter(RecursiveCharacterTextSplitter): + """Attempts to split the text along Markdown-formatted headings.""" + + def __init__(self, **kwargs: Any) -> None: + """Initialize a MarkdownTextSplitter.""" + separators = self.get_separators_for_language(Language.MARKDOWN) + super().__init__(separators=separators, **kwargs) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/math.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/math.py new file mode 100644 index 000000000000..41e1b6a0bd00 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/utils/math.py @@ -0,0 +1,61 @@ +# This file has been copied as is. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +"""Math utils.""" +from typing import List, Optional, Tuple, Union + +import numpy as np + +Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] + + +def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: + """Row-wise cosine similarity between two equal-width matrices.""" + if len(X) == 0 or len(Y) == 0: + return np.array([]) + X = np.array(X) + Y = np.array(Y) + if X.shape[1] != Y.shape[1]: + raise ValueError( + f"Number of columns in X and Y must be the same. X has shape {X.shape} " + f"and Y has shape {Y.shape}." + ) + + X_norm = np.linalg.norm(X, axis=1) + Y_norm = np.linalg.norm(Y, axis=1) + # Ignore divide by zero errors run time warnings as those are handled below. + with np.errstate(divide="ignore", invalid="ignore"): + similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) + similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 + return similarity + + +def cosine_similarity_top_k( + X: Matrix, + Y: Matrix, + top_k: Optional[int] = 5, + score_threshold: Optional[float] = None, +) -> Tuple[List[Tuple[int, int]], List[float]]: + """Row-wise cosine similarity with optional top-k and score threshold filtering. + + Args: + X: Matrix. + Y: Matrix, same width as X. + top_k: Max number of results to return. + score_threshold: Minimum cosine similarity of results. + + Returns: + Tuple of two lists. First contains two-tuples of indices (X_idx, Y_idx), + second contains corresponding cosine similarities. + """ + if len(X) == 0 or len(Y) == 0: + return [], [] + score_array = cosine_similarity(X, Y) + score_threshold = score_threshold or -1.0 + score_array[score_array < score_threshold] = 0 + top_k = min(top_k or len(score_array), np.count_nonzero(score_array)) + top_k_idxs = np.argpartition(score_array, -top_k, axis=None)[-top_k:] + top_k_idxs = top_k_idxs[np.argsort(score_array.ravel()[top_k_idxs])][::-1] + ret_idxs = np.unravel_index(top_k_idxs, score_array.shape) + scores = score_array.ravel()[top_k_idxs].tolist() + return list(zip(*ret_idxs)), scores # type: ignore diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/__init__.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/base.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/base.py new file mode 100644 index 000000000000..32ac4a7f6b8c --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/base.py @@ -0,0 +1,431 @@ +# This file has been slightly modified to not rely on Pydantic. +# Last Sync: 2023-09-05 +# Tag: v0.0.220 +"""Interface for vector stores.""" +from __future__ import annotations + +import asyncio +import warnings +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from functools import partial +from typing import ( + Any, + ClassVar, + Collection, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, +) + +from azure.ai.generative.index._langchain.vendor.schema.document import Document +from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings +from azure.ai.generative.index._langchain.vendor.schema import BaseRetriever + +VST = TypeVar("VST", bound="VectorStore") + + +class VectorStore(ABC): + """Interface for vector stores.""" + + @abstractmethod + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + + Returns: + List of ids from adding the texts into the vectorstore. + """ + + def delete(self, ids: List[str]) -> Optional[bool]: + """Delete by vector ID. + + Args: + ids: List of ids to delete. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + + raise NotImplementedError( + "delete_by_id method must be implemented by subclass." + ) + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore.""" + raise NotImplementedError + + def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: + """Run more documents through the embeddings and add to the vectorstore. + + Args: + documents (List[Document]: Documents to add to the vectorstore. + + Returns: + List[str]: List of IDs of the added texts. + """ + # TODO: Handle the case where the user doesn't provide ids on the Collection + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + return self.add_texts(texts, metadatas, **kwargs) + + async def aadd_documents( + self, documents: List[Document], **kwargs: Any + ) -> List[str]: + """Run more documents through the embeddings and add to the vectorstore. + + Args: + documents (List[Document]: Documents to add to the vectorstore. + + Returns: + List[str]: List of IDs of the added texts. + """ + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + return await self.aadd_texts(texts, metadatas, **kwargs) + + def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]: + """Return docs most similar to query using specified search type.""" + if search_type == "similarity": + return self.similarity_search(query, **kwargs) + elif search_type == "mmr": + return self.max_marginal_relevance_search(query, **kwargs) + else: + raise ValueError( + f"search_type of {search_type} not allowed. Expected " + "search_type to be 'similarity' or 'mmr'." + ) + + async def asearch( + self, query: str, search_type: str, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query using specified search type.""" + if search_type == "similarity": + return await self.asimilarity_search(query, **kwargs) + elif search_type == "mmr": + return await self.amax_marginal_relevance_search(query, **kwargs) + else: + raise ValueError( + f"search_type of {search_type} not allowed. Expected " + "search_type to be 'similarity' or 'mmr'." + ) + + @abstractmethod + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query.""" + + def similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + docs_and_similarities = self._similarity_search_with_relevance_scores( + query, k=k, **kwargs + ) + if any( + similarity < 0.0 or similarity > 1.0 + for _, similarity in docs_and_similarities + ): + warnings.warn( + "Relevance scores must be between" + f" 0 and 1, got {docs_and_similarities}" + ) + + score_threshold = kwargs.get("score_threshold") + if score_threshold is not None: + docs_and_similarities = [ + (doc, similarity) + for doc, similarity in docs_and_similarities + if similarity >= score_threshold + ] + if len(docs_and_similarities) == 0: + warnings.warn( + "No relevant docs were retrieved using the relevance score" + f" threshold {score_threshold}" + ) + return docs_and_similarities + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + async def asimilarity_search_with_relevance_scores( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial(self.similarity_search_with_relevance_scores, query, k, **kwargs) + return await asyncio.get_event_loop().run_in_executor(None, func) + + async def asimilarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial(self.similarity_search, query, k, **kwargs) + return await asyncio.get_event_loop().run_in_executor(None, func) + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + raise NotImplementedError + + async def asimilarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to embedding vector.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial(self.similarity_search_by_vector, embedding, k, **kwargs) + return await asyncio.get_event_loop().run_in_executor(None, func) + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + raise NotImplementedError + + async def amax_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial( + self.max_marginal_relevance_search, query, k, fetch_k, lambda_mult, **kwargs + ) + return await asyncio.get_event_loop().run_in_executor(None, func) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + raise NotImplementedError + + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + raise NotImplementedError + + @classmethod + def from_documents( + cls: Type[VST], + documents: List[Document], + embedding: Embeddings, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs) + + @classmethod + async def afrom_documents( + cls: Type[VST], + documents: List[Document], + embedding: Embeddings, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return await cls.afrom_texts(texts, embedding, metadatas=metadatas, **kwargs) + + @classmethod + @abstractmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from texts and embeddings.""" + + @classmethod + async def afrom_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from texts and embeddings.""" + raise NotImplementedError + + def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever: + return VectorStoreRetriever(vectorstore=self, **kwargs) + + +@dataclass +class VectorStoreRetriever(BaseRetriever): + vectorstore: VectorStore + search_type: str = "similarity" + search_kwargs: dict = field(default_factory=dict) + allowed_search_types: ClassVar[Collection[str]] = ( + "similarity", + "similarity_score_threshold", + "mmr", + ) + + def get_relevant_documents(self, query: str) -> List[Document]: + if self.search_type == "similarity": + docs = self.vectorstore.similarity_search(query, **self.search_kwargs) + elif self.search_type == "similarity_score_threshold": + docs_and_similarities = ( + self.vectorstore.similarity_search_with_relevance_scores( + query, **self.search_kwargs + ) + ) + docs = [doc for doc, _ in docs_and_similarities] + elif self.search_type == "mmr": + docs = self.vectorstore.max_marginal_relevance_search( + query, **self.search_kwargs + ) + else: + raise ValueError(f"search_type of {self.search_type} not allowed.") + return docs + + async def aget_relevant_documents(self, query: str) -> List[Document]: + if self.search_type == "similarity": + docs = await self.vectorstore.asimilarity_search( + query, **self.search_kwargs + ) + elif self.search_type == "similarity_score_threshold": + docs_and_similarities = ( + await self.vectorstore.asimilarity_search_with_relevance_scores( + query, **self.search_kwargs + ) + ) + docs = [doc for doc, _ in docs_and_similarities] + elif self.search_type == "mmr": + docs = await self.vectorstore.amax_marginal_relevance_search( + query, **self.search_kwargs + ) + else: + raise ValueError(f"search_type of {self.search_type} not allowed.") + return docs + + def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: + """Add documents to vectorstore.""" + return self.vectorstore.add_documents(documents, **kwargs) + + async def aadd_documents( + self, documents: List[Document], **kwargs: Any + ) -> List[str]: + """Add documents to vectorstore.""" + return await self.vectorstore.aadd_documents(documents, **kwargs) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/faiss.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/faiss.py new file mode 100644 index 000000000000..7d811ab56f2f --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/faiss.py @@ -0,0 +1,631 @@ +"""Wrapper around FAISS vector database.""" +from __future__ import annotations + +import math +import os +import pickle +import uuid +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple + +import numpy as np + +from azure.ai.generative.index._langchain.vendor.docstore.base import AddableMixin, Docstore +from azure.ai.generative.index._langchain.vendor.schema.document import Document +from azure.ai.generative.index._langchain.vendor.embeddings.base import Embeddings +from azure.ai.generative.index._langchain.vendor.vectorstores.base import VectorStore +from azure.ai.generative.index._langchain.vendor.vectorstores.utils import maximal_marginal_relevance + + +def dependable_faiss_import(no_avx2: Optional[bool] = None) -> Any: + """ + Import faiss if available, otherwise raise error. + If FAISS_NO_AVX2 environment variable is set, it will be considered + to load FAISS with no AVX2 optimization. + + Args: + no_avx2: Load FAISS strictly with no AVX2 optimization + so that the vectorstore is portable and compatible with other devices. + """ + if no_avx2 is None and "FAISS_NO_AVX2" in os.environ: + no_avx2 = bool(os.getenv("FAISS_NO_AVX2")) + + try: + if no_avx2: + from faiss import swigfaiss as faiss + else: + import faiss + except ImportError: + raise ImportError( + "Could not import faiss python package. " + "Please install it with `pip install faiss` " + "or `pip install faiss-cpu` (depending on Python version)." + ) + return faiss + + +def _default_relevance_score_fn(score: float) -> float: + """Return a similarity score on a scale [0, 1].""" + # The 'correct' relevance function + # may differ depending on a few things, including: + # - the distance / similarity metric used by the VectorStore + # - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + # - embedding dimensionality + # - etc. + # This function converts the euclidean norm of normalized embeddings + # (0 is most similar, sqrt(2) most dissimilar) + # to a similarity function (0 to 1) + return 1.0 - score / math.sqrt(2) + + +class FAISS(VectorStore): + """Wrapper around FAISS vector database. + + To use, you should have the ``faiss`` python package installed. + + Example: + .. code-block:: python + + from langchain import FAISS + faiss = FAISS(embedding_function, index, docstore, index_to_docstore_id) + + """ + + def __init__( + self, + embedding_function: Callable, + index: Any, + docstore: Docstore, + index_to_docstore_id: Dict[int, str], + relevance_score_fn: Optional[ + Callable[[float], float] + ] = _default_relevance_score_fn, + normalize_L2: bool = False, + ): + """Initialize with necessary components.""" + self.embedding_function = embedding_function + self.index = index + self.docstore = docstore + self.index_to_docstore_id = index_to_docstore_id + self.relevance_score_fn = relevance_score_fn + self._normalize_L2 = normalize_L2 + + def __add( + self, + texts: Iterable[str], + embeddings: Iterable[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + documents = [] + for i, text in enumerate(texts): + metadata = metadatas[i] if metadatas else {} + documents.append(Document(page_content=text, metadata=metadata)) + if ids is None: + ids = [str(uuid.uuid4()) for _ in texts] + # Add to the index, the index_to_id mapping, and the docstore. + starting_len = len(self.index_to_docstore_id) + faiss = dependable_faiss_import() + vector: np.ndarray = np.array(embeddings, dtype=np.float32) + if self._normalize_L2: + faiss.normalize_L2(vector) + self.index.add(vector) + # Get list of index, id, and docs. + full_info = [(starting_len + i, ids[i], doc) for i, doc in enumerate(documents)] + # Add information to docstore and index. + self.docstore.add({_id: doc for _, _id, doc in full_info}) + index_to_id = {index: _id for index, _id, _ in full_info} + self.index_to_docstore_id.update(index_to_id) + return [_id for _, _id, _ in full_info] + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of unique IDs. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + # Embed and create the documents. + embeddings = [self.embedding_function(text) for text in texts] + return self.__add(texts, embeddings, metadatas=metadatas, ids=ids, **kwargs) + + def add_embeddings( + self, + text_embeddings: Iterable[Tuple[str, List[float]]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + text_embeddings: Iterable pairs of string and embedding to + add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + ids: Optional list of unique IDs. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if not isinstance(self.docstore, AddableMixin): + raise ValueError( + "If trying to add texts, the underlying docstore should support " + f"adding items, which {self.docstore} does not" + ) + # Embed and create the documents. + texts, embeddings = zip(*text_embeddings) + + return self.__add(texts, embeddings, metadatas=metadatas, ids=ids, **kwargs) + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + embedding: Embedding vector to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + **kwargs: kwargs to be passed to similarity search. Can include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of documents most similar to the query text and L2 distance + in float for each. Lower score represents more similarity. + """ + faiss = dependable_faiss_import() + vector: np.ndarray = np.array([embedding], dtype=np.float32) + if self._normalize_L2: + faiss.normalize_L2(vector) + scores, indices = self.index.search(vector, k if filter is None else fetch_k) + docs = [] + for j, i in enumerate(indices[0]): + if i == -1: + # This happens when not enough docs are returned. + continue + _id = self.index_to_docstore_id[i] + doc = self.docstore.search(_id) + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + if filter is not None: + filter = { + key: [value] if not isinstance(value, list) else value + for key, value in filter.items() + } + if all(doc.metadata.get(key) in value for key, value in filter.items()): + docs.append((doc, scores[0][j])) + else: + docs.append((doc, scores[0][j])) + + score_threshold = kwargs.get("score_threshold") + if score_threshold is not None: + docs = [ + (doc, similarity) + for doc, similarity in docs + if similarity >= score_threshold + ] + return docs[:k] + + def similarity_search_with_score( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + + Returns: + List of documents most similar to the query text with + L2 distance in float. Lower score represents more similarity. + """ + embedding = self.embedding_function(query) + docs = self.similarity_search_with_score_by_vector( + embedding, + k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + return docs + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + + Returns: + List of Documents most similar to the embedding. + """ + docs_and_scores = self.similarity_search_with_score_by_vector( + embedding, + k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + return [doc for doc, _ in docs_and_scores] + + def similarity_search( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + fetch_k: (Optional[int]) Number of Documents to fetch before filtering. + Defaults to 20. + + Returns: + List of Documents most similar to the query. + """ + docs_and_scores = self.similarity_search_with_score( + query, k, filter=filter, fetch_k=fetch_k, **kwargs + ) + return [doc for doc, _ in docs_and_scores] + + def max_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + *, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, Any]] = None, + ) -> List[Tuple[Document, float]]: + """Return docs and their similarity scores selected using the maximal marginal + relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch before filtering to + pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents and similarity scores selected by maximal marginal + relevance and score for each. + """ + scores, indices = self.index.search( + np.array([embedding], dtype=np.float32), + fetch_k if filter is None else fetch_k * 2, + ) + if filter is not None: + filtered_indices = [] + for i in indices[0]: + if i == -1: + # This happens when not enough docs are returned. + continue + _id = self.index_to_docstore_id[i] + doc = self.docstore.search(_id) + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + if all(doc.metadata.get(key) == value for key, value in filter.items()): + filtered_indices.append(i) + indices = np.array([filtered_indices]) + # -1 happens when not enough docs are returned. + embeddings = [self.index.reconstruct(int(i)) for i in indices[0] if i != -1] + mmr_selected = maximal_marginal_relevance( + np.array([embedding], dtype=np.float32), + embeddings, + k=k, + lambda_mult=lambda_mult, + ) + selected_indices = [indices[0][i] for i in mmr_selected] + selected_scores = [scores[0][i] for i in mmr_selected] + docs_and_scores = [] + for i, score in zip(selected_indices, selected_scores): + if i == -1: + # This happens when not enough docs are returned. + continue + _id = self.index_to_docstore_id[i] + doc = self.docstore.search(_id) + if not isinstance(doc, Document): + raise ValueError(f"Could not find document for id {_id}, got {doc}") + docs_and_scores.append((doc, score)) + return docs_and_scores + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch before filtering to + pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + docs_and_scores = self.max_marginal_relevance_search_with_score_by_vector( + embedding, k=k, fetch_k=fetch_k, lambda_mult=lambda_mult, filter=filter + ) + return [doc for doc, _ in docs_and_scores] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, Any]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch before filtering (if needed) to + pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + embedding = self.embedding_function(query) + docs = self.max_marginal_relevance_search_by_vector( + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + return docs + + def merge_from(self, target: FAISS) -> None: + """Merge another FAISS object with the current one. + + Add the target FAISS to the current one. + + Args: + target: FAISS object you wish to merge into the current one + + Returns: + None. + """ + if not isinstance(self.docstore, AddableMixin): + raise ValueError("Cannot merge with this type of docstore") + # Numerical index for target docs are incremental on existing ones + starting_len = len(self.index_to_docstore_id) + + # Merge two IndexFlatL2 + self.index.merge_from(target.index) + + # Get id and docs from target FAISS object + full_info = [] + for i, target_id in target.index_to_docstore_id.items(): + doc = target.docstore.search(target_id) + if not isinstance(doc, Document): + raise ValueError("Document should be returned") + full_info.append((starting_len + i, target_id, doc)) + + # Add information to docstore and index_to_docstore_id. + self.docstore.add({_id: doc for _, _id, doc in full_info}) + index_to_id = {index: _id for index, _id, _ in full_info} + self.index_to_docstore_id.update(index_to_id) + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> FAISS: + """Construct FAISS wrapper from raw documents. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Creates an in memory docstore + 3. Initializes the FAISS database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import FAISS + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + faiss = FAISS.from_texts(texts, embeddings) + """ + embeddings = embedding.embed_documents(texts) + return cls.__from( + texts, + embeddings, + embedding, + metadatas=metadatas, + ids=ids, + **kwargs, + ) + + @classmethod + def from_embeddings( + cls, + text_embeddings: List[Tuple[str, List[float]]], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> FAISS: + """Construct FAISS wrapper from raw documents. + + This is a user friendly interface that: + 1. Embeds documents. + 2. Creates an in memory docstore + 3. Initializes the FAISS database + + This is intended to be a quick way to get started. + + Example: + .. code-block:: python + + from langchain import FAISS + from langchain.embeddings import OpenAIEmbeddings + embeddings = OpenAIEmbeddings() + text_embeddings = embeddings.embed_documents(texts) + text_embedding_pairs = list(zip(texts, text_embeddings)) + faiss = FAISS.from_embeddings(text_embedding_pairs, embeddings) + """ + texts = [t[0] for t in text_embeddings] + embeddings = [t[1] for t in text_embeddings] + return cls.__from( + texts, + embeddings, + embedding, + metadatas=metadatas, + ids=ids, + **kwargs, + ) + + def save_local(self, folder_path: str, index_name: str = "index") -> None: + """Save FAISS index, docstore, and index_to_docstore_id to disk. + + Args: + folder_path: folder path to save index, docstore, + and index_to_docstore_id to. + index_name: for saving with a specific index file name + """ + path = Path(folder_path) + path.mkdir(exist_ok=True, parents=True) + + # save index separately since it is not picklable + faiss = dependable_faiss_import() + faiss.write_index( + self.index, str(path / "{index_name}.faiss".format(index_name=index_name)) + ) + + # save docstore and index_to_docstore_id + with open(path / "{index_name}.pkl".format(index_name=index_name), "wb") as f: + pickle.dump((self.docstore, self.index_to_docstore_id), f) + + @classmethod + def load_local( + cls, folder_path: str, embeddings: Embeddings, index_name: str = "index" + ) -> FAISS: + """Load FAISS index, docstore, and index_to_docstore_id from disk. + + Args: + folder_path: folder path to load index, docstore, + and index_to_docstore_id from. + embeddings: Embeddings to use when generating queries + index_name: for saving with a specific index file name + """ + path = Path(folder_path) + # load index separately since it is not picklable + faiss = dependable_faiss_import() + index = faiss.read_index( + str(path / "{index_name}.faiss".format(index_name=index_name)) + ) + + # load docstore and index_to_docstore_id + with open(path / "{index_name}.pkl".format(index_name=index_name), "rb") as f: + docstore, index_to_docstore_id = pickle.load(f) + return cls(embeddings.embed_query, index, docstore, index_to_docstore_id) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + filter: Optional[Dict[str, Any]] = None, + fetch_k: int = 20, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and their similarity scores on a scale from 0 to 1.""" + if self.relevance_score_fn is None: + raise ValueError( + "normalize_score_fn must be provided to" + " FAISS constructor to normalize scores" + ) + docs_and_scores = self.similarity_search_with_score( + query, + k=k, + filter=filter, + fetch_k=fetch_k, + **kwargs, + ) + return [(doc, self.relevance_score_fn(score)) for doc, score in docs_and_scores] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/utils.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/utils.py new file mode 100644 index 000000000000..c10bde09e79b --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_langchain/vendor/vectorstores/utils.py @@ -0,0 +1,41 @@ +"""Utility functions for working with vectors and vectorstores.""" + +from typing import List + +import numpy as np + +from azure.ai.generative.index._langchain.vendor.utils.math import cosine_similarity + + +def maximal_marginal_relevance( + query_embedding: np.ndarray, + embedding_list: list, + lambda_mult: float = 0.5, + k: int = 4, +) -> List[int]: + """Calculate maximal marginal relevance.""" + if min(k, len(embedding_list)) <= 0: + return [] + if query_embedding.ndim == 1: + query_embedding = np.expand_dims(query_embedding, axis=0) + similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0] + most_similar = int(np.argmax(similarity_to_query)) + idxs = [most_similar] + selected = np.array([embedding_list[most_similar]]) + while len(idxs) < min(k, len(embedding_list)): + best_score = -np.inf + idx_to_add = -1 + similarity_to_selected = cosine_similarity(embedding_list, selected) + for i, query_score in enumerate(similarity_to_query): + if i in idxs: + continue + redundant_score = max(similarity_to_selected[i]) + equation_score = ( + lambda_mult * query_score - (1 - lambda_mult) * redundant_score + ) + if equation_score > best_score: + best_score = equation_score + idx_to_add = i + idxs.append(idx_to_add) + selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) + return idxs diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py index 6ffd231a5e1a..6b352ef70bdf 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_mlindex.py @@ -13,7 +13,7 @@ from azure.core.credentials import TokenCredential from azure.ai.generative.index._documents import Document, DocumentChunksIterator from azure.ai.generative.index._embeddings import EmbeddingsContainer -from azure.ai.resources._index._utils.connections import ( +from azure.ai.generative.index._utils.connections import ( BaseConnection, WorkspaceConnection, get_connection_by_id_v2, @@ -158,7 +158,7 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) langchain_pkg_version = pkg_version.parse(langchain_version) if index_kind == "acs": - from azure.ai.resources._index._indexes.azure_search import import_azure_search_or_so_help_me + from azure.ai.generative.index._indexes.azure_search import import_azure_search_or_so_help_me import_azure_search_or_so_help_me() @@ -274,11 +274,11 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) f"Failed to load FAISS Index using installed version of langchain, retrying with vendored FAISS VectorStore.\n{e}" ) - from azure.ai.resources._index._langchain.vendor.vectorstores.faiss import FAISS + from azure.ai.generative.index._langchain.vendor.vectorstores.faiss import FAISS store = FAISS.load_local(str(tmpdir), embeddings) elif engine.endswith("indexes.faiss.FaissAndDocStore"): - from azure.ai.resources._index._indexes.faiss import FaissAndDocStore + from azure.ai.generative.index._indexes.faiss import FaissAndDocStore error_fmt_str = """Failed to import langchain faiss bridge module with: {e}\n" This could be due to an incompatible change in langchain since this bridge was implemented. If you understand what has changed you could implement your own wrapper of azure.ai.tools.mlindex._indexes.faiss.FaissAndDocStore. @@ -381,7 +381,7 @@ def as_native_index_client(self, credential: Optional[TokenCredential] = None): """ Converts MLIndex config into a client for the underlying Index, may download files. - An azure.search.documents.SearchClient for acs indexes or an azure.ai.resources._index._indexes.indexFaissAndDocStore for faiss indexes. + An azure.search.documents.SearchClient for acs indexes or an azure.ai.generative.index._indexes.indexFaissAndDocStore for faiss indexes. """ index_kind = self.index_config.get("kind", None) if index_kind == "acs": @@ -396,7 +396,7 @@ def as_native_index_client(self, credential: Optional[TokenCredential] = None): api_version=self.index_config.get("api_version", "2023-07-01-preview"), ) elif index_kind == "faiss": - from azure.ai.resources._index._indexes.faiss import FaissAndDocStore + from azure.ai.generative.index._indexes.faiss import FaissAndDocStore embeddings = self.get_langchain_embeddings(credential=credential) @@ -457,7 +457,7 @@ def override_connections( else: self.embeddings_config["connection_type"] = "workspace_connection" if isinstance(embedding_connection, str): - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 embedding_connection = get_connection_by_id_v2(embedding_connection, credential=credential) self.embeddings_config["connection"] = {"id": get_id_from_connection(embedding_connection)} if index_connection: @@ -466,7 +466,7 @@ def override_connections( else: self.index_config["connection_type"] = "workspace_connection" if isinstance(index_connection, str): - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 index_connection = get_connection_by_id_v2(index_connection, credential=credential) self.index_config["connection"] = {"id": get_id_from_connection(index_connection)} self.save(just_config=True) # type: ignore[call-arg] @@ -619,7 +619,7 @@ def from_documents( if isinstance(embeddings_model, str): connection_args = {} if "open_ai" in embeddings_model: - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 if embeddings_connection: if isinstance(embeddings_connection, str): @@ -725,7 +725,7 @@ def from_embeddings_container( ) elif index_type == "acs": from azure.ai.generative.index._tasks.update_acs import create_index_from_raw_embeddings - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 if not index_connection: index_config = { diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py new file mode 100644 index 000000000000..d3fe34dd68b4 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py @@ -0,0 +1,218 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Language model classes.""" +import copy +import json +import os +from typing import Dict, Optional, Union + +from azure.core.credentials import TokenCredential +from azure.ai.generative.constants._common import USER_AGENT_HEADER_KEY +from azure.ai.generative.index._utils.connections import ( + connection_to_credential, + get_connection_by_id_v2, + get_connection_credential, +) +from azure.ai.generative.index._utils.logging import get_logger +from azure.ai.generative._user_agent import USER_AGENT + +try: + from azure.ai.resources.entities import BaseConnection +except Exception: + BaseConnection = None +try: + from azure.ai.ml.entities import WorkspaceConnection +except Exception: + WorkspaceConnection = None + +logger = get_logger(__name__) + + +def parse_model_uri(uri: str, **kwargs) -> dict: + """Parse a model URI into a dictionary of configuration parameters.""" + scheme, details = uri.split("://") + + def split_details(details): + details = details.split("/") + dets = {} + for i in range(0, len(details), 2): + dets[details[i]] = details[i + 1] + return dets + + config = {**kwargs} + if scheme == "azure_open_ai": + config = {**split_details(details), **config} + config["kind"] = "open_ai" + if "endpoint" in config: + if config["endpoint"] and (".openai." in config["endpoint"] or ".api.cognitive." in config["endpoint"] or ".cognitiveservices." in config["endpoint"]): + config["api_base"] = config["endpoint"].rstrip("/") + else: + config["api_base"] = f"https://{config['endpoint']}.openai.azure.com" + config["api_type"] = "azure" + config["api_version"] = kwargs.get("api_version") if kwargs.get("api_version") is not None else "2023-03-15-preview" + # Azure OpenAI has a batch_size limit of 16 + if "batch_size" not in config: + config["batch_size"] = "16" + elif scheme == "open_ai": + config["kind"] = "open_ai" + config = {**split_details(details), **config} + config["api_type"] = "open_ai" + elif scheme == "hugging_face": + config["kind"] = "hugging_face" + config["model"] = details.split("model/")[1] + elif scheme == "none": + config["kind"] = "none" + else: + raise ValueError(f"Unknown model kind: {scheme}") + + return config + + +def init_open_ai_from_config(config: dict, credential: Optional[TokenCredential]) -> Dict: + """Initialize an OpenAI model from a configuration dictionary.""" + import openai + + logger.debug("OpenAI arguments: \n") + logger.debug("\n".join(f"{k}={v}" if k != "key" and k != "api_key" else f"{k}=[REDACTED]" for k, v in config.items())) + + try: + if config.get("key") is not None: + config["api_key"] = config.get("key") + elif "connection_type" not in config: + if config.get("api_key") is None: + config["api_key"] = os.environ.get("OPENAI_API_KEY", None) + if config["api_key"] is None and "azure" in config["api_type"]: + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential(process_timeout=60) if credential is None else credential + config["api_key"] = credential.get_token("https://cognitiveservices.azure.com/.default").token + config["api_type"] = "azure_ad" + else: + if config["connection_type"] == "workspace_connection": + connection_id = config.get("connection", {}).get("id", "") + connection = get_connection_by_id_v2(connection_id, credential=credential) + # Only change base, version, and type in AOAI case + if hasattr(connection, "type"): + connection_obj: Union[WorkspaceConnection, BaseConnection] = connection + if connection_obj.type == "azure_open_ai": + config["api_base"] = connection_obj.target + connection_metadata = connection_obj.metadata + config["api_version"] = connection_obj.metadata.get("apiVersion", connection_metadata.get("ApiVersion", "2023-07-01-preview")) + config["api_type"] = connection_obj.metadata.get("apiType", connection_metadata.get("ApiType", "azure")).lower() + elif isinstance(connection, dict) and connection.get("properties", {}).get("category", None) == "AzureOpenAI": + config["api_base"] = connection.get("properties", {}).get("target") + connection_metadata = connection.get("properties", {}).get("metadata", {}) + config["api_version"] = connection_metadata.get("apiVersion", connection_metadata.get("ApiVersion", "2023-03-15-preview")) + config["api_type"] = connection_metadata.get("apiType", connection_metadata.get("ApiType", "azure")).lower() + + if config["api_type"] == "azure_ad" or config["api_type"] == "azuread": + from azure.identity import DefaultAzureCredential + + credential = DefaultAzureCredential(process_timeout=60) if credential is None else credential + else: + credential = connection_to_credential(connection) + else: + credential = get_connection_credential(config) + + if not hasattr(credential, "key"): + # Add hack to check for "BAKER-OPENAI-API-KEY" + if config.get("connection_type", "workspace_keyvault") == "workspace_keyvault": + new_args = copy.deepcopy(config) + new_args["connection"]["key"] = "BAKER-OPENAI-API-KEY" + credential = get_connection_credential(new_args) + + if hasattr(credential, "key"): + config["api_key"] = credential.key # type: ignore[union-attr] + else: + config["api_key"] = credential.get_token("https://cognitiveservices.azure.com/.default").token # type: ignore[union-attr] + config["api_type"] = "azure_ad" + except Exception as e: + if "OPENAI_API_KEY" in os.environ: + logger.warning(f"Failed to get credential for ACS with {e}, falling back to openai 0.x env vars.") + config["api_key"] = os.environ["OPENAI_API_KEY"] + config["api_type"] = os.environ.get("OPENAI_API_TYPE", "azure") + config["api_base"] = os.environ.get("OPENAI_API_BASE", openai.api_base if hasattr(openai, "api_base") else openai.base_url) + config["api_version"] = os.environ.get("OPENAI_API_VERSION", openai.api_version) + elif "AZURE_OPENAI_KEY" in os.environ: + logger.warning(f"Failed to get credential for ACS with {e}, falling back to openai 1.x env vars.") + config["api_key"] = os.environ["AZURE_OPENAI_KEY"] + config["api_type"] = os.environ.get("OPENAI_API_TYPE", "azure") + config["azure_endpoint"] = os.environ.get("AZURE_OPENAI_ENDPOINT") + config["api_version"] = os.environ.get("OPENAI_API_VERSION", openai.api_version) + else: + raise e + + if openai.api_type and "azure" in openai.api_type: + config["api_version"] = config.get("api_version", "2023-03-15-preview") + + return config + + +# TODO: Vendor langchain deps or move to langchain module. +def init_llm(model_config: dict, **kwargs): + """Initialize a language model from a model configuration.""" + from langchain.chat_models.azure_openai import AzureChatOpenAI + from langchain.chat_models.openai import ChatOpenAI + from langchain.llms import AzureOpenAI + + llm = None + logger.debug(f"model_config: {json.dumps(model_config, indent=2)}") + model_kwargs = { + "frequency_penalty": model_config.get("frequency_penalty", 0), + "presence_penalty": model_config.get("presence_penalty", 0), + } + if model_config.get("stop") is not None: + model_kwargs["stop"] = model_config.get("stop") + if model_config.get("kind") == "open_ai" and model_config.get("api_type") == "azure": + model_config = init_open_ai_from_config(model_config, credential=None) + if model_config["model"].startswith("gpt-3.5-turbo") or model_config["model"].startswith("gpt-35-turbo") or model_config["model"].startswith("gpt-4"): + logger.info(f"Initializing AzureChatOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") + + llm = AzureChatOpenAI( + deployment_name=model_config["deployment"], + model=model_config["model"], + max_tokens=model_config.get("max_tokens"), + model_kwargs=model_kwargs, + openai_api_key=model_config.get("api_key"), + openai_api_base=model_config.get("api_base"), + openai_api_type=model_config.get("api_type"), + openai_api_version=model_config.get("api_version"), + max_retries=model_config.get("max_retries", 3), + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + **kwargs + ) # type: ignore + if model_config.get("temperature", None) is not None: + llm.temperature = model_config.get("temperature") + else: + logger.info(f"Initializing AzureOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") + + llm = AzureOpenAI( + deployment_name=model_config["deployment"], + model=model_config["model"], + max_tokens=model_config.get("max_tokens"), + model_kwargs=model_kwargs, + openai_api_key=model_config.get("api_key"), + max_retries=model_config.get("max_retries", 3), + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + **kwargs + ) # type: ignore + if model_config.get("temperature", None) is not None: + llm.temperature = model_config.get("temperature") + elif model_config.get("kind") == "open_ai" and model_config.get("api_type") == "open_ai": + logger.info(f"Initializing OpenAI with model {model_config['model']} with kwargs: {model_kwargs}") + model_config = init_open_ai_from_config(model_config, credential=None) + llm = ChatOpenAI( + model=model_config["model"], + max_tokens=model_config.get("max_tokens"), + model_kwargs=model_kwargs, + openai_api_key=model_config.get("api_key"), + default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, + **kwargs + ) # type: ignore + if model_config.get("temperature", None) is not None: + llm.temperature = model_config.get("temperature") + else: + raise ValueError(f"Unsupported llm kind: {model_config.get('kind')}") + + return llm diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py index 7632b67d1d9b..85187a934d8b 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk.py @@ -16,6 +16,7 @@ from azure.ai.generative.index._documents import ( SUPPORTED_EXTENSIONS, ChunkedDocument, + Document, DocumentChunksIterator, DocumentSource, ) @@ -29,7 +30,6 @@ safe_mlflow_start_run, track_activity, ) -from azure.ai.resources._index._documents import Document logger = get_logger("crack_and_chunk") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py index bb0aab5fcee1..45562e5aa8a6 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed.py @@ -21,7 +21,7 @@ from azure.ai.generative.index._embeddings import DataEmbeddedDocument, EmbeddedDocumentSource, EmbeddingsContainer from azure.ai.generative.index._mlindex import MLIndex from azure.ai.generative.index._tasks.crack_and_chunk import custom_loading, get_activity_logging_filter, str2bool -from azure.ai.generative.index._documents.document import DocumentSource +from azure.ai.generative.index._documents.document import Document, DocumentSource from azure.ai.generative.index._utils.logging import ( _logger_factory, enable_appinsights_logging, @@ -30,7 +30,6 @@ safe_mlflow_start_run, track_activity, ) -from azure.ai.resources._index._documents import Document logger = get_logger("crack_and_chunk_and_embed") @@ -63,7 +62,7 @@ def crack_and_chunk_and_embed( if isinstance(embeddings_connection, str): connection_args["connection"] = {"id": embeddings_connection} else: - from azure.ai.resources._index._utils.connections import get_id_from_connection + from azure.ai.generative.index._utils.connections import get_id_from_connection connection_args["connection"] = {"id": get_id_from_connection(embeddings_connection)} diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py index d27b907fd395..7527a7a6d73e 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/crack_and_chunk_and_embed_and_index.py @@ -81,17 +81,17 @@ def crack_and_chunk_and_embed_and_index( if index_connection is not None: connection_args["connection_type"] = "workspace_connection" if isinstance(embeddings_connection, str): - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 connection_args["connection"] = {"id": index_connection} connection = get_connection_by_id_v2(index_connection) else: - from azure.ai.resources._index._utils.connections import get_id_from_connection + from azure.ai.generative.index._utils.connections import get_id_from_connection connection_args["connection"] = {"id": get_id_from_connection(index_connection)} connection = index_connection - from azure.ai.resources._index._utils.connections import ( + from azure.ai.generative.index._utils.connections import ( get_metadata_from_connection, get_target_from_connection, ) @@ -107,7 +107,7 @@ def crack_and_chunk_and_embed_and_index( ) elif index_type == "faiss": logger.info(f"Creating Faiss index from embeddings_container with config {index_config}") - mlindex = embeddings_container.write_as_faiss_mlindex(output_path, engine="azure.ai.resources._index._indexes.faiss.FaissAndDocStore") + mlindex = embeddings_container.write_as_faiss_mlindex(output_path, engine="azure.ai.generative.index._indexes.faiss.FaissAndDocStore") else: raise ValueError(f"Unsupported index_type {index_type}") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py index fde34ef29576..44dad28aaae2 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed.py @@ -13,6 +13,10 @@ from typing import Iterator, List, Optional import pandas as pd +from azure.ai.generative.index._documents import ( + Document, + StaticDocument, +) from azure.ai.generative.index._embeddings import EmbeddingsContainer from azure.ai.generative.index._utils.logging import ( _logger_factory, @@ -23,7 +27,6 @@ safe_mlflow_start_run, track_activity, ) -from azure.ai.resources._index._documents import Document, StaticDocument logger = get_logger("embed") @@ -276,7 +279,7 @@ def main(args, logger, activity_logger): connection_args["connection"] = {"id": connection_id} else: if "open_ai" in args.embeddings_model: - from azure.ai.resources._index._utils.azureml import get_workspace_from_environment + from azure.ai.generative.index._utils.azureml import get_workspace_from_environment ws = get_workspace_from_environment() connection_args["connection_type"] = "workspace_keyvault" diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py index 5f0c478967de..4c8a3db230ea 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/embed_prs.py @@ -11,7 +11,7 @@ import pandas as pd from azure.ai.generative.index._embeddings import EmbeddingsContainer from azure.ai.generative.index._tasks.embed import read_chunks_into_documents -from azure.ai.resources._index._utils.azureml import get_workspace_from_environment +from azure.ai.generative.index._utils.azureml import get_workspace_from_environment from azure.ai.generative.index._utils.logging import ( _logger_factory, enable_appinsights_logging, diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py index 5a379a9f4b22..eb14809d04d7 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/generate_qa.py @@ -13,7 +13,7 @@ import pandas as pd from azureml.core import Run from azure.ai.generative.index.data_generation.qa import QADataGenerator, GenerationResult, QAType -from azure.ai.resources._index._utils.connections import (get_connection_by_id_v2, +from azure.ai.generative.index._utils.connections import (get_connection_by_id_v2, get_connection_credential, connection_to_credential) from azure.ai.generative.index._utils.logging import (enable_appinsights_logging, @@ -26,7 +26,7 @@ def get_model_config(llm_config: Dict[str, Union[str, int]], openai_api_type: str, openai_api_version: str, activity_logger: Logger): """Get model_config from llm_config. llm_config format is used in Baker pipelines. - model_config format is accepted by `azure.ai.resources._index._models.init_llm()`.""" + model_config format is accepted by `azure.ai.generative.index._models.init_llm()`.""" model_config = llm_config.copy() model_config['kind'] = model_config['type'] del model_config['type'] diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py index 67c3e8844010..8776f22c0a7d 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/git_clone.py @@ -16,7 +16,7 @@ def main(args, logger, activity_logger): try: connection_id = os.environ.get('AZUREML_WORKSPACE_CONNECTION_ID_GIT') if connection_id is not None and connection_id != '': - from azure.ai.resources._index._utils.connections import get_connection_by_id_v2 + from azure.ai.generative.index._utils.connections import get_connection_by_id_v2 connection = get_connection_by_id_v2(connection_id) if args.git_repository != connection['properties']['target']: diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py index 286164d2f431..2cda41ab5cb5 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_acs.py @@ -15,7 +15,7 @@ from azure.ai.generative.index._embeddings import EmbeddingsContainer, ReferenceEmbeddedDocument from azure.ai.generative.index._mlindex import MLIndex -from azure.ai.resources._index._utils.connections import get_connection_credential +from azure.ai.generative.index._utils.connections import get_connection_credential from azure.ai.generative.index._utils.logging import ( _logger_factory, enable_appinsights_logging, @@ -486,7 +486,7 @@ def main(args, logger, activity_logger): if args.connection_id is not None: connection_args["connection_type"] = "workspace_connection" connection_args["connection"] = {"id": args.connection_id} - from azure.ai.resources._index._utils.connections import ( + from azure.ai.generative.index._utils.connections import ( get_connection_by_id_v2, get_metadata_from_connection, get_target_from_connection, diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py index da6a8efa6411..f8e7c24132ad 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_tasks/update_pinecone.py @@ -14,7 +14,7 @@ from azure.core.credentials import AzureKeyCredential, TokenCredential from azure.ai.generative.index._embeddings import EmbeddingsContainer, ReferenceEmbeddedDocument from azure.ai.generative.index._mlindex import MLIndex -from azure.ai.resources._index._utils.connections import get_connection_credential +from azure.ai.generative.index._utils.connections import get_connection_credential from azure.ai.generative.index._utils.logging import ( _logger_factory, get_logger, @@ -303,7 +303,7 @@ def main(args, logger, activity_logger): if args.connection_id is not None: connection_args["connection_type"] = "workspace_connection" connection_args["connection"] = {"id": args.connection_id} - from azure.ai.resources._index._utils.connections import ( + from azure.ai.generative.index._utils.connections import ( get_connection_by_id_v2, get_metadata_from_connection, ) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/azureml.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/azureml.py new file mode 100644 index 000000000000..f0958f52ca87 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/azureml.py @@ -0,0 +1,44 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Functions for interacting with AzureML.""" +from typing import Dict, List + +from azure.ai.generative.index._utils.logging import get_logger + +logger = get_logger(__name__) + + +def get_workspace_from_environment(): + """Get the workspace from the run context if running in Azure, otherwise return None.""" + from azureml.core import Run + + run = Run.get_context() + if hasattr(run, "experiment"): + # We are running in Azure + return run.experiment.workspace + else: + return None + + +def get_secret_from_workspace(name: str, workspace=None) -> str: + """Get a secret from the workspace if running in Azure, otherwise get it from the environment.""" + secrets = get_secrets_from_workspace([name], workspace) + return secrets[name] + + +def get_secrets_from_workspace(names: List[str], workspace=None) -> Dict[str, str]: + """Get a secret from the workspace if running in Azure, otherwise get it from the environment.""" + import os + + ws = get_workspace_from_environment() if workspace is None else workspace + if ws: + keyvault = ws.get_default_keyvault() + secrets = keyvault.get_secrets(names) + logger.info("Run context and secrets retrieved", extra={"print": True}) + else: + secrets = {} + for name in names: + secrets[name] = os.environ.get(name, os.environ.get(name.replace("-", "_"))) + + return secrets diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/connections.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/connections.py new file mode 100644 index 000000000000..8640f42fe130 --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/connections.py @@ -0,0 +1,321 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""MLIndex auth connection utilities.""" +import json +import os +import re +from typing import Any, Dict, Optional, Union + +from azure.ai.generative.index._utils.logging import get_logger +from azure.ai.generative.index._utils.requests import create_session_with_retry, send_post_request + +try: + from azure.ai.resources.entities import BaseConnection +except Exception: + BaseConnection = None +try: + from azure.ai.ml import MLClient + from azure.ai.ml.entities import WorkspaceConnection +except Exception: + MLClient = None + WorkspaceConnection = None +try: + from azure.core.credentials import TokenCredential +except Exception: + TokenCredential = object + +logger = get_logger("connections") + +def get_pinecone_environment(config, credential: Optional[TokenCredential] = None): + """Get the Pinecone project environment from a connection.""" + connection_type = config.get("connection_type", None) + if connection_type != "workspace_connection": + raise ValueError(f"Unsupported connection type for Pinecone index: {connection_type}") + + connection_id = config.get("connection", {}).get("id") + connection = get_connection_by_id_v2(connection_id, credential=credential) + return get_metadata_from_connection(connection)["environment"] + + +def get_connection_credential(config, credential: Optional[TokenCredential] = None): + """Get a credential for a connection.""" + try: + from azure.core.credentials import AzureKeyCredential + except ImportError as e: + raise ValueError( + "Could not import azure-core python package. " + "Please install it with `pip install azure-core`." + ) from e + try: + from azure.identity import DefaultAzureCredential + except ImportError as e: + raise ValueError( + "Could not import azure-identity python package. " + "Please install it with `pip install azure-identity`." + ) from e + + if config.get("connection_type", None) == "workspace_keyvault": + from azureml.core import Run, Workspace + run = Run.get_context() + if hasattr(run, "experiment"): + ws = run.experiment.workspace + else: + try: + ws = Workspace( + subscription_id=config.get("connection", {}).get("subscription"), + resource_group=config.get("connection", {}).get("resource_group"), + workspace_name=config.get("connection", {}).get("workspace") + ) + except Exception as e: + logger.warning(f"Could not get workspace '{config.get('connection', {}).get('workspace')}': {e}") + # Fall back to looking for key in environment. + import os + key = os.environ.get(config.get("connection", {}).get("key")) + if key is None: + raise ValueError(f"Could not get workspace '{config.get('connection', {}).get('workspace')}' and no key named '{config.get('connection', {}).get('key')}' in environment") + return AzureKeyCredential(key) + + keyvault = ws.get_default_keyvault() + connection_credential = AzureKeyCredential(keyvault.get_secret(config.get("connection", {}).get("key"))) + elif config.get("connection_type", None) == "workspace_connection": + connection_id = config.get("connection", {}).get("id") + connection = get_connection_by_id_v2(connection_id, credential=credential) + connection_credential = connection_to_credential(connection) + elif config.get("connection_type", None) == "environment": + import os + key = os.environ.get(config.get("connection", {}).get("key", "OPENAI_API_KEY")) + connection_credential = (credential if credential is not None else DefaultAzureCredential(process_timeout=60)) if key is None else AzureKeyCredential(key) + else: + connection_credential = credential if credential is not None else DefaultAzureCredential(process_timeout=60) + + return connection_credential + + +def workspace_connection_to_credential(connection: Union[dict, BaseConnection, WorkspaceConnection]): + """Get a credential for a workspace connection.""" + return connection_to_credential(connection) + + +def connection_to_credential(connection: Union[dict, BaseConnection, WorkspaceConnection]): + """Get a credential for a workspace connection.""" + if isinstance(connection, dict): + props = connection["properties"] + auth_type = props.get("authType", props.get("AuthType")) + if auth_type == "ApiKey": + from azure.core.credentials import AzureKeyCredential + return AzureKeyCredential(props["credentials"]["key"]) + elif auth_type == "PAT": + from azure.core.credentials import AccessToken + return AccessToken(props["credentials"]["pat"], props.get("expiresOn", None)) + elif auth_type == "CustomKeys": + # OpenAI connections are made with CustomKeys auth, so we can try to access the key using known structure + from azure.core.credentials import AzureKeyCredential + if connection.get("metadata", {}).get("azureml.flow.connection_type", None) == "OpenAI": + # Try to get the the key with api_key, if fail, default to regular CustomKeys handling + try: + key = props["credentials"]["keys"]["api_key"] + return AzureKeyCredential(key) + except Exception as e: + logger.warning(f"Could not get key using api_key, using default handling: {e}") + key_dict = props["credentials"]["keys"] + if len(key_dict.keys()) != 1: + raise ValueError(f"Only connections with a single key can be used. Number of keys present: {len(key_dict.keys())}") + return AzureKeyCredential(props["credentials"]["keys"][list(key_dict.keys())[0]]) + else: + raise ValueError(f"Unknown auth type '{auth_type}'") + elif isinstance(connection, WorkspaceConnection): + if connection.credentials.type.lower() == "api_key": + from azure.core.credentials import AzureKeyCredential + return AzureKeyCredential(connection.credentials.key) + elif connection.credentials.type.lower() == "pat": + from azure.core.credentials import AccessToken + return AccessToken(connection.credentials.pat, connection.credentials.expires_on) + elif connection.credentials.type.lower() == "custom_keys": + if connection._metadata.get("azureml.flow.connection_type", "").lower() == "openai": + from azure.core.credentials import AzureKeyCredential + try: + key = connection.credentials.keys.api_key + return AzureKeyCredential(key) + except Exception as e: + logger.warning(f"Could not get key using api_key, using default handling: {e}") + key_dict = connection.credentials.keys + if len(key_dict.keys()) != 1: + raise ValueError(f"Only connections with a single key can be used. Number of keys present: {len(key_dict.keys())}") + return AzureKeyCredential(connection.credentials.keys[list(key_dict.keys())[0]]) + else: + raise ValueError(f"Unknown auth type '{connection.credentials.type}' for connection '{connection.name}'") + else: + if connection.credentials.type.lower() == "api_key": + from azure.core.credentials import AzureKeyCredential + return AzureKeyCredential(connection.credentials.key) + else: + raise ValueError(f"Unknown auth type '{connection.credentials.type}' for connection '{connection.name}'") + + +def get_connection_by_id_v2(connection_id: str, credential: Optional[TokenCredential] = None, client: str = "sdk") -> Union[Dict[str, Dict[str, Dict[str, Any]]], WorkspaceConnection, BaseConnection]: + """ + Get a connection by id using azure.ai.ml or azure.ai.generative. + + If azure.ai.ml is installed, use that, otherwise use azure.ai.generative. + """ + uri_match = re.match(r"/subscriptions/(.*)/resourceGroups/(.*)/providers/Microsoft.MachineLearningServices/workspaces/(.*)/connections/(.*)", connection_id, flags=re.IGNORECASE) + + if uri_match is None: + logger.error(f"Invalid connection_id {connection_id}, expecting Azure Machine Learning resource ID") + raise ValueError(f"Invalid connection id {connection_id}") + + logger.info(f"Getting workspace connection: {uri_match.group(4)}") + + from azureml.dataprep.api._aml_auth._azureml_token_authentication import AzureMLTokenAuthentication + + if credential is None: + from azure.identity import DefaultAzureCredential + + if os.environ.get("AZUREML_RUN_ID", None) is not None: + credential = AzureMLTokenAuthentication._initialize_aml_token_auth() + else: + credential = credential if credential is not None else DefaultAzureCredential(process_timeout=60) + + logger.info(f"Using auth: {type(credential)}") + + if client == "sdk" and MLClient is not None: + logger.info("Getting workspace connection via MLClient") + ml_client = MLClient( + credential=credential, + subscription_id=uri_match.group(1), + resource_group_name=uri_match.group(2), + workspace_name=uri_match.group(3) + ) + + if os.environ.get("AZUREML_RUN_ID", None) is not None: + # In AzureML Run context, we need to use workspaces internal endpoint that will accept AzureMLToken auth. + old_base_url = ml_client.connections._operation._client._base_url + ml_client.connections._operation._client._base_url = f"{os.environ.get('AZUREML_SERVICE_ENDPOINT')}/rp/workspaces" + + logger.info(f"Using ml_client base_url: {ml_client.connections._operation._client._base_url}") + + list_secrets_response = ml_client.connections._operation.list_secrets( + connection_name=uri_match.group(4), + resource_group_name=ml_client.resource_group_name, + workspace_name=ml_client.workspace_name, + ) + connection = WorkspaceConnection._from_rest_object(list_secrets_response) + logger.info(f"Got Connection: {connection.id}") + + if os.environ.get("AZUREML_RUN_ID", None) is not None: + ml_client.connections._operation._client._base_url = old_base_url + else: + logger.info("Getting workspace connection via REST as fallback") + return get_connection_by_id_v1(connection_id, credential) + + return connection + + +def get_id_from_connection(connection: Union[dict, WorkspaceConnection, BaseConnection]) -> str: + """Get a connection id from a connection.""" + if isinstance(connection, dict): + return connection["id"] + elif isinstance(connection, WorkspaceConnection): + return connection.id + elif isinstance(connection, BaseConnection): + return connection.id + else: + raise ValueError(f"Unknown connection type: {type(connection)}") + + +def get_target_from_connection(connection: Union[dict, WorkspaceConnection, BaseConnection]) -> str: + """Get a connection target from a connection.""" + if isinstance(connection, dict): + return connection["properties"]["target"] + elif isinstance(connection, WorkspaceConnection): + return connection.target + elif isinstance(connection, BaseConnection): + return connection.target + else: + raise ValueError(f"Unknown connection type: {type(connection)}") + + +def get_metadata_from_connection(connection: Union[dict, WorkspaceConnection, BaseConnection]) -> dict: + """Get a connection metadata from a connection.""" + if isinstance(connection, dict): + return connection["properties"]["metadata"] + elif isinstance(connection, WorkspaceConnection): + return connection.metadata + elif isinstance(connection, BaseConnection): + return connection.metadata + else: + raise ValueError(f"Unknown connection type: {type(connection)}") + + +def get_connection_by_name_v2(workspace, name: str) -> dict: + """Get a connection from a workspace.""" + if hasattr(workspace._auth, "get_token"): + bearer_token = workspace._auth.get_token("https://management.azure.com/.default").token + else: + bearer_token = workspace._auth.token + + endpoint = workspace.service_context._get_endpoint("api") + url = f"{endpoint}/rp/workspaces/subscriptions/{workspace.subscription_id}/resourcegroups/{workspace.resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace.name}/connections/{name}/listsecrets?api-version=2023-02-01-preview" + resp = send_post_request(url, { + "Authorization": f"Bearer {bearer_token}", + "content-type": "application/json" + }, {}) + + return resp.json() + + +def get_connection_by_id_v1(connection_id: str, credential: Optional[TokenCredential] = None) -> dict: + """Get a connection from a workspace.""" + uri_match = re.match(r"/subscriptions/(.*)/resourceGroups/(.*)/providers/Microsoft.MachineLearningServices/workspaces/(.*)/connections/(.*)", connection_id) + + if uri_match is None: + logger.error(f"Invalid connection_id {connection_id}, expecting Azure Machine Learning resource ID") + raise ValueError(f"Invalid connection id {connection_id}") + + from azureml.core import Run, Workspace + run = Run.get_context() + if hasattr(run, "experiment"): + ws = run.experiment.workspace + else: + try: + ws = Workspace( + subscription_id=uri_match.group(1), + resource_group=uri_match.group(2), + workspace_name=uri_match.group(3) + ) + except Exception as e: + logger.warning(f"Could not get workspace '{uri_match.group(3)}': {e}") + raise ValueError(f"Could not get workspace '{uri_match.group(3)}'") from e + + return get_connection_by_name_v2(ws, uri_match.group(4)) + + +def send_put_request(url, headers, payload): + """Send a PUT request.""" + with create_session_with_retry() as session: + response = session.put(url, data=json.dumps(payload), headers=headers) + # Raise an exception if the response contains an HTTP error status code + response.raise_for_status() + + return response.json() + + +def create_connection_v2(workspace, name, category: str, target: str, auth_type: str, credentials: dict, metadata: str): + """Create a connection in a workspace.""" + url = f"https://management.azure.com/subscriptions/{workspace.subscription_id}/resourcegroups/{workspace.resource_group}/providers/Microsoft.MachineLearningServices/workspaces/{workspace.name}/connections/{name}?api-version=2023-04-01-preview" + + resp = send_put_request(url, { + "Authorization": f"Bearer {workspace._auth.get_token('https://management.azure.com/.default').token}", + "content-type": "application/json" + }, { + "properties": { + "category": category, + "target": target, + "authType": auth_type, + "credentials": credentials, + "metadata": metadata + } + }) + + return resp diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/deployment.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/deployment.py new file mode 100644 index 000000000000..47823f648e3d --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/deployment.py @@ -0,0 +1,40 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Azure OpenAI deployment related utils.""" +import openai +from azure.core.credentials import AzureKeyCredential +from azure.ai.generative.index._utils.connections import ( + connection_to_credential, + get_metadata_from_connection, + get_target_from_connection, +) +from openai.api_resources.deployment import Deployment +from openai.util import convert_to_dict + + +def infer_deployment(aoai_connection, model_name): + """Infer deployment name in an AOAI connection, given model name.""" + if model_name is None or model_name == "": + raise ValueError("Parameter 'model_name' has no value. Deployment inferring cannot be performed.") + connection_metadata = get_metadata_from_connection(aoai_connection) + openai.api_type = connection_metadata.get("ApiType", connection_metadata.get("apiType", "azure")) + openai.api_version = connection_metadata.get( + "ApiVersion", connection_metadata.get("apiVersion", "2023-03-15-preview") + ) + api_base = get_target_from_connection(aoai_connection) + if hasattr(openai, "api_base"): + openai.api_base = api_base + else: + openai.base_url = api_base + credential = connection_to_credential(aoai_connection) + openai.api_key = credential.key if isinstance(credential, AzureKeyCredential) else credential.get_token().token + deployment_list = convert_to_dict( + Deployment.list(api_key=openai.api_key, api_base=api_base, api_type=openai.api_type) + ) + for deployment in deployment_list["data"]: + if deployment["model"] == model_name: + return deployment["id"] + raise Exception( + f"Deployment for model={model_name} not found in AOAI workspace. Please retry with correct model name or create a deployment." + ) diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py index aa27b50b259c..df24cbff88df 100644 --- a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/git.py @@ -8,7 +8,7 @@ import git -from azure.ai.resources._index._utils.azureml import get_secret_from_workspace +from azure.ai.generative.index._utils.azureml import get_secret_from_workspace from azure.ai.generative.index._utils.logging import get_logger logger = get_logger("utils.git") diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/requests.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/requests.py new file mode 100644 index 000000000000..8e71dcb11e6f --- /dev/null +++ b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_utils/requests.py @@ -0,0 +1,58 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- +"""Request utilities.""" +import json + + +def create_session_with_retry(retry=3): + """ + Create requests.session with retry. + + :type retry: int + rtype: Response + """ + import requests + from requests.adapters import HTTPAdapter + + retry_policy = _get_retry_policy(num_retry=retry) + + session = requests.Session() + session.mount("https://", HTTPAdapter(max_retries=retry_policy)) + session.mount("http://", HTTPAdapter(max_retries=retry_policy)) + return session + + +def _get_retry_policy(num_retry=3): + """ + Request retry policy with increasing backoff. + + :return: Returns the msrest or requests REST client retry policy. + :rtype: urllib3.Retry + """ + from urllib3 import Retry + + status_forcelist = [413, 429, 500, 502, 503, 504] + backoff_factor = 0.4 + retry_policy = Retry( + total=num_retry, + read=num_retry, + connect=num_retry, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + # By default this is True. We set it to false to get the full error trace, including url and + # status code of the last retry. Otherwise, the error message is 'too many 500 error responses', + # which is not useful. + raise_on_status=False + ) + return retry_policy + + +def send_post_request(url, headers, payload): + """Send a POST request.""" + with create_session_with_retry() as session: + response = session.post(url, data=json.dumps(payload), headers=headers) + # Raise an exception if the response contains an HTTP error status code + response.raise_for_status() + + return response diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py index d66c9ed36b4a..b3c532c44507 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_docstore.py @@ -83,9 +83,9 @@ def load(cls, input_path: str) -> "FileBasedDocstore": fs, uri = url_to_fs(input_path) - documents: Optional[Dict[str, Document]] = {} + documents = {} with fs.open(f"{input_path.rstrip('/')}/docs.jsonl") as f: for line in f: document = StaticDocument.loads(line.strip()) - documents[document.document_id] = document # type: ignore[index] + documents[document.document_id] = document return cls(documents) diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py index 77289df1b785..5e9db92dd4a3 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_documents/document.py @@ -127,7 +127,7 @@ def dumps(self) -> str: return json.dumps({"content": self.data, "metadata": self._metadata, "document_id": self.document_id}) @classmethod - def loads(cls, data: str) -> "StaticDocument": + def loads(cls, data: str) -> "Document": """Load the document from a json string.""" data_dict = json.loads(data) metadata = data_dict["metadata"] diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py index af0bd4fe84f6..f59f8a1df16a 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/__init__.py @@ -8,7 +8,6 @@ from collections import OrderedDict from typing import Callable, List, Optional, Union -import cloudpickle from azure.core.credentials import TokenCredential from azure.ai.resources._index._embeddings.openai import OpenAIEmbedder from azure.ai.resources._index._langchain.vendor.embeddings.base import Embeddings as Embedder @@ -219,33 +218,9 @@ def embeddings_container_local_path(self, value): """Set the path to the embeddings container.""" self._embeddings_container_path = value - def as_langchain_embeddings(self, credential: Optional[TokenCredential] = None) -> Embedder: - """Returns a langchain Embedder that can be used to embed text.""" - return get_langchain_embeddings(self.kind, self.arguments, credential=credential) - @staticmethod def from_uri(uri: str, credential: Optional[TokenCredential] = None, **kwargs) -> "EmbeddingsContainer": """Create an embeddings object from a URI.""" config = parse_model_uri(uri, **kwargs) kwargs["credential"] = credential - return EmbeddingsContainer(**{**config, **kwargs}) - - @staticmethod - def from_metadata(metadata: dict) -> "EmbeddingsContainer": - """Create an embeddings object from metadata.""" - schema_version = metadata.get("schema_version", "1") - if schema_version == "1": - embeddings = EmbeddingsContainer(metadata["kind"], **metadata["arguments"]) - return embeddings - elif schema_version == "2": - kind = metadata["kind"] - del metadata["kind"] - if kind == "custom": - metadata["embedding_fn"] = cloudpickle.loads( - gzip.decompress(metadata["pickled_embedding_fn"])) - del metadata["pickled_embedding_fn"] - - embeddings = EmbeddingsContainer(kind, **metadata) - return embeddings - else: - raise ValueError(f"Schema version {schema_version} is not supported") + return EmbeddingsContainer(**{**config, **kwargs}) \ No newline at end of file diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py index 8f43bba263ef..d18acfc9cc4e 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_embeddings/openai.py @@ -44,7 +44,6 @@ def __init__( elif batch_size is None: batch_size = 1000 self.batch_size = int(batch_size) - self._dynamic_batch_size: Optional[int] = None if max_retries is None: max_retries = 10 @@ -144,16 +143,7 @@ def _retryable_openai_errors(self) -> List[Exception]: def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: try: - if self._dynamic_batch_size is None: - return self._embed_request(tokenized_texts=tokenized_texts, **kwargs) - else: - embedding_response: Dict[str, List] = {"data": []} - for i in range(0, len(tokenized_texts), self._dynamic_batch_size): - embedding_response["data"].extend( - self._embed_request( - tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs - )["data"] - ) + return self._embed_request(tokenized_texts=tokenized_texts, **kwargs) except Exception as e: err_msg = str(e) if "Too many inputs" not in err_msg: @@ -163,20 +153,14 @@ def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], ** match = re.match(r".*The max number of inputs is ([0-9]+).*", err_msg) if match and match.group(1): try: - self._dynamic_batch_size = int(match.group(1)) + self.batch_size = int(match.group(1)) except Exception: - logger.error( - "Failed to parse max number of inputs from error message, falling back to batch_size=1." - ) - self._dynamic_batch_size = 1 - logger.warning(f"Reducing batch_size to {self._dynamic_batch_size} and retrying.") - embedding_response: Dict[str, List] = {"data": []} # type: ignore[no-redef] - for i in range(0, len(tokenized_texts), self._dynamic_batch_size): - embedding_response["data"].extend( - self._embed_request( - tokenized_texts=tokenized_texts[i : i + self._dynamic_batch_size], **kwargs - )["data"] - ) + logger.error("Failed to parse max number of inputs from error message, falling back to batch_size=1.") + self.batch_size = 1 + logger.warning(f"Reducing batch_size to {self.batch_size} and retrying.") + embedding_response: Dict[str, List] = {"data": []} + for i in range(0, len(tokenized_texts), self.batch_size): + embedding_response["data"].extend(self._embed_request(tokenized_texts=tokenized_texts[i : i + self.batch_size], **kwargs)["data"]) else: raise @@ -184,6 +168,8 @@ def _dynamic_batch_size_embed_request(self, tokenized_texts: List[List[int]], ** def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: try: + min_seconds = 4 + max_seconds = 10 total_delay = 0 last_exception = None for retry in range(self.max_retries): @@ -204,6 +190,7 @@ def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: for retryable_error in self._retryable_openai_errors: if isinstance(e, type(retryable_error)): retrying = True + import openai # Retry with retry-after if found in RateLimitError if isinstance(e, self._RateLimitError): @@ -216,10 +203,10 @@ def _embed_request(self, tokenized_texts: List[List[int]], **kwargs) -> dict: # Wait for 1 minute as suggested by openai https://help.openai.com/en/articles/6897202-ratelimiterror logger.warning("Retry after 60 seconds.") delay = 60 - total_delay += delay - logger.warning(f"Sleeping for {delay} seconds before retrying.") - time.sleep(delay) - break + total_delay += delay + logger.warning(f"Sleeping for {delay} seconds before retrying.") + time.sleep(delay) + break if not retrying: break @@ -254,7 +241,8 @@ def _embed(self, texts: List[str]) -> List[List[float]]: tokens = encoding.encode( text, - # TODO: Does this need to be configurable? Our use cases treat all text as raw data. + # TODO: Do these need to be configurable? Our use cases treat all text as raw data. + allowed_special="all", disallowed_special=(), ) # Text longer than a models context length can be split and the embeddings averaged to approximate full text diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py index 049d81fa2ad3..cdcb1db6e9e2 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_indexes/faiss.py @@ -149,15 +149,15 @@ def similarity_search(self, query: str, k: int = 8, **kwargs) -> List[Document]: def save(self, output_path: Union[str, Path]): """Write index and docstore to output_path.""" - output_path_obj = Path(output_path) - output_path_obj.mkdir(exist_ok=True, parents=True) + output_path = Path(output_path) + output_path.mkdir(exist_ok=True, parents=True) faiss = import_faiss_or_so_help_me() - faiss.write_index(self.index, str(output_path_obj / "index.faiss")) + faiss.write_index(self.index, str(output_path / "index.faiss")) - self.docstore.save(str(output_path_obj / "docstore")) + self.docstore.save(str(output_path / "docstore")) - with (output_path_obj / "index_to_doc_id.json").open("w") as f: + with (output_path / "index_to_doc_id.json").open("w") as f: json.dump(self.index_to_doc_id, f) def save_local(self, output_path: str): diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/__init__.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/__init__.py new file mode 100644 index 000000000000..624f5ee88ecf --- /dev/null +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/__init__.py @@ -0,0 +1,6 @@ +# --------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# --------------------------------------------------------- + + +__path__ = __import__("pkgutil").extend_path(__path__, __name__) diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/math.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/math.py new file mode 100644 index 000000000000..41e1b6a0bd00 --- /dev/null +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_langchain/vendor/utils/math.py @@ -0,0 +1,61 @@ +# This file has been copied as is. +# Last Sync: 2023-08-24 +# Commit: 3e5cda3405ec1aa369fe90253d88f3e26a03db10 +"""Math utils.""" +from typing import List, Optional, Tuple, Union + +import numpy as np + +Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] + + +def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: + """Row-wise cosine similarity between two equal-width matrices.""" + if len(X) == 0 or len(Y) == 0: + return np.array([]) + X = np.array(X) + Y = np.array(Y) + if X.shape[1] != Y.shape[1]: + raise ValueError( + f"Number of columns in X and Y must be the same. X has shape {X.shape} " + f"and Y has shape {Y.shape}." + ) + + X_norm = np.linalg.norm(X, axis=1) + Y_norm = np.linalg.norm(Y, axis=1) + # Ignore divide by zero errors run time warnings as those are handled below. + with np.errstate(divide="ignore", invalid="ignore"): + similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) + similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 + return similarity + + +def cosine_similarity_top_k( + X: Matrix, + Y: Matrix, + top_k: Optional[int] = 5, + score_threshold: Optional[float] = None, +) -> Tuple[List[Tuple[int, int]], List[float]]: + """Row-wise cosine similarity with optional top-k and score threshold filtering. + + Args: + X: Matrix. + Y: Matrix, same width as X. + top_k: Max number of results to return. + score_threshold: Minimum cosine similarity of results. + + Returns: + Tuple of two lists. First contains two-tuples of indices (X_idx, Y_idx), + second contains corresponding cosine similarities. + """ + if len(X) == 0 or len(Y) == 0: + return [], [] + score_array = cosine_similarity(X, Y) + score_threshold = score_threshold or -1.0 + score_array[score_array < score_threshold] = 0 + top_k = min(top_k or len(score_array), np.count_nonzero(score_array)) + top_k_idxs = np.argpartition(score_array, -top_k, axis=None)[-top_k:] + top_k_idxs = top_k_idxs[np.argsort(score_array.ravel()[top_k_idxs])][::-1] + ret_idxs = np.unravel_index(top_k_idxs, score_array.shape) + scores = score_array.ravel()[top_k_idxs].tolist() + return list(zip(*ret_idxs)), scores # type: ignore diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py index 4887fa70d87a..ff24452b8e57 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_mlindex.py @@ -11,7 +11,7 @@ from azure.ai.ml.entities import Data from azure.core.credentials import TokenCredential from azure.ai.resources._index._documents import Document -from azure.ai.resources._index._embeddings import EmbeddingsContainer +from azure.ai.resources._index._embeddings.EmbeddingsContainer import from_metadata from azure.ai.resources._index._utils.connections import ( get_connection_credential, get_connection_by_id_v2, @@ -142,7 +142,7 @@ def description(self, value: str): def get_langchain_embeddings(self, credential: Optional[TokenCredential] = None): """Get the LangChainEmbeddings from the MLIndex.""" - embeddings = EmbeddingsContainer.from_metadata(self.embeddings_config.copy()) + embeddings = from_metadata(self.embeddings_config.copy()) return embeddings.as_langchain_embeddings(credential=credential) @@ -233,7 +233,7 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) if engine == "langchain.vectorstores.FAISS": from azure.ai.resources._index._langchain.vendor.vectorstores.faiss import FAISS - embeddings = EmbeddingsContainer.from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) + embeddings = from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) fs, uri = url_to_fs(self.base_uri) @@ -253,7 +253,7 @@ def as_langchain_vectorstore(self, credential: Optional[TokenCredential] = None) logger.warning(error_fmt_str.format(e=e)) azureml_faiss_as_langchain_faiss = None # type: ignore[assignment] - embeddings = EmbeddingsContainer.from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) + embeddings = from_metadata(self.embeddings_config.copy()).as_langchain_embeddings(credential=credential) store: FaissAndDocStore = FaissAndDocStore.load(self.base_uri, embeddings.embed_query) # type: ignore[no-redef] if azureml_faiss_as_langchain_faiss is not None: diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py index 472648664231..5d0c912b4545 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_models.py @@ -8,14 +8,12 @@ from typing import Optional from azure.core.credentials import TokenCredential -from azure.ai.resources.constants._common import USER_AGENT_HEADER_KEY from azure.ai.resources._index._utils.connections import ( connection_to_credential, get_connection_by_id_v2, get_connection_credential, ) from azure.ai.resources._index._utils.logging import get_logger -from azure.ai.resources._user_agent import USER_AGENT logger = get_logger(__name__) @@ -132,75 +130,7 @@ def init_open_ai_from_config(config: dict, credential: Optional[TokenCredential] else: raise e - if openai.api_type and "azure" in openai.api_type: + if "azure" in openai.api_type: config["api_version"] = config.get("api_version", "2023-03-15-preview") - return config - -# TODO: Vendor langchain deps or move to langchain module. -def init_llm(model_config: dict, **kwargs): - """Initialize a language model from a model configuration.""" - from langchain.chat_models.azure_openai import AzureChatOpenAI - from langchain.chat_models.openai import ChatOpenAI - from langchain.llms import AzureOpenAI - - llm = None - logger.debug(f"model_config: {json.dumps(model_config, indent=2)}") - model_kwargs = { - "frequency_penalty": model_config.get("frequency_penalty", 0), - "presence_penalty": model_config.get("presence_penalty", 0), - } - if model_config.get("stop") is not None: - model_kwargs["stop"] = model_config.get("stop") - if model_config.get("kind") == "open_ai" and model_config.get("api_type") == "azure": - model_config = init_open_ai_from_config(model_config, credential=None) - if model_config["model"].startswith("gpt-3.5-turbo") or model_config["model"].startswith("gpt-35-turbo") or model_config["model"].startswith("gpt-4"): - logger.info(f"Initializing AzureChatOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") - - llm = AzureChatOpenAI( - deployment_name=model_config["deployment"], - model=model_config["model"], - max_tokens=model_config.get("max_tokens"), - model_kwargs=model_kwargs, - openai_api_key=model_config.get("api_key"), - openai_api_base=model_config.get("api_base"), - openai_api_type=model_config.get("api_type"), - openai_api_version=model_config.get("api_version"), - max_retries=model_config.get("max_retries", 3), - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - **kwargs - ) # type: ignore - if model_config.get("temperature", None) is not None: - llm.temperature = model_config.get("temperature") - else: - logger.info(f"Initializing AzureOpenAI with model {model_config['model']} with kwargs: {model_kwargs}") - - llm = AzureOpenAI( - deployment_name=model_config["deployment"], - model=model_config["model"], - max_tokens=model_config.get("max_tokens"), - model_kwargs=model_kwargs, - openai_api_key=model_config.get("api_key"), - max_retries=model_config.get("max_retries", 3), - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - **kwargs - ) # type: ignore - if model_config.get("temperature", None) is not None: - llm.temperature = model_config.get("temperature") - elif model_config.get("kind") == "open_ai" and model_config.get("api_type") == "open_ai": - logger.info(f"Initializing OpenAI with model {model_config['model']} with kwargs: {model_kwargs}") - model_config = init_open_ai_from_config(model_config, credential=None) - llm = ChatOpenAI( - model=model_config["model"], - max_tokens=model_config.get("max_tokens"), - model_kwargs=model_kwargs, - openai_api_key=model_config.get("api_key"), - default_headers={USER_AGENT_HEADER_KEY: USER_AGENT}, - **kwargs - ) # type: ignore - if model_config.get("temperature", None) is not None: - llm.temperature = model_config.get("temperature") - else: - raise ValueError(f"Unsupported llm kind: {model_config.get('kind')}") - - return llm \ No newline at end of file + return config \ No newline at end of file diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py index 93886f0a22dc..0224afff459f 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/connections.py @@ -27,16 +27,6 @@ logger = get_logger("connections") -def get_pinecone_environment(config, credential: Optional[TokenCredential] = None): - """Get the Pinecone project environment from a connection.""" - connection_type = config.get("connection_type", None) - if connection_type != "workspace_connection": - raise ValueError(f"Unsupported connection type for Pinecone index: {connection_type}") - - connection_id = config.get("connection", {}).get("id") - connection = get_connection_by_id_v2(connection_id, credential=credential) - return get_metadata_from_connection(connection)["environment"] - def get_connection_credential(config, credential: Optional[TokenCredential] = None): """Get a credential for a connection.""" diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py index da74a9abd2bf..a47ff50e0013 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_utils/logging.py @@ -229,7 +229,7 @@ def _try_get_run_info(): info["location"] = location try: from azureml.core import Run - run: Run = Run.get_context() # type: ignore[annotation-unchecked] + run: Run = Run.get_context() if hasattr(run, "experiment"): info["parent_run_id"] = run.properties.get("azureml.pipelinerunid", "Unknown") info["mlIndexAssetKind"] = run.properties.get("azureml.mlIndexAssetKind", "Unknown") diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py index 60288bf64701..d8330340d105 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/_utils/_ai_client_utils.py @@ -10,6 +10,7 @@ from azure.ai.ml._file_utils.file_utils import traverse_up_path_and_find_file from azure.ai.ml.exceptions import ErrorCategory, ErrorTarget, ValidationException +from azure.ai.ml.constants._common import LOCAL_PATH def find_config_file_path( path: Optional[Union[os.PathLike, str]] = None, diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py b/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py index 36d224eeffbe..08204339d176 100644 --- a/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py +++ b/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py @@ -361,7 +361,7 @@ def build_index_on_cloud( source=IndexSource( input_data=Data( type="uri_folder", - path=".", + path="", ), input_glob=input_glob, chunk_size=chunk_size, diff --git a/sdk/ai/azure-ai-resources/cspell.json b/sdk/ai/azure-ai-resources/cspell.json index 586a18e8c44d..e37ee76a4934 100644 --- a/sdk/ai/azure-ai-resources/cspell.json +++ b/sdk/ai/azure-ai-resources/cspell.json @@ -1,3 +1,3 @@ { - "ignoreWords": ["redef", "llms"] + "ignoreWords": ["redef"] }