diff --git a/sdk/ai/azure-ai-projects/README.md b/sdk/ai/azure-ai-projects/README.md index 48e321c765dc..9167efc97eb5 100644 --- a/sdk/ai/azure-ai-projects/README.md +++ b/sdk/ai/azure-ai-projects/README.md @@ -254,16 +254,19 @@ dataset: DatasetVersion = project_client.datasets.upload_file( name=dataset_name, version=dataset_version_1, file_path=data_file, + connection_name=connection_name, ) print(dataset) print( - f"Upload all files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files." + f"Upload files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files." ) dataset = project_client.datasets.upload_folder( name=dataset_name, version=dataset_version_2, folder=data_folder, + connection_name=connection_name, + file_pattern=re.compile(r"\.(txt|csv|md)$", re.IGNORECASE), ) print(dataset) @@ -271,15 +274,9 @@ print(f"Get an existing Dataset version `{dataset_version_1}`:") dataset = project_client.datasets.get(name=dataset_name, version=dataset_version_1) print(dataset) -""" -TODO: TypeSpec needs to be fixed for this to work. "body" should be removed. print(f"Get credentials of an existing Dataset version `{dataset_version_1}`:") -asset_credential = project_client.datasets.get_credentials( - name=dataset_name, - version=dataset_version_1, - body=None) +asset_credential = project_client.datasets.get_credentials(name=dataset_name, version=dataset_version_1) print(asset_credential) -""" print("List latest versions of all Datasets:") for dataset in project_client.datasets.list(): diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/aio/_patch.py b/sdk/ai/azure-ai-projects/azure/ai/projects/aio/_patch.py index 533e9e00191f..9e68a875bf47 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/aio/_patch.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/aio/_patch.py @@ -6,6 +6,7 @@ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize """ +import os from typing import List, Any, TYPE_CHECKING from typing_extensions import Self from azure.core.credentials_async import AsyncTokenCredential @@ -17,6 +18,21 @@ # pylint: disable=unused-import,ungrouped-imports from azure.ai.agents.aio import AgentsClient +_console_logging_enabled: bool = os.environ.get("ENABLE_AZURE_AI_PROJECTS_CONSOLE_LOGGING", "False").lower() in ( + "true", + "1", + "yes", +) +if _console_logging_enabled: + import sys + import logging + + azure_logger = logging.getLogger("azure") + azure_logger.setLevel(logging.DEBUG) + azure_logger.addHandler(logging.StreamHandler(stream=sys.stdout)) + identity_logger = logging.getLogger("azure.identity") + identity_logger.setLevel(logging.ERROR) + class AIProjectClient(AIProjectClientGenerated): # pylint: disable=too-many-instance-attributes """AIProjectClient. @@ -55,6 +71,8 @@ class AIProjectClient(AIProjectClientGenerated): # pylint: disable=too-many-ins def __init__(self, endpoint: str, credential: AsyncTokenCredential, **kwargs: Any) -> None: + kwargs.setdefault("logging_enable", _console_logging_enabled) + self._kwargs = kwargs.copy() self._patched_user_agent = _patch_user_agent(self._kwargs.pop("user_agent", None)) diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_datasets_async.py b/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_datasets_async.py index 305c16f07961..1e0950397cb1 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_datasets_async.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/aio/operations/_patch_datasets_async.py @@ -7,9 +7,12 @@ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize """ +import os +import re import logging -from typing import Any, Tuple +from typing import Any, Tuple, Optional from pathlib import Path +from urllib.parse import urlsplit from azure.storage.blob.aio import ContainerClient from azure.core.tracing.decorator_async import distributed_trace_async @@ -22,7 +25,6 @@ PendingUploadType, PendingUploadResponse, ) -from ...models._enums import CredentialType logger = logging.getLogger(__name__) @@ -43,64 +45,42 @@ async def _create_dataset_and_get_its_container_client( self, name: str, input_version: str, + connection_name: Optional[str] = None, ) -> Tuple[ContainerClient, str]: pending_upload_response: PendingUploadResponse = await self.pending_upload( name=name, version=input_version, - body=PendingUploadRequest(pending_upload_type=PendingUploadType.BLOB_REFERENCE), + body=PendingUploadRequest( + pending_upload_type=PendingUploadType.BLOB_REFERENCE, + connection_name=connection_name, + ), ) output_version: str = input_version if not pending_upload_response.blob_reference: - raise ValueError("Blob reference for consumption is not present") - if not pending_upload_response.blob_reference.credential.type: - raise ValueError("Credential type is not present") - if pending_upload_response.blob_reference.credential.type != CredentialType.SAS: - raise ValueError("Credential type is not SAS") - if not pending_upload_response.blob_reference.blob_uri: - raise ValueError("Blob URI is not present or empty") - - if logger.getEffectiveLevel() == logging.DEBUG: - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.pending_upload_id = %s.", - pending_upload_response.pending_upload_id, - ) - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.pending_upload_type = %s.", - pending_upload_response.pending_upload_type, - ) # == PendingUploadType.BLOB_REFERENCE - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.blob_uri = %s.", - pending_upload_response.blob_reference.blob_uri, - ) # Hosted on behalf of (HOBO) not visible to the user. If the form of: "https://.blob.core.windows.net/?" - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.storage_account_arm_id = %s.", - pending_upload_response.blob_reference.storage_account_arm_id, - ) # /subscriptions/<>/resourceGroups/<>/Microsoft.Storage/accounts/<> - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.credential.sas_uri = %s.", - pending_upload_response.blob_reference.credential.sas_uri, - ) - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.credential.type = %s.", - pending_upload_response.blob_reference.credential.type, - ) # == CredentialType.SAS + raise ValueError("Blob reference is not present") + if not pending_upload_response.blob_reference.credential: + raise ValueError("SAS credential are not present") + if not pending_upload_response.blob_reference.credential.sas_uri: + raise ValueError("SAS URI is missing or empty") # For overview on Blob storage SDK in Python see: # https://learn.microsoft.com/azure/storage/blobs/storage-quickstart-blobs-python # https://learn.microsoft.com/azure/storage/blobs/storage-blob-upload-python - # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-from-container-url + # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-from-container-url return ( - await ContainerClient.from_container_url( - container_url=pending_upload_response.blob_reference.blob_uri, # Of the form: "https://.blob.core.windows.net/?" + ContainerClient.from_container_url( + container_url=pending_upload_response.blob_reference.credential.sas_uri, # Of the form: "https://.blob.core.windows.net/?" ), output_version, ) @distributed_trace_async - async def upload_file(self, *, name: str, version: str, file_path: str, **kwargs: Any) -> DatasetVersion: + async def upload_file( + self, *, name: str, version: str, file_path: str, connection_name: Optional[str] = None, **kwargs: Any + ) -> DatasetVersion: """Upload file to a blob storage, and create a dataset that references this file. This method uses the `ContainerClient.upload_blob` method from the azure-storage-blob package to upload the file. Any keyword arguments provided will be passed to the `upload_blob` method. @@ -111,6 +91,9 @@ async def upload_file(self, *, name: str, version: str, file_path: str, **kwargs :paramtype version: str :keyword file_path: The file name (including optional path) to be uploaded. Required. :paramtype file_path: str + :keyword connection_name: The name of an Azure Storage Account connection, where the file should be uploaded. + If not specified, the default Azure Storage Account connection will be used. Optional. + :paramtype connection_name: str :return: The created dataset version. :rtype: ~azure.ai.projects.models.DatasetVersion :raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request. @@ -123,7 +106,9 @@ async def upload_file(self, *, name: str, version: str, file_path: str, **kwargs raise ValueError("The provided file is actually a folder. Use method `upload_folder` instead") container_client, output_version = await self._create_dataset_and_get_its_container_client( - name=name, input_version=version + name=name, + input_version=version, + connection_name=connection_name, ) async with container_client: @@ -137,25 +122,37 @@ async def upload_file(self, *, name: str, version: str, file_path: str, **kwargs blob_name, ) - # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-upload-blob - with await container_client.upload_blob(name=blob_name, data=data, **kwargs) as blob_client: - + # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-upload-blob + async with await container_client.upload_blob(name=blob_name, data=data, **kwargs) as blob_client: logger.debug("[upload_file] Done uploading") + # Remove the SAS token from the URL (remove all query strings). + # The resulting format should be "https://.blob.core.windows.net//" + data_uri = urlsplit(blob_client.url)._replace(query="").geturl() + dataset_version = await self.create_or_update( name=name, version=output_version, body=FileDatasetVersion( # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url # Per above doc the ".url" contains SAS token... should this be stripped away? - data_uri=blob_client.url, # ".blob.windows.core.net//" + data_uri=data_uri, ), ) return dataset_version @distributed_trace_async - async def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: Any) -> DatasetVersion: + async def upload_folder( + self, + *, + name: str, + version: str, + folder: str, + connection_name: Optional[str] = None, + file_pattern: Optional[re.Pattern] = None, + **kwargs: Any, + ) -> DatasetVersion: """Upload all files in a folder and its sub folders to a blob storage, while maintaining relative paths, and create a dataset that references this folder. This method uses the `ContainerClient.upload_blob` method from the azure-storage-blob package @@ -166,7 +163,13 @@ async def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: :keyword version: The version identifier for the dataset. Required. :paramtype version: str :keyword folder: The folder name (including optional path) to be uploaded. Required. - :paramtype file: str + :paramtype folder: str + :keyword connection_name: The name of an Azure Storage Account connection, where the file should be uploaded. + If not specified, the default Azure Storage Account connection will be used. Optional. + :paramtype connection_name: str + :keyword file_pattern: A regex pattern to filter files to be uploaded. Only files matching the pattern + will be uploaded. Optional. + :paramtype file_pattern: re.Pattern :return: The created dataset version. :rtype: ~azure.ai.projects.models.DatasetVersion :raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request. @@ -178,40 +181,43 @@ async def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: raise ValueError("The provided folder is actually a file. Use method `upload_file` instead.") container_client, output_version = await self._create_dataset_and_get_its_container_client( - name=name, input_version=version + name=name, input_version=version, connection_name=connection_name ) async with container_client: # Recursively traverse all files in the folder files_uploaded: bool = False - for file_path in path_folder.rglob("*"): # `rglob` matches all files and folders recursively - if file_path.is_file(): # Check if the path is a file. Skip folders. - blob_name = file_path.relative_to(path_folder) # Blob name relative to the folder + for root, _, files in os.walk(folder): + for file in files: + if file_pattern and not file_pattern.search(file): + continue # Skip files that do not match the pattern + file_path = os.path.join(root, file) + blob_name = os.path.relpath(file_path, folder).replace("\\", "/") # Ensure correct format for Azure logger.debug( "[upload_folder] Start uploading file `%s` as blob `%s`.", file_path, blob_name, ) - with file_path.open( - "rb" - ) as data: # Open the file for reading in binary mode # TODO: async version? - # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-upload-blob - container_client.upload_blob(name=str(blob_name), data=data, **kwargs) - logger.debug("[upload_folder] Done uploaded.") + with open(file=file_path, mode="rb") as data: # Open the file for reading in binary mode + # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-upload-blob + await container_client.upload_blob(name=str(blob_name), data=data, **kwargs) + logger.debug("[upload_folder] Done uploading file") files_uploaded = True + logger.debug("[upload_folder] Done uploaded.") if not files_uploaded: raise ValueError("The provided folder is empty.") + # Remove the SAS token from the URL (remove all query strings). + # The resulting format should be "https://.blob.core.windows.net/" + # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-url + data_uri = urlsplit(container_client.url)._replace(query="").geturl() + dataset_version = await self.create_or_update( name=name, version=output_version, - body=FolderDatasetVersion( - # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url - # Per above doc the ".url" contains SAS token... should this be stripped away? - data_uri=container_client.url, # ".blob.windows.core.net/ ?" - ), + body=FolderDatasetVersion(data_uri=data_uri), ) return dataset_version diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_models.py b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_models.py index fcae19c3e4e7..11e0872870e8 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/models/_models.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/models/_models.py @@ -559,8 +559,7 @@ class BlobReference(_Model): """Blob reference details. :ivar blob_uri: Blob URI path for client to upload data. Example: - `https://blob.windows.core.net/Container/Path `_. - Required. + ``https://blob.windows.core.net/Container/Path``. Required. :vartype blob_uri: str :ivar storage_account_arm_id: ARM ID of the storage account to use. Required. :vartype storage_account_arm_id: str @@ -569,8 +568,7 @@ class BlobReference(_Model): """ blob_uri: str = rest_field(name="blobUri", visibility=["read", "create", "update", "delete", "query"]) - """Blob URI path for client to upload data. Example: `https://blob.windows.core.net/Container/Path - `_. Required.""" + """Blob URI path for client to upload data. Example: ``https://blob.windows.core.net/Container/Path``. Required.""" storage_account_arm_id: str = rest_field( name="storageAccountArmId", visibility=["read", "create", "update", "delete", "query"] ) @@ -740,14 +738,17 @@ class DatasetVersion(_Model): You probably want to use the sub-classes and not this class directly. Known sub-classes are: FileDatasetVersion, FolderDatasetVersion - :ivar data_uri: URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330 - `_. Required. + :ivar data_uri: URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required. :vartype data_uri: str :ivar type: Dataset type. Required. Known values are: "uri_file" and "uri_folder". :vartype type: str or ~azure.ai.projects.models.DatasetType - :ivar is_reference: Indicates if dataset is reference only or managed by dataset service. If - true, the underlying data will be deleted when the dataset version is deleted. + :ivar is_reference: Indicates if the dataset holds a reference to the storage, or the dataset + manages storage itself. If true, the underlying data will not be deleted when the dataset + version is deleted. :vartype is_reference: bool + :ivar connection_name: The Azure Storage Account connection name. Required if + startPendingUploadVersion was not called before creating the Dataset. + :vartype connection_name: str :ivar id: Asset ID, a unique identifier for the asset. :vartype id: str :ivar name: The name of the resource. Required. @@ -762,13 +763,15 @@ class DatasetVersion(_Model): __mapping__: Dict[str, _Model] = {} data_uri: str = rest_field(name="dataUri", visibility=["read", "create"]) - """URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330 - `_. Required.""" + """URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required.""" type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"]) """Dataset type. Required. Known values are: \"uri_file\" and \"uri_folder\".""" is_reference: Optional[bool] = rest_field(name="isReference", visibility=["read"]) - """Indicates if dataset is reference only or managed by dataset service. If true, the underlying - data will be deleted when the dataset version is deleted.""" + """Indicates if the dataset holds a reference to the storage, or the dataset manages storage + itself. If true, the underlying data will not be deleted when the dataset version is deleted.""" + connection_name: Optional[str] = rest_field(name="connectionName", visibility=["read", "create"]) + """The Azure Storage Account connection name. Required if startPendingUploadVersion was not called + before creating the Dataset.""" id: Optional[str] = rest_field(visibility=["read"]) """Asset ID, a unique identifier for the asset.""" name: str = rest_field(visibility=["read"]) @@ -786,6 +789,7 @@ def __init__( *, data_uri: str, type: str, + connection_name: Optional[str] = None, description: Optional[str] = None, tags: Optional[Dict[str, str]] = None, ) -> None: ... @@ -1067,12 +1071,15 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: class FileDatasetVersion(DatasetVersion, discriminator="uri_file"): """FileDatasetVersion Definition. - :ivar data_uri: URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330 - `_. Required. + :ivar data_uri: URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required. :vartype data_uri: str - :ivar is_reference: Indicates if dataset is reference only or managed by dataset service. If - true, the underlying data will be deleted when the dataset version is deleted. + :ivar is_reference: Indicates if the dataset holds a reference to the storage, or the dataset + manages storage itself. If true, the underlying data will not be deleted when the dataset + version is deleted. :vartype is_reference: bool + :ivar connection_name: The Azure Storage Account connection name. Required if + startPendingUploadVersion was not called before creating the Dataset. + :vartype connection_name: str :ivar id: Asset ID, a unique identifier for the asset. :vartype id: str :ivar name: The name of the resource. Required. @@ -1095,6 +1102,7 @@ def __init__( self, *, data_uri: str, + connection_name: Optional[str] = None, description: Optional[str] = None, tags: Optional[Dict[str, str]] = None, ) -> None: ... @@ -1113,12 +1121,15 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: class FolderDatasetVersion(DatasetVersion, discriminator="uri_folder"): """FileDatasetVersion Definition. - :ivar data_uri: URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330 - `_. Required. + :ivar data_uri: URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required. :vartype data_uri: str - :ivar is_reference: Indicates if dataset is reference only or managed by dataset service. If - true, the underlying data will be deleted when the dataset version is deleted. + :ivar is_reference: Indicates if the dataset holds a reference to the storage, or the dataset + manages storage itself. If true, the underlying data will not be deleted when the dataset + version is deleted. :vartype is_reference: bool + :ivar connection_name: The Azure Storage Account connection name. Required if + startPendingUploadVersion was not called before creating the Dataset. + :vartype connection_name: str :ivar id: Asset ID, a unique identifier for the asset. :vartype id: str :ivar name: The name of the resource. Required. @@ -1141,6 +1152,7 @@ def __init__( self, *, data_uri: str, + connection_name: Optional[str] = None, description: Optional[str] = None, tags: Optional[Dict[str, str]] = None, ) -> None: ... @@ -1347,7 +1359,7 @@ class PendingUploadRequest(_Model): :ivar pending_upload_id: If PendingUploadId is not provided, a random GUID will be used. :vartype pending_upload_id: str - :ivar connection_name: Name of Azure blob storage connection to use for generating temporary + :ivar connection_name: Azure Storage Account connection name to use for generating temporary SAS token. :vartype connection_name: str :ivar pending_upload_type: BlobReference is the only supported type. Required. Blob Reference @@ -1362,7 +1374,7 @@ class PendingUploadRequest(_Model): connection_name: Optional[str] = rest_field( name="connectionName", visibility=["read", "create", "update", "delete", "query"] ) - """Name of Azure blob storage connection to use for generating temporary SAS token.""" + """Azure Storage Account connection name to use for generating temporary SAS token.""" pending_upload_type: Literal[PendingUploadType.BLOB_REFERENCE] = rest_field( name="pendingUploadType", visibility=["read", "create", "update", "delete", "query"] ) diff --git a/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_datasets.py b/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_datasets.py index 6f9bbc609e39..4cacc6e286dc 100644 --- a/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_datasets.py +++ b/sdk/ai/azure-ai-projects/azure/ai/projects/operations/_patch_datasets.py @@ -7,9 +7,12 @@ Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize """ +import os +import re import logging -from typing import Any, Tuple +from typing import Any, Tuple, Optional from pathlib import Path +from urllib.parse import urlsplit from azure.storage.blob import ContainerClient from azure.core.tracing.decorator import distributed_trace from ._operations import DatasetsOperations as DatasetsOperationsGenerated @@ -21,7 +24,6 @@ PendingUploadType, PendingUploadResponse, ) -from ..models._enums import CredentialType logger = logging.getLogger(__name__) @@ -42,49 +44,25 @@ def _create_dataset_and_get_its_container_client( self, name: str, input_version: str, + connection_name: Optional[str] = None, ) -> Tuple[ContainerClient, str]: pending_upload_response: PendingUploadResponse = self.pending_upload( name=name, version=input_version, - body=PendingUploadRequest(pending_upload_type=PendingUploadType.BLOB_REFERENCE), + body=PendingUploadRequest( + pending_upload_type=PendingUploadType.BLOB_REFERENCE, + connection_name=connection_name, + ), ) output_version: str = input_version if not pending_upload_response.blob_reference: - raise ValueError("Blob reference for consumption is not present") - if not pending_upload_response.blob_reference.credential.type: - raise ValueError("Credential type is not present") - if pending_upload_response.blob_reference.credential.type != CredentialType.SAS: - raise ValueError("Credential type is not SAS") - if not pending_upload_response.blob_reference.blob_uri: - raise ValueError("Blob URI is not present or empty") - - if logger.getEffectiveLevel() == logging.DEBUG: - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.pending_upload_id = %s.", - pending_upload_response.pending_upload_id, - ) - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.pending_upload_type = %s.", - pending_upload_response.pending_upload_type, - ) # == PendingUploadType.BLOB_REFERENCE - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.blob_uri = %s.", - pending_upload_response.blob_reference.blob_uri, - ) # Hosted on behalf of (HOBO) not visible to the user. If the form of: "https://.blob.core.windows.net/?" - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.storage_account_arm_id = %s.", - pending_upload_response.blob_reference.storage_account_arm_id, - ) # /subscriptions/<>/resourceGroups/<>/Microsoft.Storage/accounts/<> - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.credential.sas_uri = %s.", - pending_upload_response.blob_reference.credential.sas_uri, - ) - logger.debug( - "[_create_dataset_and_get_its_container_client] pending_upload_response.blob_reference.credential.type = %s.", - pending_upload_response.blob_reference.credential.type, - ) # == CredentialType.SAS + raise ValueError("Blob reference is not present") + if not pending_upload_response.blob_reference.credential: + raise ValueError("SAS credential are not present") + if not pending_upload_response.blob_reference.credential.sas_uri: + raise ValueError("SAS URI is missing or empty") # For overview on Blob storage SDK in Python see: # https://learn.microsoft.com/azure/storage/blobs/storage-quickstart-blobs-python @@ -93,13 +71,15 @@ def _create_dataset_and_get_its_container_client( # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-from-container-url return ( ContainerClient.from_container_url( - container_url=pending_upload_response.blob_reference.blob_uri, # Of the form: "https://.blob.core.windows.net/?" + container_url=pending_upload_response.blob_reference.credential.sas_uri # Of the form: "https://.blob.core.windows.net/?" ), output_version, ) @distributed_trace - def upload_file(self, *, name: str, version: str, file_path: str, **kwargs: Any) -> DatasetVersion: + def upload_file( + self, *, name: str, version: str, file_path: str, connection_name: Optional[str] = None, **kwargs: Any + ) -> DatasetVersion: """Upload file to a blob storage, and create a dataset that references this file. This method uses the `ContainerClient.upload_blob` method from the azure-storage-blob package to upload the file. Any keyword arguments provided will be passed to the `upload_blob` method. @@ -110,6 +90,9 @@ def upload_file(self, *, name: str, version: str, file_path: str, **kwargs: Any) :paramtype version: str :keyword file_path: The file name (including optional path) to be uploaded. Required. :paramtype file_path: str + :keyword connection_name: The name of an Azure Storage Account connection, where the file should be uploaded. + If not specified, the default Azure Storage Account connection will be used. Optional. + :paramtype connection_name: str :return: The created dataset version. :rtype: ~azure.ai.projects.models.DatasetVersion :raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request. @@ -122,7 +105,7 @@ def upload_file(self, *, name: str, version: str, file_path: str, **kwargs: Any) raise ValueError("The provided file is actually a folder. Use method `upload_folder` instead") container_client, output_version = self._create_dataset_and_get_its_container_client( - name=name, input_version=version + name=name, input_version=version, connection_name=connection_name ) with container_client: @@ -138,23 +121,35 @@ def upload_file(self, *, name: str, version: str, file_path: str, **kwargs: Any) # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-upload-blob with container_client.upload_blob(name=blob_name, data=data, **kwargs) as blob_client: - logger.debug("[upload_file] Done uploading") + # Remove the SAS token from the URL (remove all query strings). + # The resulting format should be "https://.blob.core.windows.net//" + data_uri = urlsplit(blob_client.url)._replace(query="").geturl() + dataset_version = self.create_or_update( name=name, version=output_version, body=FileDatasetVersion( # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url # Per above doc the ".url" contains SAS token... should this be stripped away? - data_uri=blob_client.url, # ".blob.windows.core.net//" + data_uri=data_uri, ), ) return dataset_version @distributed_trace - def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: Any) -> DatasetVersion: + def upload_folder( + self, + *, + name: str, + version: str, + folder: str, + connection_name: Optional[str] = None, + file_pattern: Optional[re.Pattern] = None, + **kwargs: Any, + ) -> DatasetVersion: """Upload all files in a folder and its sub folders to a blob storage, while maintaining relative paths, and create a dataset that references this folder. This method uses the `ContainerClient.upload_blob` method from the azure-storage-blob package @@ -165,7 +160,13 @@ def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: Any) :keyword version: The version identifier for the dataset. Required. :paramtype version: str :keyword folder: The folder name (including optional path) to be uploaded. Required. - :paramtype file: str + :paramtype folder: str + :keyword connection_name: The name of an Azure Storage Account connection, where the file should be uploaded. + If not specified, the default Azure Storage Account connection will be used. Optional. + :paramtype connection_name: str + :keyword file_pattern: A regex pattern to filter files to be uploaded. Only files matching the pattern + will be uploaded. Optional. + :paramtype file_pattern: re.Pattern :return: The created dataset version. :rtype: ~azure.ai.projects.models.DatasetVersion :raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request. @@ -177,39 +178,45 @@ def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: Any) raise ValueError("The provided folder is actually a file. Use method `upload_file` instead.") container_client, output_version = self._create_dataset_and_get_its_container_client( - name=name, input_version=version + name=name, + input_version=version, + connection_name=connection_name, ) with container_client: # Recursively traverse all files in the folder files_uploaded: bool = False - for file_path in path_folder.rglob("*"): # `rglob` matches all files and folders recursively - if file_path.is_file(): # Check if the path is a file. Skip folders. - blob_name = file_path.relative_to(path_folder) # Blob name relative to the folder + for root, _, files in os.walk(folder): + for file in files: + if file_pattern and not file_pattern.search(file): + continue # Skip files that do not match the pattern + file_path = os.path.join(root, file) + blob_name = os.path.relpath(file_path, folder).replace("\\", "/") # Ensure correct format for Azure logger.debug( "[upload_folder] Start uploading file `%s` as blob `%s`.", file_path, blob_name, ) - with file_path.open("rb") as data: # Open the file for reading in binary mode - # TODO: Is there an upload_folder? + with open(file=file_path, mode="rb") as data: # Open the file for reading in binary mode # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-upload-blob container_client.upload_blob(name=str(blob_name), data=data, **kwargs) - logger.debug("[upload_folder] Done uploaded.") + logger.debug("[upload_folder] Done uploading file") files_uploaded = True + logger.debug("[upload_folder] Done uploaded.") if not files_uploaded: raise ValueError("The provided folder is empty.") + # Remove the SAS token from the URL (remove all query strings). + # The resulting format should be "https://.blob.core.windows.net/" + # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-url + data_uri = urlsplit(container_client.url)._replace(query="").geturl() + dataset_version = self.create_or_update( name=name, version=output_version, - body=FolderDatasetVersion( - # See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url - # Per above doc the ".url" contains SAS token... should this be stripped away? - data_uri=container_client.url, # ".blob.windows.core.net/ ?" - ), + body=FolderDatasetVersion(data_uri=data_uri), ) return dataset_version diff --git a/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations.py b/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations.py index c148f824e765..bdd6a44c053b 100644 --- a/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations.py +++ b/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations.py @@ -68,6 +68,7 @@ def test_datasets_create_or_update(self, aiproject_endpoint): "name": "str", "type": "uri_file", "version": "str", + "connectionName": "str", "description": "str", "id": "str", "isReference": bool, diff --git a/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations_async.py b/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations_async.py index 1aafb652e6b4..6db1ecba7504 100644 --- a/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations_async.py +++ b/sdk/ai/azure-ai-projects/generated_tests/test_ai_project_datasets_operations_async.py @@ -69,6 +69,7 @@ async def test_datasets_create_or_update(self, aiproject_endpoint): "name": "str", "type": "uri_file", "version": "str", + "connectionName": "str", "description": "str", "id": "str", "isReference": bool, diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets.py index 7881a0bb23a4..06d4f6e7070b 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets.py @@ -20,18 +20,21 @@ Set these environment variables with your own values: 1) PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your Azure AI Foundry project. - 2) DATASET_NAME - Optional. The name of the Dataset to create and use in this sample. - 3) DATASET_VERSION_1 - Optional. The first version of the Dataset to create and use in this sample. - 4) DATASET_VERSION_2 - Optional. The second version of the Dataset to create and use in this sample. - 5) DATA_FOLDER - Optional. The folder path where the data files for upload are located. + 2) CONNECTION_NAME - Required. The name of the Azure Storage Account connection to use for uploading files. + 3) DATASET_NAME - Optional. The name of the Dataset to create and use in this sample. + 4) DATASET_VERSION_1 - Optional. The first version of the Dataset to create and use in this sample. + 5) DATASET_VERSION_2 - Optional. The second version of the Dataset to create and use in this sample. + 6) DATA_FOLDER - Optional. The folder path where the data files for upload are located. """ import os +import re from azure.identity import DefaultAzureCredential from azure.ai.projects import AIProjectClient from azure.ai.projects.models import DatasetVersion endpoint = os.environ["PROJECT_ENDPOINT"] +connection_name = os.environ["CONNECTION_NAME"] dataset_name = os.environ.get("DATASET_NAME", "dataset-test") dataset_version_1 = os.environ.get("DATASET_VERSION_1", "1.0") dataset_version_2 = os.environ.get("DATASET_VERSION_2", "2.0") @@ -53,16 +56,19 @@ name=dataset_name, version=dataset_version_1, file_path=data_file, + connection_name=connection_name, ) print(dataset) print( - f"Upload all files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files." + f"Upload files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files." ) dataset = project_client.datasets.upload_folder( name=dataset_name, version=dataset_version_2, folder=data_folder, + connection_name=connection_name, + file_pattern=re.compile(r"\.(txt|csv|md)$", re.IGNORECASE), ) print(dataset) @@ -70,15 +76,9 @@ dataset = project_client.datasets.get(name=dataset_name, version=dataset_version_1) print(dataset) - """ - TODO: TypeSpec needs to be fixed for this to work. "body" should be removed. print(f"Get credentials of an existing Dataset version `{dataset_version_1}`:") - asset_credential = project_client.datasets.get_credentials( - name=dataset_name, - version=dataset_version_1, - body=None) + asset_credential = project_client.datasets.get_credentials(name=dataset_name, version=dataset_version_1) print(asset_credential) - """ print("List latest versions of all Datasets:") for dataset in project_client.datasets.list(): diff --git a/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets_async.py b/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets_async.py index cd921590e5b3..8fbff37e0dda 100644 --- a/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets_async.py +++ b/sdk/ai/azure-ai-projects/samples/datasets/sample_datasets_async.py @@ -20,14 +20,16 @@ Set these environment variables with your own values: 1) PROJECT_ENDPOINT - Required. The Azure AI Project endpoint, as found in the overview page of your Azure AI Foundry project. - 2) DATASET_NAME - Optional. The name of the Dataset to create and use in this sample. - 3) DATASET_VERSION_1 - Optional. The first version of the Dataset to create and use in this sample. - 4) DATASET_VERSION_2 - Optional. The second version of the Dataset to create and use in this sample. - 5) DATA_FOLDER - Optional. The folder path where the data files for upload are located. + 2) CONNECTION_NAME - Required. The name of the Azure Storage Account connection to use for uploading files. + 3) DATASET_NAME - Optional. The name of the Dataset to create and use in this sample. + 4) DATASET_VERSION_1 - Optional. The first version of the Dataset to create and use in this sample. + 5) DATASET_VERSION_2 - Optional. The second version of the Dataset to create and use in this sample. + 6) DATA_FOLDER - Optional. The folder path where the data files for upload are located. """ import asyncio import os +import re from azure.identity.aio import DefaultAzureCredential from azure.ai.projects.aio import AIProjectClient from azure.ai.projects.models import DatasetVersion @@ -41,6 +43,7 @@ async def main() -> None: endpoint = os.environ["PROJECT_ENDPOINT"] + connection_name = os.environ["CONNECTION_NAME"] dataset_name = os.environ.get("DATASET_NAME", "dataset-test") dataset_version_1 = os.environ.get("DATASET_VERSION_1", "1.0") dataset_version_2 = os.environ.get("DATASET_VERSION_2", "2.0") @@ -56,16 +59,19 @@ async def main() -> None: name=dataset_name, version=dataset_version_1, file_path=data_file, + connection_name=connection_name, ) print(dataset) print( - f"Upload all files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files." + f"Upload files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files." ) dataset = await project_client.datasets.upload_folder( name=dataset_name, version=dataset_version_2, folder=data_folder, + connection_name=connection_name, + file_pattern=re.compile(r"\.(txt|csv|md)$", re.IGNORECASE), ) print(dataset) @@ -73,6 +79,12 @@ async def main() -> None: dataset = await project_client.datasets.get(name=dataset_name, version=dataset_version_1) print(dataset) + print(f"Get credentials of an existing Dataset version `{dataset_version_1}`:") + asset_credential = await project_client.datasets.get_credentials( + name=dataset_name, version=dataset_version_1 + ) + print(asset_credential) + print("List latest versions of all Datasets:") async for dataset in project_client.datasets.list(): print(dataset) diff --git a/sdk/ai/azure-ai-projects/tests/samples/test_samples.py b/sdk/ai/azure-ai-projects/tests/samples/test_samples.py index 899e2840cc01..3e539b3c443e 100644 --- a/sdk/ai/azure-ai-projects/tests/samples/test_samples.py +++ b/sdk/ai/azure-ai-projects/tests/samples/test_samples.py @@ -16,9 +16,8 @@ class TestSamples: To run this test: * 'cd' to the folder '/sdk/ai/azure-ai-projects' in your azure-sdk-for-python repo. - * Define the environment variable PROJECT_ENDPOINT with the endpoint of the Azure AI Foundry project used for testing. - * You may also want to make sure ENABLE_AZURE_AI_PROJECTS_CONSOLE_LOGGING is not define (or defined to "false") - so you don't get too much console output. + * set PROJECT_ENDPOINT= - Define your Azure AI Foundry project endpoint used by the test. + * set ENABLE_AZURE_AI_PROJECTS_CONSOLE_LOGGING=false - to make sure logging is not enabled in the test, to reduce console spew. * Run: pytest tests/samples/test_samples.py::TestSamples * Load the resulting report in Excel: tests\samples\samples_report.csv """ @@ -48,18 +47,6 @@ def teardown_class(cls): exception_message = f'"{exception_string.splitlines()[0]}"' if exception_string else "" writer.writerow([f"{'PASS' if passed else 'FAIL'}", test_name, exception_message]) - """ - report_lines = [] - if len(cls._results) > 0: - for test_name, (passed, exception_string) in cls._results.items(): - exception_summary = f"\"{exception_string.splitlines()[0]}\"" if exception_string else "" - report_lines.append(f"{'PASS' if passed else 'FAIL'}, {test_name}, {exception_summary}") - report_content = "\n".join(report_lines) - report_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "samples_report.csv") - with open(report_path, "w", encoding="utf-8") as f: - f.write(report_content) - """ - @classmethod def _set_env_vars(cls, sample_name: str, **kwargs): """ diff --git a/sdk/ai/azure-ai-projects/tsp-location.yaml b/sdk/ai/azure-ai-projects/tsp-location.yaml index 7db567c71dd0..22421471ffc1 100644 --- a/sdk/ai/azure-ai-projects/tsp-location.yaml +++ b/sdk/ai/azure-ai-projects/tsp-location.yaml @@ -1,4 +1,4 @@ directory: specification/ai/Azure.AI.Projects -commit: 46ce90da0e24d277b7c1cb984ee77116138233e4 +commit: 07a63adf249cb199d5abd179448c92cd6e3446c8 repo: Azure/azure-rest-api-specs additionalDirectories: