Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
First
  • Loading branch information
dargilco committed May 10, 2025
commit 00cefdc5b2960595f6b4b0973f9fd9a2f9e1be8c
13 changes: 5 additions & 8 deletions sdk/ai/azure-ai-projects/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,32 +254,29 @@ dataset: DatasetVersion = project_client.datasets.upload_file(
name=dataset_name,
version=dataset_version_1,
file_path=data_file,
connection_name=connection_name,
)
print(dataset)

print(
f"Upload all files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files."
f"Upload files in a folder (including sub-folders) and create a new version `{dataset_version_2}` in the same Dataset, to reference the files."
)
dataset = project_client.datasets.upload_folder(
name=dataset_name,
version=dataset_version_2,
folder=data_folder,
connection_name=connection_name,
file_pattern=re.compile(r"\.(txt|csv|md)$", re.IGNORECASE),
)
print(dataset)

print(f"Get an existing Dataset version `{dataset_version_1}`:")
dataset = project_client.datasets.get(name=dataset_name, version=dataset_version_1)
print(dataset)

"""
TODO: TypeSpec needs to be fixed for this to work. "body" should be removed.
print(f"Get credentials of an existing Dataset version `{dataset_version_1}`:")
asset_credential = project_client.datasets.get_credentials(
name=dataset_name,
version=dataset_version_1,
body=None)
asset_credential = project_client.datasets.get_credentials(name=dataset_name, version=dataset_version_1)
print(asset_credential)
"""

print("List latest versions of all Datasets:")
for dataset in project_client.datasets.list():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,12 @@

Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
"""
import os
import re
import logging
from typing import Any, Tuple, Optional
from pathlib import Path
from urllib.parse import urlsplit
from azure.storage.blob.aio import ContainerClient
from azure.core.tracing.decorator_async import distributed_trace_async

Expand All @@ -22,7 +25,6 @@
PendingUploadType,
PendingUploadResponse,
)
from ...models._enums import CredentialType

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -67,7 +69,7 @@ async def _create_dataset_and_get_its_container_client(
# https://learn.microsoft.com/azure/storage/blobs/storage-quickstart-blobs-python
# https://learn.microsoft.com/azure/storage/blobs/storage-blob-upload-python

# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-from-container-url
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-from-container-url
return (
ContainerClient.from_container_url(
container_url=pending_upload_response.blob_reference.credential.sas_uri, # Of the form: "https://<account>.blob.core.windows.net/<container>?<sasToken>"
Expand Down Expand Up @@ -120,28 +122,37 @@ async def upload_file(
blob_name,
)

# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-upload-blob
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-upload-blob
async with await container_client.upload_blob(name=blob_name, data=data, **kwargs) as blob_client:

logger.debug("[upload_file] Done uploading")

file_dataset_version = FileDatasetVersion(
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url
# Per above doc the ".url" contains SAS token... should this be stripped away?
data_uri=blob_client.url, # "<account>.blob.windows.core.net/<container>/<file_name>"
)
file_dataset_version.is_reference = True # TODO: Update TypeSpec to make this writable.
# Remove the SAS token from the URL (remove all query strings).
# The resulting format should be "https://<account>.blob.core.windows.net/<container>/<file_name>"
data_uri = urlsplit(blob_client.url)._replace(query="").geturl()

dataset_version = await self.create_or_update(
name=name,
version=output_version,
body=file_dataset_version,
body=FileDatasetVersion(
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url
# Per above doc the ".url" contains SAS token... should this be stripped away?
data_uri=data_uri,
),
)

return dataset_version

@distributed_trace_async
async def upload_folder(self, *, name: str, version: str, folder: str, **kwargs: Any) -> DatasetVersion:
async def upload_folder(
self,
*,
name: str,
version: str,
folder: str,
connection_name: Optional[str] = None,
file_pattern: Optional[re.Pattern] = None,
**kwargs: Any,
) -> DatasetVersion:
"""Upload all files in a folder and its sub folders to a blob storage, while maintaining
relative paths, and create a dataset that references this folder.
This method uses the `ContainerClient.upload_blob` method from the azure-storage-blob package
Expand All @@ -152,7 +163,13 @@ async def upload_folder(self, *, name: str, version: str, folder: str, **kwargs:
:keyword version: The version identifier for the dataset. Required.
:paramtype version: str
:keyword folder: The folder name (including optional path) to be uploaded. Required.
:paramtype file: str
:paramtype folder: str
:keyword connection_name: The name of an Azure Storage Account connection, where the file should be uploaded.
If not specified, the default Azure Storage Account connection will be used. Optional.
:paramtype connection_name: str
:keyword file_pattern: A regex pattern to filter files to be uploaded. Only files matching the pattern
will be uploaded. Optional.
:paramtype file_pattern: re.Pattern
:return: The created dataset version.
:rtype: ~azure.ai.projects.models.DatasetVersion
:raises ~azure.core.exceptions.HttpResponseError: If an error occurs during the HTTP request.
Expand All @@ -164,40 +181,43 @@ async def upload_folder(self, *, name: str, version: str, folder: str, **kwargs:
raise ValueError("The provided folder is actually a file. Use method `upload_file` instead.")

container_client, output_version = await self._create_dataset_and_get_its_container_client(
name=name, input_version=version
name=name, input_version=version, connection_name=connection_name
)

async with container_client:

# Recursively traverse all files in the folder
files_uploaded: bool = False
for file_path in path_folder.rglob("*"): # `rglob` matches all files and folders recursively
if file_path.is_file(): # Check if the path is a file. Skip folders.
blob_name = file_path.relative_to(path_folder) # Blob name relative to the folder
for root, _, files in os.walk(folder):
for file in files:
if file_pattern and not file_pattern.search(file):
continue # Skip files that do not match the pattern
file_path = os.path.join(root, file)
blob_name = os.path.relpath(file_path, folder).replace("\\", "/") # Ensure correct format for Azure
logger.debug(
"[upload_folder] Start uploading file `%s` as blob `%s`.",
file_path,
blob_name,
)
with file_path.open(
"rb"
) as data: # Open the file for reading in binary mode # TODO: async version?
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.containerclient?view=azure-python#azure-storage-blob-containerclient-upload-blob
container_client.upload_blob(name=str(blob_name), data=data, **kwargs)
logger.debug("[upload_folder] Done uploaded.")
with open(file=file_path, mode="rb") as data: # Open the file for reading in binary mode
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-upload-blob
await container_client.upload_blob(name=str(blob_name), data=data, **kwargs)
logger.debug("[upload_folder] Done uploading file")
files_uploaded = True
logger.debug("[upload_folder] Done uploaded.")

if not files_uploaded:
raise ValueError("The provided folder is empty.")

# Remove the SAS token from the URL (remove all query strings).
# The resulting format should be "https://<account>.blob.core.windows.net/<container>"
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.aio.containerclient?view=azure-python#azure-storage-blob-aio-containerclient-url
data_uri = urlsplit(container_client.url)._replace(query="").geturl()

dataset_version = await self.create_or_update(
name=name,
version=output_version,
body=FolderDatasetVersion(
# See https://learn.microsoft.com/python/api/azure-storage-blob/azure.storage.blob.blobclient?view=azure-python#azure-storage-blob-blobclient-url
# Per above doc the ".url" contains SAS token... should this be stripped away?
data_uri=container_client.url, # "<account>.blob.windows.core.net/<container> ?"
),
body=FolderDatasetVersion(data_uri=data_uri),
)

return dataset_version
44 changes: 24 additions & 20 deletions sdk/ai/azure-ai-projects/azure/ai/projects/models/_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,8 +559,7 @@ class BlobReference(_Model):
"""Blob reference details.

:ivar blob_uri: Blob URI path for client to upload data. Example:
`https://blob.windows.core.net/Container/Path <https://blob.windows.core.net/Container/Path>`_.
Required.
``https://blob.windows.core.net/Container/Path``. Required.
:vartype blob_uri: str
:ivar storage_account_arm_id: ARM ID of the storage account to use. Required.
:vartype storage_account_arm_id: str
Expand All @@ -569,8 +568,7 @@ class BlobReference(_Model):
"""

blob_uri: str = rest_field(name="blobUri", visibility=["read", "create", "update", "delete", "query"])
"""Blob URI path for client to upload data. Example: `https://blob.windows.core.net/Container/Path
<https://blob.windows.core.net/Container/Path>`_. Required."""
"""Blob URI path for client to upload data. Example: ``https://blob.windows.core.net/Container/Path``. Required."""
storage_account_arm_id: str = rest_field(
name="storageAccountArmId", visibility=["read", "create", "update", "delete", "query"]
)
Expand Down Expand Up @@ -740,15 +738,17 @@ class DatasetVersion(_Model):
You probably want to use the sub-classes and not this class directly. Known sub-classes are:
FileDatasetVersion, FolderDatasetVersion

:ivar data_uri: URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330
<https://go.microsoft.com/fwlink/?linkid=2202330>`_. Required.
:ivar data_uri: URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required.
:vartype data_uri: str
:ivar type: Dataset type. Required. Known values are: "uri_file" and "uri_folder".
:vartype type: str or ~azure.ai.projects.models.DatasetType
:ivar is_reference: Indicates if the dataset holds a reference to the storage, or the dataset
manages storage itself. If true, the underlying data will not be deleted when the dataset
version is deleted.
:vartype is_reference: bool
:ivar connection_name: The Azure Storage Account connection name. Required if
startPendingUploadVersion was not called before creating the Dataset.
:vartype connection_name: str
:ivar id: Asset ID, a unique identifier for the asset.
:vartype id: str
:ivar name: The name of the resource. Required.
Expand All @@ -763,15 +763,15 @@ class DatasetVersion(_Model):

__mapping__: Dict[str, _Model] = {}
data_uri: str = rest_field(name="dataUri", visibility=["read", "create"])
"""URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330
<https://go.microsoft.com/fwlink/?linkid=2202330>`_. Required."""
"""URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required."""
type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])
"""Dataset type. Required. Known values are: \"uri_file\" and \"uri_folder\"."""
is_reference: Optional[bool] = rest_field(
name="isReference", visibility=["read", "create", "update", "delete", "query"]
)
is_reference: Optional[bool] = rest_field(name="isReference", visibility=["read"])
"""Indicates if the dataset holds a reference to the storage, or the dataset manages storage
itself. If true, the underlying data will not be deleted when the dataset version is deleted."""
connection_name: Optional[str] = rest_field(name="connectionName", visibility=["read", "create"])
"""The Azure Storage Account connection name. Required if startPendingUploadVersion was not called
before creating the Dataset."""
id: Optional[str] = rest_field(visibility=["read"])
"""Asset ID, a unique identifier for the asset."""
name: str = rest_field(visibility=["read"])
Expand All @@ -789,7 +789,7 @@ def __init__(
*,
data_uri: str,
type: str,
is_reference: Optional[bool] = None,
connection_name: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
) -> None: ...
Expand Down Expand Up @@ -1071,13 +1071,15 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
class FileDatasetVersion(DatasetVersion, discriminator="uri_file"):
"""FileDatasetVersion Definition.

:ivar data_uri: URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330
<https://go.microsoft.com/fwlink/?linkid=2202330>`_. Required.
:ivar data_uri: URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required.
:vartype data_uri: str
:ivar is_reference: Indicates if the dataset holds a reference to the storage, or the dataset
manages storage itself. If true, the underlying data will not be deleted when the dataset
version is deleted.
:vartype is_reference: bool
:ivar connection_name: The Azure Storage Account connection name. Required if
startPendingUploadVersion was not called before creating the Dataset.
:vartype connection_name: str
:ivar id: Asset ID, a unique identifier for the asset.
:vartype id: str
:ivar name: The name of the resource. Required.
Expand All @@ -1100,7 +1102,7 @@ def __init__(
self,
*,
data_uri: str,
is_reference: Optional[bool] = None,
connection_name: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
) -> None: ...
Expand All @@ -1119,13 +1121,15 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
class FolderDatasetVersion(DatasetVersion, discriminator="uri_folder"):
"""FileDatasetVersion Definition.

:ivar data_uri: URI of the data. Example: `https://go.microsoft.com/fwlink/?linkid=2202330
<https://go.microsoft.com/fwlink/?linkid=2202330>`_. Required.
:ivar data_uri: URI of the data. Example: ``https://go.microsoft.com/fwlink/?linkid=2202330``. Required.
:vartype data_uri: str
:ivar is_reference: Indicates if the dataset holds a reference to the storage, or the dataset
manages storage itself. If true, the underlying data will not be deleted when the dataset
version is deleted.
:vartype is_reference: bool
:ivar connection_name: The Azure Storage Account connection name. Required if
startPendingUploadVersion was not called before creating the Dataset.
:vartype connection_name: str
:ivar id: Asset ID, a unique identifier for the asset.
:vartype id: str
:ivar name: The name of the resource. Required.
Expand All @@ -1148,7 +1152,7 @@ def __init__(
self,
*,
data_uri: str,
is_reference: Optional[bool] = None,
connection_name: Optional[str] = None,
description: Optional[str] = None,
tags: Optional[Dict[str, str]] = None,
) -> None: ...
Expand Down Expand Up @@ -1355,7 +1359,7 @@ class PendingUploadRequest(_Model):

:ivar pending_upload_id: If PendingUploadId is not provided, a random GUID will be used.
:vartype pending_upload_id: str
:ivar connection_name: Name of Azure blob storage connection to use for generating temporary
:ivar connection_name: Azure Storage Account connection name to use for generating temporary
SAS token.
:vartype connection_name: str
:ivar pending_upload_type: BlobReference is the only supported type. Required. Blob Reference
Expand All @@ -1370,7 +1374,7 @@ class PendingUploadRequest(_Model):
connection_name: Optional[str] = rest_field(
name="connectionName", visibility=["read", "create", "update", "delete", "query"]
)
"""Name of Azure blob storage connection to use for generating temporary SAS token."""
"""Azure Storage Account connection name to use for generating temporary SAS token."""
pending_upload_type: Literal[PendingUploadType.BLOB_REFERENCE] = rest_field(
name="pendingUploadType", visibility=["read", "create", "update", "delete", "query"]
)
Expand Down
Loading