Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
69 commits
Select commit Hold shift + click to select a range
991a274
sync changes and sample for vector search control plane
simorenoh Mar 21, 2024
3a3a652
Update index_management.py
simorenoh Mar 21, 2024
20f533c
Update index_management.py
simorenoh Mar 21, 2024
09f33b7
async and samples
simorenoh Mar 28, 2024
8e527fd
sync and async tests
simorenoh Mar 28, 2024
7c44137
Update CHANGELOG.md
simorenoh Mar 28, 2024
7eb5439
developed typehints
simorenoh Mar 28, 2024
c428476
skip tests
simorenoh Mar 29, 2024
58000fd
create_if_not_exists, README
simorenoh Apr 2, 2024
4c4b1ab
Update README.md
simorenoh Apr 2, 2024
0e6b24f
add provisional, add dimension limit
simorenoh Apr 3, 2024
b42f3cb
Merge branch 'main' into vector-search-query
simorenoh Apr 16, 2024
fef391d
adds sync changes, adds changelog
simorenoh May 3, 2024
8583dbf
async changes
simorenoh May 3, 2024
158f60f
some comments addressed
simorenoh May 3, 2024
c880436
Update CHANGELOG.md
simorenoh May 3, 2024
a414f05
bug fix on ordering
simorenoh May 8, 2024
d217210
ordering bug fix
simorenoh May 8, 2024
8869ea4
fix datetime
simorenoh May 8, 2024
0c6d8eb
samples added
simorenoh May 8, 2024
30b0645
small fixes
simorenoh May 9, 2024
5056d89
fix some additional PQ logic
simorenoh May 9, 2024
358deae
last bit of pq fixes
simorenoh May 9, 2024
617c709
Update non_streaming_order_by_aggregator.py
simorenoh May 9, 2024
73e3709
memory optimization
simorenoh May 10, 2024
6bb8090
Update sdk/cosmos/azure-cosmos/azure/cosmos/_execution_context/aio/do…
simorenoh May 10, 2024
326b155
Merge branch 'main' into vector-search-query
simorenoh May 10, 2024
540a645
addressing comments
simorenoh May 10, 2024
98a4fc9
test name fix, improve readme/ samples
simorenoh May 10, 2024
d487519
add sync tests, improve readme
simorenoh May 10, 2024
abd2bc0
async tests
simorenoh May 10, 2024
a0547b1
pylint
simorenoh May 10, 2024
07acb93
remove print
simorenoh May 10, 2024
7cd5b92
pylint
simorenoh May 10, 2024
5834b29
adds env variable
simorenoh May 10, 2024
f615f3e
adds JS tests
simorenoh May 13, 2024
0081bbe
error logic improvements
simorenoh May 13, 2024
674f483
readme updates
simorenoh May 13, 2024
0e26bf6
more fixes to logic
simorenoh May 13, 2024
a65eb0a
oops
simorenoh May 13, 2024
6563bc3
memory optimization
simorenoh May 13, 2024
9935dc1
Update sdk/cosmos/azure-cosmos/README.md
simorenoh May 13, 2024
ad36a9c
update variable for naming conventions
simorenoh May 13, 2024
86b78b7
remove/ comment out diskANN
simorenoh May 13, 2024
3cff42f
offset + limit fix, tests fixes
simorenoh May 14, 2024
dd187dd
add capabilities env var flag
simorenoh May 14, 2024
d2fbb1b
use feature flag for existing query tests
simorenoh May 14, 2024
fe7742a
disable emulator for query tests
simorenoh May 14, 2024
7cd4d9d
missed some tests
simorenoh May 14, 2024
b3876c6
Update test_aggregate.py
simorenoh May 14, 2024
d8bc50d
Update test-resources.bicep
simorenoh May 15, 2024
1e699e4
forgot tests were being skipped
simorenoh May 15, 2024
e79839b
Update sdk/cosmos/azure-cosmos/test/test_vector_policy.py
Pilchie May 15, 2024
16860dc
Update sdk/cosmos/azure-cosmos/test/test_vector_policy_async.py
Pilchie May 15, 2024
1431e9e
test fixes
simorenoh May 15, 2024
28bef5b
Merge branch 'vector-search-query' of https://github.com/simorenoh/az…
simorenoh May 15, 2024
8701b80
Update README.md
simorenoh May 15, 2024
58af1bb
create separate db for vectors
simorenoh May 15, 2024
9bfdf57
tests
simorenoh May 15, 2024
45e5b6d
tests
simorenoh May 15, 2024
c4a7c60
more tests
simorenoh May 15, 2024
b6dbe45
small bit
simorenoh May 15, 2024
fca1294
final fixes hopefully
simorenoh May 15, 2024
445ba94
raise time limit on test so it doesnt fail
simorenoh May 15, 2024
f64775d
Update test_query_vector_similarity_async.py
simorenoh May 15, 2024
ae9524d
add date for release prep
simorenoh May 15, 2024
e616c4a
Merge branch 'main' into vector-search-query
simorenoh May 15, 2024
8ad2591
Update CHANGELOG.md
simorenoh May 15, 2024
fd10e89
Merge branch 'main' into vector-search-query
simorenoh May 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
async tests
  • Loading branch information
simorenoh committed May 10, 2024
commit abd2bc015b98ba510de289a157e34be370e1d212
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ def setUp(self):

def tearDown(self):
try:
self.created_db.delete_container(self.TEST_CONTAINER_ID)
self.created_db.delete_container("quantized" + self.TEST_CONTAINER_ID)
self.created_db.delete_container("flat" + self.TEST_CONTAINER_ID)
self.created_db.delete_container("diskANN" + self.TEST_CONTAINER_ID)
except exceptions.CosmosHttpResponseError:
pass

Expand Down
205 changes: 205 additions & 0 deletions sdk/cosmos/azure-cosmos/test/test_query_vector_similarity_async.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
# The MIT License (MIT)
# Copyright (c) Microsoft Corporation. All rights reserved.

import unittest
import uuid

import pytest

import azure.cosmos.exceptions as exceptions
from azure.cosmos.aio import CosmosClient, DatabaseProxy
import azure.cosmos.exceptions as exceptions
import test_config
import vector_test_data
from azure.cosmos import http_constants, DatabaseProxy
from azure.cosmos.partition_key import PartitionKey


def verify_ordering(item_list, distance_function):
for i in range(len(item_list)):
assert item_list[i]["text"] == vector_test_data.get_ordered_item_texts()[i]
if distance_function == "euclidean":
for i in range(len(item_list) - 1):
assert item_list[i]["SimilarityScore"] <= item_list[i + 1]["SimilarityScore"]
else:
for i in range(len(item_list) - 1):
assert item_list[i]["SimilarityScore"] >= item_list[i + 1]["SimilarityScore"]


@pytest.mark.skip # skipping these while we get an emulator
class TestVectorSimilarityQuery(unittest.TestCase):
"""Test to check vector similarity queries behavior."""

created_db: DatabaseProxy = None
client: CosmosClient = None
config = test_config.TestConfig
host = config.host
masterKey = config.masterKey
connectionPolicy = config.connectionPolicy
TEST_DATABASE_ID = config.TEST_DATABASE_ID
TEST_CONTAINER_ID = "Vector Similarity Container " + str(uuid.uuid4())

@classmethod
def setUpClass(cls):
if (cls.masterKey == '[YOUR_KEY_HERE]' or
cls.host == '[YOUR_ENDPOINT_HERE]'):
raise Exception(
"You must specify your Azure Cosmos account values for "
"'masterKey' and 'host' at the top of this class to run the "
"tests.")

cls.client = CosmosClient(cls.host, cls.masterKey)
cls.created_db = cls.client.get_database_client(cls.TEST_DATABASE_ID)

async def asyncSetUp(self):
self.created_quantized_cosine_container = await self.created_db.create_container(
id="quantized" + self.TEST_CONTAINER_ID,
partition_key=PartitionKey(path="/pk"),
offer_throughput=test_config.TestConfig.THROUGHPUT_FOR_5_PARTITIONS,
indexing_policy=test_config.get_vector_indexing_policy(embedding_type="quantizedFlat"),
vector_embedding_policy=test_config.get_vector_embedding_policy(data_type="float32",
distance_function="cosine"))
self.created_flat_euclidean_container = await self.created_db.create_container(
id="flat" + self.TEST_CONTAINER_ID,
partition_key=PartitionKey(path="/pk"),
offer_throughput=test_config.TestConfig.THROUGHPUT_FOR_5_PARTITIONS,
indexing_policy=test_config.get_vector_indexing_policy(embedding_type="flat"),
vector_embedding_policy=test_config.get_vector_embedding_policy(data_type="float32",
distance_function="euclidean"))
self.created_diskANN_dotproduct_container = await self.created_db.create_container(
id="diskANN" + self.TEST_CONTAINER_ID,
partition_key=PartitionKey(path="/pk"),
offer_throughput=test_config.TestConfig.THROUGHPUT_FOR_5_PARTITIONS,
indexing_policy=test_config.get_vector_indexing_policy(embedding_type="diskANN"),
vector_embedding_policy=test_config.get_vector_embedding_policy(data_type="float32",
distance_function="dotproduct"))
for item in vector_test_data.get_vector_items():
await self.created_quantized_cosine_container.create_item(item)
await self.created_flat_euclidean_container.create_item(item)
await self.created_diskANN_dotproduct_container.create_item(item)

async def tearDown(self):
try:
await self.created_db.delete_container("quantized" + self.TEST_CONTAINER_ID)
await self.created_db.delete_container("flat" + self.TEST_CONTAINER_ID)
await self.created_db.delete_container("diskANN" + self.TEST_CONTAINER_ID)
except exceptions.CosmosHttpResponseError:
pass
await self.client.close()

async def test_wrong_queries_async(self):
vector_string = vector_test_data.get_embedding_string("I am having a wonderful day.")
# try to send a vector search query without limit filters
query = "SELECT c.text, VectorDistance(c.embedding, [{}]) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}])".format(vector_string, vector_string)
try:
[item async for item in self.created_cosine_container.query_items(query=query, enable_cross_partition_query=True)]
pytest.fail("Client should not allow queries without filters.")
except ValueError as e:
assert "Executing a vector search query without TOP or LIMIT can consume many RUs very fast and" \
" have long runtimes. Please ensure you are using one of the two filters with your" \
" vector search query.'" in e.args[0]

# try to send a vector search query specifying the ordering as ASC or DESC
query = "SELECT c.text, VectorDistance(c.embedding, [{}]) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}]) ASC".format(vector_string,
vector_string)
try:
[item async for item in self.created_cosine_container.query_items(query=query, enable_cross_partition_query=True)]
pytest.fail("Client should not allow queries with ASC/DESC.")
except exceptions.CosmosHttpResponseError as e:
assert e.status_code == http_constants.StatusCodes.BAD_REQUEST
assert "Specifying a sorting order (ASC or DESC) with VectorDistance" \
" function is not supported." in e.message

async def test_ordering_distances_async(self):
# load up previously calculated embedding for the given string
vector_string = vector_test_data.get_embedding_string("I am having a wonderful day.")
# test euclidean distance
for i in range(1, 11):
# we define queries with and without specs to directly use the embeddings in our container policies
vanilla_query = "SELECT TOP {} c.text, VectorDistance(c.embedding, [{}]) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}])".format(str(i),
vector_string,
vector_string)
specs_query = "SELECT TOP {} c.text, VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'euclidean'}}) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'euclidean'}})" \
.format(str(i), vector_string, vector_string)

flat_list = [item async for item in self.created_flat_euclidean_container.query_items(query=vanilla_query,
enable_cross_partition_query=True)]
verify_ordering(flat_list, "euclidean")

quantized_list = [item async for item in self.created_quantized_cosine_container.query_items(query=specs_query,
enable_cross_partition_query=True)]
verify_ordering(quantized_list, "euclidean")

disk_ann_list = [item async for item in self.created_diskANN_dotproduct_container.query_items(query=specs_query,
enable_cross_partition_query=True)]
verify_ordering(disk_ann_list, "euclidean")
# test cosine distance
for i in range(1, 11):
vanilla_query = "SELECT TOP {} c.text, VectorDistance(c.embedding, [{}]) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}])".format(str(i),
vector_string,
vector_string)
specs_query = "SELECT TOP {} c.text, VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'cosine'}}) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'cosine'}})" \
.format(str(i), vector_string, vector_string)

flat_list = [item async for item in self.created_flat_euclidean_container.query_items(
query=specs_query, enable_cross_partition_query=True)]
verify_ordering(flat_list, "cosine")

quantized_list = [item async for item in self.created_quantized_cosine_container.query_items(query=vanilla_query,
enable_cross_partition_query=True)]
verify_ordering(quantized_list, "cosine")

disk_ann_list = [item async for item in self.created_diskANN_dotproduct_container.query_items(query=specs_query,
enable_cross_partition_query=True)]
verify_ordering(disk_ann_list, "cosine")
# test dot product distance
for i in range(1, 11):
vanilla_query = "SELECT TOP {} c.text, VectorDistance(c.embedding, [{}]) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}])".format(str(i),
vector_string,
vector_string)
specs_query = "SELECT TOP {} c.text, VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'dotproduct'}}) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'dotproduct'}})" \
.format(str(i), vector_string, vector_string)

flat_list = [item async for item in self.created_flat_euclidean_container.query_items(query=specs_query, enable_cross_partition_query=True)]
verify_ordering(flat_list, "dotproduct")

quantized_list = [item async for item in self.created_quantized_cosine_container.query_items(query=specs_query,
enable_cross_partition_query=True)]
verify_ordering(quantized_list, "dotproduct")

disk_ann_list = [item async for item in self.created_diskANN_dotproduct_container.query_items(query=vanilla_query,
enable_cross_partition_query=True)]
verify_ordering(disk_ann_list, "dotproduct")

async def test_vector_query_pagination(self):
# load up previously calculated embedding for the given string
vector_string = vector_test_data.get_embedding_string("I am having a wonderful day.")

query = "SELECT TOP 8 c.text, VectorDistance(c.embedding, [{}], false, {{'distanceFunction': 'cosine'}}) AS " \
"SimilarityScore FROM c ORDER BY VectorDistance(c.embedding, [{}], false, {{'distanceFunction': " \
"'cosine'}})".format(vector_string, vector_string)

query_iterable = self.created_quantized_cosine_container.query_items(query=query,
enable_cross_partition_query=True,
max_item_count=3)
all_fetched_res = []
count = 0
pages = query_iterable.by_page()
async for items in await pages.__anext__():
count += 1
all_fetched_res.extend(items)
assert count == 3
assert len(all_fetched_res) == 8
verify_ordering(all_fetched_res, "cosine")


if __name__ == "__main__":
unittest.main()