From 1b05e0cd908a127b2887d795011fc281253d5fc0 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Mon, 16 Oct 2023 14:07:55 +0000 Subject: [PATCH 01/11] mango: add $beginsWith operator Adds a `$beginsWith` operator to selectors, with json and text index support. This is a compliment / precursor to optimising `$regex` support as proposed in https://github.com/apache/couchdb/pull/4776. For `json` indexes, a $beginsWith operator translates into a key range query, as is common practice for _view queries. For example, to find all rows with a key beginning with "W", we can use a range `start_key="W", end_key="W\ufff0"`. Given Mango uses compound keys, this is slightly more complex in practice, but the idea is the same. As with other range operators (`$gt`, `$gte`, etc), `$beginsWith` can be used in combination with equality operators and result sorting but must result in a contiguous key range. That is, a range of `start_key=[10, "W"], end_key=[10, "W\ufff0", {}]` would be valid, but `start_key=["W", 10], end_key=["W\ufff0", 10, {}]` would not, because the second element of the key may result in a non-contiguous range. For text indexes, `$beginsWith` translates to a Lucene query on the specified field of `W*`. If a non-string operand is provided to `$beginsWith`, the request will fail with a 400 / `invalid_operator` error. --- src/mango/src/mango_idx_view.erl | 6 ++ src/mango/src/mango_selector.erl | 32 ++++++++ src/mango/src/mango_selector_text.erl | 3 + src/mango/test/03-operator-test.py | 16 ++++ src/mango/test/25-beginswith-test.py | 112 ++++++++++++++++++++++++++ 5 files changed, 169 insertions(+) create mode 100644 src/mango/test/25-beginswith-test.py diff --git a/src/mango/src/mango_idx_view.erl b/src/mango/src/mango_idx_view.erl index 25d75d55d0a..d1650e987cc 100644 --- a/src/mango/src/mango_idx_view.erl +++ b/src/mango/src/mango_idx_view.erl @@ -306,6 +306,8 @@ indexable({[{<<"$gt">>, _}]}) -> true; indexable({[{<<"$gte">>, _}]}) -> true; +indexable({[{<<"$beginsWith">>, _}]}) -> + true; % This is required to improve index selection for covering indexes. % Making `$exists` indexable should not cause problems in other cases. indexable({[{<<"$exists">>, _}]}) -> @@ -412,6 +414,10 @@ range(_, _, LCmp, Low, HCmp, High) -> % operators but its all straight forward once you figure out how % we're basically just narrowing our logical ranges. +% beginsWith requires both a high and low bound +range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) -> + {LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, HCmp, High), + range({[{<<"$lte">>, <>}]}, LCmp0, Low0, HCmp0, High0); range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) -> case range_pos(Low, Arg, High) of min -> diff --git a/src/mango/src/mango_selector.erl b/src/mango/src/mango_selector.erl index 59be7a6ebaf..c1b4d7c282e 100644 --- a/src/mango/src/mango_selector.erl +++ b/src/mango/src/mango_selector.erl @@ -135,6 +135,8 @@ norm_ops({[{<<"$text">>, Arg}]}) when {[{<<"$default">>, {[{<<"$text">>, Arg}]}}]}; norm_ops({[{<<"$text">>, Arg}]}) -> ?MANGO_ERROR({bad_arg, '$text', Arg}); +norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) -> + Cond; % Not technically an operator but we pass it through here % so that this function accepts its own output. This exists % so that $text can have a field name value which simplifies @@ -514,6 +516,11 @@ match({[{<<"$mod">>, [D, R]}]}, Value, _Cmp) when is_integer(Value) -> Value rem D == R; match({[{<<"$mod">>, _}]}, _Value, _Cmp) -> false; +match({[{<<"$beginsWith">>, Prefix}]}, Value, _Cmp) when is_binary(Prefix), is_binary(Value) -> + string:prefix(Value, Prefix) /= nomatch; +% When Value is not a string, do not match +match({[{<<"$beginsWith">>, Prefix}]}, _, _Cmp) when is_binary(Prefix) -> + false; match({[{<<"$regex">>, Regex}]}, Value, _Cmp) when is_binary(Value) -> try match == re:run(Value, Regex, [{capture, none}]) @@ -1054,4 +1061,29 @@ fields_nor_test() -> }, ?assertEqual([<<"field1">>, <<"field2">>], fields_of(Selector2)). +match_beginswith_test() -> + Doc = + {[ + {<<"_id">>, <<"foo">>}, + {<<"_rev">>, <<"bar">>}, + {<<"user_id">>, 11} + ]}, + Check = fun(Field, Prefix) -> + Selector = {[{Field, {[{<<"$beginsWith">>, Prefix}]}}]}, + % Call match_int/2 to avoid ERROR for missing metric; this is confusing + % in the middle of test output. + match_int(mango_selector:normalize(Selector), Doc) + end, + [ + % matching + ?assertEqual(true, Check(<<"_id">>, <<"f">>)), + % no match (user_id is not a binary string) + ?assertEqual(false, Check(<<"user_id">>, <<"f">>)), + % invalid (prefix is not a binary string) + ?assertThrow( + {mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}}, + Check(<<"user_id">>, 1) + ) + ]. + -endif. diff --git a/src/mango/src/mango_selector_text.erl b/src/mango/src/mango_selector_text.erl index 1f8609ac27b..fc9280d85de 100644 --- a/src/mango/src/mango_selector_text.erl +++ b/src/mango/src/mango_selector_text.erl @@ -142,6 +142,9 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) -> true -> FieldExists; false -> {op_not, {FieldExists, false}} end; +convert(Path, {[{<<"$beginsWith">>, Arg}]}) -> + PrefixSearch = [value_str(Arg), <<"*">>], + {op_field, {make_field(Path, Arg), PrefixSearch}}; % We're not checking the actual type here, just looking for % anything that has a possibility of matching by checking % for the field name. We use the same logic for $exists on diff --git a/src/mango/test/03-operator-test.py b/src/mango/test/03-operator-test.py index 70e3fbc5f24..b43aacf5f69 100644 --- a/src/mango/test/03-operator-test.py +++ b/src/mango/test/03-operator-test.py @@ -141,6 +141,22 @@ def test_exists_false_returns_missing_but_not_null(self): for d in docs: self.assertNotIn("twitter", d) + def test_beginswith(self): + docs = self.db.find({"location.state": {"$beginsWith": "New"}}) + self.assertEqual(len(docs), 2) + self.assertUserIds([2, 10], docs) + + # non-string prefixes should return an error + def test_beginswith_invalid_prefix(self): + docs = self.db.find({"location.state": {"$beginsWith": 123}}) + self.assertEqual(len(docs), 2) + + # non-string values in documents should not match the prefix, + # but should not error + def test_beginswith_invalid_prefix(self): + docs = self.db.find({"user_id": {"$beginsWith": "Foo"}}) + self.assertEqual(len(docs), 0) + class OperatorJSONTests(mango.UserDocsTests, BaseOperatorTests.Common): # START: text indexes do not support range queries across type boundaries so only diff --git a/src/mango/test/25-beginswith-test.py b/src/mango/test/25-beginswith-test.py new file mode 100644 index 00000000000..76772c24399 --- /dev/null +++ b/src/mango/test/25-beginswith-test.py @@ -0,0 +1,112 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +import copy +import mango + +DOCS = [ + {"_id": "aaa", "name": "Jimi", "location": "AUS", "age": 27}, + {"_id": "abc", "name": "Eddie", "location": "AND", "age": 65}, + {"_id": "bbb", "name": "Harry", "location": "CAN", "age": 21}, + {"_id": "ccc", "name": "Eddie", "location": "DEN", "age": 37}, + {"_id": "ddd", "name": "Jones", "location": "ETH", "age": 49}, +] + + +def to_utf8_bytes(list): + return [x.encode() for x in list] + + +class BeginsWithOperator(mango.DbPerClass): + def setUp(self): + self.db.recreate() + self.db.save_docs(copy.deepcopy(DOCS)) + self.db.create_index(["location"]) + self.db.create_index(["name", "location"]) + + def assertDocIds(self, user_ids, docs): + user_ids_returned = list(d["_id"] for d in docs) + user_ids.sort() + user_ids_returned.sort() + self.assertEqual(user_ids, user_ids_returned) + + def test_basic(self): + docs = self.db.find({"location": {"$beginsWith": "A"}}) + + self.assertEqual(len(docs), 2) + self.assertDocIds(["aaa", "abc"], docs) + + def test_json_range(self): + explain = self.db.find({"location": {"$beginsWith": "A"}}, explain=True) + self.assertEqual(explain["mrargs"]["start_key"], ["A"]) + end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b""]) + + def test_compound_key(self): + selector = {"name": "Eddie", "location": {"$beginsWith": "A"}} + explain = self.db.find(selector, explain=True) + + self.assertEqual(explain["mrargs"]["start_key"], ["Eddie", "A"]) + end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b""]) + + docs = self.db.find(selector) + self.assertEqual(len(docs), 1) + self.assertDocIds(["abc"], docs) + + def test_sort_asc(self): + selector = {"location": {"$beginsWith": "A"}} + explain = self.db.find(selector, sort=["location"], explain=True) + + self.assertEqual(explain["mrargs"]["start_key"], ["A"]) + end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b""]) + self.assertEqual(explain["mrargs"]["direction"], "fwd") + + def test_sort_desc(self): + selector = {"location": {"$beginsWith": "A"}} + explain = self.db.find(selector, sort=[{"location": "desc"}], explain=True) + + start_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(start_key_bytes, [b"A"]) + self.assertEqual(explain["mrargs"]["end_key"], ["A"]) + self.assertEqual(explain["mrargs"]["direction"], "rev") + + def test_all_docs_range(self): + explain = self.db.find({"_id": {"$beginsWith": "a"}}, explain=True) + self.assertEqual(explain["mrargs"]["start_key"], "a") + end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"]) + + def test_no_index(self): + selector = {"foo": {"$beginsWith": "a"}} + resp_explain = self.db.find(selector, explain=True) + + self.assertEqual(resp_explain["index"]["type"], "special") + self.assertEqual(resp_explain["mrargs"]["start_key"], None) + self.assertEqual(resp_explain["mrargs"]["end_key"], "") + + def test_invalid_operand(self): + try: + self.db.find({"_id": {"$beginsWith": True}}) + except Exception as e: + self.assertEqual(e.response.status_code, 400) + resp = e.response.json() + self.assertEqual(resp["error"], "invalid_operator") + else: + raise AssertionError("expected find error") + + def test_does_not_match_non_string_value(self): + selector = {"age": {"$beginsWith": "a"}} + docs = self.db.find(selector) + + self.assertEqual(len(docs), 0) From 80378afcb46778b7a336e77ae600fb9fd857484e Mon Sep 17 00:00:00 2001 From: Will Holley Date: Wed, 18 Oct 2023 13:04:44 +0000 Subject: [PATCH 02/11] add text eunit tests --- src/mango/src/mango_selector_text.erl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/mango/src/mango_selector_text.erl b/src/mango/src/mango_selector_text.erl index fc9280d85de..6ba4221a8ef 100644 --- a/src/mango/src/mango_selector_text.erl +++ b/src/mango/src/mango_selector_text.erl @@ -142,8 +142,9 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) -> true -> FieldExists; false -> {op_not, {FieldExists, false}} end; -convert(Path, {[{<<"$beginsWith">>, Arg}]}) -> - PrefixSearch = [value_str(Arg), <<"*">>], +convert(Path, {[{<<"$beginsWith">>, Arg}]}) when is_binary(Arg) -> + Suffix = <<"*">>, + PrefixSearch = value_str(<>), {op_field, {make_field(Path, Arg), PrefixSearch}}; % We're not checking the actual type here, just looking for % anything that has a possibility of matching by checking @@ -824,6 +825,12 @@ convert_nor_test() -> }) ). +convert_beginswith_test() -> + ?assertEqual( + {op_field, {[[<<"field">>], <<":">>, <<"string">>],<<"\"foo\\*\"">>}}, + convert_selector(#{<<"field">> => #{<<"$beginsWith">> => <<"foo">>}}) + ). + to_query_test() -> F = fun(S) -> iolist_to_binary(to_query(S)) end, Input = {<<"name">>, <<"value">>}, From 0292f508eb7ff47aef910b3a4577ea8ffc35ae45 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Thu, 19 Oct 2023 13:42:33 +0000 Subject: [PATCH 03/11] refactor mango integration tests --- src/mango/test/25-beginswith-test.py | 60 ++++++++++++++++++---------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/src/mango/test/25-beginswith-test.py b/src/mango/test/25-beginswith-test.py index 76772c24399..c136cebea19 100644 --- a/src/mango/test/25-beginswith-test.py +++ b/src/mango/test/25-beginswith-test.py @@ -33,6 +33,10 @@ def setUp(self): self.db.create_index(["location"]) self.db.create_index(["name", "location"]) + def get_mrargs(self, selector, sort=None): + explain = self.db.find(selector, sort=sort, explain=True) + return explain["mrargs"] + def assertDocIds(self, user_ids, docs): user_ids_returned = list(d["_id"] for d in docs) user_ids.sort() @@ -46,54 +50,57 @@ def test_basic(self): self.assertDocIds(["aaa", "abc"], docs) def test_json_range(self): - explain = self.db.find({"location": {"$beginsWith": "A"}}, explain=True) - self.assertEqual(explain["mrargs"]["start_key"], ["A"]) - end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + mrargs = self.get_mrargs({"location": {"$beginsWith": "A"}}) + + self.assertEqual(mrargs["start_key"], ["A"]) + end_key_bytes = to_utf8_bytes(mrargs["end_key"]) self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b""]) def test_compound_key(self): selector = {"name": "Eddie", "location": {"$beginsWith": "A"}} - explain = self.db.find(selector, explain=True) + mrargs = self.get_mrargs(selector) - self.assertEqual(explain["mrargs"]["start_key"], ["Eddie", "A"]) - end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(mrargs["start_key"], ["Eddie", "A"]) + end_key_bytes = to_utf8_bytes(mrargs["end_key"]) self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b""]) docs = self.db.find(selector) self.assertEqual(len(docs), 1) self.assertDocIds(["abc"], docs) - def test_sort_asc(self): + def test_sort(self): selector = {"location": {"$beginsWith": "A"}} - explain = self.db.find(selector, sort=["location"], explain=True) + mrargs = self.get_mrargs(selector, sort=["location"]) - self.assertEqual(explain["mrargs"]["start_key"], ["A"]) - end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + self.assertEqual(mrargs["start_key"], ["A"]) + end_key_bytes = to_utf8_bytes(mrargs["end_key"]) self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b""]) - self.assertEqual(explain["mrargs"]["direction"], "fwd") + self.assertEqual(mrargs["direction"], "fwd") def test_sort_desc(self): selector = {"location": {"$beginsWith": "A"}} - explain = self.db.find(selector, sort=[{"location": "desc"}], explain=True) + mrargs = self.get_mrargs(selector, sort=[{"location": "desc"}]) - start_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + start_key_bytes = to_utf8_bytes(mrargs["end_key"]) self.assertEqual(start_key_bytes, [b"A"]) - self.assertEqual(explain["mrargs"]["end_key"], ["A"]) - self.assertEqual(explain["mrargs"]["direction"], "rev") + self.assertEqual(mrargs["end_key"], ["A"]) + self.assertEqual(mrargs["direction"], "rev") def test_all_docs_range(self): - explain = self.db.find({"_id": {"$beginsWith": "a"}}, explain=True) - self.assertEqual(explain["mrargs"]["start_key"], "a") - end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"]) + mrargs = self.get_mrargs({"_id": {"$beginsWith": "a"}}) + + self.assertEqual(mrargs["start_key"], "a") + end_key_bytes = to_utf8_bytes(mrargs["end_key"]) self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"]) def test_no_index(self): selector = {"foo": {"$beginsWith": "a"}} resp_explain = self.db.find(selector, explain=True) + mrargs = resp_explain["mrargs"] self.assertEqual(resp_explain["index"]["type"], "special") - self.assertEqual(resp_explain["mrargs"]["start_key"], None) - self.assertEqual(resp_explain["mrargs"]["end_key"], "") + self.assertEqual(mrargs["start_key"], None) + self.assertEqual(mrargs["end_key"], "") def test_invalid_operand(self): try: @@ -106,7 +113,16 @@ def test_invalid_operand(self): raise AssertionError("expected find error") def test_does_not_match_non_string_value(self): - selector = {"age": {"$beginsWith": "a"}} - docs = self.db.find(selector) + docs = self.db.find({"age": {"$beginsWith": "a"}}) + self.assertEqual(len(docs), 0) + def test_no_matches(self): + docs = self.db.find({"name": {"$beginsWith": "Z"}}) self.assertEqual(len(docs), 0) + + def test_case_sensitivity(self): + docs = self.db.find({"name": {"$beginsWith": "j"}}) + self.assertEqual(len(docs), 0) + + docs = self.db.find({"name": {"$beginsWith": "J"}}) + self.assertEqual(len(docs), 2) From de1f01205dcd9ff9854d2a9fc3280af78877d9a8 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Thu, 19 Oct 2023 13:43:09 +0000 Subject: [PATCH 04/11] erlfmt fixes --- src/mango/src/mango_selector_text.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mango/src/mango_selector_text.erl b/src/mango/src/mango_selector_text.erl index 6ba4221a8ef..4a50ff9ba6a 100644 --- a/src/mango/src/mango_selector_text.erl +++ b/src/mango/src/mango_selector_text.erl @@ -144,7 +144,7 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) -> end; convert(Path, {[{<<"$beginsWith">>, Arg}]}) when is_binary(Arg) -> Suffix = <<"*">>, - PrefixSearch = value_str(<>), + PrefixSearch = value_str(<>), {op_field, {make_field(Path, Arg), PrefixSearch}}; % We're not checking the actual type here, just looking for % anything that has a possibility of matching by checking @@ -827,7 +827,7 @@ convert_nor_test() -> convert_beginswith_test() -> ?assertEqual( - {op_field, {[[<<"field">>], <<":">>, <<"string">>],<<"\"foo\\*\"">>}}, + {op_field, {[[<<"field">>], <<":">>, <<"string">>], <<"\"foo\\*\"">>}}, convert_selector(#{<<"field">> => #{<<"$beginsWith">> => <<"foo">>}}) ). From bf2de997f58167f1c35e70337f46aa4939271b6f Mon Sep 17 00:00:00 2001 From: Will Holley Date: Tue, 24 Oct 2023 12:30:32 +0000 Subject: [PATCH 05/11] use SubTest for sort tests --- src/mango/test/25-beginswith-test.py | 36 ++++++++++++++++------------ 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/mango/test/25-beginswith-test.py b/src/mango/test/25-beginswith-test.py index c136cebea19..3b5134b6514 100644 --- a/src/mango/test/25-beginswith-test.py +++ b/src/mango/test/25-beginswith-test.py @@ -70,21 +70,27 @@ def test_compound_key(self): def test_sort(self): selector = {"location": {"$beginsWith": "A"}} - mrargs = self.get_mrargs(selector, sort=["location"]) - - self.assertEqual(mrargs["start_key"], ["A"]) - end_key_bytes = to_utf8_bytes(mrargs["end_key"]) - self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b""]) - self.assertEqual(mrargs["direction"], "fwd") - - def test_sort_desc(self): - selector = {"location": {"$beginsWith": "A"}} - mrargs = self.get_mrargs(selector, sort=[{"location": "desc"}]) - - start_key_bytes = to_utf8_bytes(mrargs["end_key"]) - self.assertEqual(start_key_bytes, [b"A"]) - self.assertEqual(mrargs["end_key"], ["A"]) - self.assertEqual(mrargs["direction"], "rev") + cases = [ + { + "sort": ["location"], + "start_key": [b"A"], + "end_key": [b"A\xef\xbf\xbd", b""], + "direction": "fwd", + }, + { + "sort": [{"location": "desc"}], + "start_key": [b"A\xef\xbf\xbd", b""], + "end_key": [b"A"], + "direction": "rev", + }, + ] + + for case in cases: + with self.subTest(sort=case["sort"]): + mrargs = self.get_mrargs(selector, sort=case["sort"]) + self.assertEqual(to_utf8_bytes(mrargs["start_key"]), case["start_key"]) + self.assertEqual(to_utf8_bytes(mrargs["end_key"]), case["end_key"]) + self.assertEqual(mrargs["direction"], case["direction"]) def test_all_docs_range(self): mrargs = self.get_mrargs({"_id": {"$beginsWith": "a"}}) From c700f122ed8478cd5b95d0d3031875073d35b5d0 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Tue, 24 Oct 2023 14:12:16 +0000 Subject: [PATCH 06/11] Refactor generated tests --- src/mango/src/mango_selector.erl | 84 +++++++++++++++----------------- 1 file changed, 38 insertions(+), 46 deletions(-) diff --git a/src/mango/src/mango_selector.erl b/src/mango/src/mango_selector.erl index c1b4d7c282e..93d3b10ca6c 100644 --- a/src/mango/src/mango_selector.erl +++ b/src/mango/src/mango_selector.erl @@ -659,6 +659,14 @@ fields({[]}) -> -ifdef(TEST). -include_lib("eunit/include/eunit.hrl"). +-define(TEST_DOC, + {[ + {<<"_id">>, <<"foo">>}, + {<<"_rev">>, <<"bar">>}, + {<<"user_id">>, 11} + ]} +). + is_constant_field_basic_test() -> Selector = normalize({[{<<"A">>, <<"foo">>}]}), Field = <<"A">>, @@ -998,30 +1006,22 @@ has_required_fields_or_nested_or_false_test() -> Normalized = normalize(Selector), ?assertEqual(false, has_required_fields(Normalized, RequiredFields)). +check_match(Selector) -> + % Call match_int/2 to avoid ERROR for missing metric; this is confusing + % in the middle of test output. + match_int(mango_selector:normalize(Selector), ?TEST_DOC). + %% This test shows the shape match/2 expects for its arguments. -match_demo_test_() -> - Doc = - {[ - {<<"_id">>, <<"foo">>}, - {<<"_rev">>, <<"bar">>}, - {<<"user_id">>, 11} - ]}, - Check = fun(Selector) -> - % Call match_int/2 to avoid ERROR for missing metric; this is confusing - % in the middle of test output. - match_int(mango_selector:normalize(Selector), Doc) - end, - [ - % matching - ?_assertEqual(true, Check({[{<<"user_id">>, 11}]})), - ?_assertEqual(true, Check({[{<<"_id">>, <<"foo">>}]})), - ?_assertEqual(true, Check({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"bar">>}]})), - % non-matching - ?_assertEqual(false, Check({[{<<"user_id">>, 1234}]})), - % string 11 doesn't match number 11 - ?_assertEqual(false, Check({[{<<"user_id">>, <<"11">>}]})), - ?_assertEqual(false, Check({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"quux">>}]})) - ]. +match_demo_test() -> + % matching + ?assertEqual(true, check_match({[{<<"user_id">>, 11}]})), + ?assertEqual(true, check_match({[{<<"_id">>, <<"foo">>}]})), + ?assertEqual(true, check_match({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"bar">>}]})), + % non-matching + ?assertEqual(false, check_match({[{<<"user_id">>, 1234}]})), + % string 11 doesn't match number 11 + ?assertEqual(false, check_match({[{<<"user_id">>, <<"11">>}]})), + ?assertEqual(false, check_match({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"quux">>}]})). fields_of(Selector) -> fields(test_util:as_selector(Selector)). @@ -1061,29 +1061,21 @@ fields_nor_test() -> }, ?assertEqual([<<"field1">>, <<"field2">>], fields_of(Selector2)). +check_beginswith(Field, Prefix) -> + Selector = {[{Field, {[{<<"$beginsWith">>, Prefix}]}}]}, + % Call match_int/2 to avoid ERROR for missing metric; this is confusing + % in the middle of test output. + match_int(mango_selector:normalize(Selector), ?TEST_DOC). + match_beginswith_test() -> - Doc = - {[ - {<<"_id">>, <<"foo">>}, - {<<"_rev">>, <<"bar">>}, - {<<"user_id">>, 11} - ]}, - Check = fun(Field, Prefix) -> - Selector = {[{Field, {[{<<"$beginsWith">>, Prefix}]}}]}, - % Call match_int/2 to avoid ERROR for missing metric; this is confusing - % in the middle of test output. - match_int(mango_selector:normalize(Selector), Doc) - end, - [ - % matching - ?assertEqual(true, Check(<<"_id">>, <<"f">>)), - % no match (user_id is not a binary string) - ?assertEqual(false, Check(<<"user_id">>, <<"f">>)), - % invalid (prefix is not a binary string) - ?assertThrow( - {mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}}, - Check(<<"user_id">>, 1) - ) - ]. + % matching + ?assertEqual(true, check_beginswith(<<"_id">>, <<"f">>)), + % no match (user_id is not a binary string) + ?assertEqual(false, check_beginswith(<<"user_id">>, <<"f">>)), + % invalid (prefix is not a binary string) + ?assertThrow( + {mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}}, + check_beginswith(<<"user_id">>, 1) + ). -endif. From 3a94e00d9ccc38427859b391e16f998439109623 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Thu, 26 Oct 2023 12:42:02 +0000 Subject: [PATCH 07/11] Fix lucene support --- src/docs/src/api/database/find.rst | 136 ++++++++++++++------------ src/mango/src/mango_selector_text.erl | 3 +- src/mango/test/03-operator-test.py | 41 +++++--- 3 files changed, 103 insertions(+), 77 deletions(-) diff --git a/src/docs/src/api/database/find.rst b/src/docs/src/api/database/find.rst index d2535070816..e94326b53ae 100644 --- a/src/docs/src/api/database/find.rst +++ b/src/docs/src/api/database/find.rst @@ -673,68 +673,74 @@ In addition, some 'meta' condition operators are available. Some condition operators accept any valid JSON content as the argument. Other condition operators require the argument to be in a specific JSON format. -+---------------+-------------+------------+-----------------------------------+ -| Operator type | Operator | Argument | Purpose | -+===============+=============+============+===================================+ -| (In)equality | ``$lt`` | Any JSON | The field is less than the | -| | | | argument. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$lte`` | Any JSON | The field is less than or equal to| -| | | | the argument. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$eq`` | Any JSON | The field is equal to the argument| -+---------------+-------------+------------+-----------------------------------+ -| | ``$ne`` | Any JSON | The field is not equal to the | -| | | | argument. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$gte`` | Any JSON | The field is greater than or equal| -| | | | to the argument. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$gt`` | Any JSON | The field is greater than the | -| | | | to the argument. | -+---------------+-------------+------------+-----------------------------------+ -| Object | ``$exists`` | Boolean | Check whether the field exists or | -| | | | not, regardless of its value. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$type`` | String | Check the document field's type. | -| | | | Valid values are ``"null"``, | -| | | | ``"boolean"``, ``"number"``, | -| | | | ``"string"``, ``"array"``, and | -| | | | ``"object"``. | -+---------------+-------------+------------+-----------------------------------+ -| Array | ``$in`` | Array of | The document field must exist in | -| | | JSON values| the list provided. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$nin`` | Array of | The document field not must exist | -| | | JSON values| in the list provided. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$size`` | Integer | Special condition to match the | -| | | | length of an array field in a | -| | | | document. Non-array fields cannot | -| | | | match this condition. | -+---------------+-------------+------------+-----------------------------------+ -| Miscellaneous | ``$mod`` | [Divisor, | Divisor is a non-zero integer, | -| | | Remainder] | Remainder is any integer. | -| | | | Non-integer values result in a | -| | | | 404. Matches documents where | -| | | | ``field % Divisor == Remainder`` | -| | | | is true, and only when the | -| | | | document field is an integer. | -+---------------+-------------+------------+-----------------------------------+ -| | ``$regex`` | String | A regular expression pattern to | -| | | | match against the document field. | -| | | | Only matches when the field is a | -| | | | string value and matches the | -| | | | supplied regular expression. The | -| | | | matching algorithms are based on | -| | | | the Perl Compatible Regular | -| | | | Expression (PCRE) library. For | -| | | | more information about what is | -| | | | implemented, see the see the | -| | | | `Erlang Regular Expression | -| | | | `_. | -+---------------+-------------+------------+-----------------------------------+ ++---------------+-----------------+-------------+------------------------------------+ +| Operator type | Operator | Argument | Purpose | ++===============+=================+=============+====================================+ +| (In)equality | ``$lt`` | Any JSON | The field is less than the | +| | | | argument. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$lte`` | Any JSON | The field is less than or equal to | +| | | | the argument. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$eq`` | Any JSON | The field is equal to the argument | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$ne`` | Any JSON | The field is not equal to the | +| | | | argument. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$gte`` | Any JSON | The field is greater than or equal | +| | | | to the argument. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$gt`` | Any JSON | The field is greater than the | +| | | | to the argument. | ++---------------+-----------------+-------------+------------------------------------+ +| Object | ``$exists`` | Boolean | Check whether the field exists or | +| | | | not, regardless of its value. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$type`` | String | Check the document field's type. | +| | | | Valid values are ``"null"``, | +| | | | ``"boolean"``, ``"number"``, | +| | | | ``"string"``, ``"array"``, and | +| | | | ``"object"``. | ++---------------+-----------------+-------------+------------------------------------+ +| Array | ``$in`` | Array of | The document field must exist in | +| | | JSON values | the list provided. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$nin`` | Array of | The document field not must exist | +| | | JSON values | in the list provided. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$size`` | Integer | Special condition to match the | +| | | | length of an array field in a | +| | | | document. Non-array fields cannot | +| | | | match this condition. | ++---------------+-----------------+-------------+------------------------------------+ +| Miscellaneous | ``$mod`` | [Divisor, | Divisor is a non-zero integer, | +| | | Remainder] | Remainder is any integer. | +| | | | Non-integer values result in a | +| | | | 404. Matches documents where | +| | | | ``field % Divisor == Remainder`` | +| | | | is true, and only when the | +| | | | document field is an integer. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$regex`` | String | A regular expression pattern to | +| | | | match against the document field. | +| | | | Only matches when the field is a | +| | | | string value and matches the | +| | | | supplied regular expression. The | +| | | | matching algorithms are based on | +| | | | the Perl Compatible Regular | +| | | | Expression (PCRE) library. For | +| | | | more information about what is | +| | | | implemented, see the see the | +| | | | `Erlang Regular Expression | +| | | | `_. | ++---------------+-----------------+-------------+------------------------------------+ +| | ``$beginsWith`` | String | Matches where the document field | +| | | | begins with the specified prefix | +| | | | (case-sensitive). If the document | +| | | | field contains a non-string value, | +| | | | the document is not matched. | ++---------------+-----------------+-------------+------------------------------------+ .. warning:: Regular expressions do not work with indexes, so they should not be used to @@ -754,8 +760,10 @@ can itself be another operator with arguments of its own. This enables us to build up more complex selector expressions. However, only equality operators such as ``$eq``, ``$gt``, ``$gte``, ``$lt``, -and ``$lte`` (but not ``$ne``) can be used as the basis of a query. You should -include at least one of these in a selector. +``$lte`` and ``$beginsWith`` (but not ``$ne``) can be used as the basis +of a query that can make efficient use of a ``json`` index. You should +include at least one of these in a selector, or consider using +a ``text`` index if more flexibility is required. For example, if you try to perform a query that attempts to match all documents that have a field called `afieldname` containing a value that begins with the diff --git a/src/mango/src/mango_selector_text.erl b/src/mango/src/mango_selector_text.erl index 4a50ff9ba6a..7d8f739232d 100644 --- a/src/mango/src/mango_selector_text.erl +++ b/src/mango/src/mango_selector_text.erl @@ -143,8 +143,9 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) -> false -> {op_not, {FieldExists, false}} end; convert(Path, {[{<<"$beginsWith">>, Arg}]}) when is_binary(Arg) -> + Prefix = mango_util:lucene_escape_query_value(Arg), Suffix = <<"*">>, - PrefixSearch = value_str(<>), + PrefixSearch = <>, {op_field, {make_field(Path, Arg), PrefixSearch}}; % We're not checking the actual type here, just looking for % anything that has a possibility of matching by checking diff --git a/src/mango/test/03-operator-test.py b/src/mango/test/03-operator-test.py index b43aacf5f69..3b1a4656502 100644 --- a/src/mango/test/03-operator-test.py +++ b/src/mango/test/03-operator-test.py @@ -10,12 +10,13 @@ # License for the specific language governing permissions and limitations under # the License. +from requests.exceptions import HTTPError import mango import unittest class BaseOperatorTests: - class Common(object): + class Common(unittest.TestCase): def assertUserIds(self, user_ids, docs): user_ids_returned = list(d["user_id"] for d in docs) user_ids.sort() @@ -142,20 +143,36 @@ def test_exists_false_returns_missing_but_not_null(self): self.assertNotIn("twitter", d) def test_beginswith(self): - docs = self.db.find({"location.state": {"$beginsWith": "New"}}) - self.assertEqual(len(docs), 2) - self.assertUserIds([2, 10], docs) + cases = [ + {"prefix": "New", "user_ids": [2, 10]}, + { + # test escaped characters - note the space in the test string + "prefix": "New ", + "user_ids": [2, 10], + }, + { + # non-string values in documents should not match the prefix, + # but should not error + "prefix": "Foo", + "user_ids": [], + }, + {"prefix": " New", "user_ids": []}, + ] - # non-string prefixes should return an error - def test_beginswith_invalid_prefix(self): - docs = self.db.find({"location.state": {"$beginsWith": 123}}) - self.assertEqual(len(docs), 2) + for case in cases: + with self.subTest(prefix=case["prefix"]): + selector = {"location.state": {"$beginsWith": case["prefix"]}} + docs = self.db.find(selector) + self.assertEqual(len(docs), len(case["user_ids"])) + self.assertUserIds(case["user_ids"], docs) - # non-string values in documents should not match the prefix, - # but should not error + # non-string prefixes should return an error def test_beginswith_invalid_prefix(self): - docs = self.db.find({"user_id": {"$beginsWith": "Foo"}}) - self.assertEqual(len(docs), 0) + cases = [123, True, [], {}] + for prefix in cases: + with self.subTest(prefix=prefix): + with self.assertRaises(HTTPError): + self.db.find({"location.state": {"$beginsWith": prefix}}) class OperatorJSONTests(mango.UserDocsTests, BaseOperatorTests.Common): From b07e0673f2f2d18ff5304b0380d04f20b0f0bf24 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Thu, 26 Oct 2023 14:32:57 +0000 Subject: [PATCH 08/11] additional special character tests --- src/mango/test/03-operator-test.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/mango/test/03-operator-test.py b/src/mango/test/03-operator-test.py index 3b1a4656502..8d216d9d8de 100644 --- a/src/mango/test/03-operator-test.py +++ b/src/mango/test/03-operator-test.py @@ -143,19 +143,19 @@ def test_exists_false_returns_missing_but_not_null(self): self.assertNotIn("twitter", d) def test_beginswith(self): + self.db.save_docs( + [ + {"user_id": 99, "location": {"state": ":Bar"}}, + ] + ) + cases = [ {"prefix": "New", "user_ids": [2, 10]}, - { - # test escaped characters - note the space in the test string - "prefix": "New ", - "user_ids": [2, 10], - }, - { - # non-string values in documents should not match the prefix, - # but should not error - "prefix": "Foo", - "user_ids": [], - }, + # test characters that require escaping + {"prefix": "New ", "user_ids": [2, 10]}, + {"prefix": ":", "user_ids": [99]}, + {"prefix": "Foo", "user_ids": []}, + {"prefix": '"Foo', "user_ids": []}, {"prefix": " New", "user_ids": []}, ] From a5af77787bd58207bc63d822e9959103755325d8 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Thu, 26 Oct 2023 15:05:58 +0000 Subject: [PATCH 09/11] fixup eunit tests --- src/mango/src/mango_selector_text.erl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mango/src/mango_selector_text.erl b/src/mango/src/mango_selector_text.erl index 7d8f739232d..e4f15d00d14 100644 --- a/src/mango/src/mango_selector_text.erl +++ b/src/mango/src/mango_selector_text.erl @@ -828,7 +828,7 @@ convert_nor_test() -> convert_beginswith_test() -> ?assertEqual( - {op_field, {[[<<"field">>], <<":">>, <<"string">>], <<"\"foo\\*\"">>}}, + {op_field, {[[<<"field">>], <<":">>, <<"string">>], <<"foo*">>}}, convert_selector(#{<<"field">> => #{<<"$beginsWith">> => <<"foo">>}}) ). From 8e29e6023ae33a8c94bd9890f4f81841b800535e Mon Sep 17 00:00:00 2001 From: Will Holley Date: Mon, 30 Oct 2023 09:07:19 +0000 Subject: [PATCH 10/11] address PR comments --- src/docs/src/api/database/find.rst | 12 +++++++----- src/mango/test/03-operator-test.py | 6 +++++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/docs/src/api/database/find.rst b/src/docs/src/api/database/find.rst index e94326b53ae..5380280fd1c 100644 --- a/src/docs/src/api/database/find.rst +++ b/src/docs/src/api/database/find.rst @@ -200,8 +200,9 @@ A simple selector, inspecting specific fields: You can create more complex selector expressions by combining operators. For best performance, it is best to combine 'combination' or -'array logical' operators, such as ``$regex``, with an equality -operators such as ``$eq``, ``$gt``, ``$gte``, ``$lt``, and ``$lte`` +'array logical' operators, such as ``$regex``, with an operator +that defines a contiguous range of keys such as ``$eq``, +``$gt``, ``$gte``, ``$lt``, ``$lte``, and ``$beginsWith`` (but not ``$ne``). For more information about creating complex selector expressions, see :ref:`creating selector expressions `. @@ -759,11 +760,12 @@ In general, whenever you have an operator that takes an argument, that argument can itself be another operator with arguments of its own. This enables us to build up more complex selector expressions. -However, only equality operators such as ``$eq``, ``$gt``, ``$gte``, ``$lt``, -``$lte`` and ``$beginsWith`` (but not ``$ne``) can be used as the basis +However, only operators that define a contiguous range of values +such as ``$eq``, ``$gt``, ``$gte``, ``$lt``, ``$lte``, +and ``$beginsWith`` (but not ``$ne``) can be used as the basis of a query that can make efficient use of a ``json`` index. You should include at least one of these in a selector, or consider using -a ``text`` index if more flexibility is required. +a ``text`` index if greater flexibility is required. For example, if you try to perform a query that attempts to match all documents that have a field called `afieldname` containing a value that begins with the diff --git a/src/mango/test/03-operator-test.py b/src/mango/test/03-operator-test.py index 8d216d9d8de..3d4503b4f7e 100644 --- a/src/mango/test/03-operator-test.py +++ b/src/mango/test/03-operator-test.py @@ -171,8 +171,12 @@ def test_beginswith_invalid_prefix(self): cases = [123, True, [], {}] for prefix in cases: with self.subTest(prefix=prefix): - with self.assertRaises(HTTPError): + try: self.db.find({"location.state": {"$beginsWith": prefix}}) + except Exception as e: + self.assertEqual(e.response.status_code, 400) + else: + raise AssertionError("expected request to fail") class OperatorJSONTests(mango.UserDocsTests, BaseOperatorTests.Common): From 5c88819dd2bf04ac3003f06631e65623fab84a49 Mon Sep 17 00:00:00 2001 From: Will Holley Date: Mon, 30 Oct 2023 16:06:36 +0000 Subject: [PATCH 11/11] remove superfluous import --- src/mango/test/03-operator-test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mango/test/03-operator-test.py b/src/mango/test/03-operator-test.py index 3d4503b4f7e..1dfd1a72510 100644 --- a/src/mango/test/03-operator-test.py +++ b/src/mango/test/03-operator-test.py @@ -10,7 +10,6 @@ # License for the specific language governing permissions and limitations under # the License. -from requests.exceptions import HTTPError import mango import unittest