diff --git a/fuzz/url_pattern.cc b/fuzz/url_pattern.cc index cbc1c8c06..f7c9da3e6 100644 --- a/fuzz/url_pattern.cc +++ b/fuzz/url_pattern.cc @@ -6,10 +6,32 @@ #include "ada.cpp" #include "ada.h" -extern "C" int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) { +std::string bytesToAlphanumeric(const std::string& source) { + static const char alphanumeric[] = + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "0123456789"; + + std::string result; + result.reserve(source.size()); + + for (char byte : source) { + int index = static_cast(byte) % (sizeof(alphanumeric) - 1); + result.push_back(alphanumeric[index]); + } + + return result; +} + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) { FuzzedDataProvider fdp(data, size); - std::string source = fdp.ConsumeRandomLengthString(256); - std::string base_source = fdp.ConsumeRandomLengthString(256); + // We do not want to trigger arbitrary regex matching. + std::string source = + "/" + bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50)) + "/" + + bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50)); + std::string base_source = + "/" + bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50)) + "/" + + bytesToAlphanumeric(fdp.ConsumeRandomLengthString(50)); // Without base or options auto result = ada::parse_url_pattern(source, nullptr, nullptr); diff --git a/fuzz/url_pattern.options b/fuzz/url_pattern.options index c1025fc28..a299d38f7 100644 --- a/fuzz/url_pattern.options +++ b/fuzz/url_pattern.options @@ -1,4 +1,5 @@ [libfuzzer] dict = url.dict -max_len = 512 -rss_limit_mb = 3000 +max_len = 100 +rss_limit_mb = 16000 +timeout = 60 diff --git a/include/ada.h b/include/ada.h index 7c579d95d..30a884fe9 100644 --- a/include/ada.h +++ b/include/ada.h @@ -30,6 +30,7 @@ #include "ada/url_pattern-inl.h" #include "ada/url_pattern_helpers.h" #include "ada/url_pattern_helpers-inl.h" +#include "ada/url_pattern_regex.h" // Public API #include "ada/ada_version.h" diff --git a/include/ada/implementation.h b/include/ada/implementation.h index 33bf67978..ae3ef2080 100644 --- a/include/ada/implementation.h +++ b/include/ada/implementation.h @@ -11,10 +11,8 @@ #include "ada/parser.h" #include "ada/common_defs.h" -#include "ada/encoding_type.h" #include "ada/url.h" -#include "ada/state.h" -#include "ada/url_aggregator.h" +#include "ada/url_pattern_regex.h" namespace ada { enum class errors : uint8_t { type_error }; @@ -56,12 +54,17 @@ bool can_parse(std::string_view input, * @param input valid UTF-8 string or URLPatternInit struct * @param base_url an optional valid UTF-8 string * @param options an optional url_pattern_options struct + * @param regex_provider an optional regex provider. if not provided, it will + * use ada::url_pattern_regex::std_regex_provider * @return url_pattern instance */ -ada_warn_unused tl::expected parse_url_pattern( - std::variant input, - const std::string_view* base_url = nullptr, - const url_pattern_options* options = nullptr); +template +ada_warn_unused tl::expected, errors> +parse_url_pattern(std::variant input, + const std::string_view* base_url = nullptr, + const url_pattern_options* options = nullptr, + std::optional provider = std::nullopt); /** * Computes a href string from a file path. The function assumes diff --git a/include/ada/parser.h b/include/ada/parser.h index 80e97decc..02b7dadf1 100644 --- a/include/ada/parser.h +++ b/include/ada/parser.h @@ -9,6 +9,7 @@ #include #include "ada/expected.h" +#include "ada/url_pattern_regex.h" /** * @private @@ -16,6 +17,7 @@ namespace ada { struct url_aggregator; struct url; +template class url_pattern; struct url_pattern_options; struct url_pattern_init; @@ -51,9 +53,11 @@ extern template url_aggregator parse_url_impl( extern template url parse_url_impl(std::string_view user_input, const url* base_url); -tl::expected parse_url_pattern_impl( +template +tl::expected, errors> parse_url_pattern_impl( std::variant input, - const std::string_view* base_url, const url_pattern_options* options); + const std::string_view* base_url, const url_pattern_options* options, + regex_provider&& provider); } // namespace ada::parser diff --git a/include/ada/url_aggregator.h b/include/ada/url_aggregator.h index 66f7991c3..053e577fa 100644 --- a/include/ada/url_aggregator.h +++ b/include/ada/url_aggregator.h @@ -222,9 +222,11 @@ struct url_aggregator : url_base { friend url_aggregator parser::parse_url_impl( std::string_view, const url_aggregator *); // url_pattern methods - friend tl::expected parse_url_pattern_impl( - std::variant input, - const std::string_view *base_url, const url_pattern_options *options); + template + friend tl::expected, errors> + parse_url_pattern_impl(std::variant input, + const std::string_view *base_url, + const url_pattern_options *options); std::string buffer{}; url_components components{}; diff --git a/include/ada/url_pattern-inl.h b/include/ada/url_pattern-inl.h index 2ad9e0af4..7e621bc86 100644 --- a/include/ada/url_pattern-inl.h +++ b/include/ada/url_pattern-inl.h @@ -24,7 +24,8 @@ inline bool url_pattern_component_result::operator==( return input == other.input && groups == other.groups; } -inline std::string url_pattern_component::to_string() const { +template +std::string url_pattern_component::to_string() const { #ifdef ADA_HAS_FORMAT return std::format(R"({{"pattern": "{}", "has_regexp_groups": {}}})", pattern, has_regexp_groups ? "true" : "false" //, @@ -34,9 +35,10 @@ inline std::string url_pattern_component::to_string() const { #endif } -inline url_pattern_component_result -url_pattern_component::create_component_match_result( - std::string_view input, const std::smatch& exec_result) { +template +url_pattern_component_result +url_pattern_component::create_component_match_result( + std::string_view input, std::vector&& exec_result) { // Let result be a new URLPatternComponentResult. // Set result["input"] to input. // Let groups be a record. @@ -44,33 +46,27 @@ url_pattern_component::create_component_match_result( url_pattern_component_result{.input = std::string(input), .groups = {}}; // If input is empty, then groups will always be empty. - if (input.empty()) { + if (input.empty() || exec_result.empty()) { return result; } // Optimization: Let's reserve the size. result.groups.reserve(exec_result.size() - 1); - size_t group_index = 0; - // Let index be 1. - // While index is less than Get(execResult, "length"): - for (size_t index = 1; index < exec_result.size(); index++) { - // Let name be component’s group name list[index - 1]. - // Let value be Get(execResult, ToString(index)). - // Set groups[name] to value. - auto exec = exec_result[index]; - if (!exec.matched) continue; + // We explicitly start iterating from 0 even though the spec + // says we should start from 1. This case is handled by the + // std_regex_provider. + for (size_t index = 0; index < exec_result.size(); index++) { result.groups.insert({ - group_name_list[group_index], - exec.str(), + group_name_list[index], + std::move(exec_result[index]), }); - - group_index++; } return result; } -inline std::string url_pattern::to_string() const { +template +std::string url_pattern::to_string() const { #ifdef ADA_HAS_FORMAT return std::format( R"({{"protocol_component": "{}", "username_component": {}, "password_component": {}, "hostname_component": {}, "port_component": {}, "pathname_component": {}, "search_component": {}, "hash_component": {}, "ignore_case": {}}})", @@ -84,42 +80,60 @@ inline std::string url_pattern::to_string() const { #endif } -inline std::string_view url_pattern::get_protocol() const ada_lifetime_bound { +template +std::string_view url_pattern::get_protocol() const + ada_lifetime_bound { // Return this's associated URL pattern's protocol component's pattern string. return protocol_component.pattern; } -inline std::string_view url_pattern::get_username() const ada_lifetime_bound { +template +std::string_view url_pattern::get_username() const + ada_lifetime_bound { // Return this's associated URL pattern's username component's pattern string. return username_component.pattern; } -inline std::string_view url_pattern::get_password() const ada_lifetime_bound { +template +std::string_view url_pattern::get_password() const + ada_lifetime_bound { // Return this's associated URL pattern's password component's pattern string. return password_component.pattern; } -inline std::string_view url_pattern::get_hostname() const ada_lifetime_bound { +template +std::string_view url_pattern::get_hostname() const + ada_lifetime_bound { // Return this's associated URL pattern's hostname component's pattern string. return hostname_component.pattern; } -inline std::string_view url_pattern::get_port() const ada_lifetime_bound { +template +std::string_view url_pattern::get_port() const + ada_lifetime_bound { // Return this's associated URL pattern's port component's pattern string. return port_component.pattern; } -inline std::string_view url_pattern::get_pathname() const ada_lifetime_bound { +template +std::string_view url_pattern::get_pathname() const + ada_lifetime_bound { // Return this's associated URL pattern's pathname component's pattern string. return pathname_component.pattern; } -inline std::string_view url_pattern::get_search() const ada_lifetime_bound { +template +std::string_view url_pattern::get_search() const + ada_lifetime_bound { // Return this's associated URL pattern's search component's pattern string. return search_component.pattern; } -inline std::string_view url_pattern::get_hash() const ada_lifetime_bound { +template +std::string_view url_pattern::get_hash() const + ada_lifetime_bound { // Return this's associated URL pattern's hash component's pattern string. return hash_component.pattern; } - -inline bool url_pattern::ignore_case() const { return ignore_case_; } - -inline bool url_pattern::has_regexp_groups() const { +template +bool url_pattern::ignore_case() const { + return ignore_case_; +} +template +bool url_pattern::has_regexp_groups() const { // If this's associated URL pattern's has regexp groups, then return true. return protocol_component.has_regexp_groups || username_component.has_regexp_groups || diff --git a/include/ada/url_pattern.h b/include/ada/url_pattern.h index 4c0b897be..bfa9e4fdb 100644 --- a/include/ada/url_pattern.h +++ b/include/ada/url_pattern.h @@ -7,8 +7,8 @@ #include "ada/implementation.h" #include "ada/expected.h" +#include "ada/url_pattern_regex.h" -#include #include #include #include @@ -18,10 +18,11 @@ namespace ada { namespace parser { template + typename url_pattern_options, typename regex_provider> tl::expected parse_url_pattern_impl( std::variant input, - const std::string_view* base_url, const url_pattern_options* options); + const std::string_view* base_url, const url_pattern_options* options, + regex_provider&& provider); } // Important: C++20 allows us to use concept rather than `using` or `typedef @@ -205,19 +206,19 @@ struct url_pattern_component_result { #endif // ADA_TESTING }; +template class url_pattern_component { public: url_pattern_component() = default; // This function explicitly takes a std::string because it is moved. // To avoid unnecessary copy, move each value while calling the constructor. - url_pattern_component(std::string&& new_pattern, std::regex&& new_regexp, - std::regex_constants::syntax_option_type new_flags, + url_pattern_component(std::string&& new_pattern, + typename regex_provider::regex_type&& new_regexp, std::vector&& new_group_name_list, bool new_has_regexp_groups) : regexp(std::move(new_regexp)), pattern(std::move(new_pattern)), - flags(new_flags), group_name_list(new_group_name_list), has_regexp_groups(new_has_regexp_groups) {} @@ -225,17 +226,17 @@ class url_pattern_component { template static tl::expected compile( std::string_view input, F& encoding_callback, - url_pattern_compile_component_options& options); + url_pattern_compile_component_options& options, + const regex_provider& provider); // @see https://urlpattern.spec.whatwg.org/#create-a-component-match-result url_pattern_component_result create_component_match_result( - std::string_view input, const std::smatch& exec_result); + std::string_view input, std::vector&& exec_result); std::string to_string() const; - std::regex regexp{}; + typename regex_provider::regex_type regexp{}; std::string pattern{}; - std::regex_constants::syntax_option_type flags = std::regex::ECMAScript; std::vector group_name_list{}; bool has_regexp_groups = false; }; @@ -268,12 +269,11 @@ struct url_pattern_options { // defined in https://wicg.github.io/urlpattern. // More information about the URL Pattern syntax can be found at // https://developer.mozilla.org/en-US/docs/Web/API/URL_Pattern_API +template class url_pattern { public: - url_pattern() = default; - explicit url_pattern(std::optional&& input, - std::optional&& base_url, - std::optional&& options); + explicit url_pattern(regex_provider&& new_regex_provider) + : regex_provider_(new_regex_provider) {} /** * @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-exec @@ -294,46 +294,48 @@ class url_pattern { const url_pattern_input& input, std::string_view* base_url_string); // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-protocol - std::string_view get_protocol() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_protocol() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-username - std::string_view get_username() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_username() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-password - std::string_view get_password() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_password() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hostname - std::string_view get_hostname() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_hostname() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-port - std::string_view get_port() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_port() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-pathname - std::string_view get_pathname() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_pathname() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-search - std::string_view get_search() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_search() const ada_lifetime_bound; // @see https://urlpattern.spec.whatwg.org/#dom-urlpattern-hash - std::string_view get_hash() const ada_lifetime_bound; + [[nodiscard]] std::string_view get_hash() const ada_lifetime_bound; // If ignoreCase is true, the JavaScript regular expression created for each // pattern must use the `vi` flag. Otherwise, they must use the `v` flag. - bool ignore_case() const; + [[nodiscard]] bool ignore_case() const; // @see https://urlpattern.spec.whatwg.org/#url-pattern-has-regexp-groups - bool has_regexp_groups() const; + [[nodiscard]] bool has_regexp_groups() const; - std::string to_string() const; + [[nodiscard]] std::string to_string() const; - url_pattern_component protocol_component{}; - url_pattern_component username_component{}; - url_pattern_component password_component{}; - url_pattern_component hostname_component{}; - url_pattern_component port_component{}; - url_pattern_component pathname_component{}; - url_pattern_component search_component{}; - url_pattern_component hash_component{}; + url_pattern_component protocol_component{}; + url_pattern_component username_component{}; + url_pattern_component password_component{}; + url_pattern_component hostname_component{}; + url_pattern_component port_component{}; + url_pattern_component pathname_component{}; + url_pattern_component search_component{}; + url_pattern_component hash_component{}; bool ignore_case_ = false; + regex_provider regex_provider_; template + typename url_pattern_options, typename regex_provider_> friend tl::expected parser::parse_url_pattern_impl( std::variant input, - const std::string_view* base_url, const url_pattern_options* options); + const std::string_view* base_url, const url_pattern_options* options, + regex_provider_&& provider); }; } // namespace ada diff --git a/include/ada/url_pattern_helpers-inl.h b/include/ada/url_pattern_helpers-inl.h index 37311bb2b..3e0b3a07b 100644 --- a/include/ada/url_pattern_helpers-inl.h +++ b/include/ada/url_pattern_helpers-inl.h @@ -39,20 +39,23 @@ inline std::string to_string(token_type type) { } } -inline void constructor_string_parser::rewind() { +template +void constructor_string_parser::rewind() { // Set parser’s token index to parser’s component start. token_index = component_start; // Set parser’s token increment to 0. token_increment = 0; } -inline bool constructor_string_parser::is_hash_prefix() { +template +bool constructor_string_parser::is_hash_prefix() { // Return the result of running is a non-special pattern char given parser, // parser’s token index and "#". return is_non_special_pattern_char(token_index, "#"); } -inline bool constructor_string_parser::is_search_prefix() { +template +bool constructor_string_parser::is_search_prefix() { // If result of running is a non-special pattern char given parser, parser’s // token index and "?" is true, then return true. if (is_non_special_pattern_char(token_index, "?")) { @@ -84,7 +87,8 @@ inline bool constructor_string_parser::is_search_prefix() { previous_token->type == token_type::ASTERISK); } -inline bool constructor_string_parser::is_non_special_pattern_char( +template +bool constructor_string_parser::is_non_special_pattern_char( size_t index, std::string_view value) { // Let token be the result of running get a safe token given parser and index. auto token = get_safe_token(index); @@ -105,7 +109,9 @@ inline bool constructor_string_parser::is_non_special_pattern_char( token->type == token_type::INVALID_CHAR; } -inline const Token* constructor_string_parser::get_safe_token(size_t index) { +template +const Token* constructor_string_parser::get_safe_token( + size_t index) { // If index is less than parser’s token list's size, then return parser’s // token list[index]. if (index < token_list.size()) [[likely]] { @@ -123,19 +129,22 @@ inline const Token* constructor_string_parser::get_safe_token(size_t index) { return &token_list.back(); } -inline bool constructor_string_parser::is_group_open() const { +template +bool constructor_string_parser::is_group_open() const { // If parser’s token list[parser’s token index]'s type is "open", then return // true. return token_list[token_index].type == token_type::OPEN; } -inline bool constructor_string_parser::is_group_close() const { +template +bool constructor_string_parser::is_group_close() const { // If parser’s token list[parser’s token index]'s type is "close", then return // true. return token_list[token_index].type == token_type::CLOSE; } -inline bool constructor_string_parser::next_is_authority_slashes() { +template +bool constructor_string_parser::next_is_authority_slashes() { // If the result of running is a non-special pattern char given parser, // parser’s token index + 1, and "/" is false, then return false. if (!is_non_special_pattern_char(token_index + 1, "/")) { @@ -149,14 +158,16 @@ inline bool constructor_string_parser::next_is_authority_slashes() { return true; } -inline bool constructor_string_parser::is_protocol_suffix() { +template +bool constructor_string_parser::is_protocol_suffix() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and ":". return is_non_special_pattern_char(token_index, ":"); } -inline void constructor_string_parser::change_state(State new_state, - size_t skip) { +template +void constructor_string_parser::change_state(State new_state, + size_t skip) { // If parser’s state is not "init", not "authority", and not "done", then set // parser’s result[parser’s state] to the result of running make a component // string given parser. @@ -254,7 +265,8 @@ inline void constructor_string_parser::change_state(State new_state, token_increment = 0; } -inline std::string constructor_string_parser::make_component_string() { +template +std::string constructor_string_parser::make_component_string() { // Assert: parser’s token index is less than parser’s token list's size. ADA_ASSERT_TRUE(token_index < token_list.size()); @@ -273,37 +285,43 @@ inline std::string constructor_string_parser::make_component_string() { end_index - component_start_input_index); } -inline bool constructor_string_parser::is_an_identity_terminator() { +template +bool constructor_string_parser::is_an_identity_terminator() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and "@". return is_non_special_pattern_char(token_index, "@"); } -inline bool constructor_string_parser::is_pathname_start() { +template +bool constructor_string_parser::is_pathname_start() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and "/". return is_non_special_pattern_char(token_index, "/"); } -inline bool constructor_string_parser::is_password_prefix() { +template +bool constructor_string_parser::is_password_prefix() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and ":". return is_non_special_pattern_char(token_index, ":"); } -inline bool constructor_string_parser::is_an_ipv6_open() { +template +bool constructor_string_parser::is_an_ipv6_open() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and "[". return is_non_special_pattern_char(token_index, "["); } -inline bool constructor_string_parser::is_an_ipv6_close() { +template +bool constructor_string_parser::is_an_ipv6_close() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and "]". return is_non_special_pattern_char(token_index, "]"); } -inline bool constructor_string_parser::is_port_prefix() { +template +bool constructor_string_parser::is_port_prefix() { // Return the result of running is a non-special pattern char given parser, // parser’s token index, and ":". return is_non_special_pattern_char(token_index, ":"); diff --git a/include/ada/url_pattern_helpers.h b/include/ada/url_pattern_helpers.h index 4d9c29f65..d62c0e612 100644 --- a/include/ada/url_pattern_helpers.h +++ b/include/ada/url_pattern_helpers.h @@ -139,6 +139,7 @@ class Tokenizer { }; // @see https://urlpattern.spec.whatwg.org/#constructor-string-parser +template struct constructor_string_parser { explicit constructor_string_parser(std::string_view new_input, std::vector&& new_token_list) @@ -154,7 +155,8 @@ struct constructor_string_parser { bool is_search_prefix(); // @see https://urlpattern.spec.whatwg.org/#parse-a-constructor-string - static tl::expected parse(std::string_view input); + static tl::expected parse(std::string_view input, + regex_provider provider); // @see https://urlpattern.spec.whatwg.org/#constructor-string-parser-state enum class State { @@ -185,7 +187,8 @@ struct constructor_string_parser { // @see // https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag - std::optional compute_protocol_matches_special_scheme_flag(); + std::optional compute_protocol_matches_special_scheme_flag( + regex_provider provider); // @see https://urlpattern.spec.whatwg.org/#next-is-authority-slashes bool next_is_authority_slashes(); @@ -321,8 +324,9 @@ bool is_ipv6_address(std::string_view input) noexcept; // @see // https://urlpattern.spec.whatwg.org/#protocol-component-matches-a-special-scheme +template bool protocol_component_matches_special_scheme( - ada::url_pattern_component& input); + ada::url_pattern_component& input); // @see https://urlpattern.spec.whatwg.org/#convert-a-modifier-to-a-string std::string convert_modifier_to_string(url_pattern_part_modifier modifier); diff --git a/include/ada/url_pattern_regex.h b/include/ada/url_pattern_regex.h new file mode 100644 index 000000000..4116cf863 --- /dev/null +++ b/include/ada/url_pattern_regex.h @@ -0,0 +1,54 @@ +/** + * @file url_search_params.h + * @brief Declaration for the URL Search Params + */ +#ifndef ADA_URL_PATTERN_REGEX_H +#define ADA_URL_PATTERN_REGEX_H + +#include +#include + +namespace ada::url_pattern_regex { + +template +concept regex_concept = requires(T t, std::string_view pattern, + bool ignore_case, std::string_view input) { + // Ensure the class has a type alias 'regex_type' + typename T::regex_type; + + // Function to create a regex instance + { + T::create_instance(pattern, ignore_case) + } -> std::same_as>; + + // Function to perform regex search + { + T::regex_search(input, std::declval()) + } -> std::same_as>>; + + // Function to match regex pattern + { + T::regex_match(input, std::declval()) + } -> std::same_as; + + // Copy constructor + { T(std::declval()) } -> std::same_as; + + // Move constructor + { T(std::declval()) } -> std::same_as; +}; + +class std_regex_provider { + public: + std_regex_provider() = default; + using regex_type = std::regex; + static std::optional create_instance(std::string_view pattern, + bool ignore_case); + static std::optional> regex_search( + std::string_view input, const regex_type& pattern); + static bool regex_match(std::string_view input, const regex_type& pattern); +}; + +} // namespace ada::url_pattern_regex + +#endif // ADA_URL_PATTERN_REGEX_H diff --git a/src/ada.cpp b/src/ada.cpp index 3d35569dd..36c66f2d9 100644 --- a/src/ada.cpp +++ b/src/ada.cpp @@ -10,4 +10,5 @@ #include "url_aggregator.cpp" #include "url_pattern.cpp" #include "url_pattern_helpers.cpp" +#include "url_pattern_regex.cpp" #include "ada_c.cpp" diff --git a/src/implementation.cpp b/src/implementation.cpp index cad5af5ff..a4553600b 100644 --- a/src/implementation.cpp +++ b/src/implementation.cpp @@ -79,10 +79,37 @@ ada_warn_unused std::string to_string(ada::encoding_type type) { } } -ada_warn_unused tl::expected parse_url_pattern( - std::variant input, - const std::string_view* base_url, const url_pattern_options* options) { - return parser::parse_url_pattern_impl(std::move(input), base_url, options); +template +ada_warn_unused tl::expected, errors> +parse_url_pattern(std::variant input, + const std::string_view* base_url, + const url_pattern_options* options, + std::optional provider) { + return parser::parse_url_pattern_impl( + std::move(input), base_url, options, + provider.value_or(url_pattern_regex::std_regex_provider())); } +template ada_warn_unused + tl::expected, errors> + parse_url_pattern( + std::variant input, + const std::string_view* base_url, const url_pattern_options* options, + std::optional provider); + +template result> +url_pattern::exec( + const url_pattern_input& input, std::string_view* base_url); + +template result url_pattern::test( + const url_pattern_input& input, std::string_view* base_url); +namespace parser { +template tl::expected, + errors> +parse_url_pattern_impl(std::variant input, + const std::string_view* base_url, + const url_pattern_options* options, + url_pattern_regex::std_regex_provider&& provider); +} // namespace parser + } // namespace ada diff --git a/src/parser.cpp b/src/parser.cpp index 09c6ad283..b3816d61c 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -898,17 +898,20 @@ result_type parse_url_impl(std::string_view user_input, return url; } -tl::expected parse_url_pattern_impl( +template +tl::expected, errors> parse_url_pattern_impl( std::variant input, - const std::string_view* base_url, const url_pattern_options* options) { + const std::string_view* base_url, const url_pattern_options* options, + regex_provider&& provider) { // Let init be null. url_pattern_init init; // If input is a scalar value string then: if (std::holds_alternative(input)) { // Set init to the result of running parse a constructor string given input. - auto parse_result = url_pattern_helpers::constructor_string_parser::parse( - std::get(input)); + auto parse_result = + url_pattern_helpers::constructor_string_parser::parse( + std::get(input), provider); if (!parse_result) { ada_log("constructor_string_parser::parse failed"); return tl::unexpected(parse_result.error()); @@ -983,15 +986,15 @@ tl::expected parse_url_pattern_impl( } // Let urlPattern be a new URL pattern. - auto url_pattern_ = url_pattern{}; + url_pattern url_pattern_(std::move(provider)); // Set urlPattern’s protocol component to the result of compiling a component // given processedInit["protocol"], canonicalize a protocol, and default // options. - auto protocol_component = url_pattern_component::compile( + auto protocol_component = url_pattern_component::compile( processed_init->protocol.value(), url_pattern_helpers::canonicalize_protocol, - url_pattern_compile_component_options::DEFAULT); + url_pattern_compile_component_options::DEFAULT, provider); if (!protocol_component) { ada_log("url_pattern_component::compile failed for protocol ", processed_init->protocol.value()); @@ -1002,10 +1005,10 @@ tl::expected parse_url_pattern_impl( // Set urlPattern’s username component to the result of compiling a component // given processedInit["username"], canonicalize a username, and default // options. - auto username_component = url_pattern_component::compile( + auto username_component = url_pattern_component::compile( processed_init->username.value(), url_pattern_helpers::canonicalize_username, - url_pattern_compile_component_options::DEFAULT); + url_pattern_compile_component_options::DEFAULT, provider); if (!username_component) { ada_log("url_pattern_component::compile failed for username ", processed_init->username.value()); @@ -1016,10 +1019,10 @@ tl::expected parse_url_pattern_impl( // Set urlPattern’s password component to the result of compiling a component // given processedInit["password"], canonicalize a password, and default // options. - auto password_component = url_pattern_component::compile( + auto password_component = url_pattern_component::compile( processed_init->password.value(), url_pattern_helpers::canonicalize_password, - url_pattern_compile_component_options::DEFAULT); + url_pattern_compile_component_options::DEFAULT, provider); if (!password_component) { ada_log("url_pattern_component::compile failed for password ", processed_init->password.value()); @@ -1038,10 +1041,10 @@ tl::expected parse_url_pattern_impl( // then set urlPattern’s hostname component to the result of compiling a // component given processedInit["hostname"], canonicalize an IPv6 hostname, // and hostname options. - auto hostname_component = url_pattern_component::compile( + auto hostname_component = url_pattern_component::compile( processed_init->hostname.value(), url_pattern_helpers::canonicalize_ipv6_hostname, - url_pattern_compile_component_options::DEFAULT); + url_pattern_compile_component_options::DEFAULT, provider); if (!hostname_component) { ada_log("url_pattern_component::compile failed for ipv6 hostname ", processed_init->hostname.value()); @@ -1052,10 +1055,10 @@ tl::expected parse_url_pattern_impl( // Otherwise, set urlPattern’s hostname component to the result of compiling // a component given processedInit["hostname"], canonicalize a hostname, and // hostname options. - auto hostname_component = url_pattern_component::compile( + auto hostname_component = url_pattern_component::compile( processed_init->hostname.value(), url_pattern_helpers::canonicalize_hostname, - url_pattern_compile_component_options::HOSTNAME); + url_pattern_compile_component_options::HOSTNAME, provider); if (!hostname_component) { ada_log("url_pattern_component::compile failed for hostname ", processed_init->hostname.value()); @@ -1066,9 +1069,9 @@ tl::expected parse_url_pattern_impl( // Set urlPattern’s port component to the result of compiling a component // given processedInit["port"], canonicalize a port, and default options. - auto port_component = url_pattern_component::compile( + auto port_component = url_pattern_component::compile( processed_init->port.value(), url_pattern_helpers::canonicalize_port, - url_pattern_compile_component_options::DEFAULT); + url_pattern_compile_component_options::DEFAULT, provider); if (!port_component) { ada_log("url_pattern_component::compile failed for port ", processed_init->port.value()); @@ -1086,8 +1089,8 @@ tl::expected parse_url_pattern_impl( // TODO: Optimization opportunity: Simplify this if statement. // If the result of running protocol component matches a special scheme given // urlPattern’s protocol component is true, then: - if (url_pattern_helpers::protocol_component_matches_special_scheme( - url_pattern_.protocol_component)) { + if (url_pattern_helpers::protocol_component_matches_special_scheme< + regex_provider>(url_pattern_.protocol_component)) { // Let pathCompileOptions be copy of the pathname options with the ignore // case property set to options["ignoreCase"]. auto path_compile_options = url_pattern_compile_component_options::PATHNAME; @@ -1098,9 +1101,10 @@ tl::expected parse_url_pattern_impl( // Set urlPattern’s pathname component to the result of compiling a // component given processedInit["pathname"], canonicalize a pathname, and // pathCompileOptions. - auto pathname_component = url_pattern_component::compile( + auto pathname_component = url_pattern_component::compile( processed_init->pathname.value(), - url_pattern_helpers::canonicalize_pathname, path_compile_options); + url_pattern_helpers::canonicalize_pathname, path_compile_options, + provider); if (!pathname_component) { ada_log("url_pattern_component::compile failed for pathname ", processed_init->pathname.value()); @@ -1111,9 +1115,10 @@ tl::expected parse_url_pattern_impl( // Otherwise set urlPattern’s pathname component to the result of compiling // a component given processedInit["pathname"], canonicalize an opaque // pathname, and compileOptions. - auto pathname_component = url_pattern_component::compile( + auto pathname_component = url_pattern_component::compile( processed_init->pathname.value(), - url_pattern_helpers::canonicalize_opaque_pathname, compile_options); + url_pattern_helpers::canonicalize_opaque_pathname, compile_options, + provider); if (!pathname_component) { ada_log("url_pattern_component::compile failed for opaque pathname ", processed_init->pathname.value()); @@ -1124,9 +1129,9 @@ tl::expected parse_url_pattern_impl( // Set urlPattern’s search component to the result of compiling a component // given processedInit["search"], canonicalize a search, and compileOptions. - auto search_component = url_pattern_component::compile( + auto search_component = url_pattern_component::compile( processed_init->search.value(), url_pattern_helpers::canonicalize_search, - compile_options); + compile_options, provider); if (!search_component) { ada_log("url_pattern_component::compile failed for search ", processed_init->search.value()); @@ -1136,9 +1141,9 @@ tl::expected parse_url_pattern_impl( // Set urlPattern’s hash component to the result of compiling a component // given processedInit["hash"], canonicalize a hash, and compileOptions. - auto hash_component = url_pattern_component::compile( + auto hash_component = url_pattern_component::compile( processed_init->hash.value(), url_pattern_helpers::canonicalize_hash, - compile_options); + compile_options, provider); if (!hash_component) { ada_log("url_pattern_component::compile failed for hash ", processed_init->hash.value()); @@ -1165,4 +1170,4 @@ template url parse_url(std::string_view user_input, const url* base_url = nullptr); template url_aggregator parse_url( std::string_view user_input, const url_aggregator* base_url = nullptr); -} // namespace ada::parser +} // namespace ada::parser \ No newline at end of file diff --git a/src/url_pattern.cpp b/src/url_pattern.cpp index 95ec41ded..7096b5019 100644 --- a/src/url_pattern.cpp +++ b/src/url_pattern.cpp @@ -2,7 +2,6 @@ #include #include -#include #include namespace ada { @@ -450,10 +449,13 @@ std::string url_pattern_init::to_string() const { return answer; } +template template -tl::expected url_pattern_component::compile( +tl::expected, errors> +url_pattern_component::compile( std::string_view input, F& encoding_callback, - url_pattern_compile_component_options& options) { + url_pattern_compile_component_options& options, + const regex_provider& provider) { ada_log("url_pattern_component::compile input: ", input); // Let part list be the result of running parse a pattern string given input, // options, and encoding callback. @@ -473,13 +475,6 @@ tl::expected url_pattern_component::compile( ada_log("regular expression string: ", regular_expression_string); - // Let flags be an empty string. - // If options’s ignore case is true then set flags to "vi". - // Otherwise set flags to "v" - auto flags = options.ignore_case - ? std::regex::icase | std::regex_constants::ECMAScript - : std::regex_constants::ECMAScript; - // Let pattern string be the result of running generate a pattern // string given part list and options. auto pattern_string = @@ -488,12 +483,10 @@ tl::expected url_pattern_component::compile( // Let regular expression be RegExpCreate(regular expression string, // flags). If this throws an exception, catch it, and throw a // TypeError. - std::regex regular_expression; - try { - regular_expression = std::regex(regular_expression_string, flags); - } catch (std::regex_error& error) { - (void)error; - ada_log("std::regex_error: ", error.what()); + std::optional regular_expression = + provider.create_instance(regular_expression_string, options.ignore_case); + + if (!regular_expression) { return tl::unexpected(errors::type_error); } @@ -507,20 +500,22 @@ tl::expected url_pattern_component::compile( // Return a new component whose pattern string is pattern string, regular // expression is regular expression, group name list is name list, and has // regexp groups is has regexp groups. - return url_pattern_component(std::move(pattern_string), - std::move(regular_expression), flags, - std::move(name_list), has_regexp_groups); + return url_pattern_component( + std::move(pattern_string), std::move(*regular_expression), + std::move(name_list), has_regexp_groups); } -result> url_pattern::exec( - const url_pattern_input& input, std::string_view* base_url = nullptr) { +template +result> url_pattern::exec( + const url_pattern_input& input, std::string_view* base_url) { // Return the result of match given this's associated URL pattern, input, and // baseURL if given. return match(input, base_url); } -result url_pattern::test(const url_pattern_input& input, - std::string_view* base_url = nullptr) { +template +result url_pattern::test(const url_pattern_input& input, + std::string_view* base_url) { // TODO: Optimization opportunity. Rather than returning `url_pattern_result` // Implement a fast path just like `can_parse()` in ada_url. // Let result be the result of match given this's associated URL pattern, @@ -532,7 +527,8 @@ result url_pattern::test(const url_pattern_input& input, return tl::unexpected(errors::type_error); } -result> url_pattern::match( +template +result> url_pattern::match( const url_pattern_input& input, std::string_view* base_url_string) { std::string protocol{}; std::string username{}; @@ -681,60 +677,45 @@ result> url_pattern::match( } } - auto regex_flags = std::regex_constants::match_any; - // Let protocolExecResult be RegExpBuiltinExec(urlPattern’s protocol // component's regular expression, protocol). - std::smatch protocol_exec_result_value; auto protocol_exec_result = - std::regex_search(protocol, protocol_exec_result_value, - protocol_component.regexp, regex_flags); + regex_provider::regex_search(protocol, protocol_component.regexp); // Let usernameExecResult be RegExpBuiltinExec(urlPattern’s username // component's regular expression, username). - std::smatch username_exec_result_value; auto username_exec_result = - std::regex_search(username, username_exec_result_value, - username_component.regexp, regex_flags); + regex_provider::regex_search(username, username_component.regexp); // Let passwordExecResult be RegExpBuiltinExec(urlPattern’s password // component's regular expression, password). - std::smatch password_exec_result_value; auto password_exec_result = - std::regex_search(password, password_exec_result_value, - password_component.regexp, regex_flags); + regex_provider::regex_search(password, password_component.regexp); // Let hostnameExecResult be RegExpBuiltinExec(urlPattern’s hostname // component's regular expression, hostname). - std::smatch hostname_exec_result_value; auto hostname_exec_result = - std::regex_search(hostname, hostname_exec_result_value, - hostname_component.regexp, regex_flags); + regex_provider::regex_search(hostname, hostname_component.regexp); // Let portExecResult be RegExpBuiltinExec(urlPattern’s port component's // regular expression, port). - std::smatch port_exec_result_value; - auto port_exec_result = std::regex_search(port, port_exec_result_value, - port_component.regexp, regex_flags); + auto port_exec_result = + regex_provider::regex_search(port, port_component.regexp); // Let pathnameExecResult be RegExpBuiltinExec(urlPattern’s pathname // component's regular expression, pathname). - std::smatch pathname_exec_result_value; auto pathname_exec_result = - std::regex_search(pathname, pathname_exec_result_value, - pathname_component.regexp, regex_flags); + regex_provider::regex_search(pathname, pathname_component.regexp); // Let searchExecResult be RegExpBuiltinExec(urlPattern’s search component's // regular expression, search). - std::smatch search_exec_result_value; - auto search_exec_result = std::regex_search( - search, search_exec_result_value, search_component.regexp, regex_flags); + auto search_exec_result = + regex_provider::regex_search(search, search_component.regexp); // Let hashExecResult be RegExpBuiltinExec(urlPattern’s hash component's // regular expression, hash). - std::smatch hash_exec_result_value; - auto hash_exec_result = std::regex_search(hash, hash_exec_result_value, - hash_component.regexp, regex_flags); + auto hash_exec_result = + regex_provider::regex_search(hash, hash_component.regexp); // If protocolExecResult, usernameExecResult, passwordExecResult, // hostnameExecResult, portExecResult, pathnameExecResult, searchExecResult, @@ -752,42 +733,42 @@ result> url_pattern::match( // Set result["protocol"] to the result of creating a component match result // given urlPattern’s protocol component, protocol, and protocolExecResult. result.protocol = protocol_component.create_component_match_result( - protocol, protocol_exec_result_value); + protocol, std::move(*protocol_exec_result)); // Set result["username"] to the result of creating a component match result // given urlPattern’s username component, username, and usernameExecResult. result.username = username_component.create_component_match_result( - username, username_exec_result_value); + username, std::move(*username_exec_result)); // Set result["password"] to the result of creating a component match result // given urlPattern’s password component, password, and passwordExecResult. result.password = password_component.create_component_match_result( - password, password_exec_result_value); + password, std::move(*password_exec_result)); // Set result["hostname"] to the result of creating a component match result // given urlPattern’s hostname component, hostname, and hostnameExecResult. result.hostname = hostname_component.create_component_match_result( - hostname, hostname_exec_result_value); + hostname, std::move(*hostname_exec_result)); // Set result["port"] to the result of creating a component match result given // urlPattern’s port component, port, and portExecResult. result.port = port_component.create_component_match_result( - port, port_exec_result_value); + port, std::move(*port_exec_result)); // Set result["pathname"] to the result of creating a component match result // given urlPattern’s pathname component, pathname, and pathnameExecResult. result.pathname = pathname_component.create_component_match_result( - pathname, pathname_exec_result_value); + pathname, std::move(*pathname_exec_result)); // Set result["search"] to the result of creating a component match result // given urlPattern’s search component, search, and searchExecResult. result.search = search_component.create_component_match_result( - search, search_exec_result_value); + search, std::move(*search_exec_result)); // Set result["hash"] to the result of creating a component match result given // urlPattern’s hash component, hash, and hashExecResult. result.hash = hash_component.create_component_match_result( - hash, hash_exec_result_value); + hash, std::move(*hash_exec_result)); return result; } diff --git a/src/url_pattern_helpers.cpp b/src/url_pattern_helpers.cpp index 56927635b..ea9138791 100644 --- a/src/url_pattern_helpers.cpp +++ b/src/url_pattern_helpers.cpp @@ -189,17 +189,20 @@ std::string generate_segment_wildcard_regexp( ada_log("generate_segment_wildcard_regexp result: ", result); return result; } - +template bool protocol_component_matches_special_scheme( - url_pattern_component& component) { + url_pattern_component& component) { auto regex = component.regexp; - return std::regex_match("http", regex) || std::regex_match("https", regex) || - std::regex_match("ws", regex) || std::regex_match("wss", regex) || - std::regex_match("ftp", regex); + return regex_provider::regex_match("http", regex) || + regex_provider::regex_match("https", regex) || + regex_provider::regex_match("ws", regex) || + regex_provider::regex_match("wss", regex) || + regex_provider::regex_match("ftp", regex); } -inline std::optional -constructor_string_parser::compute_protocol_matches_special_scheme_flag() { +template +inline std::optional constructor_string_parser:: + compute_protocol_matches_special_scheme_flag(regex_provider provider) { ada_log( "constructor_string_parser::compute_protocol_matches_special_scheme_" "flag"); @@ -208,9 +211,9 @@ constructor_string_parser::compute_protocol_matches_special_scheme_flag() { auto protocol_string = make_component_string(); // Let protocol component be the result of compiling a component given // protocol string, canonicalize a protocol, and default options. - auto protocol_component = url_pattern_component::compile( + auto protocol_component = url_pattern_component::compile( protocol_string, canonicalize_protocol, - url_pattern_compile_component_options::DEFAULT); + url_pattern_compile_component_options::DEFAULT, provider); if (!protocol_component) { ada_log("url_pattern_component::compile failed for protocol_string ", protocol_string); @@ -470,8 +473,10 @@ tl::expected canonicalize_hash(std::string_view input) { return tl::unexpected(errors::type_error); } -tl::expected constructor_string_parser::parse( - std::string_view input) { +template +tl::expected +constructor_string_parser::parse(std::string_view input, + regex_provider provider) { ada_log("constructor_string_parser::parse input=", input); // Let parser be a new constructor string parser whose input is input and // token list is the result of running tokenize given input and "lenient". @@ -564,7 +569,8 @@ tl::expected constructor_string_parser::parse( if (parser.is_protocol_suffix()) { // Run compute protocol matches a special scheme flag given parser. if (const auto error = - parser.compute_protocol_matches_special_scheme_flag()) { + parser.compute_protocol_matches_special_scheme_flag( + provider)) { ada_log("compute_protocol_matches_special_scheme_flag failed"); return tl::unexpected(*error); } @@ -1298,5 +1304,4 @@ std::string generate_pattern_string( // Return result. return result; } - -} // namespace ada::url_pattern_helpers +} // namespace ada::url_pattern_helpers \ No newline at end of file diff --git a/src/url_pattern_regex.cpp b/src/url_pattern_regex.cpp new file mode 100644 index 000000000..431058f87 --- /dev/null +++ b/src/url_pattern_regex.cpp @@ -0,0 +1,50 @@ +#include +#include "ada/url_pattern_regex.h" + +namespace ada::url_pattern_regex { +std::optional std_regex_provider::create_instance( + std::string_view pattern, bool ignore_case) { + // Let flags be an empty string. + // If options’s ignore case is true then set flags to "vi". + // Otherwise set flags to "v" + auto flags = ignore_case + ? std::regex::icase | std::regex_constants::ECMAScript + : std::regex_constants::ECMAScript; + try { + return std::regex(pattern.data(), pattern.size(), flags); + } catch (const std::regex_error& e) { + (void)e; + ada_log("std_regex_provider::create_instance failed:", e.what()); + return std::nullopt; + } +} + +std::optional> std_regex_provider::regex_search( + std::string_view input, const std::regex& pattern) { + std::string input_str( + input.begin(), + input.end()); // Convert string_view to string for regex_search + std::smatch match_result; + if (!std::regex_search(input_str, match_result, pattern, + std::regex_constants::match_any)) { + return std::nullopt; + } + std::vector matches; + if (match_result.empty()) { + return matches; + } + matches.reserve(match_result.size()); + for (size_t i = 1; i < match_result.size(); ++i) { + if (auto entry = match_result[i]; entry.matched) { + matches.emplace_back(entry.str()); + } + } + return matches; +} + +bool std_regex_provider::regex_match(std::string_view input, + const std::regex& pattern) { + return std::regex_match(input.begin(), input.end(), pattern); +} + +} // namespace ada::url_pattern_regex diff --git a/tests/wpt_urlpattern_tests.cpp b/tests/wpt_urlpattern_tests.cpp index 6b9263a85..5f58434ce 100644 --- a/tests/wpt_urlpattern_tests.cpp +++ b/tests/wpt_urlpattern_tests.cpp @@ -28,17 +28,11 @@ TEST(wpt_urlpattern_tests, parse_pattern_string_basic_tests) { ASSERT_TRUE(part_list); } -TEST(wpt_urlpattern_tests, compile_basic_tests) { - auto protocol_component = ada::url_pattern_component::compile( - "*", ada::url_pattern_helpers::canonicalize_protocol, - ada::url_pattern_compile_component_options::DEFAULT); - ASSERT_TRUE(protocol_component); -} - TEST(wpt_urlpattern_tests, basic_tests) { auto init = ada::url_pattern_init{}; init.pathname = "/books"; - auto url = ada::parse_url_pattern(init); + auto url = + ada::parse_url_pattern(init); ASSERT_TRUE(url); ASSERT_EQ(url->get_protocol(), "*"); ASSERT_EQ(url->get_hostname(), "*"); @@ -224,7 +218,9 @@ parse_pattern_field(ondemand::array& patterns) { return std::tuple(*init_str, base_url, options); } -tl::expected parse_pattern( +tl::expected, + ada::errors> +parse_pattern( std::variant& init_variant, std::optional& base_url, std::optional& options) {