From f0492f6b2383f9ef3a265eb1982570739e534f21 Mon Sep 17 00:00:00 2001 From: Matt Duncan Date: Fri, 20 Jun 2025 11:46:44 -0700 Subject: [PATCH 1/3] feat(issues): Add hex parameterization experiment --- src/sentry/grouping/parameterization.py | 44 ++++++++++++++++--- src/sentry/grouping/strategies/message.py | 4 +- .../sentry/grouping/test_parameterization.py | 8 ++++ 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/src/sentry/grouping/parameterization.py b/src/sentry/grouping/parameterization.py index c06a719e833f2b..2d1bb742fa63f4 100644 --- a/src/sentry/grouping/parameterization.py +++ b/src/sentry/grouping/parameterization.py @@ -180,8 +180,36 @@ def compiled_pattern(self) -> re.Pattern[str]: ), ] +EXPERIMENT_PARAMETERIZATION_REGEXES = [ + ( + ParameterizationRegex( + name="hex", + raw_pattern=r""" + # Hex value with 0x or 0X prefix + (\b0[xX][0-9a-fA-F]+\b) | + + # Hex value without 0x or 0X prefix exactly 4 or 8 bytes long. + # + # We don't need to lookahead for a-f since we if it contains at + # least one number it must contain at least one a-f otherwise it + # would have matched "int". + # + # (?=.*[0-9]): At least one 0-9 is in the 16 match. + # [0-9a-f]{16}: Exactly 16 hex characters (0-9, a-f). + (\b(?=.*[0-9])[0-9a-f]{8,16}\b) + """, + ) + if r.name == "hex" + else r + ) + for r in DEFAULT_PARAMETERIZATION_REGEXES.copy() +] + DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES} +EXPERIMENT_PARAMETERIZATION_REGEXES_MAP = { + r.name: r.pattern for r in EXPERIMENT_PARAMETERIZATION_REGEXES +} @dataclasses.dataclass @@ -273,14 +301,15 @@ def __init__( self, regex_pattern_keys: Sequence[str], experiments: Sequence[ParameterizationExperiment] = (), + enable_regex_experiments: bool = False, ): + self._enable_regex_experiments = enable_regex_experiments self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys) self._experiments = experiments self.matches_counter: defaultdict[str, int] = defaultdict(int) - @staticmethod - def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]: + def _make_regex_from_patterns(self, pattern_keys: Sequence[str]) -> re.Pattern[str]: """ Takes list of pattern keys and returns a compiled regex pattern that matches any of them. @@ -292,9 +321,14 @@ def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]: so we can use newlines and indentation for better legibility in patterns above. """ - return re.compile( - rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}" - ) + if self._enable_regex_experiments: + return re.compile( + rf"(?x){'|'.join(EXPERIMENT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}" + ) + else: + return re.compile( + rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}" + ) def parametrize_w_regex(self, content: str) -> str: """ diff --git a/src/sentry/grouping/strategies/message.py b/src/sentry/grouping/strategies/message.py index 0ed8d1f1981dec..01e1fecadb410e 100644 --- a/src/sentry/grouping/strategies/message.py +++ b/src/sentry/grouping/strategies/message.py @@ -85,7 +85,9 @@ def _should_run_experiment(experiment_name: str) -> bool: ) parameterizer = Parameterizer( - regex_pattern_keys=REGEX_PATTERN_KEYS, experiments=(UniqueIdExperiment,) + regex_pattern_keys=REGEX_PATTERN_KEYS, + experiments=(UniqueIdExperiment,), + enable_regex_experiments=_should_run_experiment("regex"), ) normalized = parameterizer.parameterize_all(trimmed, _should_run_experiment) diff --git a/tests/sentry/grouping/test_parameterization.py b/tests/sentry/grouping/test_parameterization.py index 3504abaf28edaa..f89d6cfa46e482 100644 --- a/tests/sentry/grouping/test_parameterization.py +++ b/tests/sentry/grouping/test_parameterization.py @@ -9,6 +9,7 @@ def parameterizer(): return Parameterizer( regex_pattern_keys=REGEX_PATTERN_KEYS, experiments=(UniqueIdExperiment,), + enable_regex_experiments=True, ) @@ -108,6 +109,8 @@ def parameterizer(): """blah had a problem""", ), ("hex", """blah 0x9af8c3b had a problem""", """blah had a problem"""), + ("hex", """blah 9af8c3b0 had a problem""", """blah had a problem"""), + ("hex", """blah 9af8c3b09af8c3b0 had a problem""", """blah had a problem"""), ("float", """blah 0.23 had a problem""", """blah had a problem"""), ("int", """blah 23 had a problem""", """blah had a problem"""), ( @@ -147,6 +150,11 @@ def parameterizer(): """A quick brown fox jumped over the lazy dog""", """A quick brown fox jumped over the lazy dog""", ), + ( + "Not confidently a hex", + """blah aaffccbb had a problem""", + """blah aaffccbb had a problem""", + ), ], ) def test_parameterize_standard(name, input, expected, parameterizer): From 4b274d07afc3d61082a2a4e9b4718aa45519995b Mon Sep 17 00:00:00 2001 From: Matt Duncan Date: Fri, 20 Jun 2025 12:46:16 -0700 Subject: [PATCH 2/3] chore(issues): Register regex experiment rollout --- src/sentry/options/defaults.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index 98d4051a870b01..693c220724a153 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2629,6 +2629,11 @@ default=0.0, flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE, ) +register( + "grouping.experiments.parameterization.regex", + default=0.0, + flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE, +) # TODO: For now, only a small number of projects are going through a grouping config transition at # any given time, so we're sampling at 100% in order to be able to get good signal. Once we've fully From 92be6c829233e9151e96c6b47b55f820e57153b3 Mon Sep 17 00:00:00 2001 From: Matt Duncan Date: Fri, 20 Jun 2025 13:50:16 -0700 Subject: [PATCH 3/3] fix(issues): Fix 9-15 character hex parameterization and add tests --- src/sentry/grouping/parameterization.py | 7 ++++--- tests/sentry/grouping/test_parameterization.py | 15 ++++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/sentry/grouping/parameterization.py b/src/sentry/grouping/parameterization.py index 2d1bb742fa63f4..a733484a07ec4a 100644 --- a/src/sentry/grouping/parameterization.py +++ b/src/sentry/grouping/parameterization.py @@ -194,9 +194,10 @@ def compiled_pattern(self) -> re.Pattern[str]: # least one number it must contain at least one a-f otherwise it # would have matched "int". # - # (?=.*[0-9]): At least one 0-9 is in the 16 match. - # [0-9a-f]{16}: Exactly 16 hex characters (0-9, a-f). - (\b(?=.*[0-9])[0-9a-f]{8,16}\b) + # (?=.*[0-9]): At least one 0-9 is in the match. + # [0-9a-f]{8/16}: Exactly 8 or 16 hex characters (0-9, a-f). + (\b(?=.*[0-9])[0-9a-f]{8}\b) | + (\b(?=.*[0-9])[0-9a-f]{16}\b) """, ) if r.name == "hex" diff --git a/tests/sentry/grouping/test_parameterization.py b/tests/sentry/grouping/test_parameterization.py index f89d6cfa46e482..615dc7302ac720 100644 --- a/tests/sentry/grouping/test_parameterization.py +++ b/tests/sentry/grouping/test_parameterization.py @@ -111,6 +111,16 @@ def parameterizer(): ("hex", """blah 0x9af8c3b had a problem""", """blah had a problem"""), ("hex", """blah 9af8c3b0 had a problem""", """blah had a problem"""), ("hex", """blah 9af8c3b09af8c3b0 had a problem""", """blah had a problem"""), + ( + "hex - missing numbers", + """blah aaffccbb had a problem""", + """blah aaffccbb had a problem""", + ), + ( + "hex - not 4 or 8 bytes", + """blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""", + """blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""", + ), ("float", """blah 0.23 had a problem""", """blah had a problem"""), ("int", """blah 23 had a problem""", """blah had a problem"""), ( @@ -150,11 +160,6 @@ def parameterizer(): """A quick brown fox jumped over the lazy dog""", """A quick brown fox jumped over the lazy dog""", ), - ( - "Not confidently a hex", - """blah aaffccbb had a problem""", - """blah aaffccbb had a problem""", - ), ], ) def test_parameterize_standard(name, input, expected, parameterizer):