diff --git a/src/sentry/grouping/parameterization.py b/src/sentry/grouping/parameterization.py index c06a719e833f2b..a733484a07ec4a 100644 --- a/src/sentry/grouping/parameterization.py +++ b/src/sentry/grouping/parameterization.py @@ -180,8 +180,37 @@ def compiled_pattern(self) -> re.Pattern[str]: ), ] +EXPERIMENT_PARAMETERIZATION_REGEXES = [ + ( + ParameterizationRegex( + name="hex", + raw_pattern=r""" + # Hex value with 0x or 0X prefix + (\b0[xX][0-9a-fA-F]+\b) | + + # Hex value without 0x or 0X prefix exactly 4 or 8 bytes long. + # + # We don't need to lookahead for a-f since we if it contains at + # least one number it must contain at least one a-f otherwise it + # would have matched "int". + # + # (?=.*[0-9]): At least one 0-9 is in the match. + # [0-9a-f]{8/16}: Exactly 8 or 16 hex characters (0-9, a-f). + (\b(?=.*[0-9])[0-9a-f]{8}\b) | + (\b(?=.*[0-9])[0-9a-f]{16}\b) + """, + ) + if r.name == "hex" + else r + ) + for r in DEFAULT_PARAMETERIZATION_REGEXES.copy() +] + DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES} +EXPERIMENT_PARAMETERIZATION_REGEXES_MAP = { + r.name: r.pattern for r in EXPERIMENT_PARAMETERIZATION_REGEXES +} @dataclasses.dataclass @@ -273,14 +302,15 @@ def __init__( self, regex_pattern_keys: Sequence[str], experiments: Sequence[ParameterizationExperiment] = (), + enable_regex_experiments: bool = False, ): + self._enable_regex_experiments = enable_regex_experiments self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys) self._experiments = experiments self.matches_counter: defaultdict[str, int] = defaultdict(int) - @staticmethod - def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]: + def _make_regex_from_patterns(self, pattern_keys: Sequence[str]) -> re.Pattern[str]: """ Takes list of pattern keys and returns a compiled regex pattern that matches any of them. @@ -292,9 +322,14 @@ def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]: so we can use newlines and indentation for better legibility in patterns above. """ - return re.compile( - rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}" - ) + if self._enable_regex_experiments: + return re.compile( + rf"(?x){'|'.join(EXPERIMENT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}" + ) + else: + return re.compile( + rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}" + ) def parametrize_w_regex(self, content: str) -> str: """ diff --git a/src/sentry/grouping/strategies/message.py b/src/sentry/grouping/strategies/message.py index 0ed8d1f1981dec..01e1fecadb410e 100644 --- a/src/sentry/grouping/strategies/message.py +++ b/src/sentry/grouping/strategies/message.py @@ -85,7 +85,9 @@ def _should_run_experiment(experiment_name: str) -> bool: ) parameterizer = Parameterizer( - regex_pattern_keys=REGEX_PATTERN_KEYS, experiments=(UniqueIdExperiment,) + regex_pattern_keys=REGEX_PATTERN_KEYS, + experiments=(UniqueIdExperiment,), + enable_regex_experiments=_should_run_experiment("regex"), ) normalized = parameterizer.parameterize_all(trimmed, _should_run_experiment) diff --git a/src/sentry/options/defaults.py b/src/sentry/options/defaults.py index a48369b5a9d2da..6a8e8c4499c278 100644 --- a/src/sentry/options/defaults.py +++ b/src/sentry/options/defaults.py @@ -2624,6 +2624,11 @@ default=0.0, flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE, ) +register( + "grouping.experiments.parameterization.regex", + default=0.0, + flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE, +) # TODO: For now, only a small number of projects are going through a grouping config transition at # any given time, so we're sampling at 100% in order to be able to get good signal. Once we've fully diff --git a/tests/sentry/grouping/test_parameterization.py b/tests/sentry/grouping/test_parameterization.py index 3504abaf28edaa..615dc7302ac720 100644 --- a/tests/sentry/grouping/test_parameterization.py +++ b/tests/sentry/grouping/test_parameterization.py @@ -9,6 +9,7 @@ def parameterizer(): return Parameterizer( regex_pattern_keys=REGEX_PATTERN_KEYS, experiments=(UniqueIdExperiment,), + enable_regex_experiments=True, ) @@ -108,6 +109,18 @@ def parameterizer(): """blah had a problem""", ), ("hex", """blah 0x9af8c3b had a problem""", """blah had a problem"""), + ("hex", """blah 9af8c3b0 had a problem""", """blah had a problem"""), + ("hex", """blah 9af8c3b09af8c3b0 had a problem""", """blah had a problem"""), + ( + "hex - missing numbers", + """blah aaffccbb had a problem""", + """blah aaffccbb had a problem""", + ), + ( + "hex - not 4 or 8 bytes", + """blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""", + """blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""", + ), ("float", """blah 0.23 had a problem""", """blah had a problem"""), ("int", """blah 23 had a problem""", """blah had a problem"""), (