Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 40 additions & 5 deletions src/sentry/grouping/parameterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,37 @@ def compiled_pattern(self) -> re.Pattern[str]:
),
]

EXPERIMENT_PARAMETERIZATION_REGEXES = [
(
ParameterizationRegex(
name="hex",
raw_pattern=r"""
# Hex value with 0x or 0X prefix
(\b0[xX][0-9a-fA-F]+\b) |

# Hex value without 0x or 0X prefix exactly 4 or 8 bytes long.
#
# We don't need to lookahead for a-f since we if it contains at
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We also can't use the built in lookahead attribute on ParameterizationRegex since we don't want it to match the existing pattern above on line 189.

# least one number it must contain at least one a-f otherwise it
# would have matched "int".
#
# (?=.*[0-9]): At least one 0-9 is in the match.
# [0-9a-f]{8/16}: Exactly 8 or 16 hex characters (0-9, a-f).
(\b(?=.*[0-9])[0-9a-f]{8}\b) |
(\b(?=.*[0-9])[0-9a-f]{16}\b)
""",
)
if r.name == "hex"
else r
)
for r in DEFAULT_PARAMETERIZATION_REGEXES.copy()
]


DEFAULT_PARAMETERIZATION_REGEXES_MAP = {r.name: r.pattern for r in DEFAULT_PARAMETERIZATION_REGEXES}
EXPERIMENT_PARAMETERIZATION_REGEXES_MAP = {
r.name: r.pattern for r in EXPERIMENT_PARAMETERIZATION_REGEXES
}


@dataclasses.dataclass
Expand Down Expand Up @@ -273,14 +302,15 @@ def __init__(
self,
regex_pattern_keys: Sequence[str],
experiments: Sequence[ParameterizationExperiment] = (),
enable_regex_experiments: bool = False,
):
self._enable_regex_experiments = enable_regex_experiments
self._parameterization_regex = self._make_regex_from_patterns(regex_pattern_keys)
self._experiments = experiments

self.matches_counter: defaultdict[str, int] = defaultdict(int)

@staticmethod
def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]:
def _make_regex_from_patterns(self, pattern_keys: Sequence[str]) -> re.Pattern[str]:
"""
Takes list of pattern keys and returns a compiled regex pattern that matches any of them.

Expand All @@ -292,9 +322,14 @@ def _make_regex_from_patterns(pattern_keys: Sequence[str]) -> re.Pattern[str]:
so we can use newlines and indentation for better legibility in patterns above.
"""

return re.compile(
rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
)
if self._enable_regex_experiments:
return re.compile(
rf"(?x){'|'.join(EXPERIMENT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
)
else:
return re.compile(
rf"(?x){'|'.join(DEFAULT_PARAMETERIZATION_REGEXES_MAP[k] for k in pattern_keys)}"
)

def parametrize_w_regex(self, content: str) -> str:
"""
Expand Down
4 changes: 3 additions & 1 deletion src/sentry/grouping/strategies/message.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,9 @@ def _should_run_experiment(experiment_name: str) -> bool:
)

parameterizer = Parameterizer(
regex_pattern_keys=REGEX_PATTERN_KEYS, experiments=(UniqueIdExperiment,)
regex_pattern_keys=REGEX_PATTERN_KEYS,
experiments=(UniqueIdExperiment,),
enable_regex_experiments=_should_run_experiment("regex"),
)

normalized = parameterizer.parameterize_all(trimmed, _should_run_experiment)
Expand Down
5 changes: 5 additions & 0 deletions src/sentry/options/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -2624,6 +2624,11 @@
default=0.0,
flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE,
)
register(
"grouping.experiments.parameterization.regex",
default=0.0,
flags=FLAG_ADMIN_MODIFIABLE | FLAG_AUTOMATOR_MODIFIABLE | FLAG_RATE,
)

# TODO: For now, only a small number of projects are going through a grouping config transition at
# any given time, so we're sampling at 100% in order to be able to get good signal. Once we've fully
Expand Down
13 changes: 13 additions & 0 deletions tests/sentry/grouping/test_parameterization.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ def parameterizer():
return Parameterizer(
regex_pattern_keys=REGEX_PATTERN_KEYS,
experiments=(UniqueIdExperiment,),
enable_regex_experiments=True,
)


Expand Down Expand Up @@ -108,6 +109,18 @@ def parameterizer():
"""blah <date> had a problem""",
),
("hex", """blah 0x9af8c3b had a problem""", """blah <hex> had a problem"""),
("hex", """blah 9af8c3b0 had a problem""", """blah <hex> had a problem"""),
("hex", """blah 9af8c3b09af8c3b0 had a problem""", """blah <hex> had a problem"""),
(
"hex - missing numbers",
"""blah aaffccbb had a problem""",
"""blah aaffccbb had a problem""",
),
(
"hex - not 4 or 8 bytes",
"""blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""",
"""blah 4aaa 9aaaaaaaa 10aaaaaaaa 15aaaaaaaaaaaaa 17aaaaaaaaaaaaaaa had a problem""",
),
("float", """blah 0.23 had a problem""", """blah <float> had a problem"""),
("int", """blah 23 had a problem""", """blah <int> had a problem"""),
(
Expand Down
Loading