-
Notifications
You must be signed in to change notification settings - Fork 41
Pex 552/on demand detection triggers #416
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
b1aa8b9
af111b7
22def48
a993ed8
48be3ae
c0a3bea
dd74f91
9b09545
b8e6c12
1349ff0
3bcb748
150c209
bf024cc
842125c
1a8ff7e
7a33f57
20b1d44
5208931
4b413ad
290a5f4
2854098
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,14 +69,30 @@ class TimeoutConfig(IntEnum): | |
| """ | ||
|
|
||
| # base amount to sleep for before beginning exponential backoff during testing | ||
| BASE_SLEEP = 60 | ||
| BASE_SLEEP = 2 | ||
|
|
||
| # NOTE: Some detections take longer to generate their risk/notables than other; testing has | ||
| # shown 270s to likely be sufficient for all detections in 99% of runs; however we have | ||
| # encountered a handful of transient failures in the last few months. Since our success rate | ||
| # is at 100% now, we will round this to a flat 300s to accomodate these outliers. | ||
| # shown 30s to likely be sufficient for all detections in 99% of runs and less than 1% of detections | ||
| # would need 60s and 90s to wait for risk/notables; therefore 30s is a reasonable interval for max | ||
| # wait time. | ||
| # Max amount to wait before timing out during exponential backoff | ||
| MAX_SLEEP = 300 | ||
| MAX_SLEEP = 30 | ||
|
|
||
| # NOTE: Based on testing, 99% of detections will generate risk/notables within 30s, and the remaining 1% of | ||
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| # detections may need up to 150s to finish; so this is a reasonable total maximum wait time | ||
| # Total wait time before giving up on waiting for risk/notables to be generated | ||
| TOTAL_MAX_WAIT = 180 | ||
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # NOTE: Based on testing, there is 1% detections couldn't generate risk/notables within single dispatch, and | ||
| # they needed to be retried; 90s is a reasonable wait time before retrying dispatching the SavedSearch | ||
| # Wait time before retrying dispatching the SavedSearch | ||
| RETRY_DISPATCH = 90 | ||
|
|
||
| # NOTE: Based on testing, 99% of detections will generate risk/notables within 30s, and the validation of risks | ||
| # and notables would take around 5 to 10 seconds; so before adding additional wait time, we let the validation | ||
| # process work as the default wait time until we reach the ADD_WAIT_TIME and add additional wait time | ||
| # Time elased before adding additional wait time | ||
xqi-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ADD_WAIT_TIME = 30 | ||
|
|
||
|
|
||
| # TODO (#226): evaluate sane defaults for timeframe for integration testing (e.g. 5y is good | ||
|
|
@@ -441,7 +457,7 @@ def dispatch(self) -> splunklib.Job: | |
| """Dispatches the SavedSearch | ||
|
|
||
| Dispatches the SavedSearch entity, returning a Job object representing the search job. | ||
| :return: a splunklib.Job object representing the search job | ||
| :return: a splunklib.Job object representing the search job when the SavedSearch is finished running | ||
| """ | ||
| self.logger.debug(f"Dispatching {self.name}...") | ||
| try: | ||
|
|
@@ -911,7 +927,15 @@ def notable_in_risk_dm(self) -> bool: | |
| return True | ||
| return False | ||
|
|
||
| def validate_risk_notable_events(self) -> IntegrationTestResult | None: | ||
| def validate_risk_notable_events(self) -> tuple[bool, str | None]: | ||
xqi-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| """Validates the existence of risk and notable events | ||
|
|
||
| Returns a bool indicating whether validating risks and notables is successful or not, | ||
| and a message indicating the reason for failure (if any). | ||
|
|
||
| :return: True if validation passes, False if it fails; None if no message, or a string | ||
| message indicating the reason for failure | ||
| """ | ||
| try: | ||
| # Validate risk events | ||
| if self.has_risk_analysis_action: | ||
|
|
@@ -951,12 +975,10 @@ def validate_risk_notable_events(self) -> IntegrationTestResult | None: | |
| else: | ||
| self.logger.debug(f"No notable action defined for '{self.name}'") | ||
|
|
||
| return None | ||
| return True, None | ||
| except ValidationFailed as e: | ||
| self.logger.error(f"Risk/notable validation failed: {e}") | ||
| return IntegrationTestResult( | ||
| status=TestResultStatus.FAIL, message=f"TEST FAILED: {e}" | ||
| ) | ||
| return False, str(e) | ||
|
|
||
| # NOTE: it would be more ideal to switch this to a system which gets the handle of the saved search job and polls | ||
| # it for completion, but that seems more tricky | ||
|
|
@@ -1019,33 +1041,45 @@ def test( | |
| self.update_pbar(TestingStates.FORCE_RUN) | ||
| self.force_run() | ||
|
|
||
| max_total_wait = 150 | ||
| wait_time = 2 | ||
| max_wait = 30 | ||
| max_total_wait = TimeoutConfig.TOTAL_MAX_WAIT | ||
|
||
| wait_time = TimeoutConfig.BASE_SLEEP | ||
| max_wait = TimeoutConfig.MAX_SLEEP | ||
| time_elapsed = 0 | ||
|
|
||
| while time_elapsed <= max_total_wait: | ||
| if time_elapsed > 90: | ||
| # wait at least 90 seconds to rerun the SavedSearch | ||
| if time_elapsed > TimeoutConfig.RETRY_DISPATCH: | ||
| self.dispatch() | ||
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| start_time = time.time() | ||
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| time.sleep(wait_time) | ||
|
|
||
| # wait at least 30 seconds before adding to the wait time | ||
| if time_elapsed > TimeoutConfig.ADD_WAIT_TIME: | ||
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| time.sleep(wait_time) | ||
| elapsed_sleep_time += wait_time | ||
| wait_time = min(max_wait, wait_time * 2) | ||
|
|
||
| # reset the result to None on each loop iteration | ||
| result = self.validate_risk_notable_events() | ||
| result = None | ||
|
|
||
| validate_pass, error = self.validate_risk_notable_events() | ||
cmcginley-splunk marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| # if result is still None, then all checks passed and we can break the loop | ||
| if result is None: | ||
| # if result is True, then all checks passed and we can break the loop | ||
| if validate_pass: | ||
| result = IntegrationTestResult( | ||
| status=TestResultStatus.PASS, | ||
| message=f"TEST PASSED: Expected risk and/or notable events were created for: {self.name}", | ||
| wait_duration=elapsed_sleep_time, | ||
| ) | ||
| break | ||
| else: | ||
| result = IntegrationTestResult( | ||
| status=TestResultStatus.FAIL, | ||
| message=f"TEST FAILED: {error}", | ||
| ) | ||
|
|
||
| end_time = time.time() | ||
| time_elapsed += end_time - start_time | ||
| wait_time = min(max_wait, wait_time * 2) | ||
|
|
||
| # TODO (PEX-436): should cleanup be in a finally block so it runs even on exception? | ||
| # cleanup the created events, disable the detection and return the result | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.