-
Notifications
You must be signed in to change notification settings - Fork 4.8k
OCPBUGS-48186: Add kubelet and CRI-O panic detection invariant test #30243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
Signed-off-by: Pannaga Rao Bhoja Ramamanohara
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -74,12 +74,21 @@ func intervalsFromNodeLogs(ctx context.Context, kubeClient kubernetes.Interface, | |
| } | ||
| newSystemdCoreDumpIntervals := intervalsFromSystemdCoreDumpLogs(nodeName, systemdCoreDumpLogs) | ||
|
|
||
| crioLogs, err := getNodeLog(ctx, kubeClient, nodeName, "crio") | ||
| if err != nil { | ||
| fmt.Fprintf(os.Stderr, "Error getting node crio logs from %s: %s", nodeName, err.Error()) | ||
| errCh <- err | ||
| return | ||
| } | ||
| newCrioLogs := eventsFromCrioLogs(nodeName, crioLogs) | ||
|
||
|
|
||
| lock.Lock() | ||
| defer lock.Unlock() | ||
| ret = append(ret, newEvents...) | ||
| ret = append(ret, newOVSEvents...) | ||
| ret = append(ret, newNetworkManagerIntervals...) | ||
| ret = append(ret, newSystemdCoreDumpIntervals...) | ||
| ret = append(ret, newCrioLogs...) | ||
| }(ctx, node.Name) | ||
| } | ||
| wg.Wait() | ||
|
|
@@ -118,6 +127,7 @@ func eventsFromKubeletLogs(nodeName string, kubeletLog []byte) monitorapi.Interv | |
| ret = append(ret, leaseUpdateError(nodeLocator, currLine)...) | ||
| ret = append(ret, leaseFailBackOff(nodeLocator, currLine)...) | ||
| ret = append(ret, parse(nodeName, currLine)...) | ||
| ret = append(ret, kubeletPanicDetected(nodeName, currLine)...) | ||
| } | ||
|
|
||
| return ret | ||
|
|
@@ -712,3 +722,66 @@ func getNodeLog(ctx context.Context, client kubernetes.Interface, nodeName, syst | |
|
|
||
| return ioutil.ReadAll(in) | ||
| } | ||
|
|
||
| var panicHeadlineRegex = regexp.MustCompile(`(panic:|fatal error:)`) | ||
|
|
||
| func kubeletPanicDetected(nodeName, logLine string) monitorapi.Intervals { | ||
| if !panicHeadlineRegex.MatchString(logLine) { | ||
| return nil | ||
| } | ||
|
|
||
| failureTime := utility.SystemdJournalLogTime(logLine, time.Now().Year()) | ||
| nodeLocator := monitorapi.NewLocator().NodeFromName(nodeName) | ||
|
|
||
| return monitorapi.Intervals{ | ||
| monitorapi.NewInterval(monitorapi.SourceKubeletLog, monitorapi.Error). | ||
| Locator(nodeLocator). | ||
| Message(monitorapi.NewMessage().Reason(monitorapi.KubeletPanic). | ||
| HumanMessage("kubelet panic detected, check logs for details")). | ||
| Display(). | ||
| Build(failureTime, failureTime.Add(1*time.Second)), | ||
| } | ||
| } | ||
|
|
||
| // eventsFromCrioLogs returns the produced intervals from CRI-O logs. | ||
| // Right now it only detects panics, but more detectors can be added as needed. | ||
| func eventsFromCrioLogs(nodeName string, crioLog []byte) monitorapi.Intervals { | ||
| ret := monitorapi.Intervals{} | ||
|
|
||
| scanner := bufio.NewScanner(bytes.NewBuffer(crioLog)) | ||
| for scanner.Scan() { | ||
| currLine := scanner.Text() | ||
| ret = append(ret, crioPanicDetected(nodeName, currLine)...) | ||
| } | ||
|
|
||
| return ret | ||
| } | ||
|
|
||
| func crioPanicDetected(nodeName, logLine string) monitorapi.Intervals { | ||
| if !panicHeadlineRegex.MatchString(logLine) { | ||
| return nil | ||
| } | ||
|
|
||
| failureTime := utility.SystemdJournalLogTime(logLine, time.Now().Year()) | ||
| nodeLocator := monitorapi.NewLocator().NodeFromName(nodeName) | ||
|
|
||
| return monitorapi.Intervals{ | ||
| monitorapi.NewInterval(monitorapi.SourceCrioLog, monitorapi.Error). | ||
| Locator(nodeLocator). | ||
| Message(monitorapi.NewMessage().Reason(monitorapi.CrioPanic). | ||
| HumanMessage("CRI-O panic detected, check logs for details")). | ||
| Display(). | ||
| Build(failureTime, failureTime.Add(1*time.Second)), | ||
| } | ||
| } | ||
PannagaRao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // findKubeletAndCrioPanics returns all intervals with Reason KubeletPanic or CrioPanic. | ||
| func findKubeletAndCrioPanics(intervals monitorapi.Intervals) monitorapi.Intervals { | ||
| var panics monitorapi.Intervals | ||
| for _, interval := range intervals { | ||
| if interval.Message.Reason == monitorapi.KubeletPanic || interval.Message.Reason == monitorapi.CrioPanic { | ||
| panics = append(panics, interval) | ||
| } | ||
| } | ||
| return panics | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.