Skip to content

Commit b782202

Browse files
authored
CP-52524: Generate an alert when various host kernel taints are set (xapi-project#6128)
Issue an alert about a broken host kernel if bits 4, 5, 7, 9, or 14 are set in `/proc/sys/kernel/tainted`, indicating some kind of error was encountered and the future behaviour of the kernel might not be predictable or safe anymore (though it generally should reasonably recover). Only one alert per tainted bit per boot should be issued. Distinguish between Major (4,5,7 - these are all things that might cause a host crash, but are unlikely to corrupt whatever data has been written out) and Warning (9, 14 - might be a concern and could be raised to Support but usually are not severe enough to crash the host) levels of errors as suggested by the Foundations team. This should serve as an indicator during issue investigation to look for the cause of the taint.
2 parents e2f96bf + aaabb6c commit b782202

File tree

7 files changed

+112
-19
lines changed

7 files changed

+112
-19
lines changed

ocaml/xapi-consts/api_messages.ml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,12 @@ let host_internal_certificate_expiring_07 =
360360

361361
let failed_login_attempts = addMessage "FAILED_LOGIN_ATTEMPTS" 3L
362362

363+
let kernel_is_broken which =
364+
addMessage ("HOST_KERNEL_ENCOUNTERED_ERROR_" ^ which) 2L
365+
366+
let kernel_is_broken_warning which =
367+
addMessage ("HOST_KERNEL_ENCOUNTERED_WARNING_" ^ which) 3L
368+
363369
let tls_verification_emergency_disabled =
364370
addMessage "TLS_VERIFICATION_EMERGENCY_DISABLED" 3L
365371

ocaml/xapi/dbsync_slave.ml

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,21 +63,24 @@ let create_localhost ~__context info =
6363
in
6464
()
6565

66-
(* TODO cat /proc/stat for btime ? *)
6766
let get_start_time () =
6867
try
69-
debug "Calculating boot time..." ;
70-
let now = Unix.time () in
71-
let uptime = Unixext.string_of_file "/proc/uptime" in
72-
let uptime = String.trim uptime in
73-
let uptime = String.split ' ' uptime in
74-
let uptime = List.hd uptime in
75-
let uptime = float_of_string uptime in
76-
let boot_time = Date.of_unix_time (now -. uptime) in
77-
debug " system booted at %s" (Date.to_rfc3339 boot_time) ;
78-
boot_time
68+
match
69+
Unixext.string_of_file "/proc/stat"
70+
|> String.trim
71+
|> String.split '\n'
72+
|> List.find (fun s -> String.starts_with ~prefix:"btime" s)
73+
|> String.split ' '
74+
with
75+
| _ :: btime :: _ ->
76+
let boot_time = Date.of_unix_time (float_of_string btime) in
77+
debug "%s: system booted at %s" __FUNCTION__ (Date.to_rfc3339 boot_time) ;
78+
boot_time
79+
| _ ->
80+
failwith "Couldn't parse /proc/stat"
7981
with e ->
80-
debug "Calculating boot time failed with '%s'" (ExnHelper.string_of_exn e) ;
82+
debug "%s: Calculating boot time failed with '%s'" __FUNCTION__
83+
(ExnHelper.string_of_exn e) ;
8184
Date.epoch
8285

8386
(* not sufficient just to fill in this data on create time [Xen caps may change if VT enabled in BIOS etc.] *)

ocaml/xapi/xapi_host.ml

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2923,6 +2923,81 @@ let emergency_reenable_tls_verification ~__context =
29232923
Helpers.touch_file Constants.verify_certificates_path ;
29242924
Db.Host.set_tls_verification_enabled ~__context ~self ~value:true
29252925

2926+
(** Issue an alert if /proc/sys/kernel/tainted indicates particular kernel
2927+
errors. Will send only one alert per reboot *)
2928+
let alert_if_kernel_broken =
2929+
let __context = Context.make "host_kernel_error_alert_startup_check" in
2930+
(* Only add an alert if
2931+
(a) an alert wasn't already issued for the currently booted kernel *)
2932+
let possible_alerts =
2933+
ref
2934+
( lazy
2935+
((* Check all the alerts since last reboot. Only done once at toolstack
2936+
startup, we track if alerts have been issued afterwards internally *)
2937+
let self = Helpers.get_localhost ~__context in
2938+
let boot_time =
2939+
Db.Host.get_other_config ~__context ~self
2940+
|> List.assoc "boot_time"
2941+
|> float_of_string
2942+
in
2943+
let all_alerts =
2944+
[
2945+
(* processor reported a Machine Check Exception (MCE) *)
2946+
(4, Api_messages.kernel_is_broken "MCE")
2947+
; (* bad page referenced or some unexpected page flags *)
2948+
(5, Api_messages.kernel_is_broken "BAD_PAGE")
2949+
; (* kernel died recently, i.e. there was an OOPS or BUG *)
2950+
(7, Api_messages.kernel_is_broken "BUG")
2951+
; (* kernel issued warning *)
2952+
(9, Api_messages.kernel_is_broken_warning "WARN")
2953+
; (* soft lockup occurred *)
2954+
(14, Api_messages.kernel_is_broken_warning "SOFT_LOCKUP")
2955+
]
2956+
in
2957+
all_alerts
2958+
|> List.filter (fun (_, alert_message) ->
2959+
let alert_already_issued_for_this_boot =
2960+
Helpers.call_api_functions ~__context (fun rpc session_id ->
2961+
Client.Client.Message.get_all_records ~rpc ~session_id
2962+
|> List.exists (fun (_, record) ->
2963+
record.API.message_name = fst alert_message
2964+
&& API.Date.is_later
2965+
~than:(API.Date.of_unix_time boot_time)
2966+
record.API.message_timestamp
2967+
)
2968+
)
2969+
in
2970+
alert_already_issued_for_this_boot
2971+
)
2972+
)
2973+
)
2974+
in
2975+
(* and (b) if we found a problem *)
2976+
fun ~__context ->
2977+
let self = Helpers.get_localhost ~__context in
2978+
possible_alerts :=
2979+
Lazy.from_val
2980+
(Lazy.force !possible_alerts
2981+
|> List.filter (fun (alert_bit, alert_message) ->
2982+
let is_bit_tainted =
2983+
Unixext.string_of_file "/proc/sys/kernel/tainted"
2984+
|> int_of_string
2985+
in
2986+
let is_bit_tainted = (is_bit_tainted lsr alert_bit) land 1 = 1 in
2987+
if is_bit_tainted then (
2988+
let host = Db.Host.get_name_label ~__context ~self in
2989+
let body =
2990+
Printf.sprintf "<body><host>%s</host></body>" host
2991+
in
2992+
Xapi_alert.add ~msg:alert_message ~cls:`Host
2993+
~obj_uuid:(Db.Host.get_uuid ~__context ~self)
2994+
~body ;
2995+
false (* alert issued, remove from the list *)
2996+
) else
2997+
true (* keep in the list, alert can be issued later *)
2998+
)
2999+
)
3000+
29263001
let alert_if_tls_verification_was_emergency_disabled ~__context =
29273002
let tls_verification_enabled_locally =
29283003
Stunnel_client.get_verify_by_default ()

ocaml/xapi/xapi_host.mli

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -540,6 +540,8 @@ val set_numa_affinity_policy :
540540

541541
val emergency_disable_tls_verification : __context:Context.t -> unit
542542

543+
val alert_if_kernel_broken : __context:Context.t -> unit
544+
543545
val alert_if_tls_verification_was_emergency_disabled :
544546
__context:Context.t -> unit
545547

ocaml/xapi/xapi_periodic_scheduler_init.ml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,13 @@ let register ~__context =
106106
(Xapi_periodic_scheduler.Periodic freq) freq
107107
Xapi_pool.alert_failed_login_attempts
108108
) ;
109+
Xapi_periodic_scheduler.add_to_queue "broken_kernel"
110+
(Xapi_periodic_scheduler.Periodic 600.) 600. (fun () ->
111+
Server_helpers.exec_with_new_task
112+
"Periodic alert if the running kernel is broken in some serious way."
113+
(fun __context -> Xapi_host.alert_if_kernel_broken ~__context
114+
)
115+
) ;
109116
Xapi_periodic_scheduler.add_to_queue
110117
"Period alert if TLS verification emergency disabled"
111118
(Xapi_periodic_scheduler.Periodic 600.) 600. (fun () ->

ocaml/xenopsd/xc/domain.ml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -835,12 +835,12 @@ let create_channels ~xc uuid domid =
835835
let numa_hierarchy =
836836
let open Xenctrlext in
837837
let open Topology in
838-
Lazy.from_fun (fun () ->
839-
let xcext = get_handle () in
840-
let distances = (numainfo xcext).distances in
841-
let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in
842-
NUMA.make ~distances ~cpu_to_node
843-
)
838+
lazy
839+
(let xcext = get_handle () in
840+
let distances = (numainfo xcext).distances in
841+
let cpu_to_node = cputopoinfo xcext |> Array.map (fun t -> t.node) in
842+
NUMA.make ~distances ~cpu_to_node
843+
)
844844

845845
let numa_mutex = Mutex.create ()
846846

quality-gate.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
set -e
44

55
list-hd () {
6-
N=294
6+
N=293
77
LIST_HD=$(git grep -r --count 'List.hd' -- **/*.ml | cut -d ':' -f 2 | paste -sd+ - | bc)
88
if [ "$LIST_HD" -eq "$N" ]; then
99
echo "OK counted $LIST_HD List.hd usages"

0 commit comments

Comments
 (0)