diff --git a/doc/guix.texi b/doc/guix.texi index a6c1556977..14e502da84 100644 --- a/doc/guix.texi +++ b/doc/guix.texi @@ -88,6 +88,7 @@ Copyright @copyright{} 2020 John Soo@* Copyright @copyright{} 2020 Jonathan Brielmaier@* Copyright @copyright{} 2020 Edgar Vincent@* Copyright @copyright{} 2021 Maxime Devos@* +Copyright @copyright{} 2021 B. Wilson@* Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.3 or @@ -31442,6 +31443,86 @@ parameters, can be done as follow: @end lisp @end deffn +@cindex rasdaemon +@cindex Platform Reliability, Availability and Serviceability daemon +@subsubheading Rasdaemon Service + +The Rasdaemon service provides a daemon which monitors the platform Reliablity, +Availability and Serviceability (RAS) reports from the Linux kernel trace +events, logging them to syslogd. + +Reliability, Availability and Serviceability is a concept used on servers meant +to measure their robustness. + +@strong{Relability} is the probability that a system will produce correct +outputs: + +@itemize @bullet +@item Generally measured as Mean Time Between Failures (MTBF), and +@item Enhanced by features that help to avoid, detect and repair hardware +faults +@end itemize + +@strong{Availability} is the probability that a system is operational at a +given time: + +@itemize @bullet +@item Generally measured as a percentage of downtime per a period of time, and +@item Often uses mechanisms to detect and correct hardware faults in runtime. +@end itemize + +@strong{Serviceability} is the simplicity and speed with which a system can be +repaired or maintained: + +@itemize @bullet +@item Generally measured on Mean Time Between Repair (MTBR). +@end itemize + + +Among the monitoring measures, the most usual ones include: + +@itemize @bullet +@item CPU – detect errors at instruction execution and at L1/L2/L3 caches; +@item Memory – add error correction logic (ECC) to detect and correct errors; +@item I/O – add CRC checksums for transferred data; +@item Storage – RAID, journal file systems, checksums, Self-Monitoring, +Analysis and Reporting Technology (SMART). +@end itemize + +By monitoring the number of occurrences of error detections, it is possible to +identify if the probability of hardware errors is increasing, and, on such +case, do a preventive maintenance to replace a degraded component while those +errors are correctable. + +For detailed information about the types of error events gathered and how to +make sense of them, see the kernel administrator's guide at +@url{https://www.kernel.org/doc/html/latest/admin-guide/ras.html}. + +@defvr {Scheme Variable} rasdaemon-service-type +Service type for the @command{rasdaemon} service. It accepts a +@code{rasdaemon-configuration} object. Instantiating like + +@lisp +(service rasdaemon-service-type) +@end lisp + +will load with a default configuration, which monitors all events and logs to +syslogd. +@end defvr + +@deftp {Data Type} rasdaemon-configuration +The data type representing the configuration of @command{rasdaemon}. + +@table @asis +@item @code{record?} (default: @code{#f}) + +A boolean indicating whether to record the events in an SQLite database. This +provides a more structured access to the information contained in the log file. +The database location is hard-coded to @file{/var/lib/rasdaemon/ras-mc_event.db}. + +@end table +@end deftp + @cindex zram @cindex compressed swap @cindex Compressed RAM-based block devices diff --git a/gnu/services/linux.scm b/gnu/services/linux.scm index 340b330030..2eb02ac5a3 100644 --- a/gnu/services/linux.scm +++ b/gnu/services/linux.scm @@ -3,6 +3,7 @@ ;;; Copyright © 2020 Brice Waegeneire ;;; Copyright © 2020 Efraim Flashner ;;; Copyright © 2021 raid5atemyhomework +;;; Copyright © 2021 B. Wilson ;;; ;;; This file is part of GNU Guix. ;;; @@ -47,6 +48,11 @@ kernel-module-loader-service-type + rasdaemon-configuration + rasdaemon-configuration? + rasdaemon-configuration-record? + rasdaemon-service-type + zram-device-configuration zram-device-configuration? zram-device-configuration-size @@ -188,6 +194,49 @@ representation." (extend append) (default-value '()))) + +;;; +;;; Reliability, Availability, and Serviceability (RAS) daemon +;;; + +(define-record-type* + rasdaemon-configuration make-rasdaemon-configuration + rasdaemon-configuration? + (record? rasdaemon-configuration-record? (default #f))) + +(define (rasdaemon-configuration->command-line-args config) + "Translate to its command line arguments + representation" + (let ((record? (rasdaemon-configuration-record? config))) + `(,(file-append rasdaemon "/sbin/rasdaemon") + "--foreground" ,@(if record? '("--record") '())))) + +(define (rasdaemon-activation config) + (let ((record? (rasdaemon-configuration-record? config)) + (rasdaemon-dir "/var/lib/rasdaemon")) + (with-imported-modules '((guix build utils)) + #~(if #$record? (mkdir-p #$rasdaemon-dir))))) + +(define (rasdaemon-shepherd-service config) + (shepherd-service + (documentation "Run rasdaemon") + (provision '(rasdaemon)) + (requirement '(syslogd)) + (start #~(make-forkexec-constructor + '#$(rasdaemon-configuration->command-line-args config))) + (stop #~(make-kill-destructor)))) + +(define rasdaemon-service-type + (service-type + (name 'rasdaemon) + (default-value (rasdaemon-configuration)) + (extensions + (list (service-extension shepherd-root-service-type + (compose list rasdaemon-shepherd-service)) + (service-extension activation-service-type rasdaemon-activation))) + (compose concatenate) + (description "Run @command{rasdaemon}, the RAS monitor"))) + ;;; ;;; Kernel module loader.