# # Example config file for mcelog # mcelog is the user space backend that decodes and process machine check events # (cpu hardware errors) reported by the CPU to the kernel # # general format #optionname = value # white space is not allowed in value currently, except at the end where it is dropped # # In general all command line options that are not commands work here. # See man mcelog or mcelog --help for a list. # e.g. to enable the --no-syslog option use #no-syslog = yes (or no to disable) # when the option has a argument #logfile = /tmp/logfile # below are the options which are not command line options. # Set CPU type for which mcelog decodes events: #cpu = type # For valid values for type please see mcelog --help. # If this value is set incorrectly the decoded output will be likely incorrect. # By default when this parameter is not set mcelog uses the CPU it is running on # on very new kernels the mcelog events reported by the kernel also carry # the CPU type which is used too when available and not overridden. # Enable daemon mode: #daemon = yes # By default mcelog just processes the currently pending events and exits. # In daemon mode it will keep running as a daemon in the background and poll # the kernel for events and then decode them. # Filter out known broken events by default. filter = yes # Don't log memory errors individually. # They still get accounted if that is enabled. #filter-memory-errors = yes # output in undecoded raw format to be easier machine readable # (default is decoded). #raw = yes # Set CPU Mhz to decode uptime from time stamp counter (output # unreliable, not needed on new kernels which report the event time # directly. A lot of systems don't have a linear time stamp clock # and the output is wrong then. # Normally mcelog tries to figure out if it the TSC is reliable # and only uses the current frequency then. # Setting a frequency forces timestamp decoding. # This setting is obsolete with modern kernels which report the time # directly. #cpumhz = 1800.00 # log output options # Log decoded machine checks in syslog (default stdout or syslog for daemon) #syslog = yes # Log decoded machine checks in syslog with error level #syslog-error = yes # Never log anything to syslog #no-syslog = yes # Append log output to logfile instead of stdout. Only when no syslog logging is active #logfile = filename # Use SMBIOS information to decode DIMMs (needs root). # This function is not recommended to use right now and generally not needed. # The exception is memdb prepopulation, which is configured separately below. #dmi = no # When in daemon mode run as this user after set up. # Note that the triggers will run as this user too. # Setting this to non root will mean that triggers cannot take some corrective # action, like offlining objects. #run-credentials-user = root # group to run as daemon with # default to the group of the run-credentials-user #run-credentials-group = nobody [server] # user allowed to access client socket. # when set to * match any # root is always allowed to access. # default: root only client-user = root # group allowed to access mcelog # When no group is configured any group matches (but still user checking). # when set to * match any #client-group = root # Path to the unix socket for client<->server communication. # When no socket-path is configured the server will not start #socket-path = /var/run/mcelog-client # When mcelog starts it checks if a server is already running. This configures the timeout # for this check. #initial-ping-timeout = 2 # [dimm] # Is the in memory DIMM error tracking enabled? # Only works on systems with integrated memory controller and # which are supported. # Only takes effect in daemon mode. dimm-tracking-enabled = yes # Use DMI information from the BIOS to prepopulate DIMM database. # Note this might not work with all BIOS and requires mcelog to run as root. # Alternative is to let mcelog create DIMM objects on demand. dmi-prepopulate = yes # # Execute these triggers when the rate of corrected or uncorrected # Errors per DIMM exceeds the threshold. # Note when the hardware does not report DIMMs this might also # be per channel. # The default of 10/24h was reasonable for server quality # DDR3 DIMMs as of 2009/10. Newer systems can benefit from # more aggressive page offline when corrected errors are seen # See: # https://www.intel.com/content/dam/www/public/us/en/documents/intel-and-samsung-mrt-improving-memory-reliability-at-data-centers.pdf # for details. #uc-error-trigger = dimm-error-trigger uc-error-threshold = 1 / 24h #ce-error-trigger = dimm-error-trigger ce-error-threshold = 2 / 24h [socket] # Enable memory error accounting per socket. socket-tracking-enabled = yes # Threshold and trigger for uncorrected memory errors on a socket. # mem-uc-error-trigger = socket-memory-error-trigger mem-uc-error-threshold = 100 / 24h # Trigger script for corrected memory errors on a socket. mem-ce-error-trigger = socket-memory-error-trigger # Threshold on when to trigger a correct error for the socket. mem-ce-error-threshold = 100 / 24h # Log socket error threshold explicitly? mem-ce-error-log = yes # Trigger script for uncorrected bus error events bus-uc-threshold-trigger = bus-error-trigger # Trigger script for uncorrected IOMCA erors iomca-threshold-trigger = iomca-error-trigger # Trigger script for other uncategorized errors unknown-threshold-trigger = unknown-error-trigger [cache] # Processing of cache error thresholds reported by Intel CPUs. cache-threshold-trigger = cache-error-trigger # Should cache threshold events be logged explicitly? cache-threshold-log = yes [page] # Memory error accouting per 4K memory page. # Threshold for the correct memory errors trigger script. memory-ce-threshold = 10 / 24h # Trigger script for corrected errors. # memory-ce-trigger = page-error-trigger # Memory error counter per 4K memory page. # Threshold for the counter replacements trigger script. memory-ce-counter-replacement-threshold = 20 / 24h # Trigger script for counter replacements. memory-ce-counter-replacement-trigger = page-error-counter-replacement-trigger # Should page threshold events be logged explicitly? memory-ce-log = yes # specify the internal action in mcelog to exceeding a page error threshold # this is done in addition to executing the trigger script if available # off no action # account only account errors # soft try to soft-offline page without killing any processes # This requires an uptodate kernel. Might not be successfull. # hard try to hard-offline page by killing processes # Requires an uptodate kernel. Might not be successfull. # soft-then-hard First try to soft offline, then try hard offlining #memory-ce-action = off|account|soft|hard|soft-then-hard memory-ce-action = soft # Trigger script before doing soft memory offline # this trigger will scan and run all the scipts in the page-error-pre-soft-trigger.extern memory-pre-sync-soft-ce-trigger = page-error-pre-sync-soft-trigger # Trigger script after completing soft memory offline # this trigger will scan and run all the scipts in the page-error-post-soft-trigger.extern memory-post-sync-soft-ce-trigger = page-error-post-sync-soft-trigger [trigger] # Maximum number of running triggers children-max = 2 # execute triggers in this directory directory = /etc/mcelog