aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTony Luck <tony.luck@intel.com>2022-05-23 08:55:23 -0700
committerAndi Kleen <ak@linux.intel.com>2022-05-23 09:11:50 -0700
commitc75cd42e245d6eba31ce8f8aeee38ef4084a9f50 (patch)
tree4d273f1871c18baad708d49b95753c58cd7ce303
parentd975fc16fd37b3e136114c01ced263de1456acf1 (diff)
downloadmcelog-c75cd42e245d6eba31ce8f8aeee38ef4084a9f50.tar.gz
mcelog: Reduce default threshold for corrected error page offlinev182
The "ten per 24 hour" threshold has been in place for over a decade. A study of large numbers of errors on modern systems suggests reducing that threshold to two per 24 hours". https://www.intel.com/content/dam/www/public/us/en/documents/intel-and-samsung-mrt-improving-memory-reliability-at-data-centers.pdf Signed-off-by: Tony Luck <tony.luck@intel.com> Signed-off-by: Andi Kleen <ak@linux.intel.com>
-rw-r--r--mcelog.conf10
1 files changed, 7 insertions, 3 deletions
diff --git a/mcelog.conf b/mcelog.conf
index 41b465a..86ddca0 100644
--- a/mcelog.conf
+++ b/mcelog.conf
@@ -109,12 +109,16 @@ dmi-prepopulate = yes
# Errors per DIMM exceeds the threshold.
# Note when the hardware does not report DIMMs this might also
# be per channel.
-# The default of 10/24h is reasonable for server quality
-# DDR3 DIMMs as of 2009/10.
+# The default of 10/24h was reasonable for server quality
+# DDR3 DIMMs as of 2009/10. Newer systems can benefit from
+# more aggressive page offline when corrected errors are seen
+# See:
+# https://www.intel.com/content/dam/www/public/us/en/documents/intel-and-samsung-mrt-improving-memory-reliability-at-data-centers.pdf
+# for details.
#uc-error-trigger = dimm-error-trigger
uc-error-threshold = 1 / 24h
#ce-error-trigger = dimm-error-trigger
-ce-error-threshold = 10 / 24h
+ce-error-threshold = 2 / 24h
[socket]
# Enable memory error accounting per socket.