aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndi Kleen <github@halobates.de>2018-05-14 15:12:30 -0700
committerGitHub <noreply@github.com>2018-05-14 15:12:30 -0700
commitc79b2e61436f6158c73d049f5fc01eb264124589 (patch)
treebcce77dac498da95f56d29fa7654bf193b22fb6a
parent0a310abe7b0788ef100b5b2a8eb7c63f04bff1d7 (diff)
parent9f86b857838624e655abf5744d58c71102101451 (diff)
downloadmcelog-c79b2e61436f6158c73d049f5fc01eb264124589.tar.gz
Merge pull request #68 from zoucao-ali/master
add a pre and post trigger when mcelog do memory soft offline
-rw-r--r--Makefile4
-rw-r--r--bus.c4
-rw-r--r--dimm.c2
-rw-r--r--leaky-bucket.h1
-rw-r--r--mcelog.conf8
-rw-r--r--mcelog.conf.523
-rw-r--r--memdb.c4
-rw-r--r--memdb.h2
-rw-r--r--page.c38
-rw-r--r--trigger.c7
-rw-r--r--trigger.h8
-rw-r--r--triggers/page-error-post-sync-soft-trigger39
-rwxr-xr-xtriggers/page-error-pre-sync-soft-trigger39
-rw-r--r--unknown.c2
-rw-r--r--yellow.c2
15 files changed, 165 insertions, 18 deletions
diff --git a/Makefile b/Makefile
index 57373af..903a90c 100644
--- a/Makefile
+++ b/Makefile
@@ -21,7 +21,9 @@ TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \
socket-memory-error-trigger \
bus-error-trigger \
iomca-error-trigger \
- unknown-error-trigger
+ unknown-error-trigger \
+ page-error-pre-sync-soft-trigger \
+ page-error-post-sync-soft-trigger
all: mcelog
diff --git a/bus.c b/bus.c
index 84eca30..df56dc7 100644
--- a/bus.c
+++ b/bus.c
@@ -82,7 +82,7 @@ void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr,
env[ei] = NULL;
assert(ei < MAX_ENV);
- run_trigger(bus_trigger, NULL, env);
+ run_trigger(bus_trigger, NULL, env, false);
for (i = 0; i < ei; i++)
free(env[i]);
free(msg);
@@ -119,7 +119,7 @@ void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn)
env[ei] = NULL;
assert(ei < MAX_ENV);
- run_trigger(iomca_trigger, NULL, env);
+ run_trigger(iomca_trigger, NULL, env, false);
for (i = 0; i < ei; i++)
free(env[i]);
free(msg);
diff --git a/dimm.c b/dimm.c
index cd41301..26d0118 100644
--- a/dimm.c
+++ b/dimm.c
@@ -374,7 +374,7 @@ void new_error(unsigned long long addr, unsigned long max_error, char *trigger)
Lprintf("Large number of corrected errors in memory at %s", loc);
Lprintf("Consider replacing it");
if (trigger && trigger[0])
- run_trigger(trigger, loc, val, max_error);
+ run_trigger(trigger, loc, val, max_error, false);
}
}
free(devs);
diff --git a/leaky-bucket.h b/leaky-bucket.h
index 860ba3c..18bd02d 100644
--- a/leaky-bucket.h
+++ b/leaky-bucket.h
@@ -2,6 +2,7 @@
#define LEAKY_BUCKET_H 1
#include <time.h>
+#include <stdbool.h>
/* Leaky bucket algorithm for triggers */
diff --git a/mcelog.conf b/mcelog.conf
index 54e2b91..b0ccc2d 100644
--- a/mcelog.conf
+++ b/mcelog.conf
@@ -174,6 +174,14 @@ memory-ce-log = yes
#memory-ce-action = off|account|soft|hard|soft-then-hard
memory-ce-action = soft
+# Trigger script before doing soft memory offline
+# this trigger will scan and run all the scipts in the page-error-pre-soft-trigger.extern
+memory-pre-sync-soft-ce-trigger = page-error-pre-sync-soft-trigger
+
+# Trigger script after completing soft memory offline
+# this trigger will scan and run all the scipts in the page-error-post-soft-trigger.extern
+memory-post-sync-soft-ce-trigger = page-error-post-sync-soft-trigger
+
[trigger]
# Maximum number of running triggers
children-max = 2
diff --git a/mcelog.conf.5 b/mcelog.conf.5
index c87f23e..261e026 100644
--- a/mcelog.conf.5
+++ b/mcelog.conf.5
@@ -1,5 +1,5 @@
-." Auto generated mcelog.conf manpage. Do not edit.
+.\" Auto generated mcelog.conf manpage. Do not edit.
.TH "mcelog.conf" 5 "mcelog"
.SH NAME
@@ -8,12 +8,11 @@ mcelog.conf \- mcelog.conf reference
.B /etc/mcelog.conf
.SH DESCRIPTION
-/etc/mcelog.conf is the main configuration file for
+/etc/mcelog.conf is the main configuration file for
.B mcelog(8).
-This is configuration file separated into sections including
+This is configuration file separated into sections including
a default section.
-
General format
.PP
.B optionname = value
@@ -266,6 +265,18 @@ soft-then-hard First try to soft offline, then try hard offlining
.B memory-ce-action = soft
.PP
.PP
+Trigger script before doing soft memory offline
+this trigger will scan and run all the scipts in the page-error-pre-soft-trigger.extern
+.PP
+.B memory-pre-sync-soft-ce-trigger = page-error-pre-sync-soft-trigger
+.PP
+.PP
+Trigger script after completing soft memory offline
+this trigger will scan and run all the scipts in the page-error-post-soft-trigger.extern
+.PP
+.B memory-post-sync-soft-ce-trigger = page-error-post-sync-soft-trigger
+.PP
+.PP
.SS "The trigger config section"
Maximum number of running triggers
.PP
@@ -277,7 +288,7 @@ Execute triggers in this directory
.PP
.SH SEE ALSO
-.BR mcelog (8)
-,
+.BR mcelog (8),
+.BR mcelog.triggers (5)
.B http://www.mcelog.org
diff --git a/memdb.c b/memdb.c
index 0aa6dd6..a15bef9 100644
--- a/memdb.c
+++ b/memdb.c
@@ -211,11 +211,11 @@ account_memdb(struct err_triggers *t, struct memdimm *md, struct mce *m)
if (m->status & MCI_STATUS_UC) {
md->uc.count++;
if (__bucket_account(&t->uc_bucket_conf, &md->uc.bucket, 1, m->time))
- memdb_trigger(msg, md, m->time, &md->uc, &t->uc_bucket_conf);
+ memdb_trigger(msg, md, m->time, &md->uc, &t->uc_bucket_conf, false);
} else {
md->ce.count++;
if (__bucket_account(&t->ce_bucket_conf, &md->ce.bucket, 1, m->time))
- memdb_trigger(msg, md, m->time, &md->ce, &t->ce_bucket_conf);
+ memdb_trigger(msg, md, m->time, &md->ce, &t->ce_bucket_conf, false);
}
free(msg);
}
diff --git a/memdb.h b/memdb.h
index 458143a..09ddd44 100644
--- a/memdb.h
+++ b/memdb.h
@@ -20,5 +20,5 @@ void memory_error(struct mce *m, int channel, int dimm, unsigned corr_err_cnt,
struct memdimm;
void memdb_trigger(char *msg, struct memdimm *md, time_t t,
- struct err_type *et, struct bucket_conf *bc);
+ struct err_type *et, struct bucket_conf *bc, bool sync);
struct memdimm *get_memdimm(int socketid, int channel, int dimm, int insert);
diff --git a/page.c b/page.c
index 324088e..a6edf5a 100644
--- a/page.c
+++ b/page.c
@@ -30,6 +30,7 @@
#include <errno.h>
#include <string.h>
#include "memutil.h"
+#include "trigger.h"
#include "mcelog.h"
#include "rbtree.h"
#include "leaky-bucket.h"
@@ -55,6 +56,7 @@ struct mempage {
static struct rb_root mempage_root;
static struct bucket_conf page_trigger_conf;
+static char *page_error_pre_soft_trigger, *page_error_post_soft_trigger;
static const char *page_state[] = {
[PAGE_ONLINE] = "online",
@@ -221,7 +223,26 @@ void account_page_error(struct mce *m, int channel, int dimm)
memdb_trigger(msg, md, t, &mp->ce, &page_trigger_conf);
free(msg);
mp->triggered = 1;
- offline_action(mp, addr);
+
+ if (offline == OFFLINE_SOFT || offline == OFFLINE_SOFT_THEN_HARD) {
+ struct bucket_conf page_soft_trigger_conf;
+
+ memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf));
+ page_soft_trigger_conf.trigger = page_error_pre_soft_trigger;
+ asprintf(&msg, "pre soft trigger run for page %llx", addr);
+ memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, true);
+ free(msg);
+
+ offline_action(mp, addr);
+
+ memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf));
+ page_soft_trigger_conf.trigger = page_error_post_soft_trigger;
+ asprintf(&msg, "post soft trigger run for page %llx", addr);
+ memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, true);
+ free(msg);
+
+ } else
+ offline_action(mp, addr);
}
}
@@ -262,4 +283,19 @@ void page_setup(void)
Lprintf("Kernel does not support page offline interface\n");
offline = OFFLINE_ACCOUNT;
}
+
+ page_error_pre_soft_trigger = config_string("page", "memory-pre-sync-soft-ce-trigger");
+
+ if (page_error_pre_soft_trigger && trigger_check(page_error_pre_soft_trigger) < 0) {
+ SYSERRprintf("Cannot access page soft pre trigger `%s'",
+ page_error_pre_soft_trigger);
+ exit(1);
+ }
+
+ page_error_post_soft_trigger= config_string("page", "memory-post-sync-soft-ce-trigger");
+ if (page_error_post_soft_trigger && trigger_check(page_error_post_soft_trigger) < 0) {
+ SYSERRprintf("Cannot access page soft post trigger `%s'",
+ page_error_post_soft_trigger);
+ exit(1);
+ }
}
diff --git a/trigger.c b/trigger.c
index 4041950..7924607 100644
--- a/trigger.c
+++ b/trigger.c
@@ -20,6 +20,7 @@
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
+#include <stdbool.h>
#include <string.h>
#include <sys/wait.h>
#include "trigger.h"
@@ -40,6 +41,8 @@ static int num_children;
static int children_max = 4;
static char *trigger_dir;
+static void finish_child(pid_t child, int status);
+
pid_t mcelog_fork(const char *name)
{
pid_t child;
@@ -58,9 +61,11 @@ pid_t mcelog_fork(const char *name)
}
// note: trigger must be allocated, e.g. from config
-void run_trigger(char *trigger, char *argv[], char **env)
+void run_trigger(char *trigger, char *argv[], char **env, bool sync)
{
pid_t child;
+ int status;
+
char *fallback_argv[] = {
trigger,
NULL,
diff --git a/trigger.h b/trigger.h
index e897dc4..f377506 100644
--- a/trigger.h
+++ b/trigger.h
@@ -1,5 +1,11 @@
-void run_trigger(char *trigger, char *argv[], char **env);
+#ifndef __TRIGGER_H__
+#define __TRIGGER_H__
+
+#include <stdbool.h>
+void run_trigger(char *trigger, char *argv[], char **env, bool sync);
void trigger_setup(void);
void trigger_wait(void);
int trigger_check(char *);
pid_t mcelog_fork(const char *thread_name);
+
+#endif
diff --git a/triggers/page-error-post-sync-soft-trigger b/triggers/page-error-post-sync-soft-trigger
new file mode 100644
index 0000000..e8e6ec6
--- /dev/null
+++ b/triggers/page-error-post-sync-soft-trigger
@@ -0,0 +1,39 @@
+#!/bin/sh
+# This shell script can be executed by mcelog in daemon mode when a page
+# in memory exceeds a pre-configured corrected error threshold.
+# mcelog internally also supports offlining the page through the kernel.
+#
+# environment:
+# THRESHOLD human readable threshold status
+# MESSAGE Human readable consolidated error message
+# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on
+# what triggered the event
+# LOCATION Consolidated location as a single string
+# DMI_LOCATION DIMM location from DMI/SMBIOS if available
+# DMI_NAME DIMM identifier from DMI/SMBIOS if available
+# DIMM DIMM number reported by hardware
+# CHANNEL Channel number reported by hardware
+# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
+# CECOUNT Total corrected error count for DIMM
+# UCCOUNT Total uncorrected error count for DIMM
+# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds)
+# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type
+#
+# note: will run as mcelog configured user
+# this can be changed in mcelog.conf
+
+logger -s -p daemon.err -t mcelog "$MESSAGE"
+logger -s -p daemon.err -t mcelog "Location: $LOCATION"
+
+[ -x ./page-error-post-sync-soft-trigger.local ] && . ./page-error-post-sync-soft-trigger.local
+
+if [ -d page-error-post-sync-soft-trigger.extern ]
+then
+ ls page-error-post-sync-soft-trigger.extern |
+ while read item
+ do
+ [ -x ./page-error-post-sync-soft-trigger.extern/$item ] && . ./page-error-post-sync-soft-trigger.extern/$item
+ done
+fi
+
+exit 0
diff --git a/triggers/page-error-pre-sync-soft-trigger b/triggers/page-error-pre-sync-soft-trigger
new file mode 100755
index 0000000..27269c6
--- /dev/null
+++ b/triggers/page-error-pre-sync-soft-trigger
@@ -0,0 +1,39 @@
+#!/bin/sh
+# This shell script can be executed by mcelog in daemon mode when a page
+# in memory exceeds a pre-configured corrected error threshold.
+# mcelog internally also supports offlining the page through the kernel.
+#
+# environment:
+# THRESHOLD human readable threshold status
+# MESSAGE Human readable consolidated error message
+# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on
+# what triggered the event
+# LOCATION Consolidated location as a single string
+# DMI_LOCATION DIMM location from DMI/SMBIOS if available
+# DMI_NAME DIMM identifier from DMI/SMBIOS if available
+# DIMM DIMM number reported by hardware
+# CHANNEL Channel number reported by hardware
+# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM
+# CECOUNT Total corrected error count for DIMM
+# UCCOUNT Total uncorrected error count for DIMM
+# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds)
+# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type
+#
+# note: will run as mcelog configured user
+# this can be changed in mcelog.conf
+
+logger -s -p daemon.err -t mcelog "$MESSAGE"
+logger -s -p daemon.err -t mcelog "Location: $LOCATION"
+
+[ -x ./page-error-pre-soft-trigger.local ] && . ./page-error-pre-soft-trigger.local
+
+if [ -d page-error-pre-sync-soft-trigger.extern ]
+then
+ ls page-error-pre-sync-soft-trigger.extern |
+ while read item
+ do
+ [ -x ./page-error-pre-sync-soft-trigger.extern/$item ] && . ./page-error-pre-sync-soft-trigger.extern/$item
+ done
+fi
+
+exit 0
diff --git a/unknown.c b/unknown.c
index d78015b..d2c0627 100644
--- a/unknown.c
+++ b/unknown.c
@@ -73,7 +73,7 @@ void run_unknown_trigger(int socket, int cpu, struct mce *log)
env[ei] = NULL;
assert(ei < MAX_ENV);
- run_trigger(unknown_trigger, NULL, env);
+ run_trigger(unknown_trigger, NULL, env, false);
for (i = 0; i < ei; i++)
free(env[i]);
free(msg);
diff --git a/yellow.c b/yellow.c
index dc45c1a..a077943 100644
--- a/yellow.c
+++ b/yellow.c
@@ -95,7 +95,7 @@ void run_yellow_trigger(int cpu, int tnum, int lnum, char *ts, char *ls, int soc
env[ei] = NULL;
assert(ei < MAX_ENV);
- run_trigger(yellow_trigger, NULL, env);
+ run_trigger(yellow_trigger, NULL, env, false);
for (i = 0; i < ei; i++)
free(env[i]);
out: