diff options
author | Andi Kleen <github@halobates.de> | 2018-05-14 15:12:30 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2018-05-14 15:12:30 -0700 |
commit | c79b2e61436f6158c73d049f5fc01eb264124589 (patch) | |
tree | bcce77dac498da95f56d29fa7654bf193b22fb6a | |
parent | 0a310abe7b0788ef100b5b2a8eb7c63f04bff1d7 (diff) | |
parent | 9f86b857838624e655abf5744d58c71102101451 (diff) | |
download | mcelog-c79b2e61436f6158c73d049f5fc01eb264124589.tar.gz |
Merge pull request #68 from zoucao-ali/master
add a pre and post trigger when mcelog do memory soft offline
-rw-r--r-- | Makefile | 4 | ||||
-rw-r--r-- | bus.c | 4 | ||||
-rw-r--r-- | dimm.c | 2 | ||||
-rw-r--r-- | leaky-bucket.h | 1 | ||||
-rw-r--r-- | mcelog.conf | 8 | ||||
-rw-r--r-- | mcelog.conf.5 | 23 | ||||
-rw-r--r-- | memdb.c | 4 | ||||
-rw-r--r-- | memdb.h | 2 | ||||
-rw-r--r-- | page.c | 38 | ||||
-rw-r--r-- | trigger.c | 7 | ||||
-rw-r--r-- | trigger.h | 8 | ||||
-rw-r--r-- | triggers/page-error-post-sync-soft-trigger | 39 | ||||
-rwxr-xr-x | triggers/page-error-pre-sync-soft-trigger | 39 | ||||
-rw-r--r-- | unknown.c | 2 | ||||
-rw-r--r-- | yellow.c | 2 |
15 files changed, 165 insertions, 18 deletions
@@ -21,7 +21,9 @@ TRIGGERS=cache-error-trigger dimm-error-trigger page-error-trigger \ socket-memory-error-trigger \ bus-error-trigger \ iomca-error-trigger \ - unknown-error-trigger + unknown-error-trigger \ + page-error-pre-sync-soft-trigger \ + page-error-post-sync-soft-trigger all: mcelog @@ -82,7 +82,7 @@ void run_bus_trigger(int socket, int cpu, char *level, char *pp, char *rrrr, env[ei] = NULL; assert(ei < MAX_ENV); - run_trigger(bus_trigger, NULL, env); + run_trigger(bus_trigger, NULL, env, false); for (i = 0; i < ei; i++) free(env[i]); free(msg); @@ -119,7 +119,7 @@ void run_iomca_trigger(int socket, int cpu, int seg, int bus, int dev, int fn) env[ei] = NULL; assert(ei < MAX_ENV); - run_trigger(iomca_trigger, NULL, env); + run_trigger(iomca_trigger, NULL, env, false); for (i = 0; i < ei; i++) free(env[i]); free(msg); @@ -374,7 +374,7 @@ void new_error(unsigned long long addr, unsigned long max_error, char *trigger) Lprintf("Large number of corrected errors in memory at %s", loc); Lprintf("Consider replacing it"); if (trigger && trigger[0]) - run_trigger(trigger, loc, val, max_error); + run_trigger(trigger, loc, val, max_error, false); } } free(devs); diff --git a/leaky-bucket.h b/leaky-bucket.h index 860ba3c..18bd02d 100644 --- a/leaky-bucket.h +++ b/leaky-bucket.h @@ -2,6 +2,7 @@ #define LEAKY_BUCKET_H 1 #include <time.h> +#include <stdbool.h> /* Leaky bucket algorithm for triggers */ diff --git a/mcelog.conf b/mcelog.conf index 54e2b91..b0ccc2d 100644 --- a/mcelog.conf +++ b/mcelog.conf @@ -174,6 +174,14 @@ memory-ce-log = yes #memory-ce-action = off|account|soft|hard|soft-then-hard memory-ce-action = soft +# Trigger script before doing soft memory offline +# this trigger will scan and run all the scipts in the page-error-pre-soft-trigger.extern +memory-pre-sync-soft-ce-trigger = page-error-pre-sync-soft-trigger + +# Trigger script after completing soft memory offline +# this trigger will scan and run all the scipts in the page-error-post-soft-trigger.extern +memory-post-sync-soft-ce-trigger = page-error-post-sync-soft-trigger + [trigger] # Maximum number of running triggers children-max = 2 diff --git a/mcelog.conf.5 b/mcelog.conf.5 index c87f23e..261e026 100644 --- a/mcelog.conf.5 +++ b/mcelog.conf.5 @@ -1,5 +1,5 @@ -." Auto generated mcelog.conf manpage. Do not edit. +.\" Auto generated mcelog.conf manpage. Do not edit. .TH "mcelog.conf" 5 "mcelog" .SH NAME @@ -8,12 +8,11 @@ mcelog.conf \- mcelog.conf reference .B /etc/mcelog.conf .SH DESCRIPTION -/etc/mcelog.conf is the main configuration file for +/etc/mcelog.conf is the main configuration file for .B mcelog(8). -This is configuration file separated into sections including +This is configuration file separated into sections including a default section. - General format .PP .B optionname = value @@ -266,6 +265,18 @@ soft-then-hard First try to soft offline, then try hard offlining .B memory-ce-action = soft .PP .PP +Trigger script before doing soft memory offline +this trigger will scan and run all the scipts in the page-error-pre-soft-trigger.extern +.PP +.B memory-pre-sync-soft-ce-trigger = page-error-pre-sync-soft-trigger +.PP +.PP +Trigger script after completing soft memory offline +this trigger will scan and run all the scipts in the page-error-post-soft-trigger.extern +.PP +.B memory-post-sync-soft-ce-trigger = page-error-post-sync-soft-trigger +.PP +.PP .SS "The trigger config section" Maximum number of running triggers .PP @@ -277,7 +288,7 @@ Execute triggers in this directory .PP .SH SEE ALSO -.BR mcelog (8) -, +.BR mcelog (8), +.BR mcelog.triggers (5) .B http://www.mcelog.org @@ -211,11 +211,11 @@ account_memdb(struct err_triggers *t, struct memdimm *md, struct mce *m) if (m->status & MCI_STATUS_UC) { md->uc.count++; if (__bucket_account(&t->uc_bucket_conf, &md->uc.bucket, 1, m->time)) - memdb_trigger(msg, md, m->time, &md->uc, &t->uc_bucket_conf); + memdb_trigger(msg, md, m->time, &md->uc, &t->uc_bucket_conf, false); } else { md->ce.count++; if (__bucket_account(&t->ce_bucket_conf, &md->ce.bucket, 1, m->time)) - memdb_trigger(msg, md, m->time, &md->ce, &t->ce_bucket_conf); + memdb_trigger(msg, md, m->time, &md->ce, &t->ce_bucket_conf, false); } free(msg); } @@ -20,5 +20,5 @@ void memory_error(struct mce *m, int channel, int dimm, unsigned corr_err_cnt, struct memdimm; void memdb_trigger(char *msg, struct memdimm *md, time_t t, - struct err_type *et, struct bucket_conf *bc); + struct err_type *et, struct bucket_conf *bc, bool sync); struct memdimm *get_memdimm(int socketid, int channel, int dimm, int insert); @@ -30,6 +30,7 @@ #include <errno.h> #include <string.h> #include "memutil.h" +#include "trigger.h" #include "mcelog.h" #include "rbtree.h" #include "leaky-bucket.h" @@ -55,6 +56,7 @@ struct mempage { static struct rb_root mempage_root; static struct bucket_conf page_trigger_conf; +static char *page_error_pre_soft_trigger, *page_error_post_soft_trigger; static const char *page_state[] = { [PAGE_ONLINE] = "online", @@ -221,7 +223,26 @@ void account_page_error(struct mce *m, int channel, int dimm) memdb_trigger(msg, md, t, &mp->ce, &page_trigger_conf); free(msg); mp->triggered = 1; - offline_action(mp, addr); + + if (offline == OFFLINE_SOFT || offline == OFFLINE_SOFT_THEN_HARD) { + struct bucket_conf page_soft_trigger_conf; + + memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf)); + page_soft_trigger_conf.trigger = page_error_pre_soft_trigger; + asprintf(&msg, "pre soft trigger run for page %llx", addr); + memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, true); + free(msg); + + offline_action(mp, addr); + + memcpy(&page_soft_trigger_conf, &page_trigger_conf, sizeof(struct bucket_conf)); + page_soft_trigger_conf.trigger = page_error_post_soft_trigger; + asprintf(&msg, "post soft trigger run for page %llx", addr); + memdb_trigger(msg, md, t, &mp->ce, &page_soft_trigger_conf, true); + free(msg); + + } else + offline_action(mp, addr); } } @@ -262,4 +283,19 @@ void page_setup(void) Lprintf("Kernel does not support page offline interface\n"); offline = OFFLINE_ACCOUNT; } + + page_error_pre_soft_trigger = config_string("page", "memory-pre-sync-soft-ce-trigger"); + + if (page_error_pre_soft_trigger && trigger_check(page_error_pre_soft_trigger) < 0) { + SYSERRprintf("Cannot access page soft pre trigger `%s'", + page_error_pre_soft_trigger); + exit(1); + } + + page_error_post_soft_trigger= config_string("page", "memory-post-sync-soft-ce-trigger"); + if (page_error_post_soft_trigger && trigger_check(page_error_post_soft_trigger) < 0) { + SYSERRprintf("Cannot access page soft post trigger `%s'", + page_error_post_soft_trigger); + exit(1); + } } @@ -20,6 +20,7 @@ #include <unistd.h> #include <stdlib.h> #include <signal.h> +#include <stdbool.h> #include <string.h> #include <sys/wait.h> #include "trigger.h" @@ -40,6 +41,8 @@ static int num_children; static int children_max = 4; static char *trigger_dir; +static void finish_child(pid_t child, int status); + pid_t mcelog_fork(const char *name) { pid_t child; @@ -58,9 +61,11 @@ pid_t mcelog_fork(const char *name) } // note: trigger must be allocated, e.g. from config -void run_trigger(char *trigger, char *argv[], char **env) +void run_trigger(char *trigger, char *argv[], char **env, bool sync) { pid_t child; + int status; + char *fallback_argv[] = { trigger, NULL, @@ -1,5 +1,11 @@ -void run_trigger(char *trigger, char *argv[], char **env); +#ifndef __TRIGGER_H__ +#define __TRIGGER_H__ + +#include <stdbool.h> +void run_trigger(char *trigger, char *argv[], char **env, bool sync); void trigger_setup(void); void trigger_wait(void); int trigger_check(char *); pid_t mcelog_fork(const char *thread_name); + +#endif diff --git a/triggers/page-error-post-sync-soft-trigger b/triggers/page-error-post-sync-soft-trigger new file mode 100644 index 0000000..e8e6ec6 --- /dev/null +++ b/triggers/page-error-post-sync-soft-trigger @@ -0,0 +1,39 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a page +# in memory exceeds a pre-configured corrected error threshold. +# mcelog internally also supports offlining the page through the kernel. +# +# environment: +# THRESHOLD human readable threshold status +# MESSAGE Human readable consolidated error message +# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on +# what triggered the event +# LOCATION Consolidated location as a single string +# DMI_LOCATION DIMM location from DMI/SMBIOS if available +# DMI_NAME DIMM identifier from DMI/SMBIOS if available +# DIMM DIMM number reported by hardware +# CHANNEL Channel number reported by hardware +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CECOUNT Total corrected error count for DIMM +# UCCOUNT Total uncorrected error count for DIMM +# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds) +# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./page-error-post-sync-soft-trigger.local ] && . ./page-error-post-sync-soft-trigger.local + +if [ -d page-error-post-sync-soft-trigger.extern ] +then + ls page-error-post-sync-soft-trigger.extern | + while read item + do + [ -x ./page-error-post-sync-soft-trigger.extern/$item ] && . ./page-error-post-sync-soft-trigger.extern/$item + done +fi + +exit 0 diff --git a/triggers/page-error-pre-sync-soft-trigger b/triggers/page-error-pre-sync-soft-trigger new file mode 100755 index 0000000..27269c6 --- /dev/null +++ b/triggers/page-error-pre-sync-soft-trigger @@ -0,0 +1,39 @@ +#!/bin/sh +# This shell script can be executed by mcelog in daemon mode when a page +# in memory exceeds a pre-configured corrected error threshold. +# mcelog internally also supports offlining the page through the kernel. +# +# environment: +# THRESHOLD human readable threshold status +# MESSAGE Human readable consolidated error message +# TOTALCOUNT total count of errors for current DIMM of CE/UC depending on +# what triggered the event +# LOCATION Consolidated location as a single string +# DMI_LOCATION DIMM location from DMI/SMBIOS if available +# DMI_NAME DIMM identifier from DMI/SMBIOS if available +# DIMM DIMM number reported by hardware +# CHANNEL Channel number reported by hardware +# SOCKETID Socket ID of CPU that includes the memory controller with the DIMM +# CECOUNT Total corrected error count for DIMM +# UCCOUNT Total uncorrected error count for DIMM +# LASTEVENT Time stamp of event that triggered threshold (in time_t format, seconds) +# THRESHOLD_COUNT Total umber of events in current threshold time period of specific type +# +# note: will run as mcelog configured user +# this can be changed in mcelog.conf + +logger -s -p daemon.err -t mcelog "$MESSAGE" +logger -s -p daemon.err -t mcelog "Location: $LOCATION" + +[ -x ./page-error-pre-soft-trigger.local ] && . ./page-error-pre-soft-trigger.local + +if [ -d page-error-pre-sync-soft-trigger.extern ] +then + ls page-error-pre-sync-soft-trigger.extern | + while read item + do + [ -x ./page-error-pre-sync-soft-trigger.extern/$item ] && . ./page-error-pre-sync-soft-trigger.extern/$item + done +fi + +exit 0 @@ -73,7 +73,7 @@ void run_unknown_trigger(int socket, int cpu, struct mce *log) env[ei] = NULL; assert(ei < MAX_ENV); - run_trigger(unknown_trigger, NULL, env); + run_trigger(unknown_trigger, NULL, env, false); for (i = 0; i < ei; i++) free(env[i]); free(msg); @@ -95,7 +95,7 @@ void run_yellow_trigger(int cpu, int tnum, int lnum, char *ts, char *ls, int soc env[ei] = NULL; assert(ei < MAX_ENV); - run_trigger(yellow_trigger, NULL, env); + run_trigger(yellow_trigger, NULL, env, false); for (i = 0; i < ei; i++) free(env[i]); out: |