diff options
author | Len Brown <len.brown@intel.com> | 2017-05-26 22:50:54 -0700 |
---|---|---|
committer | Len Brown <len.brown@intel.com> | 2017-05-26 22:50:54 -0700 |
commit | a34611e6b6897001c0ba0187a8737b13e4e7f7f7 (patch) | |
tree | 048291c1cbde932581d6b7cf146499b3ba595c97 | |
parent | a351e9b9fc24e982ec2f0e76379a49826036da12 (diff) | |
download | linux-shepherd.tar.gz |
shepherd: Scheduler Helper Daemonshepherd
v1 - as shown at OSTS
Signed-off-by: Len Brown <len.brown@intel.com>
-rw-r--r-- | tools/power/shepherd/Makefile | 6 | ||||
-rw-r--r-- | tools/power/shepherd/cpuset.c | 413 | ||||
-rw-r--r-- | tools/power/shepherd/shepherd.1 | 61 | ||||
-rw-r--r-- | tools/power/shepherd/shepherd.c | 1201 |
4 files changed, 1681 insertions, 0 deletions
diff --git a/tools/power/shepherd/Makefile b/tools/power/shepherd/Makefile new file mode 100644 index 00000000000000..6a74f892b44e71 --- /dev/null +++ b/tools/power/shepherd/Makefile @@ -0,0 +1,6 @@ + +shepherd: shepherd.c + cc -o shepherd shepherd.c +# cc -o shepherd shepherd.c -lcpuset +clean: + rm shepherd diff --git a/tools/power/shepherd/cpuset.c b/tools/power/shepherd/cpuset.c new file mode 100644 index 00000000000000..f875949c12bf78 --- /dev/null +++ b/tools/power/shepherd/cpuset.c @@ -0,0 +1,413 @@ +/* + * Terminology: + * + * cpuset - (libc) cpu_set_t data structure represents set of CPUs + * cpumask - string with hex mask (e.g. "0x00000001") + * cpulist - string with CPU ranges (e.g. "0-3,5,7,8") + * + * Based on code from taskset.c and Linux kernel. + * + * This file may be redistributed under the terms of the + * GNU Lesser General Public License. + * + * Copyright (C) 2010 Karel Zak <kzak@redhat.com> + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sched.h> +#include <errno.h> +#include <string.h> +#include <ctype.h> +#include <sys/syscall.h> + +#include "cpuset.h" +#define cpuset_nbits(setsize) (8 * (setsize)) + +#if 0 +#include "c.h" + +static inline int val_to_char(int v) +{ + if (v >= 0 && v < 10) + return '0' + v; + else if (v >= 10 && v < 16) + return ('a' - 10) + v; + else + return -1; +} + +static inline int char_to_val(int c) +{ + int cl; + + cl = tolower(c); + if (c >= '0' && c <= '9') + return c - '0'; + else if (cl >= 'a' && cl <= 'f') + return cl + (10 - 'a'); + else + return -1; +} + +#endif +static const char *nexttoken(const char *q, int sep) +{ + if (q) + q = strchr(q, sep); + if (q) + q++; + return q; +} + +#if 0 +/* + * Number of bits in a CPU bitmask on current system + */ +int get_max_number_of_cpus(void) +{ +#ifdef SYS_sched_getaffinity + int n, cpus = 2048; + size_t setsize; + cpu_set_t *set = cpuset_alloc(cpus, &setsize, NULL); + + if (!set) + return -1; /* error */ + + for (;;) { + CPU_ZERO_S(setsize, set); + + /* the library version does not return size of cpumask_t */ + n = syscall(SYS_sched_getaffinity, 0, setsize, set); + + if (n < 0 && errno == EINVAL && cpus < 1024 * 1024) { + cpuset_free(set); + cpus *= 2; + set = cpuset_alloc(cpus, &setsize, NULL); + if (!set) + return -1; /* error */ + continue; + } + cpuset_free(set); + return n * 8; + } +#endif + return -1; +} + +/* + * Allocates a new set for ncpus and returns size in bytes and size in bits + */ +cpu_set_t *cpuset_alloc(int ncpus, size_t *setsize, size_t *nbits) +{ + cpu_set_t *set = CPU_ALLOC(ncpus); + + if (!set) + return NULL; + if (setsize) + *setsize = CPU_ALLOC_SIZE(ncpus); + if (nbits) + *nbits = cpuset_nbits(CPU_ALLOC_SIZE(ncpus)); + return set; +} + +void cpuset_free(cpu_set_t *set) +{ + CPU_FREE(set); +} + +#if !HAVE_DECL_CPU_ALLOC +/* Please, use CPU_COUNT_S() macro. This is fallback */ +int __cpuset_count_s(size_t setsize, const cpu_set_t *set) +{ + int s = 0; + const __cpu_mask *p = set->__bits; + const __cpu_mask *end = &set->__bits[setsize / sizeof (__cpu_mask)]; + + while (p < end) { + __cpu_mask l = *p++; + + if (l == 0) + continue; +# if LONG_BIT > 32 + l = (l & 0x5555555555555555ul) + ((l >> 1) & 0x5555555555555555ul); + l = (l & 0x3333333333333333ul) + ((l >> 2) & 0x3333333333333333ul); + l = (l & 0x0f0f0f0f0f0f0f0ful) + ((l >> 4) & 0x0f0f0f0f0f0f0f0ful); + l = (l & 0x00ff00ff00ff00fful) + ((l >> 8) & 0x00ff00ff00ff00fful); + l = (l & 0x0000ffff0000fffful) + ((l >> 16) & 0x0000ffff0000fffful); + l = (l & 0x00000000fffffffful) + ((l >> 32) & 0x00000000fffffffful); +# else + l = (l & 0x55555555ul) + ((l >> 1) & 0x55555555ul); + l = (l & 0x33333333ul) + ((l >> 2) & 0x33333333ul); + l = (l & 0x0f0f0f0ful) + ((l >> 4) & 0x0f0f0f0ful); + l = (l & 0x00ff00fful) + ((l >> 8) & 0x00ff00fful); + l = (l & 0x0000fffful) + ((l >> 16) & 0x0000fffful); +# endif + s += l; + } + return s; +} +#endif + +#endif /* 0 */ +/* + * Returns human readable representation of the cpuset. The output format is + * a list of CPUs with ranges (for example, "0,1,3-9"). + */ +char *cpulist_create(char *str, size_t len, + cpu_set_t *set, size_t setsize) +{ + size_t i; + char *ptr = str; + int entry_made = 0; + size_t max = cpuset_nbits(setsize); + + for (i = 0; i < max; i++) { + if (CPU_ISSET_S(i, setsize, set)) { + int rlen; + size_t j, run = 0; + entry_made = 1; + for (j = i + 1; j < max; j++) { + if (CPU_ISSET_S(j, setsize, set)) + run++; + else + break; + } + if (!run) + rlen = snprintf(ptr, len, "%zu,", i); + else if (run == 1) { + rlen = snprintf(ptr, len, "%zu,%zu,", i, i + 1); + i++; + } else { + rlen = snprintf(ptr, len, "%zu-%zu,", i, i + run); + i += run; + } + if (rlen < 0 || (size_t) rlen + 1 > len) + return NULL; + ptr += rlen; + if (rlen > 0 && len > (size_t) rlen) + len -= rlen; + else + len = 0; + } + } + ptr -= entry_made; + *ptr = '\0'; + + return str; +} + +#if 0 +/* + * Returns string with CPU mask. + */ +char *cpumask_create(char *str, size_t len, + cpu_set_t *set, size_t setsize) +{ + char *ptr = str; + char *ret = NULL; + int cpu; + + for (cpu = cpuset_nbits(setsize) - 4; cpu >= 0; cpu -= 4) { + char val = 0; + + if (len == (size_t) (ptr - str)) + break; + + if (CPU_ISSET_S(cpu, setsize, set)) + val |= 1; + if (CPU_ISSET_S(cpu + 1, setsize, set)) + val |= 2; + if (CPU_ISSET_S(cpu + 2, setsize, set)) + val |= 4; + if (CPU_ISSET_S(cpu + 3, setsize, set)) + val |= 8; + + if (!ret && val) + ret = ptr; + *ptr++ = val_to_char(val); + } + *ptr = '\0'; + return ret ? ret : ptr - 1; +} + +/* + * Parses string with CPUs mask. + */ +int cpumask_parse(const char *str, cpu_set_t *set, size_t setsize) +{ + int len = strlen(str); + const char *ptr = str + len - 1; + int cpu = 0; + + /* skip 0x, it's all hex anyway */ + if (len > 1 && !memcmp(str, "0x", 2L)) + str += 2; + + CPU_ZERO_S(setsize, set); + + while (ptr >= str) { + char val; + + /* cpu masks in /sys uses comma as a separator */ + if (*ptr == ',') + ptr--; + + val = char_to_val(*ptr); + if (val == (char) -1) + return -1; + if (val & 1) + CPU_SET_S(cpu, setsize, set); + if (val & 2) + CPU_SET_S(cpu + 1, setsize, set); + if (val & 4) + CPU_SET_S(cpu + 2, setsize, set); + if (val & 8) + CPU_SET_S(cpu + 3, setsize, set); + len--; + ptr--; + cpu += 4; + } + + return 0; +} + +#endif /* 0 */ +/* + * Parses string with list of CPU ranges. + * Returns 0 on success. + * Returns 1 on error. + * Returns 2 if fail is set and a cpu number passed in the list doesn't fit + * into the cpu_set. If fail is not set cpu numbers that do not fit are + * ignored and 0 is returned instead. + */ +int cpulist_parse(const char *str, cpu_set_t *set, size_t setsize, int fail) +{ + size_t max = cpuset_nbits(setsize); + const char *p, *q; + int r = 0; + + q = str; + CPU_ZERO_S(setsize, set); + + while (p = q, q = nexttoken(q, ','), p) { + unsigned int a; /* beginning of range */ + unsigned int b; /* end of range */ + unsigned int s; /* stride */ + const char *c1, *c2; + char c; + + if ((r = sscanf(p, "%u%c", &a, &c)) < 1) + return 1; + b = a; + s = 1; + + c1 = nexttoken(p, '-'); + c2 = nexttoken(p, ','); + if (c1 != NULL && (c2 == NULL || c1 < c2)) { + if ((r = sscanf(c1, "%u%c", &b, &c)) < 1) + return 1; + c1 = nexttoken(c1, ':'); + if (c1 != NULL && (c2 == NULL || c1 < c2)) { + if ((r = sscanf(c1, "%u%c", &s, &c)) < 1) + return 1; + if (s == 0) + return 1; + } + } + + if (!(a <= b)) + return 1; + while (a <= b) { + if (fail && (a >= max)) + return 2; + CPU_SET_S(a, setsize, set); + a += s; + } + } + + if (r == 2) + return 1; + return 0; +} + +#if 0 +#ifdef TEST_PROGRAM + +#include <getopt.h> + +int main(int argc, char *argv[]) +{ + cpu_set_t *set; + size_t setsize, buflen, nbits; + char *buf, *mask = NULL, *range = NULL; + int ncpus = 2048, rc, c; + + static const struct option longopts[] = { + { "ncpus", 1, 0, 'n' }, + { "mask", 1, 0, 'm' }, + { "range", 1, 0, 'r' }, + { NULL, 0, 0, 0 } + }; + + while ((c = getopt_long(argc, argv, "n:m:r:", longopts, NULL)) != -1) { + switch(c) { + case 'n': + ncpus = atoi(optarg); + break; + case 'm': + mask = strdup(optarg); + break; + case 'r': + range = strdup(optarg); + break; + default: + goto usage_err; + } + } + + if (!mask && !range) + goto usage_err; + + set = cpuset_alloc(ncpus, &setsize, &nbits); + if (!set) + err(EXIT_FAILURE, "failed to allocate cpu set"); + + /* + fprintf(stderr, "ncpus: %d, cpuset bits: %zd, cpuset bytes: %zd\n", + ncpus, nbits, setsize); + */ + + buflen = 7 * nbits; + buf = malloc(buflen); + if (!buf) + err(EXIT_FAILURE, "failed to allocate cpu set buffer"); + + if (mask) + rc = cpumask_parse(mask, set, setsize); + else + rc = cpulist_parse(range, set, setsize, 0); + + if (rc) + errx(EXIT_FAILURE, "failed to parse string: %s", mask ? : range); + + printf("%-15s = %15s ", mask ? : range, + cpumask_create(buf, buflen, set, setsize)); + printf("[%s]\n", cpulist_create(buf, buflen, set, setsize)); + + free(buf); + free(mask); + free(range); + cpuset_free(set); + + return EXIT_SUCCESS; + +usage_err: + fprintf(stderr, + "usage: %s [--ncpus <num>] --mask <mask> | --range <list>", + program_invocation_short_name); + exit(EXIT_FAILURE); +} +#endif +#endif /* 0 */ diff --git a/tools/power/shepherd/shepherd.1 b/tools/power/shepherd/shepherd.1 new file mode 100644 index 00000000000000..f4591d9688cfd0 --- /dev/null +++ b/tools/power/shepherd/shepherd.1 @@ -0,0 +1,61 @@ +.\" shepherd(1) manpage +.\" +.\" Copyright (C) 2017 Len Brown +.\" +.\" Released under GPLv2 +.\" +.TH SHEPHERD 8 "May 2017" "util-linux" "User Commands" +.SH NAME +shepherd \- ScHEduler helPER Daemon +.SH SYNOPSIS +.B shepherd +[options] +.IR command\ [ argument ...] +.br +.B shepherd +[options] +.SH DESCRIPTION +.PP +.B shepherd +implements system-wide power-aware scheduling from a user-space daemon, using +the thread affinity. +.SH OPTIONS +.BR \-d ,\ \-\-debug +Enable verbose debugging output, repeat for even more... +.TP +.TP +.BR \-h ,\ \-\-help +Display version and help text and exit. +.SH USAGE +.TP +Shepherd will fork into a daemon, unless invoked with --debug. +.B shepherd +.IR command\ [ arguments ] +.TP +You can also retrieve the CPU affinity of an existing task: +.B shepherd \-p +.I pid +.TP +Or set it: +.B shepherd \-p +.SH PERMISSIONS +shepherd must run as root. +.SH EXIT VALUE +0 for success, non-zero for failure. +.SH NOTES +.pp +.SH SEE ALSO +.BR taskset (1), +.BR sched_setaffinity (2), +.BR sched_getaffinity (2) +.sp +See +.SH AUTHOR +Written by Len Brown +.SH COPYRIGHT +Copyright \(co 2017 Len Brown, Intel Corp. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +.SH AVAILABILITY +The shepherd command is part of the util-linux package and is available from +ftp://ftp.kernel.org/pub/linux/utils/util-linux/. diff --git a/tools/power/shepherd/shepherd.c b/tools/power/shepherd/shepherd.c new file mode 100644 index 00000000000000..cdf199ec188281 --- /dev/null +++ b/tools/power/shepherd/shepherd.c @@ -0,0 +1,1201 @@ +/* + * shepherd.c + * Scheduler Helper Daemon + * + * see shepherd.1 for details + * + * This program is free software, released under GPLv2 + * Copyright (C) 2016 Len Brown, Intel Corp <len.brown@intel.com> + */ + +#define _GNU_SOURCE /* See feature_test_macros(7) */ +#include <sched.h> + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <getopt.h> +#include <errno.h> +#include <signal.h> +#include <sched.h> +#include <stddef.h> +#include <string.h> +#include <sys/prctl.h> +#include <time.h> +#include <err.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <syslog.h> + +#include "cpuset.h" + +#include "cpuset.c" + +#define UNUSED(x) unsigned int UNUSED_ ## x __attribute__((__unused__)) + +unsigned int debug; + +unsigned int max_cpu_num; +unsigned int present_num_cpus; + +cpu_set_t *cpuset_present; +cpu_set_t *cpuset_active; +size_t system_cpuset_size; + +char *proc_stat = "/proc/stat"; +char *proc_loadavg = "/proc/loadavg"; + +long user_tick_per_sec; +double poll_slow_sec; +double poll_norm_sec; +double poll_fast_sec; + +int exit_main_loop; +FILE *dfp; +char *output_path; + +/* + * cpuset_size_floor + * the CPU set will not shrink smaller than cpuset_size_floor + */ +int cpuset_size_floor = 1; + +/* + * cpuset_size_ceiling + * the CPU set will not grow no larger than cpuset_size_ceiling + */ +int cpuset_size_ceiling; + +#define HISTORY_LENGTH 10 + +struct cpuset_history { + double seconds; + unsigned cpuset_num_cpus; + unsigned long long system_idle_tick_sum; +} history[HISTORY_LENGTH]; + +unsigned int *set_size_choosen; +double *set_size_seconds; + +static void debug_dump_history(void) +{ + int i; + + for (i = 0; i < HISTORY_LENGTH; ++i) + { + fprintf(dfp, "history[%d] %d cpus %f sec %lld sum\n", + i, + history[i].cpuset_num_cpus, + history[i].seconds, + history[i].system_idle_tick_sum); + } +} +#ifdef NOTYET +/* + * up_slope_max -- how fast a set can grow per interval + * 1.00 means it can increase by up to 100% of its current size + * 0.50 means it can increase by up to 50% of its current size + * etc. + */ +double up_slope_max = 0.50; +#endif + +#ifdef NOTYET +/* + * up_cpus_max + * maximum number of CPUs to increase set per interval + * default = cpuset_size_ceiling/4; + */ +int up_cpus_max; +#endif + +#ifdef NOTYET +/* + * down_rate + * 0.50 means shrink at 50% of the way to ideal_set size on each interval + */ +double down_rate = 0.5; +#endif + + +/* + * get_next_polling_interval_sec() + * + * Policy decision: + * return ns for next polling interval + */ +static double get_next_polling_interval_sec(double average_utilization, int set_size) +{ + + /* + * If max set size, then poll normally, + * no matter the utilization. We can't grow more, + * and we are not in a big rush to shrink. + */ + if (set_size == cpuset_size_ceiling) + return poll_norm_sec; + + /* + * If min set and very low utilization, poll slowly, + * to minimally disturb profoundly idle system. + */ + if ((set_size == cpuset_size_floor) && (average_utilization < 5)) + return poll_slow_sec; + + /* + * If high utilzation, poll quickly + * to promptly expand capacity with demand + */ + if (average_utilization > 80) + return poll_fast_sec; + + /* + * poll at moderate rate + */ + return poll_norm_sec; +} + +#ifdef NOTYET +/* + * Timer slack gives the kernel permission to coalesce our timeout + * with nearby system timers. System default is +/- 0.050 ms, + * which is much tighter than we need. Increase timer slack + * to 1/16th of the requested timeout duration, eg. + * + * 10 ms / 16 = +/- 0.625 ms + * 100 ms / 16 = +/- 6.25 ms + * 1,000 ms / 16 = +/- 62.5 ms + * 2,000 ms / 16 = +/- 125.0 ms + */ + +unsigned int debug_timer_slack_updates; + +unsigned long timer_slack_current_ns; +static void timer_slack_check(double sleep_sec) +{ + unsigned long timer_slack_desired_ns; + + timer_slack_desired_ns = (unsigned long long)sleep_sec / 16 * 1000000000; + + if (timer_slack_desired_ns == timer_slack_current_ns) + return; + + debug_timer_slack_updates++; + + if (prctl(PR_SET_TIMERSLACK, timer_slack_desired_ns) < 0) + err(-1, "PR_SET_TIMERSLACK"); + + timer_slack_current_ns = timer_slack_desired_ns; +} +#endif + +static double timestamp_sec(void) +{ + struct timespec ts; + + if (clock_gettime(CLOCK_REALTIME, &ts) < 0) + err(-1, "clock_gettime"); + + return ((double)ts.tv_sec + (double)ts.tv_nsec/1000000000); +} + +struct timespec ts_begin, ts_end; + +#define DEBUG_SLEEP_ARRAY_SIZE 12 + +unsigned int sleep_request_histogram[DEBUG_SLEEP_ARRAY_SIZE]; +unsigned int sleep_actual_histogram[DEBUG_SLEEP_ARRAY_SIZE]; + +unsigned int debug_sleep_count; +double debug_sleep_stats_start_time; + +static void debug_sleep_histogram_record(unsigned int *a, double interval) +{ + int i; + + for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) { + if (interval >= 2.0) { + a[i]++; + return; + } + interval *= 2; + } +} +static void debug_sleep_histograms_clear(void) +{ + int i; + + for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) { + sleep_request_histogram[i] = 0; + sleep_actual_histogram[i] = 0; + } + + debug_sleep_stats_start_time = timestamp_sec(); +} +static void debug_sleep_histograms_dump(void) +{ + int i; + double seconds = 2.0; + unsigned int request_count, actual_count; + double now, interval; + + request_count = actual_count = 0; + + fprintf(dfp, "Sleep Wake Seconds\n"); + for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) { + request_count += sleep_request_histogram[i]; + actual_count += sleep_actual_histogram[i]; + fprintf(dfp, "%5d %4d >= %f\n", + sleep_request_histogram[i], + sleep_actual_histogram[i], seconds); + seconds /= 2; + } + now = timestamp_sec(); + interval = now - debug_sleep_stats_start_time; + fprintf(dfp, "%5d %4d Total, %f per second over %.2f seconds\n", + request_count, actual_count, + (double)request_count/(interval), interval); +} + +static void stats_clear(void) +{ + unsigned int cpus; + + for (cpus = 0; cpus <= present_num_cpus; ++cpus) { + set_size_choosen[cpus] = 0; + set_size_seconds[cpus] = 0.0; + } + debug_sleep_histograms_clear(); +} +static void stats_init(void) +{ + set_size_choosen = calloc(present_num_cpus + 1, sizeof(unsigned int)); + if (set_size_choosen == NULL) + err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(unsigned int)); + + set_size_seconds = calloc(present_num_cpus + 1, sizeof(double)); + if (set_size_seconds == NULL) + err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(double)); + + stats_clear(); +} + +static void stats_dump(void) +{ + unsigned int cpus; + + fprintf(dfp, "CPUs Used Duration[sec]\n"); + + for (cpus = 1; cpus <= present_num_cpus; ++cpus) { + fprintf(dfp, "%4d %4d %8.2f\n", + cpus, set_size_choosen[cpus], set_size_seconds[cpus]); + } +} + +static void dump_statistics(void) +{ + stats_dump(); + debug_sleep_histograms_dump(); + +#if NOTYET + fprintf(dfp, "%u%% timer slack updates (%u)\n", + 100 * debug_timer_slack_updates/debug_sleep_count, debug_timer_slack_updates); + + debug_timer_slack_updates = 0; + debug_sleep_count = 0; +#endif + stats_clear(); + debug_sleep_histograms_clear(); +} + +/* + * sleep_for_sec(sec) + * + * Sleep for 'ns' nanosec + * First allow adjustment to timer_slack + */ + +static void sleep_for_sec(double seconds) +{ + struct timespec ts; + + debug_sleep_count++; + debug_sleep_histogram_record(sleep_request_histogram, seconds); + +#if NOTYET + timer_slack_check(seconds); +#endif + + ts.tv_sec = (time_t)seconds; + + if (debug > 1) + fprintf(dfp, "input seconds %f -> tv seconds %lld\n", + seconds, (long long int)ts.tv_sec); + + seconds = seconds - ts.tv_sec; + ts.tv_nsec = (long)(seconds * 1000000000.0); + + if (debug > 1) + fprintf(dfp, "input seconds * 1B %f -> tv nsec %ld\n", + seconds * 1000000000, ts.tv_nsec); + + if (nanosleep(&ts, NULL) < 0) { + if (errno == EINTR) { + /* early return if signaled */ + return; + } + err(-1, "nanosleep"); + } +} +static void __attribute__((__noreturn__)) usage(FILE *out) +{ + fprintf(out, "shepherd 17.05.21a (C) 2017 Len Brown <len.brown@intel.com>\n"); + fprintf(out, "Usage: shepherd [options]\n"); + fprintf(out, "'man shepherd' for details\n"); + + exit(out == stderr ? EXIT_FAILURE: EXIT_SUCCESS); +} + +static void cmdline(int argc, char **argv) +{ + int c; + + static const struct option longopts[] = { + { "debug", 0, NULL, 'd' }, + { "help", 0, NULL, 'h' }, + { "output", 0, NULL, 'o' }, + }; + + while ((c = getopt_long_only(argc, argv, "+dho:", longopts, NULL)) != -1) { + switch (c) { + case 'd': + debug++; + break; + case 'h': + usage(stdout); + break; + case 'o': + output_path = optarg; + break; + default: + usage(stderr); + break; + } + } +} + +static void init_default_sample_rates(void) +{ + + double user_tick_sec; + + user_tick_per_sec = sysconf(_SC_CLK_TCK); + + if (user_tick_per_sec != 100) + warnx("User ticks %ld, expected 100", user_tick_per_sec); + + user_tick_sec = 1.0 / user_tick_per_sec; + + /* + * Nominal wakeup frequency is once per second + * 1.0 sec/sample = 1 sample/sec + */ + poll_norm_sec = 1; + + /* + * Slow wakeup frequency is used during profound idle + * and at max utilization of full set size + * 2.0 sec/sample = 0.5 sample/sec + */ + poll_slow_sec = 2; + + /* + * Fast wakeup frequency is used at high utilization + * Sample at 10 * user user tick rate, which means + * we will see at most 10 counter ticks/cpu/sample. + * Since user_tick_sec = 10ms, + * sample rate = 10 * 10ms = 100ms = 10 samples/sec + */ + poll_fast_sec = user_tick_sec * 10; + +} + +static void defaults_init(void) +{ + init_default_sample_rates(); + + // set_size_ceiling = TBD; + +} +/* + * Open a file, and exit on failure + */ +static FILE *fopen_or_die(const char *path, const char *mode) +{ + FILE *filep = fopen(path, mode); + if (!filep) + err(1, "%s: open failed", path); + return filep; +} +/* + * run func(cpu) on every cpu in /proc/stat + * return sum of return values of func(cpu) + * else, if func() returns negative value, return that. + */ +static unsigned long long sum_for_all_proc_cpus(int (func)(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int), unsigned int arg) +{ + FILE *fp; + int cpu_num, user, nice, system, idle; + int retval; + unsigned long long sum = 0; + + fp = fopen_or_die(proc_stat, "r"); + + retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n"); + if (retval != 0) + err(1, "%s: failed to parse format", proc_stat); + + while (1) { + retval = fscanf(fp, "cpu%u %d %d %d %d %*d %*d %*d %*d %*d %*d\n", + &cpu_num, &user, &nice, &system, &idle); + if (retval != 5) + break; + + retval = func(arg, cpu_num, user, nice, system, idle); + if (retval < 0) { + fclose(fp); + return(retval); + } + sum += retval; + } + fclose(fp); + return sum; +} + +static unsigned int get_num_runnable_threads(void) +{ + FILE *fp; + int retval; + unsigned int num_threads; + + fp = fopen_or_die(proc_loadavg, "r"); + + retval = fscanf(fp, "%*f %*f %*f %ud/%*d %*d\n", &num_threads); + if (retval != 1) + err(1, "%s", proc_stat); + + if (debug > 2) + fprintf(dfp, "%d threads\n", num_threads); + fclose(fp); + + return num_threads; +} +#if NOTYET +static int for_all_cpuset_cpus(cpu_set_t *set, int (func)(unsigned int)) +{ + unsigned int cpu; + int retval; + + for (cpu = 0; cpu <= max_cpu_num; ++cpu) { + if (!CPU_ISSET_S(cpu, system_cpuset_size, set)) + continue; + retval = func(cpu); + if (retval) + return retval; + } + return 0; +} +#endif +/* + * cpu_count() + * update global max_cpu_num, return 1 (per cpu) + */ +static int cpu_count(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d)) +{ + if (cpu > max_cpu_num) + max_cpu_num = cpu; + + if (debug > 1) + fprintf(dfp, "cpu%d: %d\n", cpu, present_num_cpus); + return 1; +} + +static int cpu_mark_present(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d)) +{ + CPU_SET_S(cpu, system_cpuset_size, cpuset_present); + return 0; +} + +/* + * for given CPU, starting with INDEX 0 + * Find first /sys/devices/system/cpu/$CPU/cache/$INDEX/shared_cpu_list + * and return it in cpuset_cache_topology arg + * + * return 0 on success, non-0 on failure + */ +static int get_cache_shared_cpu_list(unsigned int cpu_num, cpu_set_t *cpuset_cache_topology) +{ + FILE *fp; + int index; + char buf[256]; + +int retval; + + for (index = 0; index < 10; ++index) { + char *s; + + sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d", cpu_num, index); + if (access(buf, R_OK) != 0) { + if (debug) + perror(buf); + return -1; + } + sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_list", cpu_num, index); + fp = fopen(buf, "r"); + if (fp == NULL) { + if (debug) perror(buf); + continue; + } + + if (fgets(buf, sizeof(buf), fp) == NULL) + err(1, "%s", buf); + + s = strchr(buf, '\n'); + *s = '\0'; + + if (debug > 1) + fprintf(dfp, "get_cache_shared_cpu_list(%d): read '%s'\n", cpu_num, buf); + retval = cpulist_parse(buf, cpuset_cache_topology, system_cpuset_size, 1); + return retval; + } + return -1; +} +/* + * cpus[i] is the order that CPUs are added to the set. + * This mapping is used to allocate cpus to the set in "topology order". + * But even for a linear mapping it is needed to deal with sparse numbering. + */ +unsigned int *cpus; + +static void topology_init_cpu_order(void) +{ + int index; + unsigned int cpu_num; + cpu_set_t *cpuset_cache_topology, *cpuset_unprocessed; + + cpuset_cache_topology = CPU_ALLOC((max_cpu_num + 1)); + if (cpuset_cache_topology == NULL) + err(3, "CPU_ALLOC"); + + cpuset_unprocessed = CPU_ALLOC((max_cpu_num + 1)); + if (cpuset_unprocessed == NULL) + err(3, "CPU_ALLOC"); + + CPU_ZERO_S(system_cpuset_size, cpuset_unprocessed); + + CPU_OR_S(system_cpuset_size, cpuset_unprocessed, cpuset_unprocessed, cpuset_active); + + for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) { + unsigned int cache_cpu_num; + + if (!CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_unprocessed)) + continue; + + CPU_ZERO_S(system_cpuset_size, cpuset_cache_topology); + if (get_cache_shared_cpu_list(cpu_num, cpuset_cache_topology)) { + if (debug) fprintf(dfp, "BAILING to LINEAR\n"); + break; + } + for (cache_cpu_num = 0; cache_cpu_num <= max_cpu_num; ++cache_cpu_num) + { + if (!CPU_ISSET_S(cache_cpu_num, system_cpuset_size, cpuset_cache_topology)) + continue; + cpus[index] = cache_cpu_num; + if (debug) + fprintf(dfp, "%d: cpus[%d] = cpu%d\n", cpu_num, index, cache_cpu_num); + CPU_CLR_S(cache_cpu_num, system_cpuset_size, cpuset_unprocessed); + index++; + } + } + + CPU_FREE(cpuset_cache_topology); + CPU_FREE(cpuset_unprocessed); + + if (cpu_num == max_cpu_num + 1) + return; + + fprintf(dfp, "BACKUP plan: linear %d\n", cpu_num); + /* linear */ + for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) { + if (CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_active)) { + cpus[index] = cpu_num; + if (debug > 1) + fprintf(dfp, "cpus[%d] = cpu%d\n", index, cpu_num); + index++; + } + } +} + +static void topology_probe(void) +{ + present_num_cpus = 0; + max_cpu_num = 0; + present_num_cpus = sum_for_all_proc_cpus(cpu_count, 0); + cpuset_size_ceiling = present_num_cpus; + + if (debug > 1) + fprintf(dfp, "%d cpus, cpu%d is highest numbered\n", + present_num_cpus, max_cpu_num); + + /* + * Allocate the ordered CPU topology list + * indexed by size of active_set + * each entry containes cpu# which is last + * onlined to complete a set of that size + */ + cpus = calloc(present_num_cpus, sizeof(unsigned int)); + if (cpus == NULL) + err(1, "calloc(%d, %zd)", present_num_cpus, sizeof(unsigned int)); + + /* + * Allocate cpuset_present, initialize to all CPUs in system + */ + cpuset_present = CPU_ALLOC((max_cpu_num + 1)); + if (cpuset_present == NULL) + err(3, "CPU_ALLOC"); + system_cpuset_size = CPU_ALLOC_SIZE((max_cpu_num + 1)); + CPU_ZERO_S(system_cpuset_size, cpuset_present); + sum_for_all_proc_cpus(cpu_mark_present, 0); + + /* + * Allocate cpuset_active, initialize = cpuset_present + */ + cpuset_active = CPU_ALLOC((max_cpu_num + 1)); + if (cpuset_active == NULL) + err(3, "CPU_ALLOC"); + CPU_ZERO_S(system_cpuset_size, cpuset_active); + CPU_OR_S(system_cpuset_size, cpuset_active, cpuset_present, cpuset_active); + + /* + * TODO: make this 2 on HT boxes + */ + cpuset_size_floor = 1; + + topology_init_cpu_order(); +} + +static int cpu_record_utilization(UNUSED(index), unsigned int cpu, + UNUSED(a), UNUSED(b), UNUSED(c), unsigned int idle_count) +{ + if (debug > 1) + fprintf(dfp, "cpu%d idle %d\n", cpu, idle_count); + +// if (CPU_ISSET_S(cpu, system_cpuset_size, cpuset_active)) + return idle_count; +// else +// return 0; +} + +static void system_snapshot_utilization(unsigned int index) +{ + + history[index].seconds = timestamp_sec(); + if(debug > 1) + fprintf(dfp, "util timestamp %f\n", history[index].seconds); + + history[index].system_idle_tick_sum = + sum_for_all_proc_cpus(cpu_record_utilization, 0); + if (debug > 2) + debug_dump_history(); +} + +#if NOTYET +static int per_cpu_stub(unsigned int cpu) +{ + fprintf(dfp, "hello world %d\n", cpu); + return 0; +} +#endif +/* + * cpuset_calculate_utilization() + * + * return average_utilization of set during previous interval from 0.0 to 100.0 % + */ +double ema; + +static double cpuset_calculate_utilization(unsigned int util_index_prev, unsigned int util_index, unsigned int active_num_cpus) +{ + double idle_ticks; + double ticks_per_interval_per_cpu; + double delta_sec; + double average_utilization; + double total_possible_ticks; + double busy_ticks; + +#if NOYET + double ema_period_sec = 2.0; + double ema_weight; +#endif + + if (util_index == util_index_prev) + return -1; + + delta_sec = history[util_index].seconds - history[util_index_prev].seconds; + debug_sleep_histogram_record(sleep_actual_histogram, delta_sec); + + if (delta_sec <= 0) { + fprintf(dfp, "BUG: delta_sec %f\n", delta_sec); + delta_sec = 0.01; + } + set_size_choosen[active_num_cpus]++; + set_size_seconds[active_num_cpus] += delta_sec; + + if(debug > 1) + fprintf(dfp, "delta_sec %f = [%d] %f - [%d] %f\n", + delta_sec, util_index, history[util_index].seconds, + util_index_prev, history[util_index_prev].seconds); + + ticks_per_interval_per_cpu = user_tick_per_sec * delta_sec; + if(debug > 1) + fprintf(dfp, "ticks_per_interval_per_cpu %f = %ld * %f\n", + ticks_per_interval_per_cpu, user_tick_per_sec, delta_sec); + + idle_ticks = history[util_index].system_idle_tick_sum - + history[util_index_prev].system_idle_tick_sum; + if (debug > 1) + fprintf(dfp, "idle_ticks %f = %lld - %lld\n", + idle_ticks, history[util_index].system_idle_tick_sum, + history[util_index_prev].system_idle_tick_sum); + + total_possible_ticks = ticks_per_interval_per_cpu * present_num_cpus; + busy_ticks = total_possible_ticks - idle_ticks; + + if (debug > 1) + fprintf(dfp, "total_possible_ticks %f busy_ticks %f\n", + total_possible_ticks, busy_ticks); + + + average_utilization = 100 * busy_ticks / ticks_per_interval_per_cpu / active_num_cpus; + + if (average_utilization < 0) + average_utilization = 0; + + +#if NOTYET + /* + * Exponential Moving Average (EMA) + * + * EMA(t) = EMA(t-1) + EMA_WEIGHT * (SAMPLE(t) - EMA(t-1)) + * + * Larger EMA_WEIGHT favors SAMPLE(t) + * at the expense of EMA history, EMA(t-1) + * + * An N-period EMA will forget 86% of history + * after N samples, and EMA_WEIGHT = 1/(N + 1). + * + * We make the policy decision of selecting ema_period_sec = 2.0. + * We use a different EMA_WEIGHT depending on the sample rate. + * + * EMA_WEIGHT(2.0 sec, 100 samples/sec) = 2/(200+1) = 0.00995 + * EMA_WEIGHT(2.0 sec, 10 samples/sec) = 2/(20+1) = 0.0952 + * EMA_WEIGHT(2.0 sec, 1 samples/sec) = 2/(2+1) = 0.666 + * EMA_WEIGHT(2.0 sec, .5 samples/sec) = 2/(1+1) = 1.0 + */ + ema_weight = 2 / (ema_period_sec * 1/delta_sec + 1); + ema = ema + ema_weight * average_utilization - ema_weight * ema; +#endif + + if(debug) { + char cpulist[256]; + + cpulist_create(cpulist, 256, cpuset_active, system_cpuset_size); + + fprintf(dfp, "%d cpus util%%: %.2f%% TGT: %.2f sec: %.4f set: %s\n", + active_num_cpus, average_utilization, + active_num_cpus * average_utilization / 100, + delta_sec, cpulist); + } + + return average_utilization; +} +/* + * cpuset_change_size(target_num, num_active) + * cpu[num_active] is the next available cpu + */ +static void cpuset_change_size(unsigned int target_num, unsigned int num_active) +{ + unsigned int i; + if (debug > 1) + fprintf(dfp, "ACTIVE %d TARGET: %d ", + num_active, target_num); + + if (target_num > num_active) { + if (debug > 1) fprintf(dfp, "ADD:"); + for (i = num_active; i < target_num; ++i) { + CPU_SET_S(cpus[i], system_cpuset_size, cpuset_active); + if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]); + } + if (debug > 1) fprintf(dfp, "\n"); + } + if (target_num < num_active) { + if (debug > 1) fprintf(dfp, "SUB:"); + for (i = num_active - 1; i > target_num - 1; --i) { + CPU_CLR_S(cpus[i], system_cpuset_size, cpuset_active); + if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]); + } + if (debug > 1) fprintf(dfp, "\n"); + } +} +static void do_binding(cpu_set_t *set) +{ + int retval; + char cpulist[256]; + FILE *fp; + + cpulist_create(cpulist, 256, set, system_cpuset_size); + fp = fopen_or_die("balloon/cpuset.cpus", "w"); + retval = fputs(cpulist, fp); + if (retval == EOF) + err(1, "failed to write %s to balloon/cpuset.cpus\n", cpulist); + +if (debug) fprintf(dfp, "balloon/cpuset.cpus %s\n", cpulist); + retval = fclose(fp); + if (retval) + err(1, "fcose balloon/cpuset.cpus"); +#if NOT_ANYMORE + { + char command[256]; + snprintf(command, 256, "taskset -cp %s all", cpulist); + retval = system(command); + if (debug > 1) + fprintf(dfp, "\t\t\t\t\tSYSTEM \"%s\" = %d\n", command, retval); + } +#endif +} + +/* + * cpuset_update() + * + * The "ideal" target set size is the size that could retire + * the previous load with all CPUs exactly 100% utilized. + * But the math rarely works out evenly, and we need to + * decided on an integer target number of CPUs in the set. + * + * Say utilization * active_num_cpus = 3.2? + * We can't allocate 0.2 of a CPU... + * + */ +static unsigned int cpuset_update(double avg_util, unsigned int active_num_cpus) +{ + double ideal_target_num_cpus; + double target_num_cpus; + double roundup; + int truncated_int; + + if (avg_util > 95) { + if (get_num_runnable_threads() > active_num_cpus + 2) { + target_num_cpus = active_num_cpus + 1; + goto done; + } + } + ideal_target_num_cpus = (avg_util * active_num_cpus)/100; + truncated_int = (unsigned int)ideal_target_num_cpus; + roundup = ideal_target_num_cpus - truncated_int; + if (roundup > 0.5) + roundup = 1; + else if (roundup > ideal_target_num_cpus/10) + roundup = 1; + + target_num_cpus = ideal_target_num_cpus + roundup; + + if (debug > 1) + fprintf(dfp, "ideal %f = %f * %d; target %d\n", + ideal_target_num_cpus, avg_util, active_num_cpus, + (unsigned int)target_num_cpus); + +#if 0 + ideal_set_size = util_sum / 100.0; + + if (ideal_set_size > (up_threshold * active_num_cpus)) { + /* + * Utilization is above up_threshold + * Increase set size + */ + + delta_set_size = active_num_cpus * up_slope; + if (delta_set_size < 1 ) + delta_set_size == 1; + + target_set_size = active_num_cpus + increase; + } else { + /* + * Utilization is below up_threshold + * Possibly decrease set size + */ + delta_set_size = active_num_cpus - ideal_set_size; + delta_set_size *= down_rate; + target_set_size = active_num_cpus - delta_set_size; + } +#endif + +done: + if (target_num_cpus > cpuset_size_ceiling) + target_num_cpus = cpuset_size_ceiling; + else if (target_num_cpus < cpuset_size_floor) + target_num_cpus = cpuset_size_floor; + + if ((unsigned int) target_num_cpus != active_num_cpus) { + cpuset_change_size(target_num_cpus, active_num_cpus); + do_binding(cpuset_active); + } + + return (unsigned int) target_num_cpus; +} +static void main_loop(void) +{ + int i; + unsigned int util_index_prev = 0; + unsigned int util_index = 1; + unsigned int active_num_cpus = present_num_cpus; // TBD + + syslog(LOG_NOTICE, "shepherd started."); + /* + * Initial snapshot + */ + system_snapshot_utilization(util_index_prev); + history[util_index_prev].cpuset_num_cpus = active_num_cpus; + sleep_for_sec(poll_fast_sec); + + /* + * run for(ever) + */ + for (i = 0; ; ++i) { + double avg_util; + + system_snapshot_utilization(util_index); + history[util_index].cpuset_num_cpus = active_num_cpus; + avg_util = cpuset_calculate_utilization(util_index_prev, util_index, active_num_cpus); + + if (exit_main_loop) + break; + + active_num_cpus = cpuset_update(avg_util, active_num_cpus); + + sleep_for_sec(get_next_polling_interval_sec(avg_util, active_num_cpus)); + + util_index_prev = util_index; + util_index += 1; + if (util_index >= HISTORY_LENGTH) + util_index = 0; + + } + syslog(LOG_NOTICE, "shepherd terminated."); +} +pid_t child_pid; + +/* + * nanny - babysit the child + * leave a note for the parrent main loop + * if the child exits + */ +static void nanny(__attribute__((unused))int signal) +{ + if (waitpid(child_pid, NULL, WNOHANG) == child_pid) + exit_main_loop = 1; +} +/* + * fork_it() + */ +static void fork_it(char **argv) +{ + + child_pid = fork(); + + if (!child_pid) { /* child */ + + if (!child_pid) { /* command child */ + execvp(argv[0], argv); + err(1, "%s", argv[0]); + } + } else { /* parent */ + signal(SIGCHLD, nanny); + } +} +static void signal_handler_stats(int signal) +{ + if (debug) + fprintf(dfp, "signal %d received\n", signal); + + dump_statistics(); +} + +static void signal_handler_end_it(int signal) +{ + if (debug) + fprintf(dfp, "caught signal %d\n", signal); + exit_main_loop = 1; +} + +static void signals_init(void) +{ + signal(SIGALRM, signal_handler_stats); + signal(SIGUSR1, signal_handler_stats); + signal(SIGINT, signal_handler_end_it); +} + +void copy_file_by_line(char *from_path, char *to_path) +{ + FILE *from_fp, *to_fp; + char line_buf[1024]; + char *s; + int retval; + + from_fp = fopen_or_die(from_path, "r"); + to_fp = fopen_or_die(to_path, "w"); + + while (fgets(line_buf, sizeof(line_buf), from_fp) != NULL) { + + if(debug > 1) fprintf(dfp, "copying: %s", line_buf); + + retval = fputs(line_buf, to_fp); + if (retval == EOF) + err(1, "fputs cpuset.mems"); + + rewind(to_fp); + } + + retval = fclose(from_fp); + if (retval) + err(1, "close %s", from_path); + + retval = fclose(to_fp); + if (retval) + err(1, "close %s", to_path); + + +} + +char cpuset_base[] = "/shepherd"; +char cpuset_balloon[] = "balloon"; + +static void cpuset_init(void) +{ + int retval; + + retval = access(cpuset_base, F_OK); + if (retval) { + if (debug) + fprintf(dfp, "making %s\n", cpuset_base); + retval = mkdir(cpuset_base, 0755); + if (retval) { + if (errno == EEXIST) + fprintf(dfp, "okay, %s is already there\n", cpuset_base); + else + err(1, "could not create '%s'", cpuset_base); + } + } + + retval = mount("dummy", cpuset_base, "cgroup", 0, "cpuset"); + if (retval) { + if (errno != EBUSY) + err(retval, "mount cpuset"); + if (debug) + fprintf(dfp, "cpuset already mounted\n"); + } + + retval = chdir(cpuset_base); + if (retval) + err(retval, "%s", cpuset_base); + + retval = mkdir(cpuset_balloon, 0755); + if (retval) { + if (errno == EEXIST) + fprintf(dfp, "okay, %s is already there\n", cpuset_balloon); + else + err(1, "could not create '%s'", cpuset_balloon); + } + + copy_file_by_line("cpuset.mems", "balloon/cpuset.mems"); + + copy_file_by_line("cpuset.cpus", "balloon/cpuset.cpus"); + + copy_file_by_line("tasks", "balloon/tasks"); + +} + +void daemonize(void) +{ + pid_t pid; + int i; + + pid = fork(); + if (pid < 0) + err(1, "fork"); + if (pid > 0) + exit(0); /* parent exits */ + + if (setsid() < 0) + err(1, "setsid"); + signal(SIGCHLD, SIG_IGN); + signal(SIGHUP, SIG_IGN); + + pid = fork(); + if (pid < 0) + err(2, "Fork"); + if (pid > 0) + exit(0); /* parent exits */ + + umask(0); + chdir("/"); + + for (i = sysconf(_SC_OPEN_MAX); i >= 0; i--) + close(i); +} + +void debug_init(void) +{ + if (output_path == NULL) { + dfp = stderr; + + //output_path = "/tmp/shepherd.log"; + } else { + dfp = fopen_or_die(output_path, "w"); + } +} + +int main(int argc, char **argv) +{ + + srandom((unsigned int)time(NULL)); + + defaults_init(); + + cmdline(argc, argv); + + debug_init(); + + if (!debug) + daemonize(); + + openlog("shepherd", LOG_PID, LOG_DAEMON); + + topology_probe(); + + cpuset_init(); + + stats_init(); + + if (argc - optind) + fork_it(argv + optind); + + signals_init(); + main_loop(); + + dump_statistics(); + do_binding(cpuset_present); /* restore system binding */ + return 0; +} + |