aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2017-05-26 22:50:54 -0700
committerLen Brown <len.brown@intel.com>2017-05-26 22:50:54 -0700
commita34611e6b6897001c0ba0187a8737b13e4e7f7f7 (patch)
tree048291c1cbde932581d6b7cf146499b3ba595c97
parenta351e9b9fc24e982ec2f0e76379a49826036da12 (diff)
downloadlinux-shepherd.tar.gz
shepherd: Scheduler Helper Daemonshepherd
v1 - as shown at OSTS Signed-off-by: Len Brown <len.brown@intel.com>
-rw-r--r--tools/power/shepherd/Makefile6
-rw-r--r--tools/power/shepherd/cpuset.c413
-rw-r--r--tools/power/shepherd/shepherd.161
-rw-r--r--tools/power/shepherd/shepherd.c1201
4 files changed, 1681 insertions, 0 deletions
diff --git a/tools/power/shepherd/Makefile b/tools/power/shepherd/Makefile
new file mode 100644
index 00000000000000..6a74f892b44e71
--- /dev/null
+++ b/tools/power/shepherd/Makefile
@@ -0,0 +1,6 @@
+
+shepherd: shepherd.c
+ cc -o shepherd shepherd.c
+# cc -o shepherd shepherd.c -lcpuset
+clean:
+ rm shepherd
diff --git a/tools/power/shepherd/cpuset.c b/tools/power/shepherd/cpuset.c
new file mode 100644
index 00000000000000..f875949c12bf78
--- /dev/null
+++ b/tools/power/shepherd/cpuset.c
@@ -0,0 +1,413 @@
+/*
+ * Terminology:
+ *
+ * cpuset - (libc) cpu_set_t data structure represents set of CPUs
+ * cpumask - string with hex mask (e.g. "0x00000001")
+ * cpulist - string with CPU ranges (e.g. "0-3,5,7,8")
+ *
+ * Based on code from taskset.c and Linux kernel.
+ *
+ * This file may be redistributed under the terms of the
+ * GNU Lesser General Public License.
+ *
+ * Copyright (C) 2010 Karel Zak <kzak@redhat.com>
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sched.h>
+#include <errno.h>
+#include <string.h>
+#include <ctype.h>
+#include <sys/syscall.h>
+
+#include "cpuset.h"
+#define cpuset_nbits(setsize) (8 * (setsize))
+
+#if 0
+#include "c.h"
+
+static inline int val_to_char(int v)
+{
+ if (v >= 0 && v < 10)
+ return '0' + v;
+ else if (v >= 10 && v < 16)
+ return ('a' - 10) + v;
+ else
+ return -1;
+}
+
+static inline int char_to_val(int c)
+{
+ int cl;
+
+ cl = tolower(c);
+ if (c >= '0' && c <= '9')
+ return c - '0';
+ else if (cl >= 'a' && cl <= 'f')
+ return cl + (10 - 'a');
+ else
+ return -1;
+}
+
+#endif
+static const char *nexttoken(const char *q, int sep)
+{
+ if (q)
+ q = strchr(q, sep);
+ if (q)
+ q++;
+ return q;
+}
+
+#if 0
+/*
+ * Number of bits in a CPU bitmask on current system
+ */
+int get_max_number_of_cpus(void)
+{
+#ifdef SYS_sched_getaffinity
+ int n, cpus = 2048;
+ size_t setsize;
+ cpu_set_t *set = cpuset_alloc(cpus, &setsize, NULL);
+
+ if (!set)
+ return -1; /* error */
+
+ for (;;) {
+ CPU_ZERO_S(setsize, set);
+
+ /* the library version does not return size of cpumask_t */
+ n = syscall(SYS_sched_getaffinity, 0, setsize, set);
+
+ if (n < 0 && errno == EINVAL && cpus < 1024 * 1024) {
+ cpuset_free(set);
+ cpus *= 2;
+ set = cpuset_alloc(cpus, &setsize, NULL);
+ if (!set)
+ return -1; /* error */
+ continue;
+ }
+ cpuset_free(set);
+ return n * 8;
+ }
+#endif
+ return -1;
+}
+
+/*
+ * Allocates a new set for ncpus and returns size in bytes and size in bits
+ */
+cpu_set_t *cpuset_alloc(int ncpus, size_t *setsize, size_t *nbits)
+{
+ cpu_set_t *set = CPU_ALLOC(ncpus);
+
+ if (!set)
+ return NULL;
+ if (setsize)
+ *setsize = CPU_ALLOC_SIZE(ncpus);
+ if (nbits)
+ *nbits = cpuset_nbits(CPU_ALLOC_SIZE(ncpus));
+ return set;
+}
+
+void cpuset_free(cpu_set_t *set)
+{
+ CPU_FREE(set);
+}
+
+#if !HAVE_DECL_CPU_ALLOC
+/* Please, use CPU_COUNT_S() macro. This is fallback */
+int __cpuset_count_s(size_t setsize, const cpu_set_t *set)
+{
+ int s = 0;
+ const __cpu_mask *p = set->__bits;
+ const __cpu_mask *end = &set->__bits[setsize / sizeof (__cpu_mask)];
+
+ while (p < end) {
+ __cpu_mask l = *p++;
+
+ if (l == 0)
+ continue;
+# if LONG_BIT > 32
+ l = (l & 0x5555555555555555ul) + ((l >> 1) & 0x5555555555555555ul);
+ l = (l & 0x3333333333333333ul) + ((l >> 2) & 0x3333333333333333ul);
+ l = (l & 0x0f0f0f0f0f0f0f0ful) + ((l >> 4) & 0x0f0f0f0f0f0f0f0ful);
+ l = (l & 0x00ff00ff00ff00fful) + ((l >> 8) & 0x00ff00ff00ff00fful);
+ l = (l & 0x0000ffff0000fffful) + ((l >> 16) & 0x0000ffff0000fffful);
+ l = (l & 0x00000000fffffffful) + ((l >> 32) & 0x00000000fffffffful);
+# else
+ l = (l & 0x55555555ul) + ((l >> 1) & 0x55555555ul);
+ l = (l & 0x33333333ul) + ((l >> 2) & 0x33333333ul);
+ l = (l & 0x0f0f0f0ful) + ((l >> 4) & 0x0f0f0f0ful);
+ l = (l & 0x00ff00fful) + ((l >> 8) & 0x00ff00fful);
+ l = (l & 0x0000fffful) + ((l >> 16) & 0x0000fffful);
+# endif
+ s += l;
+ }
+ return s;
+}
+#endif
+
+#endif /* 0 */
+/*
+ * Returns human readable representation of the cpuset. The output format is
+ * a list of CPUs with ranges (for example, "0,1,3-9").
+ */
+char *cpulist_create(char *str, size_t len,
+ cpu_set_t *set, size_t setsize)
+{
+ size_t i;
+ char *ptr = str;
+ int entry_made = 0;
+ size_t max = cpuset_nbits(setsize);
+
+ for (i = 0; i < max; i++) {
+ if (CPU_ISSET_S(i, setsize, set)) {
+ int rlen;
+ size_t j, run = 0;
+ entry_made = 1;
+ for (j = i + 1; j < max; j++) {
+ if (CPU_ISSET_S(j, setsize, set))
+ run++;
+ else
+ break;
+ }
+ if (!run)
+ rlen = snprintf(ptr, len, "%zu,", i);
+ else if (run == 1) {
+ rlen = snprintf(ptr, len, "%zu,%zu,", i, i + 1);
+ i++;
+ } else {
+ rlen = snprintf(ptr, len, "%zu-%zu,", i, i + run);
+ i += run;
+ }
+ if (rlen < 0 || (size_t) rlen + 1 > len)
+ return NULL;
+ ptr += rlen;
+ if (rlen > 0 && len > (size_t) rlen)
+ len -= rlen;
+ else
+ len = 0;
+ }
+ }
+ ptr -= entry_made;
+ *ptr = '\0';
+
+ return str;
+}
+
+#if 0
+/*
+ * Returns string with CPU mask.
+ */
+char *cpumask_create(char *str, size_t len,
+ cpu_set_t *set, size_t setsize)
+{
+ char *ptr = str;
+ char *ret = NULL;
+ int cpu;
+
+ for (cpu = cpuset_nbits(setsize) - 4; cpu >= 0; cpu -= 4) {
+ char val = 0;
+
+ if (len == (size_t) (ptr - str))
+ break;
+
+ if (CPU_ISSET_S(cpu, setsize, set))
+ val |= 1;
+ if (CPU_ISSET_S(cpu + 1, setsize, set))
+ val |= 2;
+ if (CPU_ISSET_S(cpu + 2, setsize, set))
+ val |= 4;
+ if (CPU_ISSET_S(cpu + 3, setsize, set))
+ val |= 8;
+
+ if (!ret && val)
+ ret = ptr;
+ *ptr++ = val_to_char(val);
+ }
+ *ptr = '\0';
+ return ret ? ret : ptr - 1;
+}
+
+/*
+ * Parses string with CPUs mask.
+ */
+int cpumask_parse(const char *str, cpu_set_t *set, size_t setsize)
+{
+ int len = strlen(str);
+ const char *ptr = str + len - 1;
+ int cpu = 0;
+
+ /* skip 0x, it's all hex anyway */
+ if (len > 1 && !memcmp(str, "0x", 2L))
+ str += 2;
+
+ CPU_ZERO_S(setsize, set);
+
+ while (ptr >= str) {
+ char val;
+
+ /* cpu masks in /sys uses comma as a separator */
+ if (*ptr == ',')
+ ptr--;
+
+ val = char_to_val(*ptr);
+ if (val == (char) -1)
+ return -1;
+ if (val & 1)
+ CPU_SET_S(cpu, setsize, set);
+ if (val & 2)
+ CPU_SET_S(cpu + 1, setsize, set);
+ if (val & 4)
+ CPU_SET_S(cpu + 2, setsize, set);
+ if (val & 8)
+ CPU_SET_S(cpu + 3, setsize, set);
+ len--;
+ ptr--;
+ cpu += 4;
+ }
+
+ return 0;
+}
+
+#endif /* 0 */
+/*
+ * Parses string with list of CPU ranges.
+ * Returns 0 on success.
+ * Returns 1 on error.
+ * Returns 2 if fail is set and a cpu number passed in the list doesn't fit
+ * into the cpu_set. If fail is not set cpu numbers that do not fit are
+ * ignored and 0 is returned instead.
+ */
+int cpulist_parse(const char *str, cpu_set_t *set, size_t setsize, int fail)
+{
+ size_t max = cpuset_nbits(setsize);
+ const char *p, *q;
+ int r = 0;
+
+ q = str;
+ CPU_ZERO_S(setsize, set);
+
+ while (p = q, q = nexttoken(q, ','), p) {
+ unsigned int a; /* beginning of range */
+ unsigned int b; /* end of range */
+ unsigned int s; /* stride */
+ const char *c1, *c2;
+ char c;
+
+ if ((r = sscanf(p, "%u%c", &a, &c)) < 1)
+ return 1;
+ b = a;
+ s = 1;
+
+ c1 = nexttoken(p, '-');
+ c2 = nexttoken(p, ',');
+ if (c1 != NULL && (c2 == NULL || c1 < c2)) {
+ if ((r = sscanf(c1, "%u%c", &b, &c)) < 1)
+ return 1;
+ c1 = nexttoken(c1, ':');
+ if (c1 != NULL && (c2 == NULL || c1 < c2)) {
+ if ((r = sscanf(c1, "%u%c", &s, &c)) < 1)
+ return 1;
+ if (s == 0)
+ return 1;
+ }
+ }
+
+ if (!(a <= b))
+ return 1;
+ while (a <= b) {
+ if (fail && (a >= max))
+ return 2;
+ CPU_SET_S(a, setsize, set);
+ a += s;
+ }
+ }
+
+ if (r == 2)
+ return 1;
+ return 0;
+}
+
+#if 0
+#ifdef TEST_PROGRAM
+
+#include <getopt.h>
+
+int main(int argc, char *argv[])
+{
+ cpu_set_t *set;
+ size_t setsize, buflen, nbits;
+ char *buf, *mask = NULL, *range = NULL;
+ int ncpus = 2048, rc, c;
+
+ static const struct option longopts[] = {
+ { "ncpus", 1, 0, 'n' },
+ { "mask", 1, 0, 'm' },
+ { "range", 1, 0, 'r' },
+ { NULL, 0, 0, 0 }
+ };
+
+ while ((c = getopt_long(argc, argv, "n:m:r:", longopts, NULL)) != -1) {
+ switch(c) {
+ case 'n':
+ ncpus = atoi(optarg);
+ break;
+ case 'm':
+ mask = strdup(optarg);
+ break;
+ case 'r':
+ range = strdup(optarg);
+ break;
+ default:
+ goto usage_err;
+ }
+ }
+
+ if (!mask && !range)
+ goto usage_err;
+
+ set = cpuset_alloc(ncpus, &setsize, &nbits);
+ if (!set)
+ err(EXIT_FAILURE, "failed to allocate cpu set");
+
+ /*
+ fprintf(stderr, "ncpus: %d, cpuset bits: %zd, cpuset bytes: %zd\n",
+ ncpus, nbits, setsize);
+ */
+
+ buflen = 7 * nbits;
+ buf = malloc(buflen);
+ if (!buf)
+ err(EXIT_FAILURE, "failed to allocate cpu set buffer");
+
+ if (mask)
+ rc = cpumask_parse(mask, set, setsize);
+ else
+ rc = cpulist_parse(range, set, setsize, 0);
+
+ if (rc)
+ errx(EXIT_FAILURE, "failed to parse string: %s", mask ? : range);
+
+ printf("%-15s = %15s ", mask ? : range,
+ cpumask_create(buf, buflen, set, setsize));
+ printf("[%s]\n", cpulist_create(buf, buflen, set, setsize));
+
+ free(buf);
+ free(mask);
+ free(range);
+ cpuset_free(set);
+
+ return EXIT_SUCCESS;
+
+usage_err:
+ fprintf(stderr,
+ "usage: %s [--ncpus <num>] --mask <mask> | --range <list>",
+ program_invocation_short_name);
+ exit(EXIT_FAILURE);
+}
+#endif
+#endif /* 0 */
diff --git a/tools/power/shepherd/shepherd.1 b/tools/power/shepherd/shepherd.1
new file mode 100644
index 00000000000000..f4591d9688cfd0
--- /dev/null
+++ b/tools/power/shepherd/shepherd.1
@@ -0,0 +1,61 @@
+.\" shepherd(1) manpage
+.\"
+.\" Copyright (C) 2017 Len Brown
+.\"
+.\" Released under GPLv2
+.\"
+.TH SHEPHERD 8 "May 2017" "util-linux" "User Commands"
+.SH NAME
+shepherd \- ScHEduler helPER Daemon
+.SH SYNOPSIS
+.B shepherd
+[options]
+.IR command\ [ argument ...]
+.br
+.B shepherd
+[options]
+.SH DESCRIPTION
+.PP
+.B shepherd
+implements system-wide power-aware scheduling from a user-space daemon, using
+the thread affinity.
+.SH OPTIONS
+.BR \-d ,\ \-\-debug
+Enable verbose debugging output, repeat for even more...
+.TP
+.TP
+.BR \-h ,\ \-\-help
+Display version and help text and exit.
+.SH USAGE
+.TP
+Shepherd will fork into a daemon, unless invoked with --debug.
+.B shepherd
+.IR command\ [ arguments ]
+.TP
+You can also retrieve the CPU affinity of an existing task:
+.B shepherd \-p
+.I pid
+.TP
+Or set it:
+.B shepherd \-p
+.SH PERMISSIONS
+shepherd must run as root.
+.SH EXIT VALUE
+0 for success, non-zero for failure.
+.SH NOTES
+.pp
+.SH SEE ALSO
+.BR taskset (1),
+.BR sched_setaffinity (2),
+.BR sched_getaffinity (2)
+.sp
+See
+.SH AUTHOR
+Written by Len Brown
+.SH COPYRIGHT
+Copyright \(co 2017 Len Brown, Intel Corp.
+This is free software; see the source for copying conditions. There is NO
+warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+.SH AVAILABILITY
+The shepherd command is part of the util-linux package and is available from
+ftp://ftp.kernel.org/pub/linux/utils/util-linux/.
diff --git a/tools/power/shepherd/shepherd.c b/tools/power/shepherd/shepherd.c
new file mode 100644
index 00000000000000..cdf199ec188281
--- /dev/null
+++ b/tools/power/shepherd/shepherd.c
@@ -0,0 +1,1201 @@
+/*
+ * shepherd.c
+ * Scheduler Helper Daemon
+ *
+ * see shepherd.1 for details
+ *
+ * This program is free software, released under GPLv2
+ * Copyright (C) 2016 Len Brown, Intel Corp <len.brown@intel.com>
+ */
+
+#define _GNU_SOURCE /* See feature_test_macros(7) */
+#include <sched.h>
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <errno.h>
+#include <signal.h>
+#include <sched.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <time.h>
+#include <err.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <syslog.h>
+
+#include "cpuset.h"
+
+#include "cpuset.c"
+
+#define UNUSED(x) unsigned int UNUSED_ ## x __attribute__((__unused__))
+
+unsigned int debug;
+
+unsigned int max_cpu_num;
+unsigned int present_num_cpus;
+
+cpu_set_t *cpuset_present;
+cpu_set_t *cpuset_active;
+size_t system_cpuset_size;
+
+char *proc_stat = "/proc/stat";
+char *proc_loadavg = "/proc/loadavg";
+
+long user_tick_per_sec;
+double poll_slow_sec;
+double poll_norm_sec;
+double poll_fast_sec;
+
+int exit_main_loop;
+FILE *dfp;
+char *output_path;
+
+/*
+ * cpuset_size_floor
+ * the CPU set will not shrink smaller than cpuset_size_floor
+ */
+int cpuset_size_floor = 1;
+
+/*
+ * cpuset_size_ceiling
+ * the CPU set will not grow no larger than cpuset_size_ceiling
+ */
+int cpuset_size_ceiling;
+
+#define HISTORY_LENGTH 10
+
+struct cpuset_history {
+ double seconds;
+ unsigned cpuset_num_cpus;
+ unsigned long long system_idle_tick_sum;
+} history[HISTORY_LENGTH];
+
+unsigned int *set_size_choosen;
+double *set_size_seconds;
+
+static void debug_dump_history(void)
+{
+ int i;
+
+ for (i = 0; i < HISTORY_LENGTH; ++i)
+ {
+ fprintf(dfp, "history[%d] %d cpus %f sec %lld sum\n",
+ i,
+ history[i].cpuset_num_cpus,
+ history[i].seconds,
+ history[i].system_idle_tick_sum);
+ }
+}
+#ifdef NOTYET
+/*
+ * up_slope_max -- how fast a set can grow per interval
+ * 1.00 means it can increase by up to 100% of its current size
+ * 0.50 means it can increase by up to 50% of its current size
+ * etc.
+ */
+double up_slope_max = 0.50;
+#endif
+
+#ifdef NOTYET
+/*
+ * up_cpus_max
+ * maximum number of CPUs to increase set per interval
+ * default = cpuset_size_ceiling/4;
+ */
+int up_cpus_max;
+#endif
+
+#ifdef NOTYET
+/*
+ * down_rate
+ * 0.50 means shrink at 50% of the way to ideal_set size on each interval
+ */
+double down_rate = 0.5;
+#endif
+
+
+/*
+ * get_next_polling_interval_sec()
+ *
+ * Policy decision:
+ * return ns for next polling interval
+ */
+static double get_next_polling_interval_sec(double average_utilization, int set_size)
+{
+
+ /*
+ * If max set size, then poll normally,
+ * no matter the utilization. We can't grow more,
+ * and we are not in a big rush to shrink.
+ */
+ if (set_size == cpuset_size_ceiling)
+ return poll_norm_sec;
+
+ /*
+ * If min set and very low utilization, poll slowly,
+ * to minimally disturb profoundly idle system.
+ */
+ if ((set_size == cpuset_size_floor) && (average_utilization < 5))
+ return poll_slow_sec;
+
+ /*
+ * If high utilzation, poll quickly
+ * to promptly expand capacity with demand
+ */
+ if (average_utilization > 80)
+ return poll_fast_sec;
+
+ /*
+ * poll at moderate rate
+ */
+ return poll_norm_sec;
+}
+
+#ifdef NOTYET
+/*
+ * Timer slack gives the kernel permission to coalesce our timeout
+ * with nearby system timers. System default is +/- 0.050 ms,
+ * which is much tighter than we need. Increase timer slack
+ * to 1/16th of the requested timeout duration, eg.
+ *
+ * 10 ms / 16 = +/- 0.625 ms
+ * 100 ms / 16 = +/- 6.25 ms
+ * 1,000 ms / 16 = +/- 62.5 ms
+ * 2,000 ms / 16 = +/- 125.0 ms
+ */
+
+unsigned int debug_timer_slack_updates;
+
+unsigned long timer_slack_current_ns;
+static void timer_slack_check(double sleep_sec)
+{
+ unsigned long timer_slack_desired_ns;
+
+ timer_slack_desired_ns = (unsigned long long)sleep_sec / 16 * 1000000000;
+
+ if (timer_slack_desired_ns == timer_slack_current_ns)
+ return;
+
+ debug_timer_slack_updates++;
+
+ if (prctl(PR_SET_TIMERSLACK, timer_slack_desired_ns) < 0)
+ err(-1, "PR_SET_TIMERSLACK");
+
+ timer_slack_current_ns = timer_slack_desired_ns;
+}
+#endif
+
+static double timestamp_sec(void)
+{
+ struct timespec ts;
+
+ if (clock_gettime(CLOCK_REALTIME, &ts) < 0)
+ err(-1, "clock_gettime");
+
+ return ((double)ts.tv_sec + (double)ts.tv_nsec/1000000000);
+}
+
+struct timespec ts_begin, ts_end;
+
+#define DEBUG_SLEEP_ARRAY_SIZE 12
+
+unsigned int sleep_request_histogram[DEBUG_SLEEP_ARRAY_SIZE];
+unsigned int sleep_actual_histogram[DEBUG_SLEEP_ARRAY_SIZE];
+
+unsigned int debug_sleep_count;
+double debug_sleep_stats_start_time;
+
+static void debug_sleep_histogram_record(unsigned int *a, double interval)
+{
+ int i;
+
+ for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) {
+ if (interval >= 2.0) {
+ a[i]++;
+ return;
+ }
+ interval *= 2;
+ }
+}
+static void debug_sleep_histograms_clear(void)
+{
+ int i;
+
+ for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) {
+ sleep_request_histogram[i] = 0;
+ sleep_actual_histogram[i] = 0;
+ }
+
+ debug_sleep_stats_start_time = timestamp_sec();
+}
+static void debug_sleep_histograms_dump(void)
+{
+ int i;
+ double seconds = 2.0;
+ unsigned int request_count, actual_count;
+ double now, interval;
+
+ request_count = actual_count = 0;
+
+ fprintf(dfp, "Sleep Wake Seconds\n");
+ for (i = 0; i < DEBUG_SLEEP_ARRAY_SIZE; ++i) {
+ request_count += sleep_request_histogram[i];
+ actual_count += sleep_actual_histogram[i];
+ fprintf(dfp, "%5d %4d >= %f\n",
+ sleep_request_histogram[i],
+ sleep_actual_histogram[i], seconds);
+ seconds /= 2;
+ }
+ now = timestamp_sec();
+ interval = now - debug_sleep_stats_start_time;
+ fprintf(dfp, "%5d %4d Total, %f per second over %.2f seconds\n",
+ request_count, actual_count,
+ (double)request_count/(interval), interval);
+}
+
+static void stats_clear(void)
+{
+ unsigned int cpus;
+
+ for (cpus = 0; cpus <= present_num_cpus; ++cpus) {
+ set_size_choosen[cpus] = 0;
+ set_size_seconds[cpus] = 0.0;
+ }
+ debug_sleep_histograms_clear();
+}
+static void stats_init(void)
+{
+ set_size_choosen = calloc(present_num_cpus + 1, sizeof(unsigned int));
+ if (set_size_choosen == NULL)
+ err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(unsigned int));
+
+ set_size_seconds = calloc(present_num_cpus + 1, sizeof(double));
+ if (set_size_seconds == NULL)
+ err(1, "calloc(%d, %zd)", present_num_cpus + 1, sizeof(double));
+
+ stats_clear();
+}
+
+static void stats_dump(void)
+{
+ unsigned int cpus;
+
+ fprintf(dfp, "CPUs Used Duration[sec]\n");
+
+ for (cpus = 1; cpus <= present_num_cpus; ++cpus) {
+ fprintf(dfp, "%4d %4d %8.2f\n",
+ cpus, set_size_choosen[cpus], set_size_seconds[cpus]);
+ }
+}
+
+static void dump_statistics(void)
+{
+ stats_dump();
+ debug_sleep_histograms_dump();
+
+#if NOTYET
+ fprintf(dfp, "%u%% timer slack updates (%u)\n",
+ 100 * debug_timer_slack_updates/debug_sleep_count, debug_timer_slack_updates);
+
+ debug_timer_slack_updates = 0;
+ debug_sleep_count = 0;
+#endif
+ stats_clear();
+ debug_sleep_histograms_clear();
+}
+
+/*
+ * sleep_for_sec(sec)
+ *
+ * Sleep for 'ns' nanosec
+ * First allow adjustment to timer_slack
+ */
+
+static void sleep_for_sec(double seconds)
+{
+ struct timespec ts;
+
+ debug_sleep_count++;
+ debug_sleep_histogram_record(sleep_request_histogram, seconds);
+
+#if NOTYET
+ timer_slack_check(seconds);
+#endif
+
+ ts.tv_sec = (time_t)seconds;
+
+ if (debug > 1)
+ fprintf(dfp, "input seconds %f -> tv seconds %lld\n",
+ seconds, (long long int)ts.tv_sec);
+
+ seconds = seconds - ts.tv_sec;
+ ts.tv_nsec = (long)(seconds * 1000000000.0);
+
+ if (debug > 1)
+ fprintf(dfp, "input seconds * 1B %f -> tv nsec %ld\n",
+ seconds * 1000000000, ts.tv_nsec);
+
+ if (nanosleep(&ts, NULL) < 0) {
+ if (errno == EINTR) {
+ /* early return if signaled */
+ return;
+ }
+ err(-1, "nanosleep");
+ }
+}
+static void __attribute__((__noreturn__)) usage(FILE *out)
+{
+ fprintf(out, "shepherd 17.05.21a (C) 2017 Len Brown <len.brown@intel.com>\n");
+ fprintf(out, "Usage: shepherd [options]\n");
+ fprintf(out, "'man shepherd' for details\n");
+
+ exit(out == stderr ? EXIT_FAILURE: EXIT_SUCCESS);
+}
+
+static void cmdline(int argc, char **argv)
+{
+ int c;
+
+ static const struct option longopts[] = {
+ { "debug", 0, NULL, 'd' },
+ { "help", 0, NULL, 'h' },
+ { "output", 0, NULL, 'o' },
+ };
+
+ while ((c = getopt_long_only(argc, argv, "+dho:", longopts, NULL)) != -1) {
+ switch (c) {
+ case 'd':
+ debug++;
+ break;
+ case 'h':
+ usage(stdout);
+ break;
+ case 'o':
+ output_path = optarg;
+ break;
+ default:
+ usage(stderr);
+ break;
+ }
+ }
+}
+
+static void init_default_sample_rates(void)
+{
+
+ double user_tick_sec;
+
+ user_tick_per_sec = sysconf(_SC_CLK_TCK);
+
+ if (user_tick_per_sec != 100)
+ warnx("User ticks %ld, expected 100", user_tick_per_sec);
+
+ user_tick_sec = 1.0 / user_tick_per_sec;
+
+ /*
+ * Nominal wakeup frequency is once per second
+ * 1.0 sec/sample = 1 sample/sec
+ */
+ poll_norm_sec = 1;
+
+ /*
+ * Slow wakeup frequency is used during profound idle
+ * and at max utilization of full set size
+ * 2.0 sec/sample = 0.5 sample/sec
+ */
+ poll_slow_sec = 2;
+
+ /*
+ * Fast wakeup frequency is used at high utilization
+ * Sample at 10 * user user tick rate, which means
+ * we will see at most 10 counter ticks/cpu/sample.
+ * Since user_tick_sec = 10ms,
+ * sample rate = 10 * 10ms = 100ms = 10 samples/sec
+ */
+ poll_fast_sec = user_tick_sec * 10;
+
+}
+
+static void defaults_init(void)
+{
+ init_default_sample_rates();
+
+ // set_size_ceiling = TBD;
+
+}
+/*
+ * Open a file, and exit on failure
+ */
+static FILE *fopen_or_die(const char *path, const char *mode)
+{
+ FILE *filep = fopen(path, mode);
+ if (!filep)
+ err(1, "%s: open failed", path);
+ return filep;
+}
+/*
+ * run func(cpu) on every cpu in /proc/stat
+ * return sum of return values of func(cpu)
+ * else, if func() returns negative value, return that.
+ */
+static unsigned long long sum_for_all_proc_cpus(int (func)(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int), unsigned int arg)
+{
+ FILE *fp;
+ int cpu_num, user, nice, system, idle;
+ int retval;
+ unsigned long long sum = 0;
+
+ fp = fopen_or_die(proc_stat, "r");
+
+ retval = fscanf(fp, "cpu %*d %*d %*d %*d %*d %*d %*d %*d %*d %*d\n");
+ if (retval != 0)
+ err(1, "%s: failed to parse format", proc_stat);
+
+ while (1) {
+ retval = fscanf(fp, "cpu%u %d %d %d %d %*d %*d %*d %*d %*d %*d\n",
+ &cpu_num, &user, &nice, &system, &idle);
+ if (retval != 5)
+ break;
+
+ retval = func(arg, cpu_num, user, nice, system, idle);
+ if (retval < 0) {
+ fclose(fp);
+ return(retval);
+ }
+ sum += retval;
+ }
+ fclose(fp);
+ return sum;
+}
+
+static unsigned int get_num_runnable_threads(void)
+{
+ FILE *fp;
+ int retval;
+ unsigned int num_threads;
+
+ fp = fopen_or_die(proc_loadavg, "r");
+
+ retval = fscanf(fp, "%*f %*f %*f %ud/%*d %*d\n", &num_threads);
+ if (retval != 1)
+ err(1, "%s", proc_stat);
+
+ if (debug > 2)
+ fprintf(dfp, "%d threads\n", num_threads);
+ fclose(fp);
+
+ return num_threads;
+}
+#if NOTYET
+static int for_all_cpuset_cpus(cpu_set_t *set, int (func)(unsigned int))
+{
+ unsigned int cpu;
+ int retval;
+
+ for (cpu = 0; cpu <= max_cpu_num; ++cpu) {
+ if (!CPU_ISSET_S(cpu, system_cpuset_size, set))
+ continue;
+ retval = func(cpu);
+ if (retval)
+ return retval;
+ }
+ return 0;
+}
+#endif
+/*
+ * cpu_count()
+ * update global max_cpu_num, return 1 (per cpu)
+ */
+static int cpu_count(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d))
+{
+ if (cpu > max_cpu_num)
+ max_cpu_num = cpu;
+
+ if (debug > 1)
+ fprintf(dfp, "cpu%d: %d\n", cpu, present_num_cpus);
+ return 1;
+}
+
+static int cpu_mark_present(UNUSED(arg), unsigned int cpu, UNUSED(a), UNUSED(b), UNUSED(c), UNUSED(d))
+{
+ CPU_SET_S(cpu, system_cpuset_size, cpuset_present);
+ return 0;
+}
+
+/*
+ * for given CPU, starting with INDEX 0
+ * Find first /sys/devices/system/cpu/$CPU/cache/$INDEX/shared_cpu_list
+ * and return it in cpuset_cache_topology arg
+ *
+ * return 0 on success, non-0 on failure
+ */
+static int get_cache_shared_cpu_list(unsigned int cpu_num, cpu_set_t *cpuset_cache_topology)
+{
+ FILE *fp;
+ int index;
+ char buf[256];
+
+int retval;
+
+ for (index = 0; index < 10; ++index) {
+ char *s;
+
+ sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d", cpu_num, index);
+ if (access(buf, R_OK) != 0) {
+ if (debug)
+ perror(buf);
+ return -1;
+ }
+ sprintf(buf, "/sys/devices/system/cpu/cpu%d/cache/index%d/shared_cpu_list", cpu_num, index);
+ fp = fopen(buf, "r");
+ if (fp == NULL) {
+ if (debug) perror(buf);
+ continue;
+ }
+
+ if (fgets(buf, sizeof(buf), fp) == NULL)
+ err(1, "%s", buf);
+
+ s = strchr(buf, '\n');
+ *s = '\0';
+
+ if (debug > 1)
+ fprintf(dfp, "get_cache_shared_cpu_list(%d): read '%s'\n", cpu_num, buf);
+ retval = cpulist_parse(buf, cpuset_cache_topology, system_cpuset_size, 1);
+ return retval;
+ }
+ return -1;
+}
+/*
+ * cpus[i] is the order that CPUs are added to the set.
+ * This mapping is used to allocate cpus to the set in "topology order".
+ * But even for a linear mapping it is needed to deal with sparse numbering.
+ */
+unsigned int *cpus;
+
+static void topology_init_cpu_order(void)
+{
+ int index;
+ unsigned int cpu_num;
+ cpu_set_t *cpuset_cache_topology, *cpuset_unprocessed;
+
+ cpuset_cache_topology = CPU_ALLOC((max_cpu_num + 1));
+ if (cpuset_cache_topology == NULL)
+ err(3, "CPU_ALLOC");
+
+ cpuset_unprocessed = CPU_ALLOC((max_cpu_num + 1));
+ if (cpuset_unprocessed == NULL)
+ err(3, "CPU_ALLOC");
+
+ CPU_ZERO_S(system_cpuset_size, cpuset_unprocessed);
+
+ CPU_OR_S(system_cpuset_size, cpuset_unprocessed, cpuset_unprocessed, cpuset_active);
+
+ for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) {
+ unsigned int cache_cpu_num;
+
+ if (!CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_unprocessed))
+ continue;
+
+ CPU_ZERO_S(system_cpuset_size, cpuset_cache_topology);
+ if (get_cache_shared_cpu_list(cpu_num, cpuset_cache_topology)) {
+ if (debug) fprintf(dfp, "BAILING to LINEAR\n");
+ break;
+ }
+ for (cache_cpu_num = 0; cache_cpu_num <= max_cpu_num; ++cache_cpu_num)
+ {
+ if (!CPU_ISSET_S(cache_cpu_num, system_cpuset_size, cpuset_cache_topology))
+ continue;
+ cpus[index] = cache_cpu_num;
+ if (debug)
+ fprintf(dfp, "%d: cpus[%d] = cpu%d\n", cpu_num, index, cache_cpu_num);
+ CPU_CLR_S(cache_cpu_num, system_cpuset_size, cpuset_unprocessed);
+ index++;
+ }
+ }
+
+ CPU_FREE(cpuset_cache_topology);
+ CPU_FREE(cpuset_unprocessed);
+
+ if (cpu_num == max_cpu_num + 1)
+ return;
+
+ fprintf(dfp, "BACKUP plan: linear %d\n", cpu_num);
+ /* linear */
+ for (index = 0, cpu_num = 0; cpu_num <= max_cpu_num; ++cpu_num) {
+ if (CPU_ISSET_S(cpu_num, system_cpuset_size, cpuset_active)) {
+ cpus[index] = cpu_num;
+ if (debug > 1)
+ fprintf(dfp, "cpus[%d] = cpu%d\n", index, cpu_num);
+ index++;
+ }
+ }
+}
+
+static void topology_probe(void)
+{
+ present_num_cpus = 0;
+ max_cpu_num = 0;
+ present_num_cpus = sum_for_all_proc_cpus(cpu_count, 0);
+ cpuset_size_ceiling = present_num_cpus;
+
+ if (debug > 1)
+ fprintf(dfp, "%d cpus, cpu%d is highest numbered\n",
+ present_num_cpus, max_cpu_num);
+
+ /*
+ * Allocate the ordered CPU topology list
+ * indexed by size of active_set
+ * each entry containes cpu# which is last
+ * onlined to complete a set of that size
+ */
+ cpus = calloc(present_num_cpus, sizeof(unsigned int));
+ if (cpus == NULL)
+ err(1, "calloc(%d, %zd)", present_num_cpus, sizeof(unsigned int));
+
+ /*
+ * Allocate cpuset_present, initialize to all CPUs in system
+ */
+ cpuset_present = CPU_ALLOC((max_cpu_num + 1));
+ if (cpuset_present == NULL)
+ err(3, "CPU_ALLOC");
+ system_cpuset_size = CPU_ALLOC_SIZE((max_cpu_num + 1));
+ CPU_ZERO_S(system_cpuset_size, cpuset_present);
+ sum_for_all_proc_cpus(cpu_mark_present, 0);
+
+ /*
+ * Allocate cpuset_active, initialize = cpuset_present
+ */
+ cpuset_active = CPU_ALLOC((max_cpu_num + 1));
+ if (cpuset_active == NULL)
+ err(3, "CPU_ALLOC");
+ CPU_ZERO_S(system_cpuset_size, cpuset_active);
+ CPU_OR_S(system_cpuset_size, cpuset_active, cpuset_present, cpuset_active);
+
+ /*
+ * TODO: make this 2 on HT boxes
+ */
+ cpuset_size_floor = 1;
+
+ topology_init_cpu_order();
+}
+
+static int cpu_record_utilization(UNUSED(index), unsigned int cpu,
+ UNUSED(a), UNUSED(b), UNUSED(c), unsigned int idle_count)
+{
+ if (debug > 1)
+ fprintf(dfp, "cpu%d idle %d\n", cpu, idle_count);
+
+// if (CPU_ISSET_S(cpu, system_cpuset_size, cpuset_active))
+ return idle_count;
+// else
+// return 0;
+}
+
+static void system_snapshot_utilization(unsigned int index)
+{
+
+ history[index].seconds = timestamp_sec();
+ if(debug > 1)
+ fprintf(dfp, "util timestamp %f\n", history[index].seconds);
+
+ history[index].system_idle_tick_sum =
+ sum_for_all_proc_cpus(cpu_record_utilization, 0);
+ if (debug > 2)
+ debug_dump_history();
+}
+
+#if NOTYET
+static int per_cpu_stub(unsigned int cpu)
+{
+ fprintf(dfp, "hello world %d\n", cpu);
+ return 0;
+}
+#endif
+/*
+ * cpuset_calculate_utilization()
+ *
+ * return average_utilization of set during previous interval from 0.0 to 100.0 %
+ */
+double ema;
+
+static double cpuset_calculate_utilization(unsigned int util_index_prev, unsigned int util_index, unsigned int active_num_cpus)
+{
+ double idle_ticks;
+ double ticks_per_interval_per_cpu;
+ double delta_sec;
+ double average_utilization;
+ double total_possible_ticks;
+ double busy_ticks;
+
+#if NOYET
+ double ema_period_sec = 2.0;
+ double ema_weight;
+#endif
+
+ if (util_index == util_index_prev)
+ return -1;
+
+ delta_sec = history[util_index].seconds - history[util_index_prev].seconds;
+ debug_sleep_histogram_record(sleep_actual_histogram, delta_sec);
+
+ if (delta_sec <= 0) {
+ fprintf(dfp, "BUG: delta_sec %f\n", delta_sec);
+ delta_sec = 0.01;
+ }
+ set_size_choosen[active_num_cpus]++;
+ set_size_seconds[active_num_cpus] += delta_sec;
+
+ if(debug > 1)
+ fprintf(dfp, "delta_sec %f = [%d] %f - [%d] %f\n",
+ delta_sec, util_index, history[util_index].seconds,
+ util_index_prev, history[util_index_prev].seconds);
+
+ ticks_per_interval_per_cpu = user_tick_per_sec * delta_sec;
+ if(debug > 1)
+ fprintf(dfp, "ticks_per_interval_per_cpu %f = %ld * %f\n",
+ ticks_per_interval_per_cpu, user_tick_per_sec, delta_sec);
+
+ idle_ticks = history[util_index].system_idle_tick_sum -
+ history[util_index_prev].system_idle_tick_sum;
+ if (debug > 1)
+ fprintf(dfp, "idle_ticks %f = %lld - %lld\n",
+ idle_ticks, history[util_index].system_idle_tick_sum,
+ history[util_index_prev].system_idle_tick_sum);
+
+ total_possible_ticks = ticks_per_interval_per_cpu * present_num_cpus;
+ busy_ticks = total_possible_ticks - idle_ticks;
+
+ if (debug > 1)
+ fprintf(dfp, "total_possible_ticks %f busy_ticks %f\n",
+ total_possible_ticks, busy_ticks);
+
+
+ average_utilization = 100 * busy_ticks / ticks_per_interval_per_cpu / active_num_cpus;
+
+ if (average_utilization < 0)
+ average_utilization = 0;
+
+
+#if NOTYET
+ /*
+ * Exponential Moving Average (EMA)
+ *
+ * EMA(t) = EMA(t-1) + EMA_WEIGHT * (SAMPLE(t) - EMA(t-1))
+ *
+ * Larger EMA_WEIGHT favors SAMPLE(t)
+ * at the expense of EMA history, EMA(t-1)
+ *
+ * An N-period EMA will forget 86% of history
+ * after N samples, and EMA_WEIGHT = 1/(N + 1).
+ *
+ * We make the policy decision of selecting ema_period_sec = 2.0.
+ * We use a different EMA_WEIGHT depending on the sample rate.
+ *
+ * EMA_WEIGHT(2.0 sec, 100 samples/sec) = 2/(200+1) = 0.00995
+ * EMA_WEIGHT(2.0 sec, 10 samples/sec) = 2/(20+1) = 0.0952
+ * EMA_WEIGHT(2.0 sec, 1 samples/sec) = 2/(2+1) = 0.666
+ * EMA_WEIGHT(2.0 sec, .5 samples/sec) = 2/(1+1) = 1.0
+ */
+ ema_weight = 2 / (ema_period_sec * 1/delta_sec + 1);
+ ema = ema + ema_weight * average_utilization - ema_weight * ema;
+#endif
+
+ if(debug) {
+ char cpulist[256];
+
+ cpulist_create(cpulist, 256, cpuset_active, system_cpuset_size);
+
+ fprintf(dfp, "%d cpus util%%: %.2f%% TGT: %.2f sec: %.4f set: %s\n",
+ active_num_cpus, average_utilization,
+ active_num_cpus * average_utilization / 100,
+ delta_sec, cpulist);
+ }
+
+ return average_utilization;
+}
+/*
+ * cpuset_change_size(target_num, num_active)
+ * cpu[num_active] is the next available cpu
+ */
+static void cpuset_change_size(unsigned int target_num, unsigned int num_active)
+{
+ unsigned int i;
+ if (debug > 1)
+ fprintf(dfp, "ACTIVE %d TARGET: %d ",
+ num_active, target_num);
+
+ if (target_num > num_active) {
+ if (debug > 1) fprintf(dfp, "ADD:");
+ for (i = num_active; i < target_num; ++i) {
+ CPU_SET_S(cpus[i], system_cpuset_size, cpuset_active);
+ if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]);
+ }
+ if (debug > 1) fprintf(dfp, "\n");
+ }
+ if (target_num < num_active) {
+ if (debug > 1) fprintf(dfp, "SUB:");
+ for (i = num_active - 1; i > target_num - 1; --i) {
+ CPU_CLR_S(cpus[i], system_cpuset_size, cpuset_active);
+ if(debug > 1) fprintf(dfp, " cpu%d", cpus[i]);
+ }
+ if (debug > 1) fprintf(dfp, "\n");
+ }
+}
+static void do_binding(cpu_set_t *set)
+{
+ int retval;
+ char cpulist[256];
+ FILE *fp;
+
+ cpulist_create(cpulist, 256, set, system_cpuset_size);
+ fp = fopen_or_die("balloon/cpuset.cpus", "w");
+ retval = fputs(cpulist, fp);
+ if (retval == EOF)
+ err(1, "failed to write %s to balloon/cpuset.cpus\n", cpulist);
+
+if (debug) fprintf(dfp, "balloon/cpuset.cpus %s\n", cpulist);
+ retval = fclose(fp);
+ if (retval)
+ err(1, "fcose balloon/cpuset.cpus");
+#if NOT_ANYMORE
+ {
+ char command[256];
+ snprintf(command, 256, "taskset -cp %s all", cpulist);
+ retval = system(command);
+ if (debug > 1)
+ fprintf(dfp, "\t\t\t\t\tSYSTEM \"%s\" = %d\n", command, retval);
+ }
+#endif
+}
+
+/*
+ * cpuset_update()
+ *
+ * The "ideal" target set size is the size that could retire
+ * the previous load with all CPUs exactly 100% utilized.
+ * But the math rarely works out evenly, and we need to
+ * decided on an integer target number of CPUs in the set.
+ *
+ * Say utilization * active_num_cpus = 3.2?
+ * We can't allocate 0.2 of a CPU...
+ *
+ */
+static unsigned int cpuset_update(double avg_util, unsigned int active_num_cpus)
+{
+ double ideal_target_num_cpus;
+ double target_num_cpus;
+ double roundup;
+ int truncated_int;
+
+ if (avg_util > 95) {
+ if (get_num_runnable_threads() > active_num_cpus + 2) {
+ target_num_cpus = active_num_cpus + 1;
+ goto done;
+ }
+ }
+ ideal_target_num_cpus = (avg_util * active_num_cpus)/100;
+ truncated_int = (unsigned int)ideal_target_num_cpus;
+ roundup = ideal_target_num_cpus - truncated_int;
+ if (roundup > 0.5)
+ roundup = 1;
+ else if (roundup > ideal_target_num_cpus/10)
+ roundup = 1;
+
+ target_num_cpus = ideal_target_num_cpus + roundup;
+
+ if (debug > 1)
+ fprintf(dfp, "ideal %f = %f * %d; target %d\n",
+ ideal_target_num_cpus, avg_util, active_num_cpus,
+ (unsigned int)target_num_cpus);
+
+#if 0
+ ideal_set_size = util_sum / 100.0;
+
+ if (ideal_set_size > (up_threshold * active_num_cpus)) {
+ /*
+ * Utilization is above up_threshold
+ * Increase set size
+ */
+
+ delta_set_size = active_num_cpus * up_slope;
+ if (delta_set_size < 1 )
+ delta_set_size == 1;
+
+ target_set_size = active_num_cpus + increase;
+ } else {
+ /*
+ * Utilization is below up_threshold
+ * Possibly decrease set size
+ */
+ delta_set_size = active_num_cpus - ideal_set_size;
+ delta_set_size *= down_rate;
+ target_set_size = active_num_cpus - delta_set_size;
+ }
+#endif
+
+done:
+ if (target_num_cpus > cpuset_size_ceiling)
+ target_num_cpus = cpuset_size_ceiling;
+ else if (target_num_cpus < cpuset_size_floor)
+ target_num_cpus = cpuset_size_floor;
+
+ if ((unsigned int) target_num_cpus != active_num_cpus) {
+ cpuset_change_size(target_num_cpus, active_num_cpus);
+ do_binding(cpuset_active);
+ }
+
+ return (unsigned int) target_num_cpus;
+}
+static void main_loop(void)
+{
+ int i;
+ unsigned int util_index_prev = 0;
+ unsigned int util_index = 1;
+ unsigned int active_num_cpus = present_num_cpus; // TBD
+
+ syslog(LOG_NOTICE, "shepherd started.");
+ /*
+ * Initial snapshot
+ */
+ system_snapshot_utilization(util_index_prev);
+ history[util_index_prev].cpuset_num_cpus = active_num_cpus;
+ sleep_for_sec(poll_fast_sec);
+
+ /*
+ * run for(ever)
+ */
+ for (i = 0; ; ++i) {
+ double avg_util;
+
+ system_snapshot_utilization(util_index);
+ history[util_index].cpuset_num_cpus = active_num_cpus;
+ avg_util = cpuset_calculate_utilization(util_index_prev, util_index, active_num_cpus);
+
+ if (exit_main_loop)
+ break;
+
+ active_num_cpus = cpuset_update(avg_util, active_num_cpus);
+
+ sleep_for_sec(get_next_polling_interval_sec(avg_util, active_num_cpus));
+
+ util_index_prev = util_index;
+ util_index += 1;
+ if (util_index >= HISTORY_LENGTH)
+ util_index = 0;
+
+ }
+ syslog(LOG_NOTICE, "shepherd terminated.");
+}
+pid_t child_pid;
+
+/*
+ * nanny - babysit the child
+ * leave a note for the parrent main loop
+ * if the child exits
+ */
+static void nanny(__attribute__((unused))int signal)
+{
+ if (waitpid(child_pid, NULL, WNOHANG) == child_pid)
+ exit_main_loop = 1;
+}
+/*
+ * fork_it()
+ */
+static void fork_it(char **argv)
+{
+
+ child_pid = fork();
+
+ if (!child_pid) { /* child */
+
+ if (!child_pid) { /* command child */
+ execvp(argv[0], argv);
+ err(1, "%s", argv[0]);
+ }
+ } else { /* parent */
+ signal(SIGCHLD, nanny);
+ }
+}
+static void signal_handler_stats(int signal)
+{
+ if (debug)
+ fprintf(dfp, "signal %d received\n", signal);
+
+ dump_statistics();
+}
+
+static void signal_handler_end_it(int signal)
+{
+ if (debug)
+ fprintf(dfp, "caught signal %d\n", signal);
+ exit_main_loop = 1;
+}
+
+static void signals_init(void)
+{
+ signal(SIGALRM, signal_handler_stats);
+ signal(SIGUSR1, signal_handler_stats);
+ signal(SIGINT, signal_handler_end_it);
+}
+
+void copy_file_by_line(char *from_path, char *to_path)
+{
+ FILE *from_fp, *to_fp;
+ char line_buf[1024];
+ char *s;
+ int retval;
+
+ from_fp = fopen_or_die(from_path, "r");
+ to_fp = fopen_or_die(to_path, "w");
+
+ while (fgets(line_buf, sizeof(line_buf), from_fp) != NULL) {
+
+ if(debug > 1) fprintf(dfp, "copying: %s", line_buf);
+
+ retval = fputs(line_buf, to_fp);
+ if (retval == EOF)
+ err(1, "fputs cpuset.mems");
+
+ rewind(to_fp);
+ }
+
+ retval = fclose(from_fp);
+ if (retval)
+ err(1, "close %s", from_path);
+
+ retval = fclose(to_fp);
+ if (retval)
+ err(1, "close %s", to_path);
+
+
+}
+
+char cpuset_base[] = "/shepherd";
+char cpuset_balloon[] = "balloon";
+
+static void cpuset_init(void)
+{
+ int retval;
+
+ retval = access(cpuset_base, F_OK);
+ if (retval) {
+ if (debug)
+ fprintf(dfp, "making %s\n", cpuset_base);
+ retval = mkdir(cpuset_base, 0755);
+ if (retval) {
+ if (errno == EEXIST)
+ fprintf(dfp, "okay, %s is already there\n", cpuset_base);
+ else
+ err(1, "could not create '%s'", cpuset_base);
+ }
+ }
+
+ retval = mount("dummy", cpuset_base, "cgroup", 0, "cpuset");
+ if (retval) {
+ if (errno != EBUSY)
+ err(retval, "mount cpuset");
+ if (debug)
+ fprintf(dfp, "cpuset already mounted\n");
+ }
+
+ retval = chdir(cpuset_base);
+ if (retval)
+ err(retval, "%s", cpuset_base);
+
+ retval = mkdir(cpuset_balloon, 0755);
+ if (retval) {
+ if (errno == EEXIST)
+ fprintf(dfp, "okay, %s is already there\n", cpuset_balloon);
+ else
+ err(1, "could not create '%s'", cpuset_balloon);
+ }
+
+ copy_file_by_line("cpuset.mems", "balloon/cpuset.mems");
+
+ copy_file_by_line("cpuset.cpus", "balloon/cpuset.cpus");
+
+ copy_file_by_line("tasks", "balloon/tasks");
+
+}
+
+void daemonize(void)
+{
+ pid_t pid;
+ int i;
+
+ pid = fork();
+ if (pid < 0)
+ err(1, "fork");
+ if (pid > 0)
+ exit(0); /* parent exits */
+
+ if (setsid() < 0)
+ err(1, "setsid");
+ signal(SIGCHLD, SIG_IGN);
+ signal(SIGHUP, SIG_IGN);
+
+ pid = fork();
+ if (pid < 0)
+ err(2, "Fork");
+ if (pid > 0)
+ exit(0); /* parent exits */
+
+ umask(0);
+ chdir("/");
+
+ for (i = sysconf(_SC_OPEN_MAX); i >= 0; i--)
+ close(i);
+}
+
+void debug_init(void)
+{
+ if (output_path == NULL) {
+ dfp = stderr;
+
+ //output_path = "/tmp/shepherd.log";
+ } else {
+ dfp = fopen_or_die(output_path, "w");
+ }
+}
+
+int main(int argc, char **argv)
+{
+
+ srandom((unsigned int)time(NULL));
+
+ defaults_init();
+
+ cmdline(argc, argv);
+
+ debug_init();
+
+ if (!debug)
+ daemonize();
+
+ openlog("shepherd", LOG_PID, LOG_DAEMON);
+
+ topology_probe();
+
+ cpuset_init();
+
+ stats_init();
+
+ if (argc - optind)
+ fork_it(argv + optind);
+
+ signals_init();
+ main_loop();
+
+ dump_statistics();
+ do_binding(cpuset_present); /* restore system binding */
+ return 0;
+}
+