perf/x86/intel/qos: Support per-task eventsexperimental/cqm/mfleming

Add support for task events as well as system-wide events. This change has a big impact on the way that we gather L3 cache occupancy values in intel_qos_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userland which knows how to discard all but one per-cpu result per socket using the 'readers' cpumask. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we cache the L3 occupancy value for the current socket in intel_qos_event_read() and calculate the total by summing all the previously cached values for all other sockets. Ideally we'd do a cross-CPU call in intel_qos_event_read() to read the instantaneous value for all other sockets instead of relying on the cached (stale) copy, but that's not possible because we execute with interrupts disabled. Signed-off-by: Matt Fleming <matt.fleming@intel.com>
author: Matt Fleming <matt.fleming@intel.com> 2014-06-25 20:05:17 +0100
committer: Matt Fleming <matt.fleming@intel.com> 2014-06-26 12:58:36 +0100
commit: 837cf8134c2314c9c049f9a20c493199472fa572 (patch)
tree: 61c5d5df95982e6613f5f05ab60992b6159e897a
parent: b335c1ae08043c3f55d60d8d3a3a263758c43e81 (diff)
download: linux-experimental/cqm/mfleming.tar.gz
2 files changed, 66 insertions, 16 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_qos.c b/arch/x86/kernel/cpu/perf_event_intel_qos.c
index a0ffd6e33c064e..be8171139e50e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_qos.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_qos.c
@@ -286,7 +286,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
  * If we're part of a group, we use the group's RMID.
  */
 static int intel_qos_setup_event(struct perf_event *event,
-				 struct perf_event **group)
+				 struct perf_event **group, int cpu)
 {
 	struct perf_event *iter;
 	int rmid;
@@ -295,6 +295,8 @@ static int intel_qos_setup_event(struct perf_event *event,
 		if (__match_event(iter, event)) {
 			/* All tasks in a group share an RMID */
 			event->hw.qos_rmid = iter->hw.qos_rmid;
+			event->hw.qos_package_count =
+				iter->hw.qos_package_count;
 			*group = iter;
 			return 0;
 		}
@@ -308,12 +310,31 @@ static int intel_qos_setup_event(struct perf_event *event,
 		return rmid;
 
 	event->hw.qos_rmid = rmid;
+
+	/*
+	 * For a task event we need counters for each package so that we
+	 * can cache the last read value.
+	 */
+	if (cpu == -1) {
+		u64 *counts;
+
+		counts = kzalloc(sizeof(u64) *
+				 cpumask_weight(&qos_cpumask), GFP_KERNEL);
+		if (!counts) {
+			__put_rmid(rmid);
+			return -ENOMEM;
+		}
+
+		event->hw.qos_package_count = counts;
+	}
+
 	return 0;
 }
 
 static void intel_qos_event_read(struct perf_event *event)
 {
 	unsigned long rmid = event->hw.qos_rmid;
+	int i, index, phys_id;
 	u64 val;
 
 	val = __rmid_read(rmid);
@@ -326,7 +347,47 @@ static void intel_qos_event_read(struct perf_event *event)
 
 	val *= qos_l3_scale; /* cachelines -> bytes */
 
-	local64_set(&event->count, val);
+	/*
+	 * If this event is per-cpu then we don't need to do any
+	 * aggregation in the kernel, it's all done in userland.
+	 */
+	if (event->cpu != -1) {
+		local64_set(&event->count, val);
+		return;
+	}
+
+	/*
+	 * OK, we've got a task event, recompute the total occupancy.
+	 *
+	 * There is a race window here because we're using stale
+	 * occupancy values since we're not able to do a cross-CPU
+	 * (socket) call to do the occupancy read because we're
+	 * executing with interrupts disabled.
+	 *
+	 * In an ideal world we'd do a smp_call_function_single() to
+	 * read the other sockets' instantaneous values because it may
+	 * have changed (reduced) since we last updated ->hw.qos_value[].
+	 *
+	 * If these values prove to be wildly inaccurate we may want to
+	 * consider installing a per-socket hrtimer to refresh the
+	 * values periodically.
+	 */
+	local64_set(&event->count, 0);
+
+	phys_id = topology_physical_package_id(smp_processor_id());
+	index = 0;
+
+	/* Convert phys_id to hw->qos_package_count index */
+	for_each_cpu(i, &qos_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			break;
+		index++;
+	}
+
+	event->hw.qos_package_count[index] = val;
+
+	for (i = 0; i < cpumask_weight(&qos_cpumask); i++)
+		local64_add(event->hw.qos_package_count[i], &event->count);
 }
 
 static void intel_qos_event_start(struct perf_event *event, int mode)
@@ -452,16 +513,6 @@ static struct pmu intel_qos_pmu;
 
 /*
  * Takes non-sampling task,cgroup or machine wide events.
- *
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_qos_event_read().
  */
 static int intel_qos_event_init(struct perf_event *event)
 {
@@ -474,9 +525,6 @@ static int intel_qos_event_init(struct perf_event *event)
 	if (event->attr.config & ~QOS_EVENT_MASK)
 		return -EINVAL;
 
-	if (event->cpu == -1)
-		return -EINVAL;
-
 	/* unsupported modes and filters */
 	if (event->attr.exclude_user   ||
 	    event->attr.exclude_kernel ||
@@ -495,7 +543,8 @@ static int intel_qos_event_init(struct perf_event *event)
 
 	mutex_lock(&cache_mutex);
 
-	err = intel_qos_setup_event(event, &group); /* will also set rmid */
+	/* Will also set rmid */
+	err = intel_qos_setup_event(event, &group, event->cpu);
 	if (err)
 		goto out;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index a48e01ed601977..4e78ca51139635 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -133,6 +133,7 @@ struct hw_perf_event {
 			struct list_head	qos_events_entry;
 			struct list_head	qos_groups_entry;
 			struct list_head	qos_group_entry;
+			u64			*qos_package_count;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
author	Matt Fleming <matt.fleming@intel.com>	2014-06-25 20:05:17 +0100
committer	Matt Fleming <matt.fleming@intel.com>	2014-06-26 12:58:36 +0100
commit	837cf8134c2314c9c049f9a20c493199472fa572 (patch)
tree	61c5d5df95982e6613f5f05ab60992b6159e897a
parent	b335c1ae08043c3f55d60d8d3a3a263758c43e81 (diff)
download	linux-experimental/cqm/mfleming.tar.gz