perf/x86/intel/qos: Support per-task events

Add support for task events as well as system-wide events. This change has a big impact on the way that we gather L3 cache occupancy values in intel_qos_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userland which knows how to discard all but one per-cpu result per socket using the 'readers' cpumask. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we cache the L3 occupancy value for the current socket in intel_qos_event_read() and calculate the total by summing all the previously cached values for all other sockets. Ideally we'd do a cross-CPU call in intel_qos_event_read() to read the instantaneous value for all other sockets instead of relying on the cached (stale) copy, but that's not possible because we execute with interrupts disabled. Signed-off-by: Matt Fleming <matt.fleming@intel.com>
author: Matt Fleming <matt.fleming@intel.com> 2014-06-25 20:05:17 +0100
committer: Matt Fleming <matt.fleming@intel.com> 2014-06-26 14:39:59 +0100
commit: 2047b68340eb0f32745be78fee80af92d8aa347a (patch)
tree: 721b31062ab334ffab234d4cfacd3fd71c58dc1b
parent: 1f4ffb0cd36c50ca7bdfb008d0bbb60855d73f56 (diff)
download: linux-2047b68340eb0f32745be78fee80af92d8aa347a.tar.gz
2 files changed, 66 insertions, 17 deletions
diff --git a/arch/x86/kernel/cpu/perf_event_intel_qos.c b/arch/x86/kernel/cpu/perf_event_intel_qos.c
index a3a7690ccec1e3..a02886cff68bdc 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_qos.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_qos.c
@@ -230,7 +230,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
  * If we're part of a group, we use the group's RMID.
  */
 static int intel_qos_setup_event(struct perf_event *event,
-				 struct perf_event **group)
+				 struct perf_event **group, int cpu)
 {
 	struct perf_event *iter;
 	int rmid;
@@ -239,6 +239,8 @@ static int intel_qos_setup_event(struct perf_event *event,
 		if (__match_event(iter, event)) {
 			/* All tasks in a group share an RMID */
 			event->hw.qos_rmid = iter->hw.qos_rmid;
+			event->hw.qos_package_count =
+				iter->hw.qos_package_count;
 			*group = iter;
 			return 0;
 		}
@@ -252,12 +254,31 @@ static int intel_qos_setup_event(struct perf_event *event,
 		return rmid;
 
 	event->hw.qos_rmid = rmid;
+
+	/*
+	 * For a task event we need counters for each package so that we
+	 * can cache the last read value.
+	 */
+	if (cpu == -1) {
+		u64 *counts;
+
+		counts = kzalloc(sizeof(u64) *
+				 cpumask_weight(&qos_cpumask), GFP_KERNEL);
+		if (!counts) {
+			__put_rmid(rmid);
+			return -ENOMEM;
+		}
+
+		event->hw.qos_package_count = counts;
+	}
+
 	return 0;
 }
 
 static void intel_qos_event_read(struct perf_event *event)
 {
 	unsigned long rmid = event->hw.qos_rmid;
+	int i, index, phys_id;
 	u64 val;
 
 	val = __rmid_read(rmid);
@@ -270,7 +291,47 @@ static void intel_qos_event_read(struct perf_event *event)
 
 	val *= qos_l3_scale; /* cachelines -> bytes */
 
-	local64_set(&event->count, val);
+	/*
+	 * If this event is per-cpu then we don't need to do any
+	 * aggregation in the kernel, it's all done in userland.
+	 */
+	if (event->cpu != -1) {
+		local64_set(&event->count, val);
+		return;
+	}
+
+	/*
+	 * OK, we've got a task event, recompute the total occupancy.
+	 *
+	 * There is a race window here because we're using stale
+	 * occupancy values since we're not able to do a cross-CPU
+	 * (socket) call to do the occupancy read because we're
+	 * executing with interrupts disabled.
+	 *
+	 * In an ideal world we'd do a smp_call_function_single() to
+	 * read the other sockets' instantaneous values because it may
+	 * have changed (reduced) since we last updated ->hw.qos_value[].
+	 *
+	 * If these values prove to be wildly inaccurate we may want to
+	 * consider installing a per-socket hrtimer to refresh the
+	 * values periodically.
+	 */
+	local64_set(&event->count, 0);
+
+	phys_id = topology_physical_package_id(smp_processor_id());
+	index = 0;
+
+	/* Convert phys_id to hw->qos_package_count index */
+	for_each_cpu(i, &qos_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			break;
+		index++;
+	}
+
+	event->hw.qos_package_count[index] = val;
+
+	for (i = 0; i < cpumask_weight(&qos_cpumask); i++)
+		local64_add(event->hw.qos_package_count[i], &event->count);
 }
 
 static void intel_qos_event_start(struct perf_event *event, int mode)
@@ -394,17 +455,6 @@ static void intel_qos_event_destroy(struct perf_event *event)
 
 static struct pmu intel_qos_pmu;
 
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_qos_event_read().
- */
 static int intel_qos_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
@@ -416,9 +466,6 @@ static int intel_qos_event_init(struct perf_event *event)
 	if (event->attr.config & ~QOS_EVENT_MASK)
 		return -EINVAL;
 
-	if (event->cpu == -1)
-		return -EINVAL;
-
 	/* unsupported modes and filters */
 	if (event->attr.exclude_user   ||
 	    event->attr.exclude_kernel ||
@@ -437,7 +484,8 @@ static int intel_qos_event_init(struct perf_event *event)
 
 	mutex_lock(&cache_mutex);
 
-	err = intel_qos_setup_event(event, &group); /* will also set rmid */
+	/* Will also set rmid */
+	err = intel_qos_setup_event(event, &group, event->cpu);
 	if (err)
 		goto out;
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2fef5d7bf21c81..747532903231d8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -132,6 +132,7 @@ struct hw_perf_event {
 			struct list_head	qos_events_entry;
 			struct list_head	qos_groups_entry;
 			struct list_head	qos_group_entry;
+			u64			*qos_package_count;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
author	Matt Fleming <matt.fleming@intel.com>	2014-06-25 20:05:17 +0100
committer	Matt Fleming <matt.fleming@intel.com>	2014-06-26 14:39:59 +0100
commit	2047b68340eb0f32745be78fee80af92d8aa347a (patch)
tree	721b31062ab334ffab234d4cfacd3fd71c58dc1b
parent	1f4ffb0cd36c50ca7bdfb008d0bbb60855d73f56 (diff)
download	linux-2047b68340eb0f32745be78fee80af92d8aa347a.tar.gz