aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorAndrea Arcangeli <aarcange@redhat.com>2019-01-04 19:55:11 -0500
committerAndrea Arcangeli <aarcange@redhat.com>2023-11-11 22:03:37 -0500
commitaaf521f4156d748379aca0e048cb0174d7d66706 (patch)
tree839014c33989048229f0f040295ef5da08ccfc5e
parent0a7da10edccd2b757c1e779da55880cdc58b6d60 (diff)
downloadaaf521f4156d748379aca0e048cb0174d7d66706.tar.gz
sched/fair: skip select_idle_sibling() in presence of sync wakeups
__wake_up_sync() gives a very explicit hint to the scheduler that the current task will immediately go to sleep and won't return running until after the waken tasks has started running again. This is common behavior for message passing through pipes or local sockets (AF_UNIX or through the loopback interface). The scheduler does everything right up to the point it calls select_idle_sibling(). Up to that point the CPU selected for the next task that got a sync-wakeup could very well be the local CPU. That way the sync-waken task will start running immediately after the current task goes to sleep without requiring remote CPU wakeups. However when select_idle_sibling() is called (especially if SCHED_MC=y) if there's at least one idle core in the same package, the sync-waken task will be forcefully waken to run on a different idle core, in turn destroying the "sync" information and all work done up to that point. Without this patch under such a workload there will be two different CPUs at ~50% utilization and the __wake_up_sync() hint won't really provide much of benefit if compared to the regular non-sync wakeup. With this patch there will be a single CPU used at 100% utilization and that increases performance for those common workloads. With 112 threads, 28 cores per socket and 2 sockets (2 NUMA nodes): $ stress-ng --timeout 300 --times --verify --metrics-brief --sockabuse `nproc` before this patch: stressor bogo ops real time usr time sys time bogo ops/s bogo ops/s (secs) (secs) (secs) (real time) (usr+sys time) sockabuse 3242769 300.00 211.08 2494.69 10809.22 1198.46 sockabuse 3243719 300.00 209.97 2491.47 10812.39 1200.74 after this patch: sockabuse 3821766 300.00 266.75 3069.13 12739.21 1145.65 sockabuse 3758514 300.00 264.16 3043.67 12528.37 1136.25 v2: improved for large CPU systems after feedback from "kernel test robot". Reported-by: kernel test robot <oliver.sang@intel.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
-rw-r--r--kernel/sched/fair.c18
1 files changed, 13 insertions, 5 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 646a6ae4b25090..08fe579850b694 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -716,7 +716,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
#include "pelt.h"
#ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu,
+ int this_cpu, int target, int sync);
static unsigned long task_h_load(struct task_struct *p);
static unsigned long capacity_of(int cpu);
@@ -6613,7 +6614,8 @@ static inline bool asym_fits_cpu(unsigned long util,
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
-static int select_idle_sibling(struct task_struct *p, int prev, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int this_cpu,
+ int target, int sync)
{
bool has_idle_core = false;
struct sched_domain *sd;
@@ -6658,8 +6660,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (is_per_cpu_kthread(current) &&
in_task() &&
- prev == smp_processor_id() &&
- this_rq()->nr_running <= 1 &&
+ prev == this_cpu &&
+ cpu_rq(this_cpu)->nr_running <= 1 &&
asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
@@ -6710,6 +6712,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
}
+ /* skip idle balancing if wake_affine_idle selected this_cpu */
+ if (sync && target == this_cpu && cpu_rq(this_cpu)->nr_running == 1 &&
+ asym_fits_cpu(task_util, util_min, util_max, target))
+ return target;
+
i = select_idle_cpu(p, sd, has_idle_core, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -7206,7 +7213,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
- new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
+ new_cpu = select_idle_sibling(p, prev_cpu, cpu, new_cpu, sync);
+
}
rcu_read_unlock();