From: Nick Piggin Move balancing fields into struct sched_domain, so we can get more useful results on systems with multiple domains (eg SMT+SMP, CMP+NUMA, SMP+NUMA, etc). Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton --- 25-akpm/include/linux/sched.h | 13 ++++ 25-akpm/kernel/sched.c | 114 ++++++++++++++++++------------------------ 2 files changed, 62 insertions(+), 65 deletions(-) diff -puN include/linux/sched.h~sched-rework-schedstats include/linux/sched.h --- 25/include/linux/sched.h~sched-rework-schedstats 2005-02-24 19:54:46.000000000 -0800 +++ 25-akpm/include/linux/sched.h 2005-02-24 19:54:46.000000000 -0800 @@ -500,17 +500,26 @@ struct sched_domain { /* load_balance() stats */ unsigned long lb_cnt[MAX_IDLE_TYPES]; unsigned long lb_failed[MAX_IDLE_TYPES]; + unsigned long lb_balanced[MAX_IDLE_TYPES]; unsigned long lb_imbalance[MAX_IDLE_TYPES]; + unsigned long lb_gained[MAX_IDLE_TYPES]; + unsigned long lb_hot_gained[MAX_IDLE_TYPES]; unsigned long lb_nobusyg[MAX_IDLE_TYPES]; unsigned long lb_nobusyq[MAX_IDLE_TYPES]; + /* Active load balancing */ + unsigned long alb_cnt; + unsigned long alb_failed; + unsigned long alb_pushed; + /* sched_balance_exec() stats */ unsigned long sbe_attempts; unsigned long sbe_pushed; /* try_to_wake_up() stats */ - unsigned long ttwu_wake_affine; - unsigned long ttwu_wake_balance; + unsigned long ttwu_wake_remote; + unsigned long ttwu_move_affine; + unsigned long ttwu_move_balance; #endif }; diff -puN kernel/sched.c~sched-rework-schedstats kernel/sched.c --- 25/kernel/sched.c~sched-rework-schedstats 2005-02-24 19:54:46.000000000 -0800 +++ 25-akpm/kernel/sched.c 2005-02-24 19:54:46.000000000 -0800 @@ -248,35 +248,13 @@ struct runqueue { unsigned long yld_cnt; /* schedule() stats */ - unsigned long sched_noswitch; unsigned long sched_switch; unsigned long sched_cnt; unsigned long sched_goidle; - /* pull_task() stats */ - unsigned long pt_gained[MAX_IDLE_TYPES]; - unsigned long pt_lost[MAX_IDLE_TYPES]; - - /* active_load_balance() stats */ - unsigned long alb_cnt; - unsigned long alb_lost; - unsigned long alb_gained; - unsigned long alb_failed; - /* try_to_wake_up() stats */ unsigned long ttwu_cnt; - unsigned long ttwu_attempts; - unsigned long ttwu_moved; - - /* wake_up_new_task() stats */ - unsigned long wunt_cnt; - unsigned long wunt_moved; - - /* sched_migrate_task() stats */ - unsigned long smt_cnt; - - /* sched_balance_exec() stats */ - unsigned long sbe_cnt; + unsigned long ttwu_local; #endif }; @@ -331,7 +309,7 @@ static inline void task_rq_unlock(runque * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 10 +#define SCHEDSTAT_VERSION 11 static int show_schedstat(struct seq_file *seq, void *v) { @@ -349,22 +327,14 @@ static int show_schedstat(struct seq_fil /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu " - "%lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", + "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, - rq->yld_cnt, rq->sched_noswitch, + rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->alb_cnt, rq->alb_gained, rq->alb_lost, - rq->alb_failed, - rq->ttwu_cnt, rq->ttwu_moved, rq->ttwu_attempts, - rq->wunt_cnt, rq->wunt_moved, - rq->smt_cnt, rq->sbe_cnt, rq->rq_sched_info.cpu_time, + rq->ttwu_cnt, rq->ttwu_local, + rq->rq_sched_info.cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) - seq_printf(seq, " %lu %lu", rq->pt_gained[itype], - rq->pt_lost[itype]); seq_printf(seq, "\n"); #ifdef CONFIG_SMP @@ -375,17 +345,21 @@ static int show_schedstat(struct seq_fil cpumask_scnprintf(mask_str, NR_CPUS, sd->span); seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu", + itype++) { + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", sd->lb_cnt[itype], + sd->lb_balanced[itype], sd->lb_failed[itype], sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n", + sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_pushed, sd->sbe_attempts, - sd->ttwu_wake_affine, sd->ttwu_wake_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); } #endif } @@ -998,7 +972,6 @@ static int try_to_wake_up(task_t * p, un #endif rq = task_rq_lock(p, &flags); - schedstat_inc(rq, ttwu_cnt); old_state = p->state; if (!(old_state & state)) goto out; @@ -1013,8 +986,21 @@ static int try_to_wake_up(task_t * p, un if (unlikely(task_running(rq, p))) goto out_activate; - new_cpu = cpu; +#ifdef CONFIG_SCHEDSTATS + schedstat_inc(rq, ttwu_cnt); + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + } else { + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + } +#endif + new_cpu = cpu; if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; @@ -1053,7 +1039,7 @@ static int try_to_wake_up(task_t * p, un * in this domain. */ if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_affine); + schedstat_inc(sd, ttwu_move_affine); goto out_set_cpu; } } else if ((sd->flags & SD_WAKE_BALANCE) && @@ -1063,7 +1049,7 @@ static int try_to_wake_up(task_t * p, un * an imbalance. */ if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_balance); + schedstat_inc(sd, ttwu_move_balance); goto out_set_cpu; } } @@ -1071,10 +1057,8 @@ static int try_to_wake_up(task_t * p, un new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: - schedstat_inc(rq, ttwu_attempts); new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu) { - schedstat_inc(rq, ttwu_moved); set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ @@ -1217,7 +1201,6 @@ void fastcall wake_up_new_task(task_t * BUG_ON(p->state != TASK_RUNNING); - schedstat_inc(rq, wunt_cnt); /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks @@ -1269,7 +1252,6 @@ void fastcall wake_up_new_task(task_t * if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); - schedstat_inc(rq, wunt_moved); /* * Parent and child are on different CPUs, now get the * parent runqueue to update the parent's ->sleep_avg: @@ -1573,7 +1555,6 @@ static void sched_migrate_task(task_t *p || unlikely(cpu_is_offline(dest_cpu))) goto out; - schedstat_inc(rq, smt_cnt); /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ @@ -1601,7 +1582,6 @@ void sched_exec(void) struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - schedstat_inc(this_rq(), sbe_cnt); /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) goto out; @@ -1746,13 +1726,10 @@ skip_queue: goto skip_bitmap; } - /* - * Right now, this is the only place pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). - */ - schedstat_inc(this_rq, pt_gained[idle]); - schedstat_inc(busiest, pt_lost[idle]); +#ifdef CONFIG_SCHEDSTATS + if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; @@ -1768,6 +1745,14 @@ out: *all_pinned = 0; if (unlikely(pinned >= max_nr_move) && pulled == 0) *all_pinned = 1; + + /* + * Right now, this is the only place pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + return pulled; } @@ -2033,6 +2018,8 @@ static int load_balance(int this_cpu, ru out_balanced: spin_unlock(&this_rq->lock); + schedstat_inc(sd, lb_balanced[idle]); + /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -2058,12 +2045,14 @@ static int load_balance_newidle(int this schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out; } busiest = find_busiest_queue(group); if (!busiest || busiest == this_rq) { + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out; } @@ -2117,7 +2106,6 @@ static void active_load_balance(runqueue cpumask_t visited_cpus; int cpu; - schedstat_inc(busiest_rq, alb_cnt); /* * Search for suitable CPUs to push tasks to in successively higher * domains with SD_LOAD_BALANCE set. @@ -2128,6 +2116,8 @@ static void active_load_balance(runqueue /* no more domains to search */ break; + schedstat_inc(sd, alb_cnt); + cpu_group = sd->groups; do { for_each_cpu_mask(cpu, cpu_group->cpumask) { @@ -2153,10 +2143,9 @@ static void active_load_balance(runqueue double_lock_balance(busiest_rq, target_rq); if (move_tasks(target_rq, cpu, busiest_rq, 1, sd, SCHED_IDLE, &all_pinned)) { - schedstat_inc(busiest_rq, alb_lost); - schedstat_inc(target_rq, alb_gained); + schedstat_inc(sd, alb_pushed); } else { - schedstat_inc(busiest_rq, alb_failed); + schedstat_inc(sd, alb_failed); } spin_unlock(&target_rq->lock); } @@ -2747,8 +2736,7 @@ go_idle: array = rq->active; rq->expired_timestamp = 0; rq->best_expired_prio = MAX_PRIO; - } else - schedstat_inc(rq, sched_noswitch); + } idx = sched_find_first_bit(array->bitmap); queue = array->queue + idx; _