From: Ingo Molnar This patch implements RLIMIT_RT_CPU: the maximum amount of CPU time RT tasks may use, in percent. Defaults to 80%. Properties of the RT-CPU rlimit: - if there's idle time in the system then RT tasks will be allowed to use more than the limit. - if an RT task goes above the limit all the time then there is no guarantee that exactly the limit will be allowed for it. (i.e. you should set the limit to somewhat above the real needs of the RT task in question.) - a zero RLIMIT_RT_CPU value means unlimited CPU time to that RT task. - a nonzero rt_cpu_limit value also has the effect of allowing the use of RT priorities to nonprivileged users. - on SMP the limit is measured and enforced per-CPU. - runtime overhead is minimal, especially if the limit is set to 0. - the CPU-use measurement code has a 'memory' of roughly 300 msecs. I.e. if an RT task runs 100 msecs nonstop then it will increase its CPU use by about 30%. This should be fast enough for users for the limit to be human-inperceptible, but slow enough to allow occasional longer timeslices to RT tasks. I've uploaded a simple utility to set the RT_CPU rlimit, called execrtlim: http://redhat.com/~mingo/rt-limit-patches/ execrtlim can be used to test the rlimit, e.g.: ./execrtlim 10 10 /bin/bash will spawn a new shell with RLIMIT_RT_CPU curr/max set to 10%/10%. on older kernels the utility prints: $ ./execrtlim 10 10 /bin/bash execrtlim: kernel does not support RLIMIT_RT_CPU. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- 25-akpm/fs/proc/proc_misc.c | 15 ++-- 25-akpm/include/asm-generic/resource.h | 23 +++++- 25-akpm/include/linux/sched.h | 1 25-akpm/kernel/sched.c | 123 ++++++++++++++++++++++++++++++++- 25-akpm/kernel/sys.c | 10 ++ 5 files changed, 164 insertions(+), 8 deletions(-) diff -puN fs/proc/proc_misc.c~rlimit_rt_cpu fs/proc/proc_misc.c --- 25/fs/proc/proc_misc.c~rlimit_rt_cpu 2005-01-26 17:17:19.761526536 -0800 +++ 25-akpm/fs/proc/proc_misc.c 2005-01-26 17:17:19.772524864 -0800 @@ -321,7 +321,7 @@ static struct file_operations proc_slabi static int show_stat(struct seq_file *p, void *v) { int i; - unsigned long jif; + unsigned long jif, rt_avg; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; u64 sum = 0; @@ -330,6 +330,7 @@ static int show_stat(struct seq_file *p, jif = - wall_to_monotonic.tv_sec; if (wall_to_monotonic.tv_nsec) --jif; + rt_avg = 0; for_each_cpu(i) { int j; @@ -344,9 +345,10 @@ static int show_stat(struct seq_file *p, steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); for (j = 0 ; j < NR_IRQS ; j++) sum += kstat_cpu(i).irqs[j]; + rt_avg += rt_cpu_average(i); } - seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu\n", + seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %lu\n", (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), (unsigned long long)cputime64_to_clock_t(system), @@ -354,7 +356,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + rt_avg); for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ @@ -366,7 +369,8 @@ static int show_stat(struct seq_file *p, irq = kstat_cpu(i).cpustat.irq; softirq = kstat_cpu(i).cpustat.softirq; steal = kstat_cpu(i).cpustat.steal; - seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu\n", + rt_avg = rt_cpu_average(i); + seq_printf(p, "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %lu\n", i, (unsigned long long)cputime64_to_clock_t(user), (unsigned long long)cputime64_to_clock_t(nice), @@ -375,7 +379,8 @@ static int show_stat(struct seq_file *p, (unsigned long long)cputime64_to_clock_t(iowait), (unsigned long long)cputime64_to_clock_t(irq), (unsigned long long)cputime64_to_clock_t(softirq), - (unsigned long long)cputime64_to_clock_t(steal)); + (unsigned long long)cputime64_to_clock_t(steal), + rt_avg); } seq_printf(p, "intr %llu", (unsigned long long)sum); diff -puN include/asm-generic/resource.h~rlimit_rt_cpu include/asm-generic/resource.h --- 25/include/asm-generic/resource.h~rlimit_rt_cpu 2005-01-26 17:17:19.762526384 -0800 +++ 25-akpm/include/asm-generic/resource.h 2005-01-26 17:17:19.778523952 -0800 @@ -21,7 +21,27 @@ #define RLIMIT_SIGPENDING 11 /* max number of pending signals */ #define RLIMIT_MSGQUEUE 12 /* maximum bytes in POSIX mqueues */ -#define RLIM_NLIMITS 13 +/* + * RLIMIT_RT_CPU - the maximum amount of CPU time an RT task + * may use, in percent. Defaults to 80%. + * + * - if there's idle time in the system then RT tasks will be + * allowed to use more than the limit. + * + * - if an RT task goes above the limit all the time then there + * is no guarantee that exactly the limit will be allowed for + * it. (i.e. you should set the limit to somewhat above the real + * needs of the RT task in question.) + * + * - a zero RLIMIT_RT_CPU value means unlimited CPU time to that + * RT task. + * + * - a nonzero rt_cpu_limit value also has the effect of allowing + * the use of RT priorities to nonprivileged users. + */ +#define RLIMIT_RT_CPU 13 + +#define RLIM_NLIMITS 14 #endif /* @@ -53,6 +73,7 @@ [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ [RLIMIT_SIGPENDING] = { MAX_SIGPENDING, MAX_SIGPENDING }, \ [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ + [RLIMIT_RT_CPU] = { 80, 100 }, \ } #endif /* __KERNEL__ */ diff -puN include/linux/sched.h~rlimit_rt_cpu include/linux/sched.h --- 25/include/linux/sched.h~rlimit_rt_cpu 2005-01-26 17:17:19.764526080 -0800 +++ 25-akpm/include/linux/sched.h 2005-01-26 17:17:19.779523800 -0800 @@ -95,6 +95,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); +extern unsigned long rt_cpu_average(int cpu); #include #include diff -puN kernel/sched.c~rlimit_rt_cpu kernel/sched.c --- 25/kernel/sched.c~rlimit_rt_cpu 2005-01-26 17:17:19.766525776 -0800 +++ 25-akpm/kernel/sched.c 2005-01-26 17:17:19.777524104 -0800 @@ -225,6 +225,11 @@ struct runqueue { prio_array_t *active, *expired, arrays[2]; int best_expired_prio; atomic_t nr_iowait; + /* + * Short-term CPU usage history/load-average of all RT tasks + * on this CPU, linearly scaled to between 0...HZ: + */ + unsigned long rt_cpu_avg; #ifdef CONFIG_SMP struct sched_domain *sd; @@ -299,6 +304,37 @@ static DEFINE_PER_CPU(struct runqueue, r # define task_running(rq, p) ((rq)->curr == (p)) #endif +#if MAX_PRIO != 140 +# error update the BITMAP_SCHED_OTHER definitions! +#endif + +#if (BITS_PER_LONG == 32) +/* + * b[3] is bits 96...127, of which bits 4..32 are SCHED_OTHER: 28 bits + * b[4] is bits 128...140, of which bits 0..11 are SCHED_OTHER: 12 bits + */ +# define BITMAP_SCHED_OTHER(b) \ + ((b[3] & 0xfffffff0UL) || \ + (b[4] & 0x00000fffUL)) +#else +/* + * b[1] is bits 64..127, of which bits 36..127 are SCHED_OTHER: 28 bits + * b[2] is bits 128..140, of which bits 0..11 are SCHED_OTHER: 12 bits + */ +# define BITMAP_SCHED_OTHER(b) \ + ((b[1] & 0xfffffff000000000UL) || \ + (b[2] & 0x0000000000000fffUL)) +#endif + +/* + * non_rt_tasks() - are there any non-RT tasks running in the runqueue? + */ +static inline int non_rt_tasks(runqueue_t *rq) +{ + return BITMAP_SCHED_OTHER(rq->active->bitmap) || + BITMAP_SCHED_OTHER(rq->expired->bitmap); +} + /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without @@ -770,6 +806,52 @@ static void deactivate_task(struct task_ p->array = NULL; } +#define WEIGHT (HZ/10) + +/* + * Here we maintain the ->rq_cpu_avg load-average. + * + * It is implemented as a decaying average, where ->rt_cpu_avg + * moves in the range of 0 ... HZ, and where the 100% -> 5% decay + * time (which is HZ-independent) is ~300 msecs: + */ +static inline void rt_cpu_avg_inc(runqueue_t *rq) +{ + rq->rt_cpu_avg = (rq->rt_cpu_avg*(WEIGHT-1) + HZ)/WEIGHT; +} + +static inline void rt_cpu_avg_dec(runqueue_t *rq) +{ + rq->rt_cpu_avg = rq->rt_cpu_avg*(WEIGHT-1)/WEIGHT; +} + +#undef WEIGHT + +static int rt_task_over_cpu_limit(struct task_struct *p, runqueue_t *rq) +{ + unsigned long limit; + + /* + * An RT task needs delaying if there are any non-RT tasks + * running on this CPU, and if the current RT CPU usage value + * violates the task's RT CPU rlimit: + */ + if (!rt_task(p) || !non_rt_tasks(rq)) + return 0; + + limit = p->signal->rlim[RLIMIT_RT_CPU].rlim_cur; + if (!limit || (rq->rt_cpu_avg <= limit * HZ / 100)) + return 0; + + /* + * Delay the task by queueing it into the expired array: + */ + dequeue_task(p, p->array); + enqueue_task(p, rq->expired); + + return 1; +} + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -1445,6 +1527,16 @@ unsigned long nr_iowait(void) return sum; } +/* + * rt_cpu_average - the total load from RT tasks on this CPU + * + * returns in units of 0.1%. (i.e. for 51.5% it returns 515) + */ +unsigned long rt_cpu_average(int cpu) +{ + return cpu_rq(cpu)->rt_cpu_avg * 1000 / HZ; +} + #ifdef CONFIG_SMP /* @@ -2370,6 +2462,7 @@ void scheduler_tick(void) rq->timestamp_last_tick = now; if (p == rq->idle) { + rt_cpu_avg_dec(rq); // rq not locked - not a problem if (wake_priority_sleeper(rq)) goto out; rebalance_tick(cpu, rq, SCHED_IDLE); @@ -2391,6 +2484,17 @@ void scheduler_tick(void) */ if (rt_task(p)) { /* + * Maintain the CPU-average of RT tasks and if there's a + * RT-CPU rlimit for the current task and the limit has + * been violated then put the current task into the + * expired array: + */ + rt_cpu_avg_inc(rq); + if (rt_task_over_cpu_limit(p, rq)) { + set_tsk_need_resched(p); + goto out_unlock; + } + /* * RR tasks need a special form of timeslice management. * FIFO tasks have no timeslices. */ @@ -2404,6 +2508,7 @@ void scheduler_tick(void) } goto out_unlock; } + rt_cpu_avg_dec(rq); if (!--p->time_slice) { dequeue_task(p, rq->active); set_tsk_need_resched(p); @@ -2714,6 +2819,7 @@ go_idle: goto go_idle; } +pick_next_task: array = rq->active; if (unlikely(!array->nr_active)) { /* @@ -2732,6 +2838,13 @@ go_idle: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); + /* + * If the RT task would violate the RT CPU use rlimit + * then delay the task and pick the next one: + */ + if (rt_task_over_cpu_limit(next, rq)) + goto pick_next_task; + if (!rt_task(next) && next->activated > 0) { unsigned long long delta = now - next->timestamp; @@ -3383,8 +3496,14 @@ recheck: if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - !capable(CAP_SYS_NICE)) + /* + * If the RT CPU limit has been set then it activates a + * mechanism that makes RT tasks deadlock-safe - thus + * ordinary users may specify RT priorities too: + */ + if (!p->signal->rlim[RLIMIT_RT_CPU].rlim_cur && + (policy == SCHED_FIFO || policy == SCHED_RR) && + !capable(CAP_SYS_NICE)) return -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) diff -puN kernel/sys.c~rlimit_rt_cpu kernel/sys.c --- 25/kernel/sys.c~rlimit_rt_cpu 2005-01-26 17:17:19.768525472 -0800 +++ 25-akpm/kernel/sys.c 2005-01-26 17:17:19.778523952 -0800 @@ -1494,6 +1494,16 @@ asmlinkage long sys_setrlimit(unsigned i return -EPERM; if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > NR_OPEN) return -EPERM; + /* + * Special security rule for RT_CPU: if a task sets its rlimit + * back to 0 then drop any RT priority it might have: + */ + if (resource == RLIMIT_RT_CPU && !capable(CAP_SYS_NICE) && + (!new_rlim.rlim_cur || !new_rlim.rlim_max)) { + struct sched_param param = { .sched_priority = 0 }; + + sched_setscheduler(current, SCHED_NORMAL, ¶m); + } retval = security_task_setrlimit(resource, &new_rlim); if (retval) _