From: Rusty Russell Ingo recently accidentally broke CPU hotplug by enabling preemption around release_task(), which can be called on the current task if the parent isn't interested. The problem is, the task can be preempted and then the CPU can go down: it's not in the task list any more, and so it won't get migrated after the CPU goes down. It stays on the down CPU, which triggers a BUG_ON. We have had previous problems with tasks releasing themselves: oprofile has a comment about it, and we had the case of trying to deliver SIGXCPU in the timer tick to the current task which had called release_task(). This patch shuffles the self-reaping off to finish_task_switch, so there's never a running task which isn't in the task list, except idle threads. Signed-off-by: Rusty Russell (authored) Signed-off-by: Andrew Morton --- 25-akpm/include/linux/sched.h | 1 + 25-akpm/kernel/exit.c | 28 +++++++--------------------- 25-akpm/kernel/sched.c | 5 ++++- 3 files changed, 12 insertions(+), 22 deletions(-) diff -puN include/linux/sched.h~dont-sleep-after-were-out-of-task-list include/linux/sched.h --- 25/include/linux/sched.h~dont-sleep-after-were-out-of-task-list 2004-08-17 23:56:04.674166560 -0700 +++ 25-akpm/include/linux/sched.h 2004-08-17 23:56:04.681165496 -0700 @@ -565,6 +565,7 @@ do { if (atomic_dec_and_test(&(tsk)->usa #define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_DEAD 0x00000008 /* Dead */ +#define PF_SELFREAP 0x00000010 /* Never a zombie, must be released */ #define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ #define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* dumped core */ diff -puN kernel/exit.c~dont-sleep-after-were-out-of-task-list kernel/exit.c --- 25/kernel/exit.c~dont-sleep-after-were-out-of-task-list 2004-08-17 23:56:04.675166408 -0700 +++ 25-akpm/kernel/exit.c 2004-08-17 23:56:04.682165344 -0700 @@ -755,8 +755,8 @@ static void exit_notify(struct task_stru state = TASK_ZOMBIE; if (tsk->exit_signal == -1 && tsk->ptrace == 0) state = TASK_DEAD; - else - tsk->state = state; + tsk->state = state; + /* * Clear these here so that update_process_times() won't try to deliver * itimer, profile or rlimit signals to this task while it is in late exit. @@ -765,14 +765,6 @@ static void exit_notify(struct task_stru tsk->it_prof_value = 0; tsk->rlim[RLIMIT_CPU].rlim_cur = RLIM_INFINITY; - /* - * Get a reference to it so that we can set the state - * as the last step. The state-setting only matters if the - * current task is releasing itself, to trigger the final - * put_task_struct() in finish_task_switch(). (thread self-reap) - */ - get_task_struct(tsk); - write_unlock_irq(&tasklist_lock); list_for_each_safe(_p, _n, &ptrace_dead) { @@ -781,18 +773,12 @@ static void exit_notify(struct task_stru release_task(t); } - /* If the process is dead, release it - nobody will wait for it */ - if (state == TASK_DEAD) { - release_task(tsk); - write_lock_irq(&tasklist_lock); - tsk->state = state; - _raw_write_unlock(&tasklist_lock); - local_irq_enable(); - } else - preempt_disable(); - + preempt_disable(); + /* PF_DEAD says drop ref after we schedule. */ tsk->flags |= PF_DEAD; - put_task_struct(tsk); + /* PF_SELFREAP says there's no parent to wait4() for us. */ + if (state == TASK_DEAD) + tsk->flags |= PF_SELFREAP; } asmlinkage NORET_TYPE void do_exit(long code) diff -puN kernel/sched.c~dont-sleep-after-were-out-of-task-list kernel/sched.c --- 25/kernel/sched.c~dont-sleep-after-were-out-of-task-list 2004-08-17 23:56:04.677166104 -0700 +++ 25-akpm/kernel/sched.c 2004-08-17 23:56:04.685164888 -0700 @@ -1484,8 +1484,11 @@ static void finish_task_switch(task_t *p finish_arch_switch(rq, prev); if (mm) mmdrop(mm); - if (unlikely(prev_task_flags & PF_DEAD)) + if (unlikely(prev_task_flags & PF_DEAD)) { + if (prev_task_flags & PF_SELFREAP) + release_task(prev); put_task_struct(prev); + } } /** _