From: Ingo Molnar This is the current remove-BKL patch. I test-booted it on x86 and x64, trying every conceivable combination of SMP, PREEMPT and PREEMPT_BKL. All other architectures should compile as well. (most of the testing was done with the zaphod patch undone but it applies cleanly on vanilla -mm3 as well and should work fine.) this is the debugging-enabled variant of the patch which has two main debugging features: - debug potentially illegal smp_processor_id() use. Has caught a number of real bugs - e.g. look at the printk.c fix in the patch. - make it possible to enable/disable the BKL via a .config. If this goes upstream we dont want this of course, but for now it gives people a chance to find out whether any particular problem was caused by this patch. This patch has one important fix over the previous BKL patch: on PREEMPT kernels if we preempted BKL-using code then the code still auto-dropped the BKL by mistake. This caused a number of breakages for testers, which breakages went away once this bug was fixed. Also the debugging mechanism has been improved alot relative to the previous BKL patch. Would be nice to test-drive this in -mm. There will likely be some more smp_processor_id() false positives but they are 1) harmless 2) easy to fix up. We could as well find more real smp_processor_id() related breakages as well. The most noteworthy fact is that no BKL-using code was found yet that relied on smp_processor_id(), which is promising from a compatibility POV. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- 25-akpm/arch/i386/Kconfig | 11 + 25-akpm/arch/i386/kernel/traps.c | 2 25-akpm/arch/i386/lib/delay.c | 2 25-akpm/arch/sh/lib/delay.c | 2 25-akpm/arch/sparc64/lib/delay.c | 2 25-akpm/arch/x86_64/Kconfig | 11 + 25-akpm/arch/x86_64/lib/delay.c | 2 25-akpm/include/asm-i386/smp.h | 2 25-akpm/include/asm-x86_64/smp.h | 2 25-akpm/include/linux/hardirq.h | 8 - 25-akpm/include/linux/smp.h | 21 ++- 25-akpm/include/linux/smp_lock.h | 51 ------- 25-akpm/include/net/route.h | 2 25-akpm/include/net/snmp.h | 14 +- 25-akpm/kernel/module.c | 2 25-akpm/kernel/printk.c | 3 25-akpm/kernel/sched.c | 262 +++++++++++++++++++++++++++++++++++---- 25-akpm/kernel/stop_machine.c | 4 18 files changed, 310 insertions(+), 93 deletions(-) diff -puN arch/i386/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore arch/i386/Kconfig --- 25/arch/i386/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.605814632 -0700 +++ 25-akpm/arch/i386/Kconfig 2004-10-21 14:54:33.632810528 -0700 @@ -513,6 +513,17 @@ config PREEMPT Say Y here if you are building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on PREEMPT + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + config X86_UP_APIC bool "Local APIC support on uniprocessors" if !SMP depends on !(X86_VISWS || X86_VOYAGER) diff -puN arch/i386/kernel/traps.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/i386/kernel/traps.c --- 25/arch/i386/kernel/traps.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.606814480 -0700 +++ 25-akpm/arch/i386/kernel/traps.c 2004-10-21 14:54:33.633810376 -0700 @@ -339,7 +339,7 @@ void die(const char * str, struct pt_reg }; static int die_counter; - if (die.lock_owner != smp_processor_id()) { + if (die.lock_owner != _smp_processor_id()) { console_verbose(); spin_lock_irq(&die.lock); die.lock_owner = smp_processor_id(); diff -puN arch/i386/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/i386/lib/delay.c --- 25/arch/i386/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.607814328 -0700 +++ 25-akpm/arch/i386/lib/delay.c 2004-10-21 14:54:33.633810376 -0700 @@ -34,7 +34,7 @@ inline void __const_udelay(unsigned long xloops *= 4; __asm__("mull %0" :"=d" (xloops), "=&a" (d0) - :"1" (xloops),"0" (current_cpu_data.loops_per_jiffy * (HZ/4))); + :"1" (xloops),"0" (cpu_data[_smp_processor_id()].loops_per_jiffy * (HZ/4))); __delay(++xloops); } diff -puN arch/sh/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/sh/lib/delay.c --- 25/arch/sh/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.609814024 -0700 +++ 25-akpm/arch/sh/lib/delay.c 2004-10-21 14:54:33.634810224 -0700 @@ -24,7 +24,7 @@ inline void __const_udelay(unsigned long __asm__("dmulu.l %0, %2\n\t" "sts mach, %0" : "=r" (xloops) - : "0" (xloops), "r" (current_cpu_data.loops_per_jiffy) + : "0" (xloops), "r" (cpu_data[_smp_processor_id()].loops_per_jiffy) : "macl", "mach"); __delay(xloops * HZ); } diff -puN arch/sparc64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/sparc64/lib/delay.c --- 25/arch/sparc64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.610813872 -0700 +++ 25-akpm/arch/sparc64/lib/delay.c 2004-10-21 14:54:33.634810224 -0700 @@ -31,7 +31,7 @@ void __const_udelay(unsigned long n) { n *= 4; - n *= (cpu_data(smp_processor_id()).udelay_val * (HZ/4)); + n *= (cpu_data(_smp_processor_id()).udelay_val * (HZ/4)); n >>= 32; __delay(n + 1); diff -puN arch/x86_64/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore arch/x86_64/Kconfig --- 25/arch/x86_64/Kconfig~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.611813720 -0700 +++ 25-akpm/arch/x86_64/Kconfig 2004-10-21 14:54:33.635810072 -0700 @@ -244,6 +244,17 @@ config PREEMPT Say Y here if you are feeling brave and building a kernel for a desktop, embedded or real-time system. Say N if you are unsure. +config PREEMPT_BKL + bool "Preempt The Big Kernel Lock" + depends on PREEMPT + default y + help + This option reduces the latency of the kernel by making the + big kernel lock preemptible. + + Say Y here if you are building a kernel for a desktop system. + Say N if you are unsure. + config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" depends on SMP diff -puN arch/x86_64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore arch/x86_64/lib/delay.c --- 25/arch/x86_64/lib/delay.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.613813416 -0700 +++ 25-akpm/arch/x86_64/lib/delay.c 2004-10-21 14:54:33.635810072 -0700 @@ -34,7 +34,7 @@ void __delay(unsigned long loops) inline void __const_udelay(unsigned long xloops) { - __delay(((xloops * current_cpu_data.loops_per_jiffy) >> 32) * HZ); + __delay(((xloops * cpu_data[_smp_processor_id()].loops_per_jiffy) >> 32) * HZ); } void __udelay(unsigned long usecs) diff -puN include/asm-i386/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/asm-i386/smp.h --- 25/include/asm-i386/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.614813264 -0700 +++ 25-akpm/include/asm-i386/smp.h 2004-10-21 14:54:33.635810072 -0700 @@ -50,7 +50,7 @@ extern u8 x86_cpu_to_apicid[]; * from the initial startup. We map APIC_BASE very early in page_setup(), * so this is correct in the x86 case. */ -#define smp_processor_id() (current_thread_info()->cpu) +#define __smp_processor_id() (current_thread_info()->cpu) extern cpumask_t cpu_callout_map; #define cpu_possible_map cpu_callout_map diff -puN include/asm-x86_64/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/asm-x86_64/smp.h --- 25/include/asm-x86_64/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.615813112 -0700 +++ 25-akpm/include/asm-x86_64/smp.h 2004-10-21 14:54:33.636809920 -0700 @@ -66,7 +66,7 @@ static inline int num_booting_cpus(void) return cpus_weight(cpu_callout_map); } -#define smp_processor_id() read_pda(cpunumber) +#define __smp_processor_id() read_pda(cpunumber) extern __inline int hard_smp_processor_id(void) { diff -puN include/linux/hardirq.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/hardirq.h --- 25/include/linux/hardirq.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.616812960 -0700 +++ 25-akpm/include/linux/hardirq.h 2004-10-21 14:54:33.636809920 -0700 @@ -61,12 +61,16 @@ #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) # define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked()) +#else +# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) +#endif + +#ifdef CONFIG_PREEMPT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else -# define in_atomic() ((preempt_count() & ~PREEMPT_ACTIVE) != 0) # define preemptible() 0 # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET #endif diff -puN include/linux/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/smp.h --- 25/include/linux/smp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.618812656 -0700 +++ 25-akpm/include/linux/smp.h 2004-10-21 14:54:33.636809920 -0700 @@ -95,8 +95,10 @@ void smp_prepare_boot_cpu(void); /* * These macros fold the SMP functionality into a single CPU system */ - -#define smp_processor_id() 0 + +#if !defined(__smp_processor_id) || !defined(CONFIG_PREEMPT) +# define smp_processor_id() 0 +#endif #define hard_smp_processor_id() 0 #define smp_threads_ready 1 #define smp_call_function(func,info,retry,wait) ({ 0; }) @@ -107,6 +109,21 @@ static inline void smp_send_reschedule(i #endif /* !SMP */ +#ifdef __smp_processor_id +# ifdef CONFIG_PREEMPT + /* + * temporary debugging check detecting places that use + * smp_processor_id() in a potentially unsafe way: + */ + extern unsigned int smp_processor_id(void); +# else +# define smp_processor_id() __smp_processor_id() +# endif +# define _smp_processor_id() __smp_processor_id() +#else +# define _smp_processor_id() smp_processor_id() +#endif + #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() #define put_cpu_no_resched() preempt_enable_no_resched() diff -puN include/linux/smp_lock.h~remove-the-bkl-by-turning-it-into-a-semaphore include/linux/smp_lock.h --- 25/include/linux/smp_lock.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.619812504 -0700 +++ 25-akpm/include/linux/smp_lock.h 2004-10-21 14:54:33.637809768 -0700 @@ -7,59 +7,14 @@ #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) -extern spinlock_t kernel_flag; - -#define kernel_locked() (current->lock_depth >= 0) - -#define get_kernel_lock() spin_lock(&kernel_flag) -#define put_kernel_lock() spin_unlock(&kernel_flag) - -/* - * Release global kernel lock. - */ -static inline void release_kernel_lock(struct task_struct *task) -{ - if (unlikely(task->lock_depth >= 0)) - put_kernel_lock(); -} - -/* - * Re-acquire the kernel lock - */ -static inline void reacquire_kernel_lock(struct task_struct *task) -{ - if (unlikely(task->lock_depth >= 0)) - get_kernel_lock(); -} - -/* - * Getting the big kernel lock. - * - * This cannot happen asynchronously, - * so we only need to worry about other - * CPU's. - */ -static inline void lock_kernel(void) -{ - int depth = current->lock_depth+1; - if (likely(!depth)) - get_kernel_lock(); - current->lock_depth = depth; -} - -static inline void unlock_kernel(void) -{ - BUG_ON(current->lock_depth < 0); - if (likely(--current->lock_depth < 0)) - put_kernel_lock(); -} +extern int kernel_locked(void); +extern void lock_kernel(void); +extern void unlock_kernel(void); #else #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) -#define release_kernel_lock(task) do { } while(0) -#define reacquire_kernel_lock(task) do { } while(0) #define kernel_locked() 1 #endif /* CONFIG_SMP || CONFIG_PREEMPT */ diff -puN include/net/route.h~remove-the-bkl-by-turning-it-into-a-semaphore include/net/route.h --- 25/include/net/route.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.620812352 -0700 +++ 25-akpm/include/net/route.h 2004-10-21 14:54:33.637809768 -0700 @@ -105,7 +105,7 @@ struct rt_cache_stat extern struct rt_cache_stat *rt_cache_stat; #define RT_CACHE_STAT_INC(field) \ - (per_cpu_ptr(rt_cache_stat, smp_processor_id())->field++) + (per_cpu_ptr(rt_cache_stat, _smp_processor_id())->field++) extern struct ip_rt_acct *ip_rt_acct; diff -puN include/net/snmp.h~remove-the-bkl-by-turning-it-into-a-semaphore include/net/snmp.h --- 25/include/net/snmp.h~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.622812048 -0700 +++ 25-akpm/include/net/snmp.h 2004-10-21 14:54:33.638809616 -0700 @@ -128,18 +128,18 @@ struct linux_mib { #define SNMP_STAT_USRPTR(name) (name[1]) #define SNMP_INC_STATS_BH(mib, field) \ - (per_cpu_ptr(mib[0], smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field]++) #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \ - (per_cpu_ptr(mib[0], smp_processor_id())->mibs[field + (offset)]++) + (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field + (offset)]++) #define SNMP_INC_STATS_USER(mib, field) \ - (per_cpu_ptr(mib[1], smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field]++) #define SNMP_INC_STATS(mib, field) \ - (per_cpu_ptr(mib[!in_softirq()], smp_processor_id())->mibs[field]++) + (per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]++) #define SNMP_DEC_STATS(mib, field) \ - (per_cpu_ptr(mib[!in_softirq()], smp_processor_id())->mibs[field]--) + (per_cpu_ptr(mib[!in_softirq()], _smp_processor_id())->mibs[field]--) #define SNMP_ADD_STATS_BH(mib, field, addend) \ - (per_cpu_ptr(mib[0], smp_processor_id())->mibs[field] += addend) + (per_cpu_ptr(mib[0], _smp_processor_id())->mibs[field] += addend) #define SNMP_ADD_STATS_USER(mib, field, addend) \ - (per_cpu_ptr(mib[1], smp_processor_id())->mibs[field] += addend) + (per_cpu_ptr(mib[1], _smp_processor_id())->mibs[field] += addend) #endif diff -puN kernel/module.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/module.c --- 25/kernel/module.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.623811896 -0700 +++ 25-akpm/kernel/module.c 2004-10-21 14:54:33.639809464 -0700 @@ -395,7 +395,7 @@ static void module_unload_init(struct mo for (i = 0; i < NR_CPUS; i++) local_set(&mod->ref[i].count, 0); /* Hold reference count during initialization. */ - local_set(&mod->ref[smp_processor_id()].count, 1); + local_set(&mod->ref[_smp_processor_id()].count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } diff -puN kernel/printk.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/printk.c --- 25/kernel/printk.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.625811592 -0700 +++ 25-akpm/kernel/printk.c 2004-10-21 14:54:33.640809312 -0700 @@ -645,8 +645,9 @@ void release_console_sem(void) _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock_irqrestore(&logbuf_lock, flags); + spin_unlock(&logbuf_lock); call_console_drivers(_con_start, _log_end); + local_irq_restore(flags); } console_locked = 0; console_may_schedule = 0; diff -puN kernel/sched.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/sched.c --- 25/kernel/sched.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.626811440 -0700 +++ 25-akpm/kernel/sched.c 2004-10-21 14:54:33.645808552 -0700 @@ -2479,6 +2479,219 @@ static inline int dependent_sleeper(int } #endif +#if defined(CONFIG_PREEMPT) && defined(__smp_processor_id) +/* + * Debugging check. + */ +unsigned int smp_processor_id(void) +{ + unsigned long preempt_count = preempt_count(); + int this_cpu = __smp_processor_id(); + cpumask_t this_mask; + + if (likely(preempt_count)) + goto out; + + if (irqs_disabled()) + goto out; + + /* + * Kernel threads bound to a single CPU can safely use + * smp_processor_id(): + */ + this_mask = cpumask_of_cpu(this_cpu); + + if (cpus_equal(current->cpus_allowed, this_mask)) + goto out; + + /* + * It is valid to assume CPU-locality during early bootup: + */ + if (system_state != SYSTEM_RUNNING) + goto out; + + /* + * Avoid recursion: + */ + preempt_disable(); + + if (!printk_ratelimit()) + goto out_enable; + + printk(KERN_ERR "using smp_processor_id() in preemptible code: %s/%d\n", + current->comm, current->pid); + dump_stack(); + +out_enable: + preempt_enable_no_resched(); +out: + return this_cpu; +} + +EXPORT_SYMBOL(smp_processor_id); + +#endif + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) + +#ifdef CONFIG_PREEMPT_BKL +/* + * The 'big kernel semaphore' + * + * This mutex is taken and released recursively by lock_kernel() + * and unlock_kernel(). It is transparently dropped and reaquired + * over schedule(). It is used to protect legacy code that hasn't + * been migrated to a proper locking design yet. + * + * Note: code locked by this semaphore will only be serialized against + * other code using the same locking facility. The code guarantees that + * the task remains on the same CPU. + * + * Don't use in new code. + */ +static __cacheline_aligned_in_smp DECLARE_MUTEX(kernel_sem); + +int kernel_locked(void) +{ + return current->lock_depth >= 0; +} + +EXPORT_SYMBOL(kernel_locked); + +/* + * Release global kernel semaphore: + */ +static inline void release_kernel_sem(struct task_struct *task) +{ + if (unlikely(task->lock_depth >= 0)) + up(&kernel_sem); +} + +/* + * Re-acquire the kernel semaphore. + * + * This function is called with preemption off. + * + * We are executing in schedule() so the code must be extremely careful + * about recursion, both due to the down() and due to the enabling of + * preemption. schedule() will re-check the preemption flag after + * reacquiring the semaphore. + */ +static inline void reacquire_kernel_sem(struct task_struct *task) +{ + int saved_lock_depth = task->lock_depth; + + if (likely(saved_lock_depth < 0)) + return; + + task->lock_depth = -1; + preempt_enable_no_resched(); + + down(&kernel_sem); + + preempt_disable(); + task->lock_depth = saved_lock_depth; +} + +/* + * Getting the big kernel semaphore. + */ +void lock_kernel(void) +{ + struct task_struct *task = current; + int depth = task->lock_depth + 1; + + if (likely(!depth)) + /* + * No recursion worries - we set up lock_depth _after_ + */ + down(&kernel_sem); + + task->lock_depth = depth; +} + +EXPORT_SYMBOL(lock_kernel); + +void unlock_kernel(void) +{ + struct task_struct *task = current; + + BUG_ON(task->lock_depth < 0); + + if (likely(--task->lock_depth < 0)) + up(&kernel_sem); +} + +EXPORT_SYMBOL(unlock_kernel); + +#else + +static spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED; + +int kernel_locked(void) +{ + return current->lock_depth >= 0; +} + +EXPORT_SYMBOL(kernel_locked); + +#define get_kernel_lock() spin_lock(&kernel_flag) +#define put_kernel_lock() spin_unlock(&kernel_flag) + +/* + * Release global kernel lock. + */ +static inline void release_kernel_sem(struct task_struct *task) +{ + if (unlikely(task->lock_depth >= 0)) + put_kernel_lock(); +} + +/* + * Re-acquire the kernel lock + */ +static inline void reacquire_kernel_sem(struct task_struct *task) +{ + if (unlikely(task->lock_depth >= 0)) + get_kernel_lock(); +} + +/* + * Getting the big kernel lock. + * + * This cannot happen asynchronously, + * so we only need to worry about other + * CPU's. + */ +void lock_kernel(void) +{ + int depth = current->lock_depth+1; + if (likely(!depth)) + get_kernel_lock(); + current->lock_depth = depth; +} + +EXPORT_SYMBOL(lock_kernel); + +void unlock_kernel(void) +{ + BUG_ON(current->lock_depth < 0); + if (likely(--current->lock_depth < 0)) + put_kernel_lock(); +} + +EXPORT_SYMBOL(unlock_kernel); + +#endif + +#else + +static inline void release_kernel_sem(struct task_struct *task) { } +static inline void reacquire_kernel_sem(struct task_struct *task) { } + +#endif + + /* * schedule() is the main scheduler function. */ @@ -2522,7 +2735,7 @@ need_resched: dump_stack(); } - release_kernel_lock(prev); + release_kernel_sem(prev); schedstat_inc(rq, sched_cnt); now = sched_clock(); if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) @@ -2645,7 +2858,7 @@ switch_tasks: } else spin_unlock_irq(&rq->lock); - reacquire_kernel_lock(current); + reacquire_kernel_sem(current); preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -2662,6 +2875,12 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); +#ifdef CONFIG_PREEMPT_BKL + struct task_struct *task = current; + int saved_lock_depth; +#endif + + /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -2671,9 +2890,21 @@ asmlinkage void __sched preempt_schedule return; need_resched: - ti->preempt_count = PREEMPT_ACTIVE; + preempt_count() += PREEMPT_ACTIVE; + /* + * We keep the big kernel semaphore locked, but we + * clear ->lock_depth so that schedule() doesnt + * auto-release the semaphore: + */ +#ifdef CONFIG_PREEMPT_BKL + saved_lock_depth = task->lock_depth; + task->lock_depth = -1; +#endif schedule(); - ti->preempt_count = 0; +#ifdef CONFIG_PREEMPT_BKL + task->lock_depth = saved_lock_depth; +#endif + preempt_count() -= PREEMPT_ACTIVE; /* we could miss a preemption opportunity between schedule and now */ barrier(); @@ -3411,6 +3642,8 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { + if (preempt_count() & PREEMPT_ACTIVE) + return; do { preempt_count() += PREEMPT_ACTIVE; schedule(); @@ -3498,7 +3731,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = this_rq(); + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); atomic_inc(&rq->nr_iowait); schedule(); @@ -3509,7 +3742,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = this_rq(); + struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); long ret; atomic_inc(&rq->nr_iowait); @@ -3718,7 +3951,7 @@ void __devinit init_idle(task_t *idle, i spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ -#ifdef CONFIG_PREEMPT +#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) idle->thread_info->preempt_count = (idle->lock_depth >= 0); #else idle->thread_info->preempt_count = 0; @@ -4123,21 +4356,6 @@ int __init migration_init(void) } #endif -/* - * The 'big kernel lock' - * - * This spinlock is taken and released recursively by lock_kernel() - * and unlock_kernel(). It is transparently dropped and reaquired - * over schedule(). It is used to protect legacy code that hasn't - * been migrated to a proper locking design yet. - * - * Don't use in new code. - * - * Note: spinlock debugging needs this even on !CONFIG_SMP. - */ -spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -EXPORT_SYMBOL(kernel_flag); - #ifdef CONFIG_SMP /* * Attach the domain 'sd' to 'cpu' as its base domain. Callers must diff -puN kernel/stop_machine.c~remove-the-bkl-by-turning-it-into-a-semaphore kernel/stop_machine.c --- 25/kernel/stop_machine.c~remove-the-bkl-by-turning-it-into-a-semaphore 2004-10-21 14:54:33.628811136 -0700 +++ 25-akpm/kernel/stop_machine.c 2004-10-21 14:54:33.646808400 -0700 @@ -90,7 +90,7 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { - if (i == smp_processor_id()) + if (i == _smp_processor_id()) continue; ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); if (ret < 0) @@ -172,7 +172,7 @@ struct task_struct *__stop_machine_run(i /* If they don't care which CPU fn runs on, bind to any online one. */ if (cpu == NR_CPUS) - cpu = smp_processor_id(); + cpu = _smp_processor_id(); p = kthread_create(do_stop, &smdata, "kstopmachine"); if (!IS_ERR(p)) { _