From 6d83365a4baf2b72119fbc7cb8769ec35591bb0c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Jul 2009 08:29:19 -0500
Subject: [PATCH] revert preempt BKL revert

commit 37ffffaf131b6620d27af5a1477f6db507718018 in tip.

[ basically, this is the -R of 8e3e076c5a78519a9f64cd384e8f18bc21882ce0 ]

While we understand that preemptible BKL is not a brilliant idea in
the first place, we have not the capacity of developers to fix all the
BKL leftovers right away. For PREEMPT-RT we rely on preemptible BKL
for now. We still look into removing BKL completely and it's high on
our todo list.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
---
 include/linux/hardirq.h |   18 +++----
 kernel/sched.c          |   27 ++++++++++--
 lib/kernel_lock.c       |  109 +++++++++++++---------------------------------
 3 files changed, 62 insertions(+), 92 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d5b3876..5d79504 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -92,14 +92,6 @@
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
-#if defined(CONFIG_PREEMPT)
-# define PREEMPT_INATOMIC_BASE kernel_locked()
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_INATOMIC_BASE 0
-# define PREEMPT_CHECK_OFFSET 0
-#endif
-
 /*
  * Are we running in atomic context?  WARNING: this macro cannot
  * always detect atomic context; in particular, it cannot know about
@@ -107,11 +99,17 @@
  * used in the general case to determine whether sleeping is possible.
  * Do not use in_atomic() in driver code.
  */
-#define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_INATOMIC_BASE)
+#define in_atomic()		((preempt_count() & ~PREEMPT_ACTIVE) != 0)
+
+#ifdef CONFIG_PREEMPT
+# define PREEMPT_CHECK_OFFSET 1
+#else
+# define PREEMPT_CHECK_OFFSET 0
+#endif
 
 /*
  * Check whether we were atomic before we did preempt_disable():
- * (used by the scheduler, *after* releasing the kernel lock)
+ * (used by the scheduler)
  */
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
diff --git a/kernel/sched.c b/kernel/sched.c
index 22c233e..1c38248 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3856,6 +3856,8 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
+	struct task_struct *task = current;
+	int saved_lock_depth;
 
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
@@ -3866,7 +3868,16 @@ asmlinkage void __sched preempt_schedule(void)
 
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
+
+		/*
+		 * We keep the big kernel semaphore locked, but we
+		 * clear ->lock_depth so that schedule() doesnt
+		 * auto-release the semaphore:
+		 */
+		saved_lock_depth = task->lock_depth;
+		task->lock_depth = -1;
 		schedule();
+		task->lock_depth = saved_lock_depth;
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -3887,15 +3898,26 @@ EXPORT_SYMBOL(preempt_schedule);
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 	struct thread_info *ti = current_thread_info();
+	struct task_struct *task = current;
+	int saved_lock_depth;
 
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
 	do {
 		add_preempt_count(PREEMPT_ACTIVE);
+
+		/*
+		 * We keep the big kernel semaphore locked, but we
+		 * clear ->lock_depth so that schedule() doesnt
+		 * auto-release the semaphore:
+		 */
+		saved_lock_depth = task->lock_depth;
+		task->lock_depth = -1;
 		local_irq_enable();
 		schedule();
 		local_irq_disable();
+		task->lock_depth = saved_lock_depth;
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 		/*
@@ -5281,11 +5303,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
-	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
 	task_thread_info(idle)->preempt_count = 0;
-#endif
+
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 5354922..2c9b548 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -14,107 +14,56 @@
 #include <trace/events/bkl.h>
 
 /*
- * The 'big kernel lock'
+ * The 'big kernel semaphore'
  *
- * This spinlock is taken and released recursively by lock_kernel()
+ * This mutex is taken and released recursively by lock_kernel()
  * and unlock_kernel().  It is transparently dropped and reacquired
  * over schedule().  It is used to protect legacy code that hasn't
  * been migrated to a proper locking design yet.
  *
+ * Note: code locked by this semaphore will only be serialized against
+ * other code using the same locking facility. The code guarantees that
+ * the task remains on the same CPU.
+ *
  * Don't use in new code.
  */
-static  __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(kernel_flag);
-
+static struct semaphore kernel_sem;
 
 /*
- * Acquire/release the underlying lock from the scheduler.
+ * Re-acquire the kernel semaphore.
  *
- * This is called with preemption disabled, and should
- * return an error value if it cannot get the lock and
- * TIF_NEED_RESCHED gets set.
+ * This function is called with preemption off.
  *
- * If it successfully gets the lock, it should increment
- * the preemption count like any spinlock does.
- *
- * (This works on UP too - do_raw_spin_trylock will never
- * return false in that case)
+ * We are executing in schedule() so the code must be extremely careful
+ * about recursion, both due to the down() and due to the enabling of
+ * preemption. schedule() will re-check the preemption flag after
+ * reacquiring the semaphore.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
-	while (!do_raw_spin_trylock(&kernel_flag)) {
-		if (need_resched())
-			return -EAGAIN;
-		cpu_relax();
-	}
-	preempt_disable();
-	return 0;
-}
+	int saved_lock_depth = current->lock_depth;
 
-void __lockfunc __release_kernel_lock(void)
-{
-	do_raw_spin_unlock(&kernel_flag);
-	__preempt_enable_no_resched();
-}
+	BUG_ON(saved_lock_depth < 0);
 
-/*
- * These are the BKL spinlocks - we try to be polite about preemption.
- * If SMP is not on (ie UP preemption), this all goes away because the
- * do_raw_spin_trylock() will always succeed.
- */
-#ifdef CONFIG_PREEMPT
-static inline void __lock_kernel(void)
-{
-	preempt_disable();
-	if (unlikely(!do_raw_spin_trylock(&kernel_flag))) {
-		/*
-		 * If preemption was disabled even before this
-		 * was called, there's nothing we can be polite
-		 * about - just spin.
-		 */
-		if (preempt_count() > 1) {
-			do_raw_spin_lock(&kernel_flag);
-			return;
-		}
+	current->lock_depth = -1;
+	local_irq_enable();
 
-		/*
-		 * Otherwise, let's wait for the kernel lock
-		 * with preemption enabled..
-		 */
-		do {
-			preempt_enable();
-			while (raw_spin_is_locked(&kernel_flag))
-				cpu_relax();
-			preempt_disable();
-		} while (!do_raw_spin_trylock(&kernel_flag));
-	}
-}
+	down(&kernel_sem);
 
-#else
+	preempt_disable();
+	local_irq_disable();
+	current->lock_depth = saved_lock_depth;
 
-/*
- * Non-preemption case - just get the spinlock
- */
-static inline void __lock_kernel(void)
-{
-	do_raw_spin_lock(&kernel_flag);
+	return 0;
 }
-#endif
 
-static inline void __unlock_kernel(void)
+void __lockfunc __release_kernel_lock(void)
 {
-	/*
-	 * the BKL is not covered by lockdep, so we open-code the
-	 * unlocking sequence (and thus avoid the dep-chain ops):
-	 */
-	do_raw_spin_unlock(&kernel_flag);
-	preempt_enable();
+	up(&kernel_sem);
 }
 
 /*
- * Getting the big kernel lock.
- *
- * This cannot happen asynchronously, so we only need to
- * worry about other CPU's.
+ * Getting the big kernel semaphore.
  */
 void __lockfunc _lock_kernel(const char *func, const char *file, int line)
 {
@@ -124,7 +73,10 @@ void __lockfunc _lock_kernel(const char *func, const char *file, int line)
 
 	if (likely(!depth)) {
 		might_sleep();
-		__lock_kernel();
+		/*
+		 * No recursion worries - we set up lock_depth _after_
+		 */
+		down(&kernel_sem);
 	}
 	current->lock_depth = depth;
 }
@@ -132,8 +84,9 @@ void __lockfunc _lock_kernel(const char *func, const char *file, int line)
 void __lockfunc _unlock_kernel(const char *func, const char *file, int line)
 {
 	BUG_ON(current->lock_depth < 0);
+
 	if (likely(--current->lock_depth < 0))
-		__unlock_kernel();
+		up(&kernel_sem);
 
 	trace_unlock_kernel(func, file, line);
 }
-- 
1.7.0.4