
From: Ingo Molnar <mingo@elte.hu>

while looking at HT scheduler bugreports and boot failures i discovered a
bad assumption in most of the HT scheduling code: that resched_task() can
be called without holding the task's runqueue.

This is most definitely not valid - doing it without locking can lead to
the task on that CPU exiting, and this CPU corrupting the (ex-) task_info
struct.  It can also lead to HT-wakeup races with task switching on that
other CPU.  (this_CPU marking the wrong task on that_CPU as need_resched -
resulting in e.g.  idle wakeups not working.)

The attached patch against fixes it all up. Changes:

- resched_task() needs to touch the task so the runqueue lock of that CPU
  must be held: resched_task() now enforces this rule.

- wake_priority_sleeper() was called without holding the runqueue lock.

- wake_sleeping_dependent() needs to hold the runqueue locks of all
  siblings (2 typically).  Effects of this ripples back to schedule() as
  well - in the non-SMT case it gets compiled out so it's fine.

- dependent_sleeper() needs the runqueue locks too - and it's slightly
  harder because it wants to know the 'next task' info which might change
  during the lock-drop/reacquire.  Ripple effect on schedule() => compiled
  out on non-SMT so fine.

- resched_task() was disabling preemption for no good reason - all paths
  that called this function had either a spinlock held or irqs disabled.

Compiled & booted on x86 SMP and UP, with and without SMT. Booted the
SMT kernel on a real SMP+HT box as well. (Unpatched kernel wouldn't even
boot with the resched_task() assert in place.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/kernel/sched.c |  112 ++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 84 insertions(+), 28 deletions(-)

diff -puN kernel/sched.c~sched-smt-fixes kernel/sched.c
--- 25/kernel/sched.c~sched-smt-fixes	Fri Aug 20 14:50:54 2004
+++ 25-akpm/kernel/sched.c	Fri Aug 20 14:50:54 2004
@@ -917,7 +917,8 @@ static void resched_task(task_t *p)
 {
 	int need_resched, nrpolling;
 
-	preempt_disable();
+	BUG_ON(!spin_is_locked(&task_rq(p)->lock));
+
 	/* minimise the chance of sending an interrupt to poll_idle() */
 	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
 	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
@@ -925,7 +926,6 @@ static void resched_task(task_t *p)
 
 	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 		smp_send_reschedule(task_cpu(p));
-	preempt_enable();
 }
 #else
 static inline void resched_task(task_t *p)
@@ -2403,8 +2403,10 @@ void scheduler_tick(int user_ticks, int 
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
+		spin_lock(&rq->lock);
 		if (wake_priority_sleeper(rq))
-			goto out;
+			goto out_unlock;
+		spin_unlock(&rq->lock);
 		rebalance_tick(cpu, rq, IDLE);
 		return;
 	}
@@ -2493,23 +2495,34 @@ out:
 }
 
 #ifdef CONFIG_SCHED_SMT
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
-	int i;
-	struct sched_domain *sd = rq->sd;
+	struct sched_domain *sd = this_rq->sd;
 	cpumask_t sibling_map;
+	int i;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return;
 
+	/*
+	 * Unlock the current runqueue because we have to lock in
+	 * CPU order to avoid deadlocks. Caller knows that we might
+	 * unlock. We keep IRQs disabled.
+	 */
+	spin_unlock(&this_rq->lock);
+
 	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq;
 
-		if (i == cpu)
-			continue;
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	/*
+	 * We clear this CPU from the mask. This both simplifies the
+	 * inner loop and keps this_rq locked when we exit:
+	 */
+	cpu_clear(this_cpu, sibling_map);
 
-		smt_rq = cpu_rq(i);
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
 
 		/*
 		 * If an SMT sibling task is sleeping due to priority
@@ -2518,27 +2531,53 @@ static inline void wake_sleeping_depende
 		if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
 			resched_task(smt_rq->idle);
 	}
+
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
+	/*
+	 * We exit with this_cpu's rq still held and IRQs
+	 * still disabled:
+	 */
 }
 
-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *sd = rq->sd;
+	struct sched_domain *sd = this_rq->sd;
 	cpumask_t sibling_map;
+	prio_array_t *array;
 	int ret = 0, i;
+	task_t *p;
 
 	if (!(sd->flags & SD_SHARE_CPUPOWER))
 		return 0;
 
+	/*
+	 * The same locking rules and details apply as for
+	 * wake_sleeping_dependent():
+	 */
+	spin_unlock(&this_rq->lock);
 	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq;
-		task_t *smt_curr;
+	for_each_cpu_mask(i, sibling_map)
+		spin_lock(&cpu_rq(i)->lock);
+	cpu_clear(this_cpu, sibling_map);
+
+	/*
+	 * Establish next task to be run - it might have gone away because
+	 * we released the runqueue lock above:
+	 */
+	if (!this_rq->nr_running)
+		goto out_unlock;
+	array = this_rq->active;
+	if (!array->nr_active)
+		array = this_rq->expired;
+	BUG_ON(!array->nr_active);
 
-		if (i == cpu)
-			continue;
+	p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next,
+		task_t, run_list);
 
-		smt_rq = cpu_rq(i);
-		smt_curr = smt_rq->curr;
+	for_each_cpu_mask(i, sibling_map) {
+		runqueue_t *smt_rq = cpu_rq(i);
+		task_t *smt_curr = smt_rq->curr;
 
 		/*
 		 * If a user task with lower static priority than the
@@ -2564,14 +2603,17 @@ static inline int dependent_sleeper(int 
 			(smt_curr == smt_rq->idle && smt_rq->nr_running))
 				resched_task(smt_curr);
 	}
+out_unlock:
+	for_each_cpu_mask(i, sibling_map)
+		spin_unlock(&cpu_rq(i)->lock);
 	return ret;
 }
 #else
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
+static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
 {
 }
 
-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
+static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	return 0;
 }
@@ -2651,13 +2693,33 @@ need_resched:
 
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
+go_idle:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
 			rq->expired_timestamp = 0;
 			wake_sleeping_dependent(cpu, rq);
+			/*
+			 * wake_sleeping_dependent() might have released
+			 * the runqueue, so break out if we got new
+			 * tasks meanwhile:
+			 */
+			if (!rq->nr_running)
+				goto switch_tasks;
+		}
+	} else {
+		if (dependent_sleeper(cpu, rq)) {
+			schedstat_inc(rq, sched_goidle);
+			next = rq->idle;
 			goto switch_tasks;
 		}
+		/*
+		 * dependent_sleeper() releases and reacquires the runqueue
+		 * lock, hence go into the idle loop if the rq went
+		 * empty meanwhile:
+		 */
+		if (unlikely(!rq->nr_running))
+			goto go_idle;
 	}
 
 	array = rq->active;
@@ -2678,12 +2740,6 @@ need_resched:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
-	if (dependent_sleeper(cpu, rq, next)) {
-		schedstat_inc(rq, sched_goidle);
-		next = rq->idle;
-		goto switch_tasks;
-	}
-
 	if (!rt_task(next) && next->activated > 0) {
 		unsigned long long delta = now - next->timestamp;
 
_
