
From: Andi Kleen <ak@suse.de>

Fix SMP race in NMI watchdog on i386/x86-64

Fix a long standing SMP Setup race in the NMI watchdog.  The watchdog would
tick from very early and check if all CPUs increase their timer interrupts. 
For that it would check the cpu_online_map.  Now if a CPU took too long to
boot the watchdog would trigger prematurely because the CPU didn't increase
its timer count yet.

Fix is to check cpu_callin_map instead of cpu_online_map because the first is
only set when a CPU started its timer interrupt.  

I fixed it on i386 and x86-64. 

Description of the problem from Manpreet Singh. Thanks.

Cc: <manpreet@fabric7.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/arch/i386/kernel/nmi.c       |    2 +-
 25-akpm/arch/i386/kernel/smpboot.c   |    2 +-
 25-akpm/arch/x86_64/kernel/nmi.c     |    4 +++-
 25-akpm/arch/x86_64/kernel/smpboot.c |    2 +-
 25-akpm/include/asm-i386/smp.h       |    1 +
 25-akpm/include/asm-x86_64/smp.h     |    1 +
 6 files changed, 8 insertions(+), 4 deletions(-)

diff -puN arch/i386/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/i386/kernel/nmi.c
--- 25/arch/i386/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race	2005-01-16 00:38:11.646059216 -0800
+++ 25-akpm/arch/i386/kernel/nmi.c	2005-01-16 00:38:11.658057392 -0800
@@ -117,7 +117,7 @@ int __init check_nmi_watchdog (void)
 	/* FIXME: Only boot CPU is online at this stage.  Check CPUs
            as they come up. */
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (!cpu_online(cpu))
+		if (!cpu_isset(cpu, cpu_callin_map))
 			continue;
 		if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
 			printk("CPU#%d: NMI appears to be stuck!\n", cpu);
diff -puN arch/i386/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/i386/kernel/smpboot.c
--- 25/arch/i386/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race	2005-01-16 00:38:11.648058912 -0800
+++ 25-akpm/arch/i386/kernel/smpboot.c	2005-01-16 00:38:11.659057240 -0800
@@ -67,7 +67,7 @@ EXPORT_SYMBOL(phys_proc_id);
 /* bitmap of online cpus */
 cpumask_t cpu_online_map;
 
-static cpumask_t cpu_callin_map;
+cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 static cpumask_t smp_commenced_mask;
 
diff -puN arch/x86_64/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/x86_64/kernel/nmi.c
--- 25/arch/x86_64/kernel/nmi.c~i386-x86-64-fix-smp-nmi-watchdog-race	2005-01-16 00:38:11.650058608 -0800
+++ 25-akpm/arch/x86_64/kernel/nmi.c	2005-01-16 00:38:11.660057088 -0800
@@ -130,7 +130,9 @@ int __init check_nmi_watchdog (void)
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-		if (!cpu_online(cpu))
+		/* Check cpu_callin_map here because that is set
+		   after the timer is started. */
+		if (!cpu_isset(cpu, cpu_callin_map))
 			continue;
 		if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
 			printk("CPU#%d: NMI appears to be stuck (%d)!\n", 
diff -puN arch/x86_64/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race arch/x86_64/kernel/smpboot.c
--- 25/arch/x86_64/kernel/smpboot.c~i386-x86-64-fix-smp-nmi-watchdog-race	2005-01-16 00:38:11.651058456 -0800
+++ 25-akpm/arch/x86_64/kernel/smpboot.c	2005-01-16 00:38:11.661056936 -0800
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(phys_proc_id);
 /* Bitmask of currently online CPUs */
 cpumask_t cpu_online_map;
 
-static cpumask_t cpu_callin_map;
+cpumask_t cpu_callin_map;
 cpumask_t cpu_callout_map;
 static cpumask_t smp_commenced_mask;
 
diff -puN include/asm-i386/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race include/asm-i386/smp.h
--- 25/include/asm-i386/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race	2005-01-16 00:38:11.652058304 -0800
+++ 25-akpm/include/asm-i386/smp.h	2005-01-16 00:38:11.660057088 -0800
@@ -53,6 +53,7 @@ extern u8 x86_cpu_to_apicid[];
 #define __smp_processor_id() (current_thread_info()->cpu)
 
 extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_callin_map;
 #define cpu_possible_map cpu_callout_map
 
 /* We don't mark CPUs online until __cpu_up(), so we need another measure */
diff -puN include/asm-x86_64/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race include/asm-x86_64/smp.h
--- 25/include/asm-x86_64/smp.h~i386-x86-64-fix-smp-nmi-watchdog-race	2005-01-16 00:38:11.654058000 -0800
+++ 25-akpm/include/asm-x86_64/smp.h	2005-01-16 00:38:11.657057544 -0800
@@ -59,6 +59,7 @@ extern u8 phys_proc_id[NR_CPUS];
  */
 
 extern cpumask_t cpu_callout_map;
+extern cpumask_t cpu_callin_map;
 #define cpu_possible_map cpu_callout_map
 
 static inline int num_booting_cpus(void)
_
