From: Andi Kleen <ak@muc.de>

I ported the Athlon prefetch fix to 2.6.0test6mm2 now.

I tested that it handles user space prefetch faults correctly.

This is Jamie's "mutiliated" version with segment/limit checking
and some additional changes:

- Handle the mmap_sem deadlock in the way Jamie proposed: just check the
  address for >= TASK_SIZE and if true never take the semaphore

- Do the same for 4/4 (untested). 

  On some thought 4/4 is actually easy because it should never do user
  exceptions from kernel space - its *_user functions do all call
  handle_mm_fault() directly.  This means if an exception space came from
  ring 0 it can be only a bug, a prefetch fault or a vmalloc fault.  None
  of these require 

  I also fixed a bug in the process - the 4/4 code didn't handle lazy
  vmalloc SMP faults correctly IMHO.  Note I didn't test if it works.

- Port LDT checking to -mm*.  This is a bit ugly.  The 4/4 patch allows
  LDT pages to be in highmem and require a kmap.  I put that code into #if
  1.  For submission to Linus all #if 1 code should be removed.

- Added a printk for now just for testing (should be also removed, but
  doesn't harm right now) 

- Removed the #ifdefs. The code is now always compiled in.

- Removed the eip==addr check.

- Some other minor cleanup.


 arch/i386/mm/fault.c         |  204 ++++++++++++++++++++++++++++++++++++++++---
 include/asm-i386/processor.h |    6 -
 2 files changed, 196 insertions(+), 14 deletions(-)

diff -puN arch/i386/mm/fault.c~athlon-prefetch-handling arch/i386/mm/fault.c
--- 25/arch/i386/mm/fault.c~athlon-prefetch-handling	2003-10-04 02:39:10.000000000 -0700
+++ 25-akpm/arch/i386/mm/fault.c	2003-10-04 02:39:10.000000000 -0700
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/tty.h>
 #include <linux/vt_kern.h>		/* For unblank_screen() */
+#include <linux/highmem.h>
 #include <linux/module.h>
 
 #include <asm/system.h>
@@ -55,6 +56,161 @@ void bust_spinlocks(int yes)
 	console_loglevel = loglevel_save;
 }
 
+/*
+ * Return EIP plus the CS segment base.  The segment limit is also
+ * adjusted, clamped to the kernel/user address space (whichever is
+ * appropriate), and returned in *eip_limit.
+ *
+ * The segment is checked, because it might have been changed by another
+ * task between the original faulting instruction and here.
+ *
+ * If CS is no longer a valid code segment, or if EIP is beyond the
+ * limit, or if it is a kernel address when CS is not a kernel segment,
+ * then the returned value will be greater than *eip_limit.
+ */
+static inline unsigned long get_segment_eip(struct pt_regs *regs,
+					    unsigned long *eip_limit)
+{
+	unsigned long eip = regs->eip;
+	unsigned seg = regs->xcs & 0xffff;
+	u32 seg_ar, seg_limit, base, *desc;
+
+	/* The standard kernel/user address space limit. */
+	*eip_limit = (seg & 3) ? USER_DS.seg : KERNEL_DS.seg;
+
+	/* Unlikely, but must come before segment checks. */
+	if (unlikely((regs->eflags & VM_MASK) != 0))
+		return eip + (seg << 4);
+
+	/* By far the commonest cases. */
+	if (likely(seg == __USER_CS || seg == __KERNEL_CS))
+		return eip;
+
+	/* Check the segment exists, is within the current LDT/GDT size,
+	   that kernel/user (ring 0..3) has the appropriate privilege,
+	   that it's a code segment, and get the limit. */
+	__asm__ ("larl %3,%0; lsll %3,%1"
+		 : "=&r" (seg_ar), "=r" (seg_limit) : "0" (0), "rm" (seg));
+	if ((~seg_ar & 0x9800) || eip > seg_limit) {
+		*eip_limit = 0;
+		return 1;	 /* So that returned eip > *eip_limit. */
+	}
+
+	/* Get the GDT/LDT descriptor base.
+	   When you look for races in this code remember that
+	   LDT and other horrors are only used in user space. */
+	if (seg & (1<<2)) {
+		/* Must lock the LDT while reading it. */
+		down(&current->mm->context.sem);
+#if 1
+		/* horrible hack for 4/4 disabled kernels.
+		   I'm not quite sure what the TLB flush is good for,
+		   it's mindlessly copied from the read_ldt code */
+		__flush_tlb_global();
+		desc = kmap(current->mm->context.ldt_pages[(seg & ~7)/PAGE_SIZE]);
+		desc = (void *)desc + ((seg & ~7) % PAGE_SIZE);
+#else
+		desc = current->mm->context.ldt;
+		desc = (void *)desc + (seg & ~7);
+#endif
+	} else {
+		/* Must disable preemption while reading the GDT. */
+		desc = (u32 *)&cpu_gdt_table[get_cpu()];
+		desc = (void *)desc + (seg & ~7);
+	}
+	base = (desc[0] >> 16) |
+		((desc[1] & 0xff) << 16) |
+		(desc[1] & 0xff000000);
+	if (seg & (1<<2)) {
+#if 1
+		kunmap((void *)((unsigned long)desc & PAGE_MASK));
+#endif
+		up(&current->mm->context.sem);
+	} else
+		put_cpu();
+
+	/* Adjust EIP and segment limit, and clamp at the kernel limit.
+	   It's legitimate for segments to wrap at 0xffffffff. */
+	seg_limit += base;
+	if (seg_limit < *eip_limit && seg_limit >= base)
+		*eip_limit = seg_limit;
+	return eip + base;
+}
+
+/*
+ * Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
+ * Check that here and ignore it.
+ */
+static int __is_prefetch(struct pt_regs *regs, unsigned long addr)
+{
+	unsigned long limit;
+	unsigned long instr = get_segment_eip (regs, &limit);
+	int scan_more = 1;
+	int prefetch = 0;
+	int i;
+
+	for (i = 0; scan_more && i < 15; i++) {
+		unsigned char opcode;
+		unsigned char instr_hi;
+		unsigned char instr_lo;
+
+		if (instr > limit)
+			break;
+		if (__get_user(opcode, (unsigned char *) instr))
+			break;
+
+		instr_hi = opcode & 0xf0;
+		instr_lo = opcode & 0x0f;
+		instr++;
+
+		switch (instr_hi) {
+		case 0x20:
+		case 0x30:
+			/* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes. */
+			scan_more = ((instr_lo & 7) == 0x6);
+			break;
+
+		case 0x60:
+			/* 0x64 thru 0x67 are valid prefixes in all modes. */
+			scan_more = (instr_lo & 0xC) == 0x4;
+			break;
+		case 0xF0:
+			/* 0xF0, 0xF2, and 0xF3 are valid prefixes in all modes. */
+			scan_more = !instr_lo || (instr_lo>>1) == 1;
+			break;
+		case 0x00:
+			/* Prefetch instruction is 0x0F0D or 0x0F18 */
+			scan_more = 0;
+			if (instr > limit)
+				break;
+			if (__get_user(opcode, (unsigned char *) instr))
+				break;
+			prefetch = (instr_lo == 0xF) &&
+				(opcode == 0x0D || opcode == 0x18);
+			break;
+		default:
+			scan_more = 0;
+			break;
+		}
+	}
+
+#if 1
+	if (prefetch)
+		printk("prefetch handled at %lx eip %lx instr %lx cs %x\n",
+		       addr, regs->eip, instr, regs->xcs);
+#endif
+
+	return prefetch;
+}
+
+static inline int is_prefetch(struct pt_regs *regs, unsigned long addr)
+{
+	if (unlikely(boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
+		     boot_cpu_data.x86 >= 6))
+		return __is_prefetch(regs, addr);
+	return 0;
+}
+
 asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
 
 /*
@@ -86,6 +242,8 @@ asmlinkage void do_page_fault(struct pt_
 
 	tsk = current;
 
+	info.si_code = SEGV_MAPERR;
+
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
 	 * 'reference' page table is init_mm.pgd.
@@ -99,18 +257,26 @@ asmlinkage void do_page_fault(struct pt_
 	 * (error_code & 4) == 0, and that the fault was not a
 	 * protection error (error_code & 1) == 0.
 	 */
-	if (address >= TASK_SIZE && !(error_code & 5))
+#ifdef CONFIG_X86_4G
+	/* On 4/4 all kernels faults are either bugs, vmalloc or prefetch */
+	if (unlikely((regs->xcs & 3) == 0))
 		goto vmalloc_fault;
+#else
+	if (unlikely(address >= TASK_SIZE)) {
+		if (!(error_code & 5))
+			goto vmalloc_fault;
+		goto bad_area_nosemaphore;
+	}
+#endif
 
 	mm = tsk->mm;
-	info.si_code = SEGV_MAPERR;
 
 	/*
 	 * If we're in an interrupt, have no user context or are running in an
 	 * atomic region then we must not take the fault..
 	 */
 	if (in_atomic() || !mm)
-		goto no_context;
+		goto bad_area_nosemaphore;
 
 	down_read(&mm->mmap_sem);
 
@@ -198,8 +364,16 @@ good_area:
 bad_area:
 	up_read(&mm->mmap_sem);
 
+bad_area_nosemaphore:
 	/* User mode accesses just cause a SIGSEGV */
 	if (error_code & 4) {
+		/*
+		 * Valid to do another page fault here because this one came
+		 * from user space.
+		 */
+		if (is_prefetch(regs, address))
+			return;
+
 		tsk->thread.cr2 = address;
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_no = 14;
@@ -232,6 +406,14 @@ no_context:
 	if (fixup_exception(regs))
 		return;
 
+	/*
+	 * Valid to do another page fault here, because if this fault
+	 * had been triggered by is_prefetch fixup_exception would have
+	 * handled it.
+	 */
+ 	if (is_prefetch(regs, address))
+ 		return;
+
 /*
  * Oops. The kernel tried to access some bad page. We'll have to
  * terminate things with extreme prejudice.
@@ -292,10 +474,14 @@ out_of_memory:
 do_sigbus:
 	up_read(&mm->mmap_sem);
 
-	/*
-	 * Send a sigbus, regardless of whether we were in kernel
-	 * or user mode.
-	 */
+	/* Kernel mode? Handle exceptions or die */
+	if (!(error_code & 4))
+		goto no_context;
+
+	/* User space => ok to do another page fault */
+	if (is_prefetch(regs, address))
+		return;
+
 	tsk->thread.cr2 = address;
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_no = 14;
@@ -304,10 +490,6 @@ do_sigbus:
 	info.si_code = BUS_ADRERR;
 	info.si_addr = (void *)address;
 	force_sig_info(SIGBUS, &info, tsk);
-
-	/* Kernel mode? Handle exceptions or die */
-	if (!(error_code & 4))
-		goto no_context;
 	return;
 
 vmalloc_fault:
diff -puN include/asm-i386/processor.h~athlon-prefetch-handling include/asm-i386/processor.h
--- 25/include/asm-i386/processor.h~athlon-prefetch-handling	2003-10-04 02:39:10.000000000 -0700
+++ 25-akpm/include/asm-i386/processor.h	2003-10-04 02:39:10.000000000 -0700
@@ -588,12 +588,12 @@ static inline void rep_nop(void)
 
 /* Prefetch instructions for Pentium III and AMD Athlon */
 /* It's not worth to care about 3dnow! prefetches for the K6
-   because they are microcoded there and very slow. */
+   because they are microcoded there and very slow.
+   However we don't do prefetches for pre XP Athlons currently
+   That should be fixed. */
 #define ARCH_HAS_PREFETCH
 extern inline void prefetch(const void *x)
 {
-	if (cpu_data[0].x86_vendor == X86_VENDOR_AMD)
-		return;		/* Some athlons fault if the address is bad */
 	alternative_input(ASM_NOP4,
 			  "prefetchnta (%1)",
 			  X86_FEATURE_XMM,

_