
From: Andi Kleen <ak@suse.de>

This is an experimental patch that needs a lot more benchmarking before it
can be put into mainline.  Mainly benchmarking to see if there are no
regressions on performance on x86-64 with this patch, on machines with >4GB
of memory.  I wouldn't put it into mainline anytime soon, it's really only
for testing.

Add 4GB DMA32 zone

Add a new 4GB GFP_DMA32 between GFP_DMA and GFP_NORMAL zone.  This helps
mainly graphic drivers who really need a lot of memory below the 4GB area. 
Previous they could only use IOMMU+16MB GFP_DMA, which was not enough
memory.

This is done for x86-64.  For other architectures who don't set up this new
zone nothing changes.

It may make sense to rename IA64's GFP_DMA to GFP_DMA32 for better source
code compatibility though.  I didn't do this, so far only x86-64 is
affected.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/arch/x86_64/mm/init.c      |   48 +++++++++++++++++++++++++------------
 25-akpm/arch/x86_64/mm/numa.c      |   11 --------
 25-akpm/include/asm-x86_64/dma.h   |   11 ++++++--
 25-akpm/include/asm-x86_64/proto.h |    2 +
 25-akpm/include/linux/gfp.h        |    5 +++
 25-akpm/include/linux/mmzone.h     |   14 ++++++----
 25-akpm/mm/page_alloc.c            |   16 +++++++++---
 7 files changed, 69 insertions(+), 38 deletions(-)

diff -puN arch/x86_64/mm/init.c~x86_64-experimental-4gb-dma-zone arch/x86_64/mm/init.c
--- 25/arch/x86_64/mm/init.c~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.607420952 -0800
+++ 25-akpm/arch/x86_64/mm/init.c	2004-11-28 01:56:45.620418976 -0800
@@ -325,24 +325,42 @@ void zap_low_mappings(void)
 	flush_tlb_all();
 }
 
+/* Compute zone sizes for the DMA and DMA32 zones in a node. */
+__init void
+size_zones(unsigned long *z, unsigned long start_pfn, unsigned long end_pfn)
+{
+	int i;
+	unsigned long missing;
+
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		z[i] = 0;
+
+	if (start_pfn < MAX_DMA_PFN)
+		z[ZONE_DMA] = MAX_DMA_PFN - start_pfn;
+	if (start_pfn < MAX_DMA32_PFN) {
+		unsigned long dma32_pfn = MAX_DMA32_PFN;
+		if (dma32_pfn > end_pfn)
+			dma32_pfn = end_pfn;
+		z[ZONE_DMA32] = dma32_pfn - start_pfn;
+	}
+	z[ZONE_NORMAL] = end_pfn - start_pfn;
+
+	/* Remove lower zones from higher ones. */
+	missing = 0;
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (z[i])
+			z[i] -= missing;
+		printk("--- zone %d: %lx pages\n", i, z[i]);
+		missing += z[i];
+	}
+}
+
 #ifndef CONFIG_DISCONTIGMEM
 void __init paging_init(void)
 {
-	{
-		unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0};
-		unsigned int max_dma;
-
-		max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
-
-		if (end_pfn < max_dma)
-			zones_size[ZONE_DMA] = end_pfn;
-		else {
-			zones_size[ZONE_DMA] = max_dma;
-			zones_size[ZONE_NORMAL] = end_pfn - max_dma;
-		}
-		free_area_init(zones_size);
-	}
-	return;
+	unsigned long zones_size[MAX_NR_ZONES];
+	size_zones(zones_size, 0, end_pfn);
+	free_area_init(zones_size);
 }
 #endif
 
diff -puN arch/x86_64/mm/numa.c~x86_64-experimental-4gb-dma-zone arch/x86_64/mm/numa.c
--- 25/arch/x86_64/mm/numa.c~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.608420800 -0800
+++ 25-akpm/arch/x86_64/mm/numa.c	2004-11-28 01:56:45.620418976 -0800
@@ -121,7 +121,6 @@ void __init setup_node_zones(int nodeid)
 { 
 	unsigned long start_pfn, end_pfn; 
 	unsigned long zones[MAX_NR_ZONES];
-	unsigned long dma_end_pfn;
 
 	memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES); 
 
@@ -130,15 +129,7 @@ void __init setup_node_zones(int nodeid)
 
 	Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
 	
-	/* All nodes > 0 have a zero length zone DMA */ 
-	dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT; 
-	if (start_pfn < dma_end_pfn) { 
-		zones[ZONE_DMA] = dma_end_pfn - start_pfn;
-		zones[ZONE_NORMAL] = end_pfn - dma_end_pfn; 
-	} else { 
-		zones[ZONE_NORMAL] = end_pfn - start_pfn; 
-	} 
-    
+  	size_zones(zones, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
 			    start_pfn, NULL); 
 } 
diff -puN include/asm-x86_64/dma.h~x86_64-experimental-4gb-dma-zone include/asm-x86_64/dma.h
--- 25/include/asm-x86_64/dma.h~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.609420648 -0800
+++ 25-akpm/include/asm-x86_64/dma.h	2004-11-28 01:56:45.621418824 -0800
@@ -72,8 +72,15 @@
 
 #define MAX_DMA_CHANNELS	8
 
-/* The maximum address that we can perform a DMA transfer to on this platform */
-#define MAX_DMA_ADDRESS      (PAGE_OFFSET+0x1000000)
+
+/* 16MB ISA DMA zone */
+#define MAX_DMA_PFN   ((16*1024*1024) >> PAGE_SHIFT)
+
+/* 4GB broken PCI/AGP hardware bus master zone */
+#define MAX_DMA32_PFN ((4UL*1024*1024*1024) >> PAGE_SHIFT)
+
+/* Compat define for old dma zone */
+#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
 
 /* 8237 DMA controllers */
 #define IO_DMA1_BASE	0x00	/* 8 bit slave DMA, channels 0..3 */
diff -puN include/asm-x86_64/proto.h~x86_64-experimental-4gb-dma-zone include/asm-x86_64/proto.h
--- 25/include/asm-x86_64/proto.h~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.611420344 -0800
+++ 25-akpm/include/asm-x86_64/proto.h	2004-11-28 01:56:45.621418824 -0800
@@ -16,6 +16,8 @@ extern void early_idt_handler(void);
 
 extern void mcheck_init(struct cpuinfo_x86 *c);
 extern void init_memory_mapping(unsigned long start, unsigned long end);
+extern void size_zones(unsigned long *z, unsigned long start_pfn,
+		       unsigned long end_pfn);
 
 extern void system_call(void); 
 extern int kernel_syscall(void);
diff -puN include/linux/gfp.h~x86_64-experimental-4gb-dma-zone include/linux/gfp.h
--- 25/include/linux/gfp.h~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.612420192 -0800
+++ 25-akpm/include/linux/gfp.h	2004-11-28 01:56:45.622418672 -0800
@@ -11,9 +11,10 @@ struct vm_area_struct;
 /*
  * GFP bitmasks..
  */
-/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */
+/* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low three bits) */
 #define __GFP_DMA	0x01
 #define __GFP_HIGHMEM	0x02
+#define __GFP_DMA32	0x04
 
 /*
  * Action modifiers - doesn't change the zoning
@@ -58,6 +59,8 @@ struct vm_area_struct;
 
 #define GFP_DMA		__GFP_DMA
 
+/* 4GB DMA on some platforms */
+#define GFP_DMA32	__GFP_DMA32
 
 /*
  * There is only one page-allocator function, and two main namespaces to
diff -puN include/linux/mmzone.h~x86_64-experimental-4gb-dma-zone include/linux/mmzone.h
--- 25/include/linux/mmzone.h~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.614419888 -0800
+++ 25-akpm/include/linux/mmzone.h	2004-11-28 01:56:45.623418520 -0800
@@ -64,11 +64,12 @@ struct per_cpu_pageset {
 } ____cacheline_aligned_in_smp;
 
 #define ZONE_DMA		0
-#define ZONE_NORMAL		1
-#define ZONE_HIGHMEM		2
+#define ZONE_DMA32		1
+#define ZONE_NORMAL		2
+#define ZONE_HIGHMEM		3
 
-#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
+#define MAX_NR_ZONES		4	/* Sync this with ZONES_SHIFT */
+#define ZONES_SHIFT		3	/* ceil(log2(MAX_NR_ZONES)) */
 
 
 /*
@@ -84,7 +85,7 @@ struct per_cpu_pageset {
  * be 8 (2 ** 3) zonelists.  GFP_ZONETYPES defines the number of possible
  * combinations of zone modifiers in "zone modifier space".
  */
-#define GFP_ZONEMASK	0x03
+#define GFP_ZONEMASK	0x07
 /*
  * As an optimisation any zone modifier bits which are only valid when
  * no other zone modifier bits are set (loners) should be placed in
@@ -104,6 +105,7 @@ struct per_cpu_pageset {
  * into multiple physical zones. On a PC we have 3 zones:
  *
  * ZONE_DMA	  < 16 MB	ISA DMA capable memory
+ * ZONE_DMA32	     0 MB 	Empty
  * ZONE_NORMAL	16-896 MB	direct mapped by the kernel
  * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
  */
@@ -409,7 +411,7 @@ extern struct pglist_data contig_page_da
 #endif
 
 /* There are currently 3 zones: DMA, Normal & Highmem, thus we need 2 bits */
-#define MAX_ZONES_SHIFT		2
+#define MAX_ZONES_SHIFT		3
 
 #if ZONES_SHIFT > MAX_ZONES_SHIFT
 #error ZONES_SHIFT > MAX_ZONES_SHIFT
diff -puN mm/page_alloc.c~x86_64-experimental-4gb-dma-zone mm/page_alloc.c
--- 25/mm/page_alloc.c~x86_64-experimental-4gb-dma-zone	2004-11-28 01:56:45.616419584 -0800
+++ 25-akpm/mm/page_alloc.c	2004-11-28 01:56:45.626418064 -0800
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(nr_swap_pages);
 struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)];
 EXPORT_SYMBOL(zone_table);
 
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 
 unsigned long __initdata nr_kernel_pages;
@@ -1200,6 +1200,10 @@ static int __init build_zonelists_node(p
 		zone = pgdat->node_zones + ZONE_NORMAL;
 		if (zone->present_pages)
 			zonelist->zones[j++] = zone;
+	case ZONE_DMA32:
+		zone = pgdat->node_zones + ZONE_DMA32;
+		if (zone->present_pages)
+			zonelist->zones[j++] = zone;
 	case ZONE_DMA:
 		zone = pgdat->node_zones + ZONE_DMA;
 		if (zone->present_pages)
@@ -1310,6 +1314,8 @@ static void __init build_zonelists(pg_da
 			k = ZONE_NORMAL;
 			if (i & __GFP_HIGHMEM)
 				k = ZONE_HIGHMEM;
+			if (i & __GFP_DMA32)
+				k = ZONE_DMA32;
 			if (i & __GFP_DMA)
 				k = ZONE_DMA;
 
@@ -1336,6 +1342,8 @@ static void __init build_zonelists(pg_da
 		k = ZONE_NORMAL;
 		if (i & __GFP_HIGHMEM)
 			k = ZONE_HIGHMEM;
+		if (i & __GFP_DMA32)
+			k = ZONE_DMA32;
 		if (i & __GFP_DMA)
 			k = ZONE_DMA;
 
@@ -1537,7 +1545,7 @@ static void __init free_area_init_core(s
 		if (zholes_size)
 			realsize -= zholes_size[j];
 
-		if (j == ZONE_DMA || j == ZONE_NORMAL)
+		if (j == ZONE_DMA || j == ZONE_NORMAL || j == ZONE_DMA32)
 			nr_kernel_pages += realsize;
 		nr_all_pages += realsize;
 
@@ -1895,12 +1903,12 @@ static void setup_per_zone_protection(vo
 
 		/*
 		 * For each of the different allocation types:
-		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+		 * GFP_DMA -> GFP_DMA32 -> GFP_KERNEL -> GFP_HIGHMEM
 		 */
 		for (i = 0; i < GFP_ZONETYPES; i++) {
 			/*
 			 * For each of the zones:
-			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA32 ->ZONE_DMA
 			 */
 			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
 				zone = &zones[j];
_
