
From: Ingo Molnar <mingo@elte.hu>

 - enable the 'prot' parameter for shared-writable mappings (the ones
   which are the primary target for fremap), without breaking up the
   vma. Costs: max # of swapfiles is 16 instead of 32 (could be fixed on
   PAE if we want).

 - added MAP_INHERIT: this both cleanly implements the old
   sys_remap_file_pages() semantics, and might be useful as well.

it adds a new syscall because the old sys_remap_file_pages() syscall was
messed up in an unfixable way :-( [prot == 0 means PROT_NONE and now we
need this.] But the patch is ABI-compatible, it keeps the old syscall
and wraps it cleanly internally.

patch applies, compiles & boots cleanly on SMP x86. Non-x86 
architectures wont compile, then need to do this:

 - add MAP_INHERIT

 - add pte_to_pgprot and change pte_to_pgoff if the bit comes off the
   offset bits.

 - change pgoff_to_pte to pgoff_prot_to_pte.

 - chop a bit off __swp_type() if there's no more space.

it should be quite straightforward for them, but needs to be tested.



 25-akpm/arch/i386/kernel/entry.S          |    3 
 25-akpm/include/asm-i386/mman.h           |    1 
 25-akpm/include/asm-i386/pgtable-2level.h |   15 ++--
 25-akpm/include/asm-i386/pgtable-3level.h |   11 ++-
 25-akpm/include/linux/mm.h                |    2 
 25-akpm/mm/fremap.c                       |  105 +++++++++++++++++++-----------
 25-akpm/mm/memory.c                       |    4 -
 25-akpm/mm/mmap.c                         |    8 +-
 25-akpm/mm/rmap.c                         |    7 +-
 9 files changed, 105 insertions(+), 51 deletions(-)

diff -puN arch/i386/kernel/entry.S~remap_file_pages-prot-2.6.1-H2 arch/i386/kernel/entry.S
--- 25/arch/i386/kernel/entry.S~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/arch/i386/kernel/entry.S	Mon Jan  5 14:49:48 2004
@@ -891,7 +891,7 @@ ENTRY(sys_call_table)
 	.long sys_epoll_create
 	.long sys_epoll_ctl	/* 255 */
 	.long sys_epoll_wait
- 	.long sys_remap_file_pages
+ 	.long old_remap_file_pages
  	.long sys_set_tid_address
  	.long sys_timer_create
  	.long sys_timer_settime		/* 260 */
@@ -908,5 +908,6 @@ ENTRY(sys_call_table)
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
+ 	.long sys_remap_file_pages
 
 syscall_table_size=(.-sys_call_table)
diff -puN include/asm-i386/mman.h~remap_file_pages-prot-2.6.1-H2 include/asm-i386/mman.h
--- 25/include/asm-i386/mman.h~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/include/asm-i386/mman.h	Mon Jan  5 14:49:48 2004
@@ -22,6 +22,7 @@
 #define MAP_NORESERVE	0x4000		/* don't check for reservations */
 #define MAP_POPULATE	0x8000		/* populate (prefault) pagetables */
 #define MAP_NONBLOCK	0x10000		/* do not block on IO */
+#define MAP_INHERIT	0x20000		/* inherit the protection bits of the underlying vma*/
 
 #define MS_ASYNC	1		/* sync memory asynchronously */
 #define MS_INVALIDATE	2		/* invalidate the caches */
diff -puN include/asm-i386/pgtable-2level.h~remap_file_pages-prot-2.6.1-H2 include/asm-i386/pgtable-2level.h
--- 25/include/asm-i386/pgtable-2level.h~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/include/asm-i386/pgtable-2level.h	Mon Jan  5 14:49:48 2004
@@ -64,15 +64,20 @@ static inline pmd_t * pmd_offset(pgd_t *
 #define pfn_pmd(pfn, prot)	__pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
 /*
- * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
+ * Bits 0, 1, 6 and 7 are taken, split up the 28 bits of offset
  * into this range:
  */
-#define PTE_FILE_MAX_BITS	29
+#define PTE_FILE_MAX_BITS	28
 
 #define pte_to_pgoff(pte) \
-	((((pte).pte_low >> 1) & 0x1f ) + (((pte).pte_low >> 8) << 5 ))
+	((((pte).pte_low >> 2) & 0xf ) + (((pte).pte_low >> 8) << 4 ))
+#define pte_to_pgprot(pte) \
+	__pgprot(((pte).pte_low & (_PAGE_RW | _PAGE_PROTNONE)) \
+		| (((pte).pte_low & _PAGE_PROTNONE) ? 0 : \
+			(_PAGE_USER | _PAGE_PRESENT)) | _PAGE_ACCESSED)
 
-#define pgoff_to_pte(off) \
-	((pte_t) { (((off) & 0x1f) << 1) + (((off) >> 5) << 8) + _PAGE_FILE })
+#define pgoff_prot_to_pte(off, prot) \
+	((pte_t) { (((off) & 0xf) << 2) + (((off) >> 4) << 8) + \
+	 (pgprot_val(prot) & (_PAGE_RW | _PAGE_PROTNONE)) + _PAGE_FILE })
 
 #endif /* _I386_PGTABLE_2LEVEL_H */
diff -puN include/asm-i386/pgtable-3level.h~remap_file_pages-prot-2.6.1-H2 include/asm-i386/pgtable-3level.h
--- 25/include/asm-i386/pgtable-3level.h~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/include/asm-i386/pgtable-3level.h	Mon Jan  5 14:49:48 2004
@@ -120,7 +120,16 @@ static inline pmd_t pfn_pmd(unsigned lon
  * put the 32 bits of offset into the high part.
  */
 #define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
+
+#define pte_to_pgprot(pte) \
+	__pgprot(((pte).pte_low & (_PAGE_RW | _PAGE_PROTNONE)) \
+		| (((pte).pte_low & _PAGE_PROTNONE) ? 0 : \
+			(_PAGE_USER | _PAGE_PRESENT)) | _PAGE_ACCESSED)
+
+#define pgoff_prot_to_pte(off, prot) \
+	((pte_t) { _PAGE_FILE + \
+		(pgprot_val(prot) & (_PAGE_RW | _PAGE_PROTNONE)) , (off) })
+
 #define PTE_FILE_MAX_BITS       32
 
 #endif /* _I386_PGTABLE_3LEVEL_H */
diff -puN include/linux/mm.h~remap_file_pages-prot-2.6.1-H2 include/linux/mm.h
--- 25/include/linux/mm.h~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/include/linux/mm.h	Mon Jan  5 14:49:48 2004
@@ -445,7 +445,7 @@ extern int install_file_pte(struct mm_st
 extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
 extern int make_pages_present(unsigned long addr, unsigned long end);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
-extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock);
+extern long __remap_file_pages(struct mm_struct *mm, unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags);
 extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);
 void put_dirty_page(struct task_struct *tsk, struct page *page,
 			unsigned long address, pgprot_t prot);
diff -puN mm/fremap.c~remap_file_pages-prot-2.6.1-H2 mm/fremap.c
--- 25/mm/fremap.c~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/mm/fremap.c	Mon Jan  5 14:49:48 2004
@@ -152,7 +152,7 @@ int install_file_pte(struct mm_struct *m
 
 	flush = zap_pte(mm, vma, addr, pte);
 
-	set_pte(pte, pgoff_to_pte(pgoff));
+	set_pte(pte, pgoff_prot_to_pte(pgoff, pgprot));
 	pte_val = *pte;
 	pte_unmap(pte);
 	if (flush)
@@ -174,27 +174,22 @@ err_unlock:
  * @size: size of the remapped virtual memory range
  * @prot: new protection bits of the range
  * @pgoff: to be mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
+ * @flags: bits MAP_INHERIT or MAP_NONBLOCKED - the later will cause no IO.
  *
  * this syscall works purely via pagetables, so it's the most efficient
  * way to map the same (large) file into a given virtual window. Unlike
  * mmap()/mremap() it does not create any new vmas. The new mappings are
  * also safe across swapout.
- *
- * NOTE: the 'prot' parameter right now is ignored, and the vma's default
- * protection is used. Arbitrary protections might be implemented in the
- * future.
  */
-long sys_remap_file_pages(unsigned long start, unsigned long size,
-	unsigned long __prot, unsigned long pgoff, unsigned long flags)
+long __remap_file_pages(struct mm_struct *mm, unsigned long start,
+	unsigned long size, unsigned long prot,
+	unsigned long pgoff, unsigned long flags)
 {
-	struct mm_struct *mm = current->mm;
+	pgprot_t pgprot = protection_map[calc_vm_prot_bits(prot) | VM_SHARED];
 	unsigned long end = start + size;
 	struct vm_area_struct *vma;
 	int err = -EINVAL;
 
-	if (__prot)
-		return err;
 	/*
 	 * Sanitize the syscall parameters:
 	 */
@@ -214,37 +209,71 @@ long sys_remap_file_pages(unsigned long 
 	/* We need down_write() to change vma->vm_flags. */
 	down_write(&mm->mmap_sem);
 	vma = find_vma(mm, start);
-
 	/*
-	 * Make sure the vma is shared, that it supports prefaulting,
-	 * and that the remapped range is valid and fully within
-	 * the single existing vma:
+	 * Make sure the permissions are right, the vma is shared
+	 * (or linearly remapped - ie. prefaulted), that it supports
+	 * prefaulting, and that the remapped range is valid and fully
+	 * within the single existing vma:
 	 */
-	if (vma && (vma->vm_flags & VM_SHARED) &&
-		vma->vm_ops && vma->vm_ops->populate &&
-			end > start && start >= vma->vm_start &&
-				end <= vma->vm_end) {
-
-		/* Must set VM_NONLINEAR before any pages are populated. */
-		if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
-			vma->vm_flags |= VM_NONLINEAR;
-
-		/* ->populate can take a long time, so downgrade the lock. */
-		downgrade_write(&mm->mmap_sem);
-		err = vma->vm_ops->populate(vma, start, size,
-					    vma->vm_page_prot,
-					    pgoff, flags & MAP_NONBLOCK);
-
-		/*
-		 * We can't clear VM_NONLINEAR because we'd have to do
-		 * it after ->populate completes, and that would prevent
-		 * downgrading the lock.  (Locks can't be upgraded).
-		 */
-		up_read(&mm->mmap_sem);
-	} else {
-		up_write(&mm->mmap_sem);
+	if (!vma)
+		goto out_unlock;
+	if (unlikely(flags & MAP_INHERIT))
+		pgprot = vma->vm_page_prot;
+	else {
+		err = -EPERM;
+		if (((prot & PROT_READ) && !(vma->vm_flags & VM_MAYREAD)))
+			goto out_unlock;
+		if (((prot & PROT_WRITE) && !(vma->vm_flags & VM_MAYWRITE)))
+			goto out_unlock;
+		if (((prot & PROT_EXEC) && !(vma->vm_flags & VM_MAYEXEC)))
+			goto out_unlock;
 	}
 
+	if (!vma->vm_ops || !vma->vm_ops->populate || end <= start ||
+				start < vma->vm_start || end > vma->vm_end)
+		goto out_unlock;
+
+	if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) {
+		if (!(vma->vm_flags & VM_SHARED))
+			goto out_unlock;
+		vma->vm_flags |= VM_NONLINEAR;
+	}
+
+	/*
+	 * ->populate can take a long time, so downgrade the lock:
+	 */
+	downgrade_write(&mm->mmap_sem);
+	err = vma->vm_ops->populate(vma, start, size,
+				pgprot, pgoff, flags & MAP_NONBLOCK);
+
+	/*
+	 * We can't clear VM_NONLINEAR because we'd have to do
+	 * it after ->populate completes, and that would prevent
+	 * downgrading the lock.  (Locks can't be upgraded).
+	 */
+	up_read(&mm->mmap_sem);
 	return err;
+
+out_unlock:
+	up_write(&mm->mmap_sem);
+	return err;
+}
+
+long sys_remap_file_pages(unsigned long start, unsigned long size,
+	unsigned long prot, unsigned long pgoff, unsigned long flags)
+{
+	return __remap_file_pages(current->mm, start, size, prot, pgoff, flags);
 }
 
+/*
+ * sys_remap_file_pages - the old API. Implies MAP_INHERIT.
+ */
+long old_remap_file_pages(unsigned long start, unsigned long size,
+	unsigned long __prot, unsigned long pgoff, unsigned long flags)
+{
+	if (__prot)
+		return -EINVAL;
+
+	return __remap_file_pages(current->mm, start, size, PROT_NONE,
+						pgoff, flags | MAP_INHERIT);
+}
diff -puN mm/memory.c~remap_file_pages-prot-2.6.1-H2 mm/memory.c
--- 25/mm/memory.c~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/mm/memory.c	Mon Jan  5 14:49:48 2004
@@ -1522,6 +1522,7 @@ static int do_file_page(struct mm_struct
 	unsigned long address, int write_access, pte_t *pte, pmd_t *pmd)
 {
 	unsigned long pgoff;
+	pgprot_t pgprot;
 	int err;
 
 	BUG_ON(!vma->vm_ops || !vma->vm_ops->nopage);
@@ -1536,11 +1537,12 @@ static int do_file_page(struct mm_struct
 	}
 
 	pgoff = pte_to_pgoff(*pte);
+	pgprot = pte_to_pgprot(*pte);
 
 	pte_unmap(pte);
 	spin_unlock(&mm->page_table_lock);
 
-	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0);
+	err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, pgprot, pgoff, 0);
 	if (err == -ENOMEM)
 		return VM_FAULT_OOM;
 	if (err)
diff -puN mm/mmap.c~remap_file_pages-prot-2.6.1-H2 mm/mmap.c
--- 25/mm/mmap.c~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/mm/mmap.c	Mon Jan  5 14:49:48 2004
@@ -690,8 +690,12 @@ out:	
 	}
 	if (flags & MAP_POPULATE) {
 		up_write(&mm->mmap_sem);
-		sys_remap_file_pages(addr, len, 0,
-					pgoff, flags & MAP_NONBLOCK);
+		/*
+		 * remap_file_pages() works even if the mapping is private,
+		 * in the linearly-mapped case:
+		 */
+		__remap_file_pages(mm, addr, len, PROT_NONE, pgoff,
+					MAP_INHERIT | (flags & MAP_NONBLOCK));
 		down_write(&mm->mmap_sem);
 	}
 	return addr;
diff -puN mm/rmap.c~remap_file_pages-prot-2.6.1-H2 mm/rmap.c
--- 25/mm/rmap.c~remap_file_pages-prot-2.6.1-H2	Mon Jan  5 14:49:48 2004
+++ 25-akpm/mm/rmap.c	Mon Jan  5 14:49:48 2004
@@ -343,6 +343,7 @@ static int try_to_unmap_one(struct page 
 		BUG_ON(pte_file(*ptep));
 	} else {
 		unsigned long pgidx;
+		pgprot_t pgprot = pte_to_pgprot(pte);
 		/*
 		 * If a nonlinear mapping then store the file page offset
 		 * in the pte.
@@ -350,8 +351,10 @@ static int try_to_unmap_one(struct page 
 		pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
 		pgidx += vma->vm_pgoff;
 		pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
-		if (page->index != pgidx) {
-			set_pte(ptep, pgoff_to_pte(page->index));
+		if (page->index != pgidx ||
+			pgprot_val(pgprot) != pgprot_val(vma->vm_page_prot)) {
+
+			set_pte(ptep, pgoff_prot_to_pte(page->index, pgprot));
 			BUG_ON(!pte_file(*ptep));
 		}
 	}

_
