
From: Trond Myklebust <trond.myklebust@fys.uio.no>

Hi Andrew,

  Mind if I send this in to Linus. On my test setup, it makes up the
difference between an immediate hang when I run iozone with > 1 thread
on a Gbyte file, and being able to run sensibly with 8 threads or
more.

Please note the slight modification to nfs_writepages w.r.t. older
versions of this patch.



 fs/nfs/write.c             |   12 +++++++++++-
 include/linux/page-flags.h |    3 +++
 mm/page-writeback.c        |   42 ++++++++++++++++++++++++++++--------------
 mm/page_alloc.c            |    5 ++++-
 4 files changed, 46 insertions(+), 16 deletions(-)

diff -puN fs/nfs/write.c~nfs-resource-management fs/nfs/write.c
--- 25/fs/nfs/write.c~nfs-resource-management	2003-04-13 12:45:29.000000000 -0700
+++ 25-akpm/fs/nfs/write.c	2003-04-13 12:45:29.000000000 -0700
@@ -283,8 +283,14 @@ nfs_writepages(struct address_space *map
 	err = nfs_flush_file(inode, NULL, 0, 0, 0);
 	if (err < 0)
 		goto out;
-	if (is_sync)
+	if (wbc->sync_mode == WB_SYNC_HOLD)
+		goto out;
+	if (is_sync && wbc->sync_mode == WB_SYNC_ALL) {
 		err = nfs_wb_all(inode);
+	} else
+		nfs_commit_file(inode, NULL, 0, 0, 0);
+	/* Avoid races. Tell upstream we've done all we were told to do */
+	wbc->nr_to_write = 0;
 out:
 	return err;
 }
@@ -372,6 +378,7 @@ nfs_mark_request_dirty(struct nfs_page *
 	nfs_list_add_request(req, &nfsi->dirty);
 	nfsi->ndirty++;
 	spin_unlock(&nfs_wreq_lock);
+	inc_page_state(nr_dirty);
 	mark_inode_dirty(inode);
 }
 
@@ -399,6 +406,7 @@ nfs_mark_request_commit(struct nfs_page 
 	nfs_list_add_request(req, &nfsi->commit);
 	nfsi->ncommit++;
 	spin_unlock(&nfs_wreq_lock);
+	inc_page_state(nr_unstable);
 	mark_inode_dirty(inode);
 }
 #endif
@@ -466,6 +474,7 @@ nfs_scan_dirty(struct inode *inode, stru
 	int	res;
 	res = nfs_scan_list(&nfsi->dirty, dst, file, idx_start, npages);
 	nfsi->ndirty -= res;
+	sub_page_state(nr_dirty,res);
 	if ((nfsi->ndirty == 0) != list_empty(&nfsi->dirty))
 		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ndirty.\n");
 	return res;
@@ -490,6 +499,7 @@ nfs_scan_commit(struct inode *inode, str
 	int	res;
 	res = nfs_scan_list(&nfsi->commit, dst, file, idx_start, npages);
 	nfsi->ncommit -= res;
+	sub_page_state(nr_unstable,res);
 	if ((nfsi->ncommit == 0) != list_empty(&nfsi->commit))
 		printk(KERN_ERR "NFS: desynchronized value of nfs_i.ncommit.\n");
 	return res;
diff -puN include/linux/page-flags.h~nfs-resource-management include/linux/page-flags.h
--- 25/include/linux/page-flags.h~nfs-resource-management	2003-04-13 12:45:29.000000000 -0700
+++ 25-akpm/include/linux/page-flags.h	2003-04-13 12:45:29.000000000 -0700
@@ -75,6 +75,7 @@
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
 
+
 /*
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
  * allowed.
@@ -82,6 +83,7 @@
 struct page_state {
 	unsigned long nr_dirty;		/* Dirty writeable pages */
 	unsigned long nr_writeback;	/* Pages under writeback */
+	unsigned long nr_unstable;	/* NFS unstable pages */
 	unsigned long nr_page_table_pages;/* Pages used for pagetables */
 	unsigned long nr_mapped;	/* mapped into pagetables */
 	unsigned long nr_slab;		/* In slab */
@@ -130,6 +132,7 @@ extern void get_full_page_state(struct p
 
 #define inc_page_state(member)	mod_page_state(member, 1UL)
 #define dec_page_state(member)	mod_page_state(member, 0UL - 1)
+#define sub_page_state(member,delta) mod_page_state(member, 0UL - (delta))
 
 
 /*
diff -puN mm/page_alloc.c~nfs-resource-management mm/page_alloc.c
--- 25/mm/page_alloc.c~nfs-resource-management	2003-04-13 12:45:29.000000000 -0700
+++ 25-akpm/mm/page_alloc.c	2003-04-13 12:45:29.000000000 -0700
@@ -941,11 +941,13 @@ void show_free_areas(void)
 		K(nr_free_pages()),
 		K(nr_free_highpages()));
 
-	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu free:%u\n",
+	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
+		"unstable:%lu free:%u\n",
 		active,
 		inactive,
 		ps.nr_dirty,
 		ps.nr_writeback,
+		ps.nr_unstable,
 		nr_free_pages());
 
 	for_each_zone(zone) {
@@ -1444,6 +1446,7 @@ struct seq_operations fragmentation_op =
 static char *vmstat_text[] = {
 	"nr_dirty",
 	"nr_writeback",
+	"nr_unstable",
 	"nr_page_table_pages",
 	"nr_mapped",
 	"nr_slab",
diff -puN mm/page-writeback.c~nfs-resource-management mm/page-writeback.c
--- 25/mm/page-writeback.c~nfs-resource-management	2003-04-13 12:45:29.000000000 -0700
+++ 25-akpm/mm/page-writeback.c	2003-04-13 12:45:29.000000000 -0700
@@ -138,6 +138,7 @@ get_dirty_limits(struct page_state *ps, 
 void balance_dirty_pages(struct address_space *mapping)
 {
 	struct page_state ps;
+	long nr_reclaimable;
 	long background_thresh;
 	long dirty_thresh;
 	unsigned long pages_written = 0;
@@ -145,8 +146,7 @@ void balance_dirty_pages(struct address_
 
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
-	get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
-	while (ps.nr_dirty + ps.nr_writeback > dirty_thresh) {
+	for (;;) {
 		struct writeback_control wbc = {
 			.bdi		= bdi,
 			.sync_mode	= WB_SYNC_NONE,
@@ -154,24 +154,37 @@ void balance_dirty_pages(struct address_
 			.nr_to_write	= write_chunk,
 		};
 
+		get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
+		nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
+		if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
+			break;
+
 		dirty_exceeded = 1;
 
-		if (ps.nr_dirty)
+		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+		 * Unstable writes are a feature of certain networked
+		 * filesystems (i.e. NFS) in which data may have been
+		 * written to the server's write cache, but has not yet
+		 * been flushed to permanent storage.
+		 */
+		if (nr_reclaimable) {
 			writeback_inodes(&wbc);
-
-		get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
-		if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh)
-			break;
-		pages_written += write_chunk - wbc.nr_to_write;
-		if (pages_written >= write_chunk)
-			break;		/* We've done our duty */
+			get_dirty_limits(&ps, &background_thresh,
+					&dirty_thresh);
+			nr_reclaimable = ps.nr_dirty + ps.nr_unstable;
+			if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
+				break;
+			pages_written += write_chunk - wbc.nr_to_write;
+			if (pages_written >= write_chunk)
+				break;		/* We've done our duty */
+		}
 		blk_congestion_wait(WRITE, HZ/10);
 	}
 
-	if (ps.nr_dirty + ps.nr_writeback <= dirty_thresh)
+	if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 		dirty_exceeded = 0;
 
-	if (!writeback_in_progress(bdi) && ps.nr_dirty > background_thresh)
+	if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -231,7 +244,8 @@ static void background_writeout(unsigned
 		long dirty_thresh;
 
 		get_dirty_limits(&ps, &background_thresh, &dirty_thresh);
-		if (ps.nr_dirty < background_thresh && min_pages <= 0)
+		if (ps.nr_dirty + ps.nr_unstable < background_thresh
+				&& min_pages <= 0)
 			break;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
@@ -302,7 +316,7 @@ static void wb_kupdate(unsigned long arg
 	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 	start_jif = jiffies;
 	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
-	nr_to_write = ps.nr_dirty;
+	nr_to_write = ps.nr_dirty + ps.nr_unstable;
 	while (nr_to_write > 0) {
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;

_
