From: Ram Pai <linuxram@us.ibm.com>

Currently the readahead code tends to read one more page than it should with
seeky database-style loads.  This was to prevent bogus readahead triggering
when we step into the last page of the current window.

The patch removes that workaround and fixes up the suboptimal logic instead.


wrt the "rounding errors" mentioned in this patch, Ram provided the following
description:

  Say the i/o size is 20 pages.

  Our algorithm starts by a initial average i/o size of 'ra_pages/2' which
  is mostly say 16.

  Now every time we take a average, the 'average' progresses as follows
  (16+20)/2=18
  (18+20)/2=19
  (19+20)/2=19
  (19+20)/2=19.....
  and the rounding error makes it never touch 20


Benchmarking sitrep:

			IOZONE 

	run on a nfs mounted filesystem:
	client machine 2proc, 733MHz, 2GB memory
	server machine 8proc, 700Mhz, 8GB memory

./iozone -c -t1 -s 4096m -r 128k


---------------------------------------------------------
|		| throughput |	throughput | throughput |
|		| KB/sec     |	KB/sec     | KB/sec     |
|		| 266	     |	266+patch  | 2.4.20     |
---------------------------------------------------------
|sequential read| 11697.55   |	11700.98   | 10846.87   |
| 		|	     |             |            |
|re-read	| 11698.39   |	11691.84   | 10865.39   |
|		|	     |             |            |
|reverse read	| 20002.71   |	20099.86   | 10340.34   |
|               |            |             |            |
|stride read	| 13813.01   |	13850.28   | 10193.87   |
|		|	     |             |            |
|random read	| 19705.06   |	19978.00   | 10839.57   |
|               |            |             |            |
|random mix	| 28465.68   |	29964.38   | 10779.17   |
|		|	     |             |            |
|pread		| 11692.95   |	11697.29   | 10863.56   |
---------------------------------------------------------


			SYSBENCH

	run on machine 2proc, 733MHz, 256MB memory


---------------------------------------------------------
|		| 266	     |	266+patch  | 2.4.21     |
---------------------------------------------------------
|time spent     | 79.6253    |	79.8176    | 73.2605sec |
| 		|	     |             |            |
|Mb/sec		| 1.959Mb.sec|	1.954Mb/sec| 2.129Mb/sec|
|		|	     |             |            |
|requests/sec 	| 125.59     |	125.29     | 136.54	|
|               |            |             |            |
|no of Reads 	| 6001       |	6001	   | 6008	|
|		|	     |             |            |
|no of Writes 	| 3999	     |	3999       | 3995	|
|               |            |             |            |
---------------------------------------------------------


---

 25-akpm/mm/readahead.c |   46 +++++++++++++++++++++++-----------------------
 1 files changed, 23 insertions(+), 23 deletions(-)

diff -puN mm/readahead.c~seeky-readahead-speedups mm/readahead.c
--- 25/mm/readahead.c~seeky-readahead-speedups	Mon May 17 13:59:13 2004
+++ 25-akpm/mm/readahead.c	Mon May 17 13:59:13 2004
@@ -353,7 +353,7 @@ page_cache_readahead(struct address_spac
 	unsigned orig_next_size;
 	unsigned actual;
 	int first_access=0;
-	unsigned long preoffset=0;
+	unsigned long average;
 
 	/*
 	 * Here we detect the case where the application is performing
@@ -394,10 +394,17 @@ page_cache_readahead(struct address_spac
 		if (ra->serial_cnt <= (max * 2))
 			ra->serial_cnt++;
 	} else {
-		ra->average = (ra->average + ra->serial_cnt) / 2;
+		/*
+		 * to avoid rounding errors, ensure that 'average'
+		 * tends towards the value of ra->serial_cnt.
+		 */
+		average = ra->average;
+		if (average < ra->serial_cnt) {
+			average++;
+		}
+		ra->average = (average + ra->serial_cnt) / 2;
 		ra->serial_cnt = 1;
 	}
-	preoffset = ra->prev_page;
 	ra->prev_page = offset;
 
 	if (offset >= ra->start && offset <= (ra->start + ra->size)) {
@@ -457,18 +464,13 @@ do_io:
 		 * ahead window and get some I/O underway for the new
 		 * current window.
 		 */
-		if (!first_access && preoffset >= ra->start &&
-				preoffset < (ra->start + ra->size)) {
-			 /* Heuristic:  If 'n' pages were
-			  * accessed in the current window, there
-			  * is a high probability that around 'n' pages
-			  * shall be used in the next current window.
-			  *
-			  * To minimize lazy-readahead triggered
-			  * in the next current window, read in
-			  * an extra page.
+		if (!first_access) {
+			 /* Heuristic: there is a high probability
+			  * that around  ra->average number of
+			  * pages shall be accessed in the next
+			  * current window.
 			  */
-			ra->next_size = preoffset - ra->start + 2;
+			ra->next_size = min(ra->average , (unsigned long)max);
 		}
 		ra->start = offset;
 		ra->size = ra->next_size;
@@ -492,21 +494,19 @@ do_io:
 		 */
 		if (ra->ahead_start == 0) {
 			/*
-			 * if the average io-size is less than maximum
+			 * If the average io-size is more than maximum
 			 * readahead size of the file the io pattern is
 			 * sequential. Hence  bring in the readahead window
 			 * immediately.
-			 * Else the i/o pattern is random. Bring
-			 * in the readahead window only if the last page of
-			 * the current window is accessed (lazy readahead).
+			 * If the average io-size is less than maximum
+			 * readahead size of the file the io pattern is
+			 * random. Hence don't bother to readahead.
 			 */
-			unsigned long average = ra->average;
-
+			average = ra->average;
 			if (ra->serial_cnt > average)
-				average = (ra->serial_cnt + ra->average) / 2;
+				average = (ra->serial_cnt + ra->average + 1) / 2;
 
-			if ((average >= max) || (offset == (ra->start +
-							ra->size - 1))) {
+			if (average > max) {
 				ra->ahead_start = ra->start + ra->size;
 				ra->ahead_size = ra->next_size;
 				actual = do_page_cache_readahead(mapping, filp,

_