<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">
From: Nick Piggin &lt;nickpiggin@yahoo.com.au&gt;

Previously the -&gt;protection[] logic was broken.  It was difficult to follow
and basically didn't use the asynch reclaim watermarks (pages_min,
pages_low, pages_high) properly.

Now use -&gt;protection *only* for lower-zone protection.  So the allocator
now explicitly uses the -&gt;pages_low, -&gt;pages_min watermarks and adds
-&gt;protection on top of that, instead of trying to use -&gt;protection for
everything.

Pages are allocated down to (-&gt;pages_low + -&gt;protection), once this is
reached, kswapd the background reclaim is started; after this, we can
allocate down to (-&gt;pages_min + -&gt;protection) without blocking; the memory
below pages_min is reserved for __GFP_HIGH and PF_MEMALLOC allocations. 
kswapd attempts to reclaim memory until -&gt;pages_high is reached.

Signed-off-by: Nick Piggin &lt;nickpiggin@yahoo.com.au&gt;
Signed-off-by: Andrew Morton &lt;akpm@osdl.org&gt;
---

 25-akpm/mm/page_alloc.c |  119 +++++++++++++++++++++---------------------------
 1 files changed, 53 insertions(+), 66 deletions(-)

diff -puN mm/page_alloc.c~alloc-pages-watermark-fixes mm/page_alloc.c
--- 25/mm/page_alloc.c~alloc-pages-watermark-fixes	2004-08-16 23:42:38.250558072 -0700
+++ 25-akpm/mm/page_alloc.c	2004-08-16 23:42:38.255557312 -0700
@@ -602,7 +602,7 @@ __alloc_pages(unsigned int gfp_mask, uns
 {
 	const int wait = gfp_mask &amp; __GFP_WAIT;
 	unsigned long min;
-	struct zone **zones;
+	struct zone **zones, *z;
 	struct page *page;
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
@@ -613,72 +613,56 @@ __alloc_pages(unsigned int gfp_mask, uns
 	might_sleep_if(wait);
 
 	zones = zonelist-&gt;zones;  /* the list of zones suitable for gfp_mask */
-	if (zones[0] == NULL)     /* no zones in the zonelist */
+
+	if (unlikely(zones[0] == NULL)) {
+		/* Should this ever happen?? */
 		return NULL;
+	}
 
 	alloc_type = zone_idx(zones[0]);
 
 	/* Go through the zonelist once, looking for a zone with enough free */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		min = z-&gt;pages_low + (1&lt;&lt;order) + z-&gt;protection[alloc_type];
 
-		min = (1&lt;&lt;order) + z-&gt;protection[alloc_type];
-
-		/*
-		 * We let real-time tasks dip their real-time paws a little
-		 * deeper into reserves.
-		 */
-		if (rt_task(p))
-			min -= z-&gt;pages_low &gt;&gt; 1;
+		if (z-&gt;free_pages &lt; min)
+			continue;
 
-		if (z-&gt;free_pages &gt;= min ||
-				(!wait &amp;&amp; z-&gt;free_pages &gt;= z-&gt;pages_high)) {
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
-				zone_statistics(zonelist, z);
-				goto got_pg;
-			}
-		}
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
 	}
 
-	/* we're somewhat low on memory, failed to find what we needed */
-	for (i = 0; zones[i] != NULL; i++)
-		wakeup_kswapd(zones[i]);
-
-	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
-
-		min = (1&lt;&lt;order) + z-&gt;protection[alloc_type];
+	for (i = 0; (z = zones[i]) != NULL; i++)
+		wakeup_kswapd(z);
 
+	/*
+	 * Go through the zonelist again. Let __GFP_HIGH and allocations
+	 * coming from realtime tasks to go deeper into reserves
+	 */
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		min = z-&gt;pages_min;
 		if (gfp_mask &amp; __GFP_HIGH)
-			min -= z-&gt;pages_low &gt;&gt; 2;
-		if (rt_task(p))
-			min -= z-&gt;pages_low &gt;&gt; 1;
+			min -= min&gt;&gt;1;
+		if (unlikely(rt_task(p)) &amp;&amp; !in_interrupt())
+			min -= min&gt;&gt;2;
+		min += (1&lt;&lt;order) + z-&gt;protection[alloc_type];
 
-		if (z-&gt;free_pages &gt;= min ||
-				(!wait &amp;&amp; z-&gt;free_pages &gt;= z-&gt;pages_high)) {
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
-				zone_statistics(zonelist, z);
-				goto got_pg;
-			}
-		}
-	}
+		if (z-&gt;free_pages &lt; min)
+			continue;
 
-	/* here we're in the low on memory slow path */
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
+	}
 
-rebalance:
+	/* This allocation should allow future memory freeing. */
 	if ((p-&gt;flags &amp; (PF_MEMALLOC | PF_MEMDIE)) &amp;&amp; !in_interrupt()) {
 		/* go through the zonelist yet again, ignoring mins */
-		for (i = 0; zones[i] != NULL; i++) {
-			struct zone *z = zones[i];
-
+		for (i = 0; (z = zones[i]) != NULL; i++) {
 			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
-				zone_statistics(zonelist, z);
+			if (page)
 				goto got_pg;
-			}
 		}
 		goto nopage;
 	}
@@ -687,6 +671,8 @@ rebalance:
 	if (!wait)
 		goto nopage;
 
+rebalance:
+	/* We now go into synchronous reclaim */
 	p-&gt;flags |= PF_MEMALLOC;
 	reclaim_state.reclaimed_slab = 0;
 	p-&gt;reclaim_state = &amp;reclaim_state;
@@ -697,27 +683,28 @@ rebalance:
 	p-&gt;flags &amp;= ~PF_MEMALLOC;
 
 	/* go through the zonelist yet one more time */
-	for (i = 0; zones[i] != NULL; i++) {
-		struct zone *z = zones[i];
+	for (i = 0; (z = zones[i]) != NULL; i++) {
+		min = z-&gt;pages_min;
+		if (gfp_mask &amp; __GFP_HIGH)
+			min -= min&gt;&gt;1;
+		if (unlikely(rt_task(p)) &amp;&amp; !in_interrupt())
+			min -= min&gt;&gt;2;
+		min += (1&lt;&lt;order) + z-&gt;protection[alloc_type];
 
-		min = (1UL &lt;&lt; order) + z-&gt;protection[alloc_type];
+		if (z-&gt;free_pages &lt; min)
+			continue;
 
-		if (z-&gt;free_pages &gt;= min ||
-				(!wait &amp;&amp; z-&gt;free_pages &gt;= z-&gt;pages_high)) {
-			page = buffered_rmqueue(z, order, gfp_mask);
-			if (page) {
- 				zone_statistics(zonelist, z);
-				goto got_pg;
-			}
-		}
+		page = buffered_rmqueue(z, order, gfp_mask);
+		if (page)
+			goto got_pg;
 	}
 
 	/*
 	 * Don't let big-order allocations loop unless the caller explicitly
 	 * requests that.  Wait for some write requests to complete then retry.
 	 *
-	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL, but that
-	 * may not be true in other implementations.
+	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
+	 * &lt;= 3, but that may not be true in other implementations.
 	 */
 	do_retry = 0;
 	if (!(gfp_mask &amp; __GFP_NORETRY)) {
@@ -740,6 +727,7 @@ nopage:
 	}
 	return NULL;
 got_pg:
+	zone_statistics(zonelist, z);
 	kernel_map_pages(page, 1 &lt;&lt; order, 1);
 	return page;
 }
@@ -1859,11 +1847,11 @@ static void setup_per_zone_protection(vo
 				 * We never protect zones that don't have memory
 				 * in them (j&gt;max_zone) or zones that aren't in
 				 * the zonelists for a certain type of
-				 * allocation (j&gt;i).  We have to assign these to
-				 * zero because the lower zones take
+				 * allocation (j&gt;=i).  We have to assign these
+				 * to zero because the lower zones take
 				 * contributions from the higher zones.
 				 */
-				if (j &gt; max_zone || j &gt; i) {
+				if (j &gt; max_zone || j &gt;= i) {
 					zone-&gt;protection[i] = 0;
 					continue;
 				}
@@ -1872,7 +1860,6 @@ static void setup_per_zone_protection(vo
 				 */
 				zone-&gt;protection[i] = higherzone_val(zone,
 								max_zone, i);
-				zone-&gt;protection[i] += zone-&gt;pages_low;
 			}
 		}
 	}
_
</pre></body></html>