From: Matthew Dobson <colpatch@us.ibm.com>




 Documentation/kernel-parameters.txt |    2 
 Documentation/sysctl/vm.txt         |    9 +++
 include/linux/mmzone.h              |   19 +++++++
 include/linux/sysctl.h              |    1 
 init/main.c                         |    1 
 kernel/sysctl.c                     |   11 ++++
 mm/page_alloc.c                     |   89 +++++++++++++++++++++++++-----------
 7 files changed, 104 insertions(+), 28 deletions(-)

diff -puN Documentation/kernel-parameters.txt~min_free_kbytes Documentation/kernel-parameters.txt
--- 25/Documentation/kernel-parameters.txt~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/Documentation/kernel-parameters.txt	2003-06-03 20:32:25.000000000 -0700
@@ -540,8 +540,6 @@ running once the system is up.
 			[KNL,ACPI] Mark specific memory as reserved.
 			Region of memory to be used, from ss to ss+nn.
 
-	memfrac=	[KNL]
-
 	meye=		[HW] Set MotionEye Camera parameters
 			See Documentation/video4linux/meye.txt.
 
diff -puN Documentation/sysctl/vm.txt~min_free_kbytes Documentation/sysctl/vm.txt
--- 25/Documentation/sysctl/vm.txt~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/Documentation/sysctl/vm.txt	2003-06-03 20:32:25.000000000 -0700
@@ -22,6 +22,7 @@ Currently, these files are in /proc/sys/
 - dirty_background_ratio
 - dirty_expire_centisecs
 - dirty_writeback_centisecs
+- min_free_kbytes
 
 ==============================================================
 
@@ -74,3 +75,11 @@ The number of pages the kernel reads in 
 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense
 for swap because we only cluster swap data in 32-page groups.
 
+==============================================================
+
+min_free_kbytes:
+
+This is used to force the Linux VM to keep a minimum number 
+of kilobytes free.  The VM uses this number to compute a pages_min
+value for each lowmem zone in the system.  Each lowmem zone gets 
+a number of reserved free pages based proportionally on its size.
diff -puN include/linux/mmzone.h~min_free_kbytes include/linux/mmzone.h
--- 25/include/linux/mmzone.h~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/include/linux/mmzone.h	2003-06-03 20:32:25.000000000 -0700
@@ -249,6 +249,25 @@ static inline struct zone *next_zone(str
 #define for_each_zone(zone) \
 	for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
 
+/**
+ * is_highmem - helper function to quickly check if a struct zone is a 
+ *              highmem zone or not.  This is an attempt to keep references
+ *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
+ * @zone - pointer to struct zone variable
+ */
+static inline int is_highmem(struct zone *zone)
+{
+	return (zone - zone->zone_pgdat->node_zones == ZONE_HIGHMEM);
+}
+
+/* These two functions are used to setup the per zone pages min values */
+struct ctl_table;
+struct file;
+int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
+					  void *, size_t *);
+extern void setup_per_zone_pages_min(void);
+
+
 #ifdef CONFIG_NUMA
 #define MAX_NR_MEMBLKS	BITS_PER_LONG /* Max number of Memory Blocks */
 #else /* !CONFIG_NUMA */
diff -puN include/linux/sysctl.h~min_free_kbytes include/linux/sysctl.h
--- 25/include/linux/sysctl.h~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/include/linux/sysctl.h	2003-06-03 20:32:25.000000000 -0700
@@ -156,6 +156,7 @@ enum
 	VM_HUGETLB_PAGES=18,	/* int: Number of available Huge Pages */
 	VM_SWAPPINESS=19,	/* Tendency to steal mapped memory */
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
+	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 };
 
 
diff -puN init/main.c~min_free_kbytes init/main.c
--- 25/init/main.c~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/init/main.c	2003-06-03 20:32:25.000000000 -0700
@@ -390,6 +390,7 @@ asmlinkage void __init start_kernel(void
 	lock_kernel();
 	printk(linux_banner);
 	setup_arch(&command_line);
+	setup_per_zone_pages_min();
 	setup_per_cpu_areas();
 
 	/*
diff -puN kernel/sysctl.c~min_free_kbytes kernel/sysctl.c
--- 25/kernel/sysctl.c~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/kernel/sysctl.c	2003-06-03 20:34:13.000000000 -0700
@@ -57,6 +57,7 @@ extern char core_pattern[];
 extern int cad_pid;
 extern int pid_max;
 extern int sysctl_lower_zone_protection;
+extern int min_free_kbytes;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -661,6 +662,16 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
+	{
+		.ctl_name	= VM_MIN_FREE_KBYTES,
+		.procname	= "min_free_kbytes",
+		.data		= &min_free_kbytes,
+		.maxlen		= sizeof(min_free_kbytes),
+		.mode		= 0644,
+		.proc_handler	= &min_free_kbytes_sysctl_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
diff -puN mm/page_alloc.c~min_free_kbytes mm/page_alloc.c
--- 25/mm/page_alloc.c~min_free_kbytes	2003-06-03 20:32:25.000000000 -0700
+++ 25-akpm/mm/page_alloc.c	2003-06-03 20:32:25.000000000 -0700
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
+#include <linux/sysctl.h>
 
 #include <asm/tlbflush.h>
 
@@ -49,9 +50,7 @@ struct zone *zone_table[MAX_NR_ZONES*MAX
 EXPORT_SYMBOL(zone_table);
 
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
-static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
-static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
-static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
+int min_free_kbytes = 1024;
 
 /*
  * Temporary debugging check for pages not lying within a given zone.
@@ -1212,7 +1211,6 @@ static void __init free_area_init_core(s
 	
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
-		unsigned long mask;
 		unsigned long size, realsize;
 		unsigned long batch;
 
@@ -1286,15 +1284,6 @@ static void __init free_area_init_core(s
 
 		pgdat->nr_zones = j+1;
 
-		mask = (realsize / zone_balance_ratio[j]);
-		if (mask < zone_balance_min[j])
-			mask = zone_balance_min[j];
-		else if (mask > zone_balance_max[j])
-			mask = zone_balance_max[j];
-		zone->pages_min = mask;
-		zone->pages_low = mask*2;
-		zone->pages_high = mask*3;
-
 		zone->zone_mem_map = lmem_map;
 		zone->zone_start_pfn = zone_start_pfn;
 
@@ -1379,19 +1368,6 @@ void __init free_area_init(unsigned long
 }
 #endif
 
-static int __init setup_mem_frac(char *str)
-{
-	int j = 0;
-
-	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
-	printk("setup_mem_frac: ");
-	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
-	printk("\n");
-	return 1;
-}
-
-__setup("memfrac=", setup_mem_frac);
-
 #ifdef CONFIG_PROC_FS
 
 #include <linux/seq_file.h>
@@ -1568,3 +1544,64 @@ void __init page_alloc_init(void)
 	init_page_alloc_cpu(smp_processor_id());
 	register_cpu_notifier(&page_alloc_nb);
 }
+
+/*
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
+ *	that the pages_{min,low,high} values for each zone are set correctly 
+ *	with respect to min_free_kbytes.
+ */
+void setup_per_zone_pages_min(void)
+{
+	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
+	unsigned long lowmem_pages = 0;
+	struct zone *zone;
+	unsigned long flags;
+
+	/* Calculate total number of !ZONE_HIGHMEM pages */
+	for_each_zone(zone)
+		if (!is_highmem(zone))
+			lowmem_pages += zone->present_pages;
+
+	for_each_zone(zone) {
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (is_highmem(zone)) {
+			/*
+			 * Often, highmem doesn't need to reserve any pages.
+			 * But the pages_min/low/high values are also used for
+			 * batching up page reclaim activity so we need a
+			 * decent value here.
+			 */
+			int min_pages;
+
+			min_pages = zone->present_pages / 1024;
+			if (min_pages < SWAP_CLUSTER_MAX)
+				min_pages = SWAP_CLUSTER_MAX;
+			if (min_pages > 128)
+				min_pages = 128;
+			zone->pages_min = min_pages;
+		} else {
+			/* if it's a lowmem zone, reserve a number of pages 
+			 * proportionate to the zone's size.
+			 */
+			zone->pages_min = (pages_min * zone->present_pages) / 
+			                   lowmem_pages;
+		}
+
+		zone->pages_low = zone->pages_min * 2;
+		zone->pages_high = zone->pages_min * 3;
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+}
+
+/*
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ *	that we can call setup_per_zone_pages_min() whenever min_free_kbytes 
+ *	changes.
+ */
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+		struct file *file, void *buffer, size_t *length)
+{
+	proc_dointvec(table, write, file, buffer, length);
+	setup_per_zone_pages_min();
+	return 0;
+}

_