Return-Path: <mbligh@w-mbligh>
X-Sieve: cmu-sieve 2.0
Return-path: <agl@us.ibm.com>
Envelope-to: mbligh@localhost
Delivery-date: Wed, 17 Mar 2004 14:24:41 -0800
Received: from w-mbligh.beaverton.ibm.com
	([127.0.0.1] helo=mail.aracnet.com ident=mbligh)
	by w-mbligh.beaverton.ibm.com with esmtp (Exim 3.35 #1 (Debian))
	id 1B3jSq-0002xN-00
	for <mbligh@localhost>; Wed, 17 Mar 2004 14:24:40 -0800
Received: from psmtp.com (exprod5mx105.postini.com [12.158.34.61])
	by citrine.spiritone.com (8.12.10/8.12.8) with SMTP id i2HMP9HT029115
	for <mbligh@aracnet.com>; Wed, 17 Mar 2004 14:25:09 -0800
Delivered-To: <mbligh@aracnet.com>
Received: from source ([32.97.182.106]) by exprod5mx105.postini.com ([12.158.34.245]) with SMTP;
	Wed, 17 Mar 2004 17:21:22 EST
Received: from northrelay02.pok.ibm.com (northrelay02.pok.ibm.com [9.56.224.150])
	by e6.ny.us.ibm.com (8.12.10/8.12.2) with ESMTP id i2HMLH4i534814
	for <mbligh@aracnet.com>; Wed, 17 Mar 2004 17:21:17 -0500
Received: from DYN317989BLD.beaverton.ibm.com (d01av02.pok.ibm.com [9.56.224.216])
	by northrelay02.pok.ibm.com (8.12.10/NCO/VER6.6) with ESMTP id i2HMLFEW128918
	for <mbligh@aracnet.com>; Wed, 17 Mar 2004 17:21:16 -0500
Subject: 2.6.4-mjb1 : 780-hugetlb_dyn_as
From: Adam Litke <agl@us.ibm.com>
To: Martin Bligh <mbligh@aracnet.com>
Content-Type: text/plain
Organization: IBM
Message-Id: <1079561743.5231.4.camel@agtpad>
Mime-Version: 1.0
X-Mailer: Ximian Evolution 1.4.5 
Date: Wed, 17 Mar 2004 14:15:43 -0800
Content-Transfer-Encoding: 7bit
X-Accept: 2.6 or must-fix


diff -upN reference/arch/ppc64/kernel/setup.c current/arch/ppc64/kernel/setup.c
--- reference/arch/ppc64/kernel/setup.c	2004-03-30 10:13:21.000000000 -0800
+++ current/arch/ppc64/kernel/setup.c	2004-03-31 12:32:13.000000000 -0800
@@ -602,6 +602,10 @@ void __init setup_arch(char **cmdline_p)
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
 	init_mm.brk = klimit;
+#ifdef CONFIG_HUGETLBFS
+	init_mm.context.hugetlb_end = TASK_HPAGE_END_32;
+	init_mm.context.hugetlb_base = TASK_HPAGE_END_32;
+#endif
 	
 	/* Save unparsed command line copy for /proc/cmdline */
 	strlcpy(saved_command_line, cmd_line, sizeof(saved_command_line));
diff -upN reference/arch/ppc64/mm/hugetlbpage.c current/arch/ppc64/mm/hugetlbpage.c
--- reference/arch/ppc64/mm/hugetlbpage.c	2004-03-31 12:32:13.000000000 -0800
+++ current/arch/ppc64/mm/hugetlbpage.c	2004-03-31 12:32:13.000000000 -0800
@@ -22,6 +22,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
+#include <asm/mmu.h>
 #include <asm/machdep.h>
 #include <asm/cputable.h>
 #include <asm/tlb.h>
@@ -236,6 +237,23 @@ static void do_slbia(void *unused)
 	asm volatile ("isync; slbia; isync":::"memory");
 }
 
+/* Returns the correct ending address for a process' hugetlb region */
+static inline unsigned long 
+get_hugetlb_area_end(struct task_struct *task)
+{
+	u32 stack_end, default_end = TASK_HPAGE_END_32;
+
+	/*
+	 * We use rlim_cur so that unprivileged applications can signal
+	 * our code using ulimit.
+	 */
+	stack_end = 0xffffffff - task->rlim[RLIMIT_STACK].rlim_cur;
+	stack_end = min(default_end, stack_end);
+
+	/* Boundary must be segment aligned */
+	return _ALIGN_DOWN(stack_end, MM_SEGMENT_SIZE);
+}
+
 /* Activate the low hpage region for 32bit processes.  mmap_sem must
  * be held*/
 static int open_32bit_htlbpage_range(struct mm_struct *mm)
@@ -246,15 +264,19 @@ static int open_32bit_htlbpage_range(str
 	if (mm->context.low_hpages)
 		return 0; /* The window is already open */
 	
+	/* Set up the area boundaries */
+	mm->context.hugetlb_end = get_hugetlb_area_end(current);
+	mm->context.hugetlb_base = mm->context.hugetlb_end;
+	
 	/* Check no VMAs are in the region */
-	vma = find_vma(mm, TASK_HPAGE_BASE_32);
+	vma = find_vma(mm, mm->context.hugetlb_base);
 
-	if (vma && (vma->vm_start < TASK_HPAGE_END_32))
+	if (vma && (vma->vm_start < mm->context.hugetlb_end))
 		return -EBUSY;
 
 	/* Clean up any leftover PTE pages in the region */
 	spin_lock(&mm->page_table_lock);
-	for (addr = TASK_HPAGE_BASE_32; addr < TASK_HPAGE_END_32;
+	for (addr = mm->context.hugetlb_base; addr < mm->context.hugetlb_end;
 	     addr += PMD_SIZE) {
 		pgd_t *pgd = pgd_offset(mm, addr);
 		pmd_t *pmd = pmd_offset(pgd, addr);
@@ -586,8 +608,8 @@ full_search:
 		}
 		if (!vma || addr + len <= vma->vm_start) {
 			if (is_hugepage_only_range(addr, len)) {
-				if (addr < TASK_HPAGE_END_32)
-					addr = TASK_HPAGE_END_32;
+				if (addr < mm->context.hugetlb_end)
+					addr = mm->context.hugetlb_end;
 				else
 					addr = TASK_HPAGE_END;
 
@@ -603,6 +625,32 @@ full_search:
 	}
 }
 
+unsigned long grow_hugetlb_region(unsigned long hpage_base, unsigned long len)
+{
+	struct vm_area_struct *vma = NULL;
+	unsigned long new_base, vma_start = hpage_base;
+
+	vma = find_vma(current->mm, vma_start);
+	vma_start = (vma && vma->vm_start < current->mm->context.hugetlb_end) ? 
+		vma->vm_start : current->mm->context.hugetlb_end;
+	printk("First vma in hugetlb region starts at: %lx\n", vma_start);
+
+	new_base = _ALIGN_DOWN(vma_start - len, MM_SEGMENT_SIZE);
+	if (new_base < TASK_HPAGE_BASE_32)
+		return -ENOMEM;
+
+	printk("Try to move hugetlb_base down to: %lx\n", new_base);
+	vma = find_vma(current->mm, new_base);
+	if (vma && vma->vm_start < hpage_base) {
+		printk("Found vma at %lx aborting\n", vma->vm_start);
+		return -ENOMEM;
+	}
+
+	current->mm->context.hugetlb_base = new_base;
+	printk("Area clean returning an area at: %lx\n", vma_start-len); 
+	return vma_start - len;
+}
+
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
@@ -623,8 +671,8 @@ unsigned long hugetlb_get_unmapped_area(
 		if (err)
 			return err; /* Should this just be EINVAL? */
 
-		base = TASK_HPAGE_BASE_32;
-		end = TASK_HPAGE_END_32;
+		base = current->mm->context.hugetlb_base;
+		end = current->mm->context.hugetlb_end;
 	} else {
 		base = TASK_HPAGE_BASE;
 		end = TASK_HPAGE_END;
@@ -637,6 +685,10 @@ unsigned long hugetlb_get_unmapped_area(
 	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
 		/* At this point:  (!vma || addr < vma->vm_end). */
 		if (addr + len > end) {
+			/* Try to get the space by expanding the hugetlb region */
+			addr = grow_hugetlb_region(base, len);
+			if (addr > 0)
+				return addr;
 			if (test_thread_flag(TIF_32BIT))
 				close_32bit_htlbpage_range(current->mm);
 			return -ENOMEM;
diff -upN reference/include/asm-ppc64/mmu.h current/include/asm-ppc64/mmu.h
--- reference/include/asm-ppc64/mmu.h	2004-03-11 14:35:23.000000000 -0800
+++ current/include/asm-ppc64/mmu.h	2004-03-31 12:32:13.000000000 -0800
@@ -24,6 +24,8 @@ typedef struct {
 	mm_context_id_t id;
 #ifdef CONFIG_HUGETLB_PAGE
 	int low_hpages;
+	unsigned long hugetlb_base;
+	unsigned long hugetlb_end;
 #endif
 } mm_context_t;
 
@@ -191,6 +193,8 @@ void create_valid_hpte( unsigned long sl
 
 #define LARGE_PAGE_SHIFT 24
 
+#define MM_SEGMENT_SIZE (1UL << 28)
+
 static inline unsigned long hpt_hash(unsigned long vpn, int large)
 {
 	unsigned long vsid;
diff -upN reference/include/asm-ppc64/page.h current/include/asm-ppc64/page.h
--- reference/include/asm-ppc64/page.h	2004-03-11 14:35:23.000000000 -0800
+++ current/include/asm-ppc64/page.h	2004-03-31 12:32:13.000000000 -0800
@@ -33,15 +33,21 @@
 #define TASK_HPAGE_BASE 	(0x0000010000000000UL)
 #define TASK_HPAGE_END 	(0x0000018000000000UL)
 
-/* For 32-bit processes the hugepage range is 2-3G */
-#define TASK_HPAGE_BASE_32	(0x80000000UL)
-#define TASK_HPAGE_END_32	(0xc0000000UL)
+/*
+ * We have much greater contention for segments in a
+ * 32-bit address space.  Therefore, the region reserved
+ * for huge pages is dynamically resized.  These values
+ * define the maximum range allowed for huge pages.
+ */
+#define TASK_HPAGE_BASE_32    (0x40000000UL)
+#define TASK_HPAGE_END_32     (0xf0000000UL)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(addr, len) \
 	( ((addr > (TASK_HPAGE_BASE-len)) && (addr < TASK_HPAGE_END)) || \
 	  (current->mm->context.low_hpages && \
-	   (addr > (TASK_HPAGE_BASE_32-len)) && (addr < TASK_HPAGE_END_32)) )
+	   (addr > (current->mm->context.hugetlb_base-len)) && \
+	   (addr < current->mm->context.hugetlb_end)) )
 #define hugetlb_free_pgtables free_pgtables
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
@@ -49,7 +55,7 @@
 	((cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) && \
 	 ((((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
 	  ((context).low_hpages && \
-	   (((addr) >= TASK_HPAGE_BASE_32) && ((addr) < TASK_HPAGE_END_32)))))
+	   (((addr) >= context.hugetlb_base) && ((addr) < context.hugetlb_end)))))
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
diff -upN reference/include/linux/sched.h current/include/linux/sched.h
--- reference/include/linux/sched.h	2004-03-31 12:32:13.000000000 -0800
+++ current/include/linux/sched.h	2004-03-31 12:32:13.000000000 -0800
@@ -500,6 +500,7 @@ struct task_struct {
 
 	unsigned long ptrace_message;
 	siginfo_t *last_siginfo; /* For ptrace use.  */
+	unsigned long fault_count;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)