From: Rusty Russell <rusty@rustcorp.com.au>

OK, this does the *minimum* required to support DEFINE_PER_CPU inside
modules.  If we decide to change kmalloc_percpu later, great, we can turf
this out.

Basically, overallocates the amount of per-cpu data at boot to at least
PERCPU_ENOUGH_ROOM if CONFIG_MODULES=y (arch-specific by default 16k: I have
only 5700 bytes of percpu data in my kernel here, so makes sense), and a
special allocator in module.c dishes it out.



 arch/ia64/kernel/module.c    |   10 ++
 include/asm-generic/percpu.h |   13 +-
 include/asm-ia64/percpu.h    |    5 -
 include/linux/module.h       |    3 
 include/linux/percpu.h       |    6 +
 init/main.c                  |    8 +
 kernel/module.c              |  202 +++++++++++++++++++++++++++++++++++++++++--
 7 files changed, 229 insertions(+), 18 deletions(-)

diff -puN arch/ia64/kernel/module.c~DEFINE_PERCPU-in-modules arch/ia64/kernel/module.c
--- 25/arch/ia64/kernel/module.c~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/arch/ia64/kernel/module.c	2003-05-26 01:43:46.000000000 -0700
@@ -887,3 +887,13 @@ module_arch_cleanup (struct module *mod)
 	if (mod->arch.unwind)
 		unw_remove_unwind_table(mod->arch.unw_table);
 }
+
+#ifdef CONFIG_SMP
+void percpu_modcopy(void *pcpudst, const void *src, unsigned long size)
+{
+	unsigned int i;
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_possible(i))
+			memcpy(pcpudst + __per_cpu_offset[i], src, size);
+}
+#endif /* CONFIG_SMP */
diff -puN include/asm-generic/percpu.h~DEFINE_PERCPU-in-modules include/asm-generic/percpu.h
--- 25/include/asm-generic/percpu.h~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/include/asm-generic/percpu.h	2003-05-26 01:43:46.000000000 -0700
@@ -8,22 +8,25 @@
 extern unsigned long __per_cpu_offset[NR_CPUS];
 
 /* Separate out the type, so (int[3], foo) works. */
-#ifndef MODULE
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
-#endif
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 
+static inline void percpu_modcopy(void *pcpudst, const void *src,
+				  unsigned long size)
+{
+	unsigned int i;
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_possible(i))
+			memcpy(pcpudst + __per_cpu_offset[i], src, size);
+}
 #else /* ! SMP */
 
-/* Can't define per-cpu variables in modules.  Sorry --RR */
-#ifndef MODULE
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) name##__per_cpu
-#endif
 
 #define per_cpu(var, cpu)			((void)cpu, var##__per_cpu)
 #define __get_cpu_var(var)			var##__per_cpu
diff -puN include/asm-ia64/percpu.h~DEFINE_PERCPU-in-modules include/asm-ia64/percpu.h
--- 25/include/asm-ia64/percpu.h~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/include/asm-ia64/percpu.h	2003-05-26 01:43:46.000000000 -0700
@@ -8,6 +8,7 @@
  * Copyright (C) 2002-2003 Hewlett-Packard Co
  *	David Mosberger-Tang <davidm@hpl.hp.com>
  */
+#define PERCPU_ENOUGH_ROOM PERCPU_PAGE_SIZE
 
 #ifdef __ASSEMBLY__
 
@@ -19,15 +20,15 @@
 
 extern unsigned long __per_cpu_offset[NR_CPUS];
 
-#ifndef MODULE
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) name##__per_cpu
-#endif
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) name##__per_cpu
 
 #define __get_cpu_var(var)	(var##__per_cpu)
 #ifdef CONFIG_SMP
 # define per_cpu(var, cpu)	(*RELOC_HIDE(&var##__per_cpu, __per_cpu_offset[cpu]))
+
+extern void percpu_modcopy(void *pcpudst, const void *src, unsigned long size);
 #else
 # define per_cpu(var, cpu)	((void)cpu, __get_cpu_var(var))
 #endif
diff -puN include/linux/module.h~DEFINE_PERCPU-in-modules include/linux/module.h
--- 25/include/linux/module.h~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/include/linux/module.h	2003-05-26 01:43:46.000000000 -0700
@@ -247,6 +247,9 @@ struct module
 	char *strtab;
 #endif
 
+	/* Per-cpu data. */
+	void *percpu;
+
 	/* The command line arguments (may be mangled).  People like
 	   keeping pointers to this stuff */
 	char *args;
diff -puN include/linux/percpu.h~DEFINE_PERCPU-in-modules include/linux/percpu.h
--- 25/include/linux/percpu.h~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/include/linux/percpu.h	2003-05-26 01:44:32.000000000 -0700
@@ -2,9 +2,15 @@
 #define __LINUX_PERCPU_H
 #include <linux/spinlock.h> /* For preempt_disable() */
 #include <linux/slab.h> /* For kmalloc() */
+#include <linux/smp.h>
 #include <linux/string.h> /* For memset() */
 #include <asm/percpu.h>
 
+/* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
+#ifndef PERCPU_ENOUGH_ROOM
+#define PERCPU_ENOUGH_ROOM 32768
+#endif
+
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
diff -puN init/main.c~DEFINE_PERCPU-in-modules init/main.c
--- 25/init/main.c~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/init/main.c	2003-05-26 01:43:46.000000000 -0700
@@ -318,14 +318,16 @@ static void __init setup_per_cpu_areas(v
 
 	/* Copy section for each CPU (we discard the original) */
 	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
-	if (!size)
-		return;
+#ifdef CONFIG_MODULES
+	if (size < PERCPU_ENOUGH_ROOM)
+		size = PERCPU_ENOUGH_ROOM;
+#endif
 
 	ptr = alloc_bootmem(size * NR_CPUS);
 
 	for (i = 0; i < NR_CPUS; i++, ptr += size) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, size);
+		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 	}
 }
 #endif /* !__GENERIC_PER_CPU */
diff -puN kernel/module.c~DEFINE_PERCPU-in-modules kernel/module.c
--- 25/kernel/module.c~DEFINE_PERCPU-in-modules	2003-05-26 01:43:46.000000000 -0700
+++ 25-akpm/kernel/module.c	2003-05-26 01:43:46.000000000 -0700
@@ -205,6 +205,167 @@ static struct module *find_module(const 
 	return NULL;
 }
 
+#ifdef CONFIG_SMP
+/* Number of blocks used and allocated. */
+static unsigned int pcpu_num_used, pcpu_num_allocated;
+/* Size of each block.  -ve means used. */
+static int *pcpu_size;
+
+static int split_block(unsigned int i, unsigned short size)
+{
+	/* Reallocation required? */
+	if (pcpu_num_used + 1 > pcpu_num_allocated) {
+		int *new = kmalloc(sizeof(new[0]) * pcpu_num_allocated*2,
+				   GFP_KERNEL);
+		if (!new)
+			return 0;
+
+		memcpy(new, pcpu_size, sizeof(new[0])*pcpu_num_allocated);
+		pcpu_num_allocated *= 2;
+		kfree(pcpu_size);
+		pcpu_size = new;
+	}
+
+	/* Insert a new subblock */
+	memmove(&pcpu_size[i+1], &pcpu_size[i],
+		sizeof(pcpu_size[0]) * (pcpu_num_used - i));
+	pcpu_num_used++;
+
+	pcpu_size[i+1] -= size;
+	pcpu_size[i] = size;
+	return 1;
+}
+
+static inline unsigned int block_size(int val)
+{
+	if (val < 0)
+		return -val;
+	return val;
+}
+
+/* Created by linker magic */
+extern char __per_cpu_start[], __per_cpu_end[];
+
+static void *percpu_modalloc(unsigned long size, unsigned long align)
+{
+	unsigned long extra;
+	unsigned int i;
+	void *ptr;
+
+	BUG_ON(align > SMP_CACHE_BYTES);
+
+	ptr = __per_cpu_start;
+	for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+		/* Extra for alignment requirement. */
+		extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
+		BUG_ON(i == 0 && extra != 0);
+
+		if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
+			continue;
+
+		/* Transfer extra to previous block. */
+		if (pcpu_size[i-1] < 0)
+			pcpu_size[i-1] -= extra;
+		else
+			pcpu_size[i-1] += extra;
+		pcpu_size[i] -= extra;
+		ptr += extra;
+
+		/* Split block if warranted */
+		if (pcpu_size[i] - size > sizeof(unsigned long))
+			if (!split_block(i, size))
+				return NULL;
+
+		/* Mark allocated */
+		pcpu_size[i] = -pcpu_size[i];
+		return ptr;
+	}
+
+	printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
+	       size);
+	return NULL;
+}
+
+static void percpu_modfree(void *freeme)
+{
+	unsigned int i;
+	void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+
+	/* First entry is core kernel percpu data. */
+	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
+		if (ptr == freeme) {
+			pcpu_size[i] = -pcpu_size[i];
+			goto free;
+		}
+	}
+	BUG();
+
+ free:
+	/* Merge with previous? */
+	if (pcpu_size[i-1] >= 0) {
+		pcpu_size[i-1] += pcpu_size[i];
+		pcpu_num_used--;
+		memmove(&pcpu_size[i], &pcpu_size[i+1],
+			(pcpu_num_used - i) * sizeof(pcpu_size[0]));
+		i--;
+	}
+	/* Merge with next? */
+	if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
+		pcpu_size[i] += pcpu_size[i+1];
+		pcpu_num_used--;
+		memmove(&pcpu_size[i+1], &pcpu_size[i+2],
+			(pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
+	}
+}
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs,
+				 const char *secstrings)
+{
+	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static int percpu_modinit(void)
+{
+	pcpu_num_used = 2;
+	pcpu_num_allocated = 2;
+	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
+			    GFP_KERNEL);
+	/* Static in-kernel percpu data (used). */
+	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+	/* Free room. */
+	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
+	if (pcpu_size[1] < 0) {
+		printk(KERN_ERR "No per-cpu room for modules.\n");
+		pcpu_num_used = 1;
+	}
+
+	return 0;
+}	
+__initcall(percpu_modinit);
+#else /* ... !CONFIG_SMP */
+static inline void *percpu_modalloc(unsigned long size, unsigned long align)
+{
+	return NULL;
+}
+static inline void percpu_modfree(void *pcpuptr)
+{
+	BUG();
+}
+static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
+					Elf_Shdr *sechdrs,
+					const char *secstrings)
+{
+	return 0;
+}
+static inline void percpu_modcopy(void *pcpudst, const void *src,
+				  unsigned long size)
+{
+	/* pcpusec should be 0, and size of that section should be 0. */
+	BUG_ON(size != 0);
+}
+#endif /* CONFIG_SMP */
+
 #ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
@@ -913,6 +1074,8 @@ static void free_module(struct module *m
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
+	if (mod->percpu)
+		percpu_modfree(mod->percpu);
 
 	/* Finally, free the core (containing the module structure) */
 	module_free(mod, mod->module_core);
@@ -939,10 +1102,11 @@ static int simplify_symbols(Elf_Shdr *se
 			    unsigned int symindex,
 			    const char *strtab,
 			    unsigned int versindex,
+			    unsigned int pcpuindex,
 			    struct module *mod)
 {
 	Elf_Sym *sym = (void *)sechdrs[symindex].sh_addr;
-	
+	unsigned long secbase;
 	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
 	int ret = 0;
 
@@ -979,10 +1143,12 @@ static int simplify_symbols(Elf_Shdr *se
 			break;
 
 		default:
-			sym[i].st_value 
-				= (unsigned long)
-				(sechdrs[sym[i].st_shndx].sh_addr
-				 + sym[i].st_value);
+			/* Divert to percpu allocation if a percpu var. */
+			if (sym[i].st_shndx == pcpuindex)
+				secbase = (unsigned long)mod->percpu;
+			else
+				secbase = sechdrs[sym[i].st_shndx].sh_addr;
+			sym[i].st_value += secbase;
 			break;
 		}
 	}
@@ -1119,7 +1285,7 @@ static struct module *load_module(void _
 	char *secstrings, *args, *modmagic, *strtab = NULL;
 	unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
 		exportindex, modindex, obsparmindex, infoindex, gplindex,
-		crcindex, gplcrcindex, versindex;
+		crcindex, gplcrcindex, versindex, pcpuindex;
 	long arglen;
 	struct module *mod;
 	long err = 0;
@@ -1194,6 +1360,7 @@ static struct module *load_module(void _
 	obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
 	versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
 	infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
+	pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
 
 	/* Don't keep modinfo section */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1250,6 +1417,17 @@ static struct module *load_module(void _
 	if (err < 0)
 		goto free_mod;
 
+	if (pcpuindex) {
+		/* We have a special allocation for this section. */
+		mod->percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
+					      sechdrs[pcpuindex].sh_addralign);
+		if (!mod->percpu) {
+			err = -ENOMEM;
+			goto free_mod;
+		}
+		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
+	}
+
 	/* Determine total sizes, and put offsets in sh_entsize.  For now
 	   this is done generically; there doesn't appear to be any
 	   special cases for the architectures. */
@@ -1259,7 +1437,7 @@ static struct module *load_module(void _
 	ptr = module_alloc(mod->core_size);
 	if (!ptr) {
 		err = -ENOMEM;
-		goto free_mod;
+		goto free_percpu;
 	}
 	memset(ptr, 0, mod->core_size);
 	mod->module_core = ptr;
@@ -1303,7 +1481,8 @@ static struct module *load_module(void _
 	set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
 
 	/* Fix up syms, so that st_value is a pointer to location. */
-	err = simplify_symbols(sechdrs, symindex, strtab, versindex, mod);
+	err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
+			       mod);
 	if (err < 0)
 		goto cleanup;
 
@@ -1342,6 +1521,10 @@ static struct module *load_module(void _
 			goto cleanup;
 	}
 
+	/* Finally, copy percpu area over. */
+	percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
+		       sechdrs[pcpuindex].sh_size);
+
 #ifdef CONFIG_KALLSYMS
 	mod->symtab = (void *)sechdrs[symindex].sh_addr;
 	mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
@@ -1383,6 +1566,9 @@ static struct module *load_module(void _
 	module_free(mod, mod->module_init);
  free_core:
 	module_free(mod, mod->module_core);
+ free_percpu:
+	if (mod->percpu)
+		percpu_modfree(mod->percpu);
  free_mod:
 	kfree(args);
  free_hdr:

_