LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Andi Kleen <ak@suse.de>
To: Amul Shah <amul.shah@unisys.com>, Andi Kleen <ak@suse.de>,
	Rohit Seth <rohitseth@google.com>,
	patches@x86-64.org, linux-kernel@vger.kernel.org
Subject: [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation
Date: Sat, 10 Feb 2007 12:50:14 +0100 (CET)	[thread overview]
Message-ID: <20070210115014.484C513DBF@wotan.suse.de> (raw)
In-Reply-To: <200702101250.142420000@suse.de>


From: Amul Shah <amul.shah@unisys.com>
Remove the statically allocated memory to NUMA node hash map in favor of a
dynamically allocated memory to node hash map (it is cache aligned).

This patch has the nice side effect in that it allows the hash map to grow
for systems with large amounts of memory (256GB - 1TB), but suffer from
having small PCI space tacked onto the boot node (which is somewhere
between 192MB to 512MB on the ES7000).

Signed-off-by: Amul Shah <amul.shah@unisys.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Cc: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---
Updated patch to fix a bug that Andi Kleen found for platforms that
don't support NUMA (or "numa=off").

---
 arch/x86_64/kernel/e820.c   |    7 ++++
 arch/x86_64/kernel/setup.c  |    5 ++
 arch/x86_64/mm/numa.c       |   74 ++++++++++++++++++++++++++++++++++++++------
 include/asm-x86_64/e820.h   |    1 
 include/asm-x86_64/mmzone.h |   13 ++++---
 5 files changed, 85 insertions(+), 15 deletions(-)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -83,6 +83,13 @@ static inline int bad_addr(unsigned long
 		return 1;
 	}
 
+#ifdef CONFIG_NUMA
+	/* NUMA memory to node map */
+	if (last >= nodemap_addr && addr < nodemap_addr + nodemap_size) {
+		*addrp = nodemap_addr + nodemap_size;
+		return 1;
+	}
+#endif
 	/* XXX ramdisk image here? */ 
 	return 0;
 } 
Index: linux/arch/x86_64/kernel/setup.c
===================================================================
--- linux.orig/arch/x86_64/kernel/setup.c
+++ linux/arch/x86_64/kernel/setup.c
@@ -444,6 +444,11 @@ void __init setup_arch(char **cmdline_p)
 	/* reserve ebda region */
 	if (ebda_addr)
 		reserve_bootmem_generic(ebda_addr, ebda_size);
+#ifdef CONFIG_NUMA
+	/* reserve nodemap region */
+	if (nodemap_addr)
+		reserve_bootmem_generic(nodemap_addr, nodemap_size);
+#endif
 
 #ifdef CONFIG_SMP
 	/*
Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -36,6 +36,8 @@ unsigned char apicid_to_node[MAX_LOCAL_A
 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
 
 int numa_off __initdata;
+unsigned long __initdata nodemap_addr;
+unsigned long __initdata nodemap_size;
 
 
 /*
@@ -52,34 +54,87 @@ populate_memnodemap(const struct bootnod
 	int res = -1;
 	unsigned long addr, end;
 
-	if (shift >= 64)
-		return -1;
-	memset(memnodemap, 0xff, sizeof(memnodemap));
+	memset(memnodemap, 0xff, memnodemapsize);
 	for (i = 0; i < numnodes; i++) {
 		addr = nodes[i].start;
 		end = nodes[i].end;
 		if (addr >= end)
 			continue;
-		if ((end >> shift) >= NODEMAPSIZE)
+		if ((end >> shift) >= memnodemapsize)
 			return 0;
 		do {
 			if (memnodemap[addr >> shift] != 0xff)
 				return -1;
 			memnodemap[addr >> shift] = i;
-                       addr += (1UL << shift);
+			addr += (1UL << shift);
 		} while (addr < end);
 		res = 1;
 	} 
 	return res;
 }
 
-int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+static int __init allocate_cachealigned_memnodemap(void)
+{
+	unsigned long pad, pad_addr;
+
+	memnodemap = memnode.embedded_map;
+	if (memnodemapsize <= 48) {
+		printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+		       nodemap_addr, nodemap_addr + nodemap_size);
+		return 0;
+	}
+
+	pad = L1_CACHE_BYTES - 1;
+	pad_addr = 0x8000;
+	nodemap_size = pad + memnodemapsize;
+	nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
+				      nodemap_size);
+	if (nodemap_addr == -1UL) {
+		printk(KERN_ERR
+		       "NUMA: Unable to allocate Memory to Node hash map\n");
+		nodemap_addr = nodemap_size = 0;
+		return -1;
+	}
+	pad_addr = (nodemap_addr + pad) & ~pad;
+	memnodemap = phys_to_virt(pad_addr);
+
+	printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+	       nodemap_addr, nodemap_addr + nodemap_size);
+	return 0;
+}
+
+/*
+ * The LSB of all start and end addresses in the node map is the value of the
+ * maximum possible shift.
+ */
+static int __init
+extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
 {
-	int shift = 20;
+	int i;
+	unsigned long start, end;
+	unsigned long bitfield = 0, memtop = 0;
 
-	while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
-		shift++;
+	for (i = 0; i < numnodes; i++) {
+		start = nodes[i].start;
+		end = nodes[i].end;
+		if (start >= end)
+			continue;
+		bitfield |= start | end;
+		if (end > memtop)
+			memtop = end;
+	}
+	i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+	memnodemapsize = (memtop >> i)+1;
+	return i;
+}
+
+int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
+{
+	int shift;
 
+	shift = extract_lsb_from_nodes(nodes, numnodes);
+	if (allocate_cachealigned_memnodemap())
+		return -1;
 	printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
 		shift);
 
@@ -290,6 +345,7 @@ void __init numa_initmem_init(unsigned l
 	       end_pfn << PAGE_SHIFT); 
 		/* setup dummy node covering all memory */ 
 	memnode_shift = 63; 
+	memnodemap = memnode.embedded_map;
 	memnodemap[0] = 0;
 	nodes_clear(node_online_map);
 	node_set_online(0);
Index: linux/include/asm-x86_64/e820.h
===================================================================
--- linux.orig/include/asm-x86_64/e820.h
+++ linux/include/asm-x86_64/e820.h
@@ -56,6 +56,7 @@ extern void finish_e820_parsing(void);
 extern struct e820map e820;
 
 extern unsigned ebda_addr, ebda_size;
+extern unsigned long nodemap_addr, nodemap_size;
 #endif/*!__ASSEMBLY__*/
 
 #endif/*__E820_HEADER*/
Index: linux/include/asm-x86_64/mmzone.h
===================================================================
--- linux.orig/include/asm-x86_64/mmzone.h
+++ linux/include/asm-x86_64/mmzone.h
@@ -11,24 +11,25 @@
 
 #include <asm/smp.h>
 
-/* Should really switch to dynamic allocation at some point */
-#define NODEMAPSIZE 0x4fff
-
 /* Simple perfect hash to map physical addresses to node numbers */
 struct memnode {
 	int shift;
-	u8 map[NODEMAPSIZE];
-} ____cacheline_aligned;
+	unsigned int mapsize;
+	u8 *map;
+	u8 embedded_map[64-16];
+} ____cacheline_aligned; /* total size = 64 bytes */
 extern struct memnode memnode;
 #define memnode_shift memnode.shift
 #define memnodemap memnode.map
+#define memnodemapsize memnode.mapsize
 
 extern struct pglist_data *node_data[];
 
 static inline __attribute__((pure)) int phys_to_nid(unsigned long addr) 
 { 
 	unsigned nid; 
-	VIRTUAL_BUG_ON((addr >> memnode_shift) >= NODEMAPSIZE);
+	VIRTUAL_BUG_ON(!memnodemap);
+	VIRTUAL_BUG_ON((addr >> memnode_shift) >= memnodemapsize);
 	nid = memnodemap[addr >> memnode_shift]; 
 	VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]); 
 	return nid; 

  reply	other threads:[~2007-02-10 11:50 UTC|newest]

Thread overview: 38+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-02-10 11:50 [PATCH 2.6.21 review I] [1/25] x86_64: Add __copy_from_user_nocache Andi Kleen
2007-02-10 11:50 ` Andi Kleen [this message]
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [3/25] i386: Convert i386 PDA code to use %fs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [4/25] x86: kernel-mode faults pollute current->thead Andi Kleen
2007-02-12  9:32   ` [patches] " Jan Beulich
2007-02-12 16:42     ` Jeff Dike
2007-02-12 17:01       ` Jan Beulich
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [5/25] i386: revert i386-fix-the-verify_quirk_intel_irqbalance Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [6/25] x86_64: revert x86_64-mm-add-genapic_force Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [7/25] x86: revert x86_64-mm-fix-the-irqbalance-quirk-for-e7320-e7520-e7525 Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [8/25] x86_64: optimize & fix APIC mode setup Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [9/25] x86_64: always use physical delivery mode on > 8 CPUs Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [10/25] x86_64: remove clustered APIC mode Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [11/25] x86: default to physical mode on hotplug CPU kernels Andi Kleen
2007-02-11 11:13   ` Eric W. Biederman
2007-02-12 22:36     ` Andi Kleen
2007-02-12 23:10       ` Eric W. Biederman
2007-02-12 23:51         ` Siddha, Suresh B
2007-02-12 23:43       ` Siddha, Suresh B
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [12/25] x86_64: x86_64-make-the-numa-hash-function-nodemap-allocation fix fix Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [13/25] i386: Fix a typo in an IRQ handler name Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [14/25] x86: Share what's shareable Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [15/25] i386: Only call unreachable_devices() when type 1 is available Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [16/25] i386: Detect and support the E7520 and the 945G/GZ/P/PL Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [17/25] i386: Reserve resources but only when we're sure about them Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [18/25] x86_64: Fix x86_64 ioremap base_address Andi Kleen
2007-02-10 11:58   ` Arjan van de Ven
2007-02-10 12:07     ` Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [19/25] x86: Reject a broken MCFG tables on Asus etc Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [20/25] x86_64: get rid of ARCH_HAVE_XTIME_LOCK Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [21/25] x86_64: a memcpy that tries to reduce cache pressure Andi Kleen
2007-02-12  9:57   ` [patches] " Jan Beulich
2007-02-12 10:25     ` Andi Kleen
2007-02-13 11:27   ` Eric Dumazet
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [22/25] x86_64: use memcpy_uncached_read() in RDMA interrupt handler to reduce packet loss Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [23/25] x86_64: improved iommu documentation Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [24/25] x86_64: do not always end the stack trace with ULONG_MAX Andi Kleen
2007-02-10 11:50 ` [PATCH 2.6.21 review I] [25/25] i386: arch/i386/kernel/e820.c should #include <asm/setup.h Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070210115014.484C513DBF@wotan.suse.de \
    --to=ak@suse.de \
    --cc=amul.shah@unisys.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@x86-64.org \
    --cc=rohitseth@google.com \
    --subject='Re: [PATCH 2.6.21 review I] [2/25] x86_64: Make the NUMA hash function nodemap allocation' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).