LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Andi Kleen <ak@suse.de>
To: Rohit Seth <rohitseth@google.com>, Andi Kleen <ak@suse.de>,
	patches@x86-64.org, linux-kernel@vger.kernel.org
Subject: [PATCH x86 for review II] [7/39] x86_64: Fix fake numa for x86_64 machines with big IO hole
Date: Mon, 12 Feb 2007 08:37:53 +0100 (CET)	[thread overview]
Message-ID: <20070212073753.6A8B013D7F@wotan.suse.de> (raw)
In-Reply-To: <20070212837.963446000@suse.de>


From: Rohit Seth <rohitseth@google.com>

This patch resolves the issue of running with numa=fake=X on kernel command
line on x86_64 machines that have big IO hole.  While calculating the size
of each node now we look at the total hole size in that range.

Previously there were nodes that only had IO holes in them causing kernel
boot problems.  We now use the NODE_MIN_SIZE (64MB) as the minimum size of
memory that any node must have.  We reduce the number of allocated nodes if
the number of nodes specified on kernel command line results in any node
getting memory smaller than NODE_MIN_SIZE.

This change allows the extra memory to be incremented in NODE_MIN_SIZE
granule and uniformly distribute among as many nodes (called big nodes) as
possible.

[akpm@osdl.org: build fix]
Signed-off-by: David Rientjes <reintjes@google.com>
Signed-off-by: Paul Menage <menage@google.com>
Signed-off-by: Rohit Seth <rohitseth@google.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 arch/x86_64/kernel/e820.c   |   31 ++++++++++++
 arch/x86_64/mm/numa.c       |  110 ++++++++++++++++++++++++++++++++++++++------
 include/asm-x86_64/e820.h   |    1 
 include/asm-x86_64/mmzone.h |    5 ++
 4 files changed, 133 insertions(+), 14 deletions(-)

Index: linux/arch/x86_64/kernel/e820.c
===================================================================
--- linux.orig/arch/x86_64/kernel/e820.c
+++ linux/arch/x86_64/kernel/e820.c
@@ -191,6 +191,37 @@ unsigned long __init e820_end_of_ram(voi
 }
 
 /*
+ * Find the hole size in the range.
+ */
+unsigned long __init e820_hole_size(unsigned long start, unsigned long end)
+{
+	unsigned long ram = 0;
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+		unsigned long last, addr;
+
+		if (ei->type != E820_RAM ||
+		    ei->addr+ei->size <= start ||
+		    ei->addr >= end)
+			continue;
+
+		addr = round_up(ei->addr, PAGE_SIZE);
+		if (addr < start)
+			addr = start;
+
+		last = round_down(ei->addr + ei->size, PAGE_SIZE);
+		if (last >= end)
+			last = end;
+
+		if (last > addr)
+			ram += last - addr;
+	}
+	return ((end - start) - ram);
+}
+
+/*
  * Mark e820 reserved areas as busy for the resource manager.
  */
 void __init e820_reserve_resources(void)
Index: linux/arch/x86_64/mm/numa.c
===================================================================
--- linux.orig/arch/x86_64/mm/numa.c
+++ linux/arch/x86_64/mm/numa.c
@@ -272,31 +272,113 @@ void __init numa_init_array(void)
 }
 
 #ifdef CONFIG_NUMA_EMU
+/* Numa emulation */
 int numa_fake __initdata = 0;
 
-/* Numa emulation */
+/*
+ * This function is used to find out if the start and end correspond to
+ * different zones.
+ */
+int zone_cross_over(unsigned long start, unsigned long end)
+{
+	if ((start < (MAX_DMA32_PFN << PAGE_SHIFT)) &&
+			(end >= (MAX_DMA32_PFN << PAGE_SHIFT)))
+		return 1;
+	return 0;
+}
+
 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
 {
- 	int i;
+ 	int i, big;
  	struct bootnode nodes[MAX_NUMNODES];
- 	unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
+ 	unsigned long sz, old_sz;
+	unsigned long hole_size;
+	unsigned long start, end;
+	unsigned long max_addr = (end_pfn << PAGE_SHIFT);
+
+	start = (start_pfn << PAGE_SHIFT);
+	hole_size = e820_hole_size(start, max_addr);
+	sz = (max_addr - start - hole_size) / numa_fake;
 
  	/* Kludge needed for the hash function */
- 	if (hweight64(sz) > 1) {
- 		unsigned long x = 1;
- 		while ((x << 1) < sz)
- 			x <<= 1;
- 		if (x < sz/2)
- 			printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
- 		sz = x;
- 	}
 
+	old_sz = sz;
+	/*
+	 * Round down to the nearest FAKE_NODE_MIN_SIZE.
+	 */
+	sz &= FAKE_NODE_MIN_HASH_MASK;
+
+	/*
+	 * We ensure that each node is at least 64MB big.  Smaller than this
+	 * size can cause VM hiccups.
+	 */
+	if (sz == 0) {
+		printk(KERN_INFO "Not enough memory for %d nodes.  Reducing "
+				"the number of nodes\n", numa_fake);
+		numa_fake = (max_addr - start - hole_size) / FAKE_NODE_MIN_SIZE;
+		printk(KERN_INFO "Number of fake nodes will be = %d\n",
+				numa_fake);
+		sz = FAKE_NODE_MIN_SIZE;
+	}
+	/*
+	 * Find out how many nodes can get an extra NODE_MIN_SIZE granule.
+	 * This logic ensures the extra memory gets distributed among as many
+	 * nodes as possible (as compared to one single node getting all that
+	 * extra memory.
+	 */
+	big = ((old_sz - sz) * numa_fake) / FAKE_NODE_MIN_SIZE;
+	printk(KERN_INFO "Fake node Size: %luMB hole_size: %luMB big nodes: "
+			"%d\n",
+			(sz >> 20), (hole_size >> 20), big);
  	memset(&nodes,0,sizeof(nodes));
+	end = start;
  	for (i = 0; i < numa_fake; i++) {
- 		nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
+		/*
+		 * In case we are not able to allocate enough memory for all
+		 * the nodes, we reduce the number of fake nodes.
+		 */
+		if (end >= max_addr) {
+			numa_fake = i - 1;
+			break;
+		}
+ 		start = nodes[i].start = end;
+		/*
+		 * Final node can have all the remaining memory.
+		 */
  		if (i == numa_fake-1)
- 			sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
- 		nodes[i].end = nodes[i].start + sz;
+ 			sz = max_addr - start;
+ 		end = nodes[i].start + sz;
+		/*
+		 * Fir "big" number of nodes get extra granule.
+		 */
+		if (i < big)
+			end += FAKE_NODE_MIN_SIZE;
+		/*
+		 * Iterate over the range to ensure that this node gets at
+		 * least sz amount of RAM (excluding holes)
+		 */
+		while ((end - start - e820_hole_size(start, end)) < sz) {
+			end += FAKE_NODE_MIN_SIZE;
+			if (end >= max_addr)
+				break;
+		}
+		/*
+		 * Look at the next node to make sure there is some real memory
+		 * to map.  Bad things happen when the only memory present
+		 * in a zone on a fake node is IO hole.
+		 */
+		while (e820_hole_size(end, end + FAKE_NODE_MIN_SIZE) > 0) {
+			if (zone_cross_over(start, end + sz)) {
+				end = (MAX_DMA32_PFN << PAGE_SHIFT);
+				break;
+			}
+			if (end >= max_addr)
+				break;
+			end += FAKE_NODE_MIN_SIZE;
+		}
+		if (end > max_addr)
+			end = max_addr;
+		nodes[i].end = end;
  		printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
  		       i,
  		       nodes[i].start, nodes[i].end,
Index: linux/include/asm-x86_64/e820.h
===================================================================
--- linux.orig/include/asm-x86_64/e820.h
+++ linux/include/asm-x86_64/e820.h
@@ -46,6 +46,7 @@ extern void e820_mark_nosave_regions(voi
 extern void e820_print_map(char *who);
 extern int e820_any_mapped(unsigned long start, unsigned long end, unsigned type);
 extern int e820_all_mapped(unsigned long start, unsigned long end, unsigned type);
+extern unsigned long e820_hole_size(unsigned long start, unsigned long end);
 
 extern void e820_setup_gap(void);
 extern void e820_register_active_regions(int nid,
Index: linux/include/asm-x86_64/mmzone.h
===================================================================
--- linux.orig/include/asm-x86_64/mmzone.h
+++ linux/include/asm-x86_64/mmzone.h
@@ -47,5 +47,10 @@ static inline __attribute__((pure)) int 
 extern int pfn_valid(unsigned long pfn);
 #endif
 
+#ifdef CONFIG_NUMA_EMU
+#define FAKE_NODE_MIN_SIZE	(64*1024*1024)
+#define FAKE_NODE_MIN_HASH_MASK	(~(FAKE_NODE_MIN_SIZE - 1ul))
+#endif
+
 #endif
 #endif

  parent reply	other threads:[~2007-02-12  7:47 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2007-02-12  7:37 [PATCH x86 for review II] [1/39] i386: move startup_32() in text.head section Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [2/39] x86_64: Break init() in two parts to avoid MODPOST warnings Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [3/39] i386: arch/i386/kernel/cpu/mcheck/mce.c should #include <asm/mce.h> Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [4/39] i386: add idle notifier Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [5/39] i386: improve sched_clock() on i686 Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [6/39] i386: romsignature/checksum cleanup Andi Kleen
2007-02-12  7:37 ` Andi Kleen [this message]
2007-02-12  7:37 ` [PATCH x86 for review II] [8/39] x86_64: Remove fastcall references in x86_64 code Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [9/39] x86_64: Use constant instead of raw number in x86_64 ioperm.c Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [10/39] x86_64: Handle 32 bit PerfMon Counter writes cleanly in x86_64 nmi_watchdog Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [11/39] i386: Handle 32 bit PerfMon Counter writes cleanly in i386 nmi_watchdog Andi Kleen
2007-02-12  7:37 ` [PATCH x86 for review II] [12/39] i386: Handle 32 bit PerfMon Counter writes cleanly in oprofile Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [13/39] i386: CONFIG_PHYSICAL_ALIGN limited to 4M? Andi Kleen
2007-02-13  6:36   ` Rene Herman
2007-02-12  7:38 ` [PATCH x86 for review II] [14/39] x86_64: cleanup Doc/x86_64/ files Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [15/39] x86_64: list x86_64 quilt tree Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [16/39] x86: simplify notify_page_fault() Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [17/39] x86_64: Tighten mce_amd driver MSR reads Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [18/39] x86_64: Allow to run a program when a machine check event is detected Andi Kleen
2007-02-12  7:54   ` Oliver Neukum
2007-02-12  8:04     ` Andi Kleen
2007-02-12  8:11       ` Bauke Jan Douma
2007-02-12 15:05       ` [patches] " Pavel Machek
2007-02-12  7:38 ` [PATCH x86 for review II] [19/39] x86_64: remove get_pmd() Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [20/39] i386: Small cleanup to TLB flush code Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [21/39] i386: rdmsr_on_cpu, wrmsr_on_cpu Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [22/39] x86_64: Kconfig typos Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [23/39] i386: use smp_call_function_single() Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [24/39] " Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [25/39] x86_64: Fix preprocessor condition Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [26/39] i386: fix 32-bit ioctls on x64_32 Andi Kleen
2007-02-12 13:24   ` Giuliano Procida
2007-02-12 22:28     ` Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [27/39] i386: APM on i386 Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [28/39] i386: fix size_or_mask and size_and_mask Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [29/39] x86_64: - Ignore long SMI interrupts in clock calibration code - update 1 Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [30/39] x86_64: Check return value of putreg in PTRACE_SETREGS Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [31/39] x86_64: Unexport __supported_pte_mask Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [32/39] x86_64: x86_64 - Fix FS/GS registers for VT execution Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [33/39] x86_64: Fix off by one error in IOMMU boundary checking Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [34/39] i386: Use stack arguments for calling into EFI Andi Kleen
2007-02-12 19:45   ` Frédéric RISS
2007-02-12  7:38 ` [PATCH x86 for review II] [35/39] x86_64: Don't reserve ROMs Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [36/39] x86_64: define dma noncoherent API functions Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [37/39] x86_64: robustify bad_dma_address handling Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [38/39] x86: fix laptop bootup hang in init_acpi() Andi Kleen
2007-02-12  7:38 ` [PATCH x86 for review II] [39/39] i386: All Transmeta CPUs have constant TSCs Andi Kleen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20070212073753.6A8B013D7F@wotan.suse.de \
    --to=ak@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@x86-64.org \
    --cc=rohitseth@google.com \
    --subject='Re: [PATCH x86 for review II] [7/39] x86_64: Fix fake numa for x86_64 machines with big IO hole' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).