From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S965406AbXAaPTH (ORCPT ); Wed, 31 Jan 2007 10:19:07 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S965273AbXAaPSp (ORCPT ); Wed, 31 Jan 2007 10:18:45 -0500 Received: from smtp-out.google.com ([216.239.45.13]:56133 "EHLO smtp-out.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S965406AbXAaPSf (ORCPT ); Wed, 31 Jan 2007 10:18:35 -0500 DomainKey-Signature: a=rsa-sha1; s=beta; d=google.com; c=nofws; q=dns; h=received:date:from:x-x-sender:to:cc:subject:in-reply-to: message-id:references:mime-version:content-type; b=T4XezYi1/gYGaotbKnmE4F5WCCBDFkZ+Fh8jj+7JjWQQLbuC8+NR8qU4OQY6W71cE 3JNSxS6C79a1Yd+Ps+W7g== Date: Wed, 31 Jan 2007 07:18:19 -0800 (PST) From: David Rientjes X-X-Sender: rientjes@chino.kir.corp.google.com To: Andrew Morton cc: Andi Kleen , Rohit Seth , linux-kernel@vger.kernel.org Subject: [patch -mm 5/7] x86_64: map fake nodes to real nodes In-Reply-To: Message-ID: References: MIME-Version: 1.0 Content-Type: TEXT/PLAIN; charset=US-ASCII Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org Exports the struct bootnode array globally so that the physical mapping can be saved when NUMA emulation is used. This is then copied and stored for later reference so that there exists a mapping between fake nodes and the real nodes they reside on through the get_phys_node() function. physical_node_map is a new struct bootnode array that is used to save the physical mapping in the emulation case. The is no effect when CONFIG_NUMA_EMU is disabled or numa=fake=off. The emulation case is handled after K8 and ACPI so that the physical mapping can be saved later. __node_distance() is modified to use the physical node that corresponds to the fake node for measurement. Cc: Andi Kleen Signed-off-by: Rohit Seth Signed-off-by: David Rientjes --- arch/x86_64/mm/k8topology.c | 23 +++++--- arch/x86_64/mm/numa.c | 113 +++++++++++++++++++++++++++-------------- arch/x86_64/mm/srat.c | 9 +++- include/asm-x86_64/numa.h | 4 +- include/asm-x86_64/proto.h | 2 +- include/asm-x86_64/topology.h | 1 + 6 files changed, 100 insertions(+), 52 deletions(-) diff --git a/arch/x86_64/mm/k8topology.c b/arch/x86_64/mm/k8topology.c index b5b8dba..bcad062 100644 --- a/arch/x86_64/mm/k8topology.c +++ b/arch/x86_64/mm/k8topology.c @@ -40,10 +40,9 @@ static __init int find_northbridge(void) return -1; } -int __init k8_scan_nodes(unsigned long start, unsigned long end) +int __init k8_scan_nodes(unsigned long start, unsigned long end, int fake) { unsigned long prevbase; - struct bootnode nodes[8]; int nodeid, i, nb; unsigned char nodeids[8]; int found = 0; @@ -161,19 +160,25 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end) if (!found) return -1; - memnode_shift = compute_hash_shift(nodes, 8); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + if (!fake) { + memnode_shift = compute_hash_shift(8); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. " + "Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", + memnode_shift); + } for (i = 0; i < 8; i++) { if (nodes[i].start != nodes[i].end) { nodeid = nodeids[i]; apicid_to_node[nodeid << dualcore] = i; apicid_to_node[(nodeid << dualcore) + dualcore] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); + if (!fake) + setup_node_bootmem(i, nodes[i].start, + nodes[i].end); } } diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 015271a..97b7be4 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -34,6 +34,7 @@ unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE }; cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly; +struct bootnode nodes[MAX_NUMNODES] __read_mostly; int numa_off __initdata; unsigned long __initdata nodemap_addr; @@ -47,8 +48,7 @@ unsigned long __initdata nodemap_size; * 0 if memnodmap[] too small (of shift too small) * -1 if node overlap or lost ram (shift too big) */ -static int __init -populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift) +static int __init populate_memnodemap(int numnodes, int shift) { int i; int res = -1; @@ -104,8 +104,7 @@ static int __init allocate_cachealigned_memnodemap(void) * The LSB of all start and end addresses in the node map is the value of the * maximum possible shift. */ -static int __init -extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) +static int __init extract_lsb_from_nodes(int numnodes) { int i, nodes_used = 0; unsigned long start, end; @@ -129,17 +128,17 @@ extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes) return i; } -int __init compute_hash_shift(struct bootnode *nodes, int numnodes) +int __init compute_hash_shift(int numnodes) { int shift; - shift = extract_lsb_from_nodes(nodes, numnodes); + shift = extract_lsb_from_nodes(numnodes); if (allocate_cachealigned_memnodemap()) return -1; printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n", shift); - if (populate_memnodemap(nodes, numnodes, shift) != 1) { + if (populate_memnodemap(numnodes, shift) != 1) { printk(KERN_INFO "Your memory is not aligned you need to rebuild your kernel " "with a bigger NODEMAPSIZE shift=%d\n", @@ -279,7 +278,37 @@ void __init numa_init_array(void) #define E820_ADDR_HOLE_SIZE(start, end) \ (e820_hole_size((start) >> PAGE_SHIFT, (end) >> PAGE_SHIFT) << \ PAGE_SHIFT) + +static struct bootnode physical_node_map[MAX_NUMNODES]; char *cmdline __initdata; +int numa_emu; + +/* + * Returns the physical NUMA node that fake node nid resides on. If NUMA + * emulation is disabled, then this is the same as nid. + */ +int get_phys_node(int nid) +{ + pg_data_t *pgdat; + u64 node_start_addr; + unsigned int i; + int ret = 0; + + if (!numa_emu) + return nid; + + pgdat = NODE_DATA(nid); + node_start_addr = pgdat->node_start_pfn << PAGE_SHIFT; + + for (i = 0; i < MAX_NUMNODES; i++) + if (node_start_addr >= physical_node_map[i].start && + node_start_addr < physical_node_map[i].end) { + ret = i; + break; + } + + return ret; +} /* * Setups up nid to range from addr to addr + size. If the end boundary is @@ -287,8 +316,7 @@ char *cmdline __initdata; * if there is additional memory left for allocation past addr and -1 otherwise. * addr is adjusted to be at the end of the node. */ -static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, - u64 size, u64 max_addr) +static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr) { int ret = 0; nodes[nid].start = *addr; @@ -310,8 +338,7 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, * is the number of nodes split up and addr is adjusted to be at the end of the * last node allocated. */ -static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, +static int __init split_nodes_equally(u64 *addr, u64 max_addr, int node_start, int num_nodes) { unsigned int big; @@ -358,7 +385,7 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, break; } } - if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) + if (setup_node_range(i, addr, end - *addr, max_addr) < 0) break; } return i - node_start + 1; @@ -369,12 +396,12 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, * always assigned to a final node and can be asymmetric. Returns the number of * nodes split. */ -static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, - u64 max_addr, int node_start, u64 size) +static int __init split_nodes_by_size(u64 *addr, u64 max_addr, int node_start, + u64 size) { int i = node_start; size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; - while (!setup_node_range(i++, nodes, addr, size, max_addr)) + while (!setup_node_range(i++, addr, size, max_addr)) ; return i - node_start; } @@ -385,7 +412,6 @@ static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, */ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) { - struct bootnode nodes[MAX_NUMNODES]; u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; int num_nodes = 0; @@ -395,13 +421,18 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) u64 size; int i; + /* + * Map the existing real NUMA toplogy to physical_node_map before the + * information is cleared. + */ + memcpy(physical_node_map, nodes, sizeof(nodes)); memset(&nodes, 0, sizeof(nodes)); /* * If the numa=fake command-line is just a single number N, split the * system RAM into N fake nodes. */ if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { - num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, + num_nodes = split_nodes_equally(&addr, max_addr, 0, simple_strtol(cmdline, NULL, 0)); if (num_nodes < 0) return num_nodes; @@ -429,8 +460,8 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; if (size) for (i = 0; i < coeff; i++, num_nodes++) - if (setup_node_range(num_nodes, nodes, - &addr, size, max_addr) < 0) + if (setup_node_range(num_nodes, &addr, + size, max_addr) < 0) goto done; if (!*cmdline) break; @@ -446,7 +477,7 @@ done: if (addr < max_addr) { if (coeff_flag && coeff < 0) { /* Split remaining nodes into num-sized chunks */ - num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes += split_nodes_by_size(&addr, max_addr, num_nodes, num); goto out; } @@ -455,7 +486,7 @@ done: /* Split remaining nodes into coeff chunks */ if (coeff <= 0) break; - num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes += split_nodes_equally(&addr, max_addr, num_nodes, coeff); break; case ',': @@ -463,13 +494,13 @@ done: break; default: /* Give one final node */ - setup_node_range(num_nodes, nodes, &addr, - max_addr - addr, max_addr); + setup_node_range(num_nodes, &addr, max_addr - addr, + max_addr); num_nodes++; } } out: - memnode_shift = compute_hash_shift(nodes, num_nodes); + memnode_shift = compute_hash_shift(num_nodes); if (memnode_shift < 0) { memnode_shift = 0; printk(KERN_ERR "No NUMA hash function found. NUMA emulation " @@ -489,30 +520,36 @@ out: void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn) { + unsigned long start_addr = start_pfn << PAGE_SHIFT; + unsigned long end_addr = end_pfn << PAGE_SHIFT; int i; -#ifdef CONFIG_NUMA_EMU - if (cmdline && !numa_emulation(start_pfn, end_pfn)) - return; -#endif - #ifdef CONFIG_ACPI_NUMA - if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, - end_pfn << PAGE_SHIFT)) + if (!numa_off && !cmdline && !acpi_scan_nodes(start_addr, end_addr)) return; #endif #ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<localities * node_to_pxm(a); diff --git a/include/asm-x86_64/numa.h b/include/asm-x86_64/numa.h index 933ff11..111b72c 100644 --- a/include/asm-x86_64/numa.h +++ b/include/asm-x86_64/numa.h @@ -6,8 +6,8 @@ struct bootnode { u64 start,end; }; - -extern int compute_hash_shift(struct bootnode *nodes, int numnodes); +extern struct bootnode nodes[MAX_NUMNODES]; +extern int compute_hash_shift(int numnodes); #define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 4760b5b..6411fd9 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -51,7 +51,7 @@ extern void early_printk(const char *fmt, ...) __attribute__((format(printf,1,2) extern void early_identify_cpu(struct cpuinfo_x86 *c); -extern int k8_scan_nodes(unsigned long start, unsigned long end); +extern int k8_scan_nodes(unsigned long start, unsigned long end, int fake); extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn); extern unsigned long numa_free_all_bootmem(void); diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h index 440b0ca..8270b0d 100644 --- a/include/asm-x86_64/topology.h +++ b/include/asm-x86_64/topology.h @@ -68,5 +68,6 @@ extern int __node_distance(int, int); #include extern cpumask_t cpu_coregroup_map(int cpu); +extern int get_phys_node(int nid); #endif