LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 2/2] x86_64: make early_node_mem return align address
[not found] <200801290053.45776.yinghai.lu@sun.com>
@ 2008-01-29 9:05 ` Yinghai Lu
2008-01-29 9:33 ` Andi Kleen
2008-01-29 18:08 ` Yinghai Lu
2008-01-29 9:05 ` [PATCH 1/2] print out node_data addr and bootmap_start addr Yinghai Lu
1 sibling, 2 replies; 10+ messages in thread
From: Yinghai Lu @ 2008-01-29 9:05 UTC (permalink / raw)
To: Ingo Molnar, Christoph Lameter; +Cc: Andrew Morton, Andi Kleen, linux-kernel
[PATCH 2/2] x86_64: make early_node_mem return align address
boot oops when system get 64g or 128g installed
Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711()
BUG: unable to handle kernel NULL pointer dereference at 000000000000005f
IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f
PGD 0
Oops: 0000 [1] SMP
CPU 0
Modules linked in:
Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6
RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f
RSP: 0000:ffff810824c57e60 EFLAGS: 00010246
RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08
RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460
RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80
R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c
R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee
FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000
CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000)
Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000
00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff
0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5
Call Trace:
[<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a
[<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34
[<ffffffff80bc34a5>] ? sctp_init+0xef/0x711
[<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1
[<ffffffff8020ccf8>] ? child_rip+0xa/0x12
[<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1
[<ffffffff8020ccee>] ? child_rip+0x0/0x12
Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0
RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f
RSP <ffff810824c57e60>
CR2: 000000000000005f
---[ end trace 02c2d78def82877a ]---
Kernel panic - not syncing: Attempted to kill init!
it turns out some variables near end of bss is corrupted already.
in System.map we have
ffffffff80d40420 b rsi_table
ffffffff80d40620 B krb5_seq_lock
ffffffff80d40628 b i.20437
ffffffff80d40630 b xprt_rdma_inline_write_padding
ffffffff80d40638 b sunrpc_table_header
ffffffff80d40640 b zero
ffffffff80d40644 b min_memreg
ffffffff80d40648 b rpcrdma_tk_lock_g
ffffffff80d40650 B sctp_assocs_id_lock
ffffffff80d40658 B proc_net_sctp
ffffffff80d40660 B sctp_assocs_id
ffffffff80d40680 B sysctl_sctp_mem
ffffffff80d40690 B sysctl_sctp_rmem
ffffffff80d406a0 B sysctl_sctp_wmem
ffffffff80d406b0 b sctp_ctl_socket
ffffffff80d406b8 b sctp_pf_inet6_specific
ffffffff80d406c0 b sctp_pf_inet_specific
ffffffff80d406c8 b sctp_af_v4_specific
ffffffff80d406d0 b sctp_af_v6_specific
ffffffff80d406d8 b sctp_rand.33270
ffffffff80d406dc b sctp_memory_pressure
ffffffff80d406e0 b sctp_sockets_allocated
ffffffff80d406e4 b sctp_memory_allocated
ffffffff80d406e8 b sctp_sysctl_header
ffffffff80d406f0 b zero
ffffffff80d406f4 A __bss_stop
ffffffff80d406f4 A _end
and setup_node_bootmem() will use that page 0xd40000 for bootmap
Bootmem setup node 0 0000000000000000-0000000828000000
NODE_DATA [000000000008a485 - 0000000000091484]
bootmap [0000000000d406f4 - 0000000000e456f3] pages 105
Bootmem setup node 1 0000000828000000-0000001028000000
NODE_DATA [0000000828000000 - 0000000828006fff]
bootmap [0000000828007000 - 0000000828106fff] pages 100
Bootmem setup node 2 0000001028000000-0000001828000000
NODE_DATA [0000001028000000 - 0000001028006fff]
bootmap [0000001028007000 - 0000001028106fff] pages 100
Bootmem setup node 3 0000001828000000-0000002028000000
NODE_DATA [0000001828000000 - 0000001828006fff]
bootmap [0000001828007000 - 0000001828106fff] pages 100
actually, setup_node_bootmem hope to make NODE_DATA to be ZONE_ALIGN (1<<(11+12)),
and bootmap will after that in PAGE.
the patch update early_node_mem, and find_e820_mem to make sure we can extra range
for alignment.
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Index: linux-2.6/arch/x86/kernel/e820_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/e820_64.c
+++ linux-2.6/arch/x86/kernel/e820_64.c
@@ -169,9 +169,10 @@ int __init e820_all_mapped(unsigned long
* Find a free area in a specific range.
*/
unsigned long __init find_e820_area(unsigned long start, unsigned long end,
- unsigned size)
+ unsigned size, unsigned long align)
{
int i;
+ unsigned long mask = ~(align - 1);
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
@@ -185,7 +186,7 @@ unsigned long __init find_e820_area(unsi
continue;
while (bad_addr(&addr, size) && addr+size <= ei->addr+ei->size)
;
- last = PAGE_ALIGN(addr) + size;
+ last = ((addr + align - 1) & mask) + size;
if (last > ei->addr + ei->size)
continue;
if (last > end)
Index: linux-2.6/arch/x86/kernel/setup_64.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/setup_64.c
+++ linux-2.6/arch/x86/kernel/setup_64.c
@@ -182,7 +182,8 @@ contig_initmem_init(unsigned long start_
unsigned long bootmap_size, bootmap;
bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
- bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size);
+ bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
+ PAGE_SIZE);
if (bootmap == -1L)
panic("Cannot find bootmem map of size %ld\n", bootmap_size);
bootmap_size = init_bootmem(bootmap >> PAGE_SHIFT, end_pfn);
Index: linux-2.6/arch/x86/mm/init_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/init_64.c
+++ linux-2.6/arch/x86/mm/init_64.c
@@ -354,7 +354,7 @@ static void __init find_early_table_spac
* need roughly 0.5KB per GB.
*/
start = 0x8000;
- table_start = find_e820_area(start, end, tables);
+ table_start = find_e820_area(start, end, tables, PAGE_SIZE);
if (table_start == -1UL)
panic("Cannot find space for the kernel page tables");
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -94,7 +94,7 @@ static int __init allocate_cachealigned_
pad_addr = 0x8000;
nodemap_size = pad + memnodemapsize;
nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
- nodemap_size);
+ nodemap_size, PAGE_SIZE);
if (nodemap_addr == -1UL) {
printk(KERN_ERR
"NUMA: Unable to allocate Memory to Node hash map\n");
@@ -164,13 +164,16 @@ int early_pfn_to_nid(unsigned long pfn)
}
static void * __init early_node_mem(int nodeid, unsigned long start,
- unsigned long end, unsigned long size)
+ unsigned long end, unsigned long size,
+ unsigned long align)
{
- unsigned long mem = find_e820_area(start, end, size);
+ unsigned long mem = find_e820_area(start, end, size, align);
void *ptr;
- if (mem != -1L)
+ if (mem != -1L) {
+ mem = round_up(mem, align);
return __va(mem);
+ }
ptr = __alloc_bootmem_nopanic(size,
SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
if (ptr == NULL) {
@@ -198,7 +201,8 @@ void __init setup_node_bootmem(int nodei
start_pfn = start >> PAGE_SHIFT;
end_pfn = end >> PAGE_SHIFT;
- node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+ node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
+ ZONE_ALIGN);
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
@@ -213,7 +217,7 @@ void __init setup_node_bootmem(int nodei
bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
bootmap = early_node_mem(nodeid, bootmap_start, end,
- bootmap_pages<<PAGE_SHIFT);
+ bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
if (bootmap == NULL) {
if (nodedata_phys < start || nodedata_phys >= end)
free_bootmem((unsigned long)node_data[nodeid],
Index: linux-2.6/include/asm-x86/e820_64.h
===================================================================
--- linux-2.6.orig/include/asm-x86/e820_64.h
+++ linux-2.6/include/asm-x86/e820_64.h
@@ -15,7 +15,7 @@
#ifndef __ASSEMBLY__
extern unsigned long find_e820_area(unsigned long start, unsigned long end,
- unsigned size);
+ unsigned size, unsigned long align);
extern void add_memory_region(unsigned long start, unsigned long size,
int type);
extern void setup_memory_region(void);
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 1/2] print out node_data addr and bootmap_start addr
[not found] <200801290053.45776.yinghai.lu@sun.com>
2008-01-29 9:05 ` [PATCH 2/2] x86_64: make early_node_mem return align address Yinghai Lu
@ 2008-01-29 9:05 ` Yinghai Lu
[not found] ` <20080201170908.GB2159@elte.hu>
1 sibling, 1 reply; 10+ messages in thread
From: Yinghai Lu @ 2008-01-29 9:05 UTC (permalink / raw)
To: Ingo Molnar, Christoph Lameter; +Cc: Andrew Morton, Andi Kleen, linux-kernel
[PATCH 1/2] print out node_data addr and bootmap_start addr
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
Index: linux-2.6/arch/x86/mm/numa_64.c
===================================================================
--- linux-2.6.orig/arch/x86/mm/numa_64.c
+++ linux-2.6/arch/x86/mm/numa_64.c
@@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodei
if (node_data[nodeid] == NULL)
return;
nodedata_phys = __pa(node_data[nodeid]);
+ printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
+ nodedata_phys + pgdat_size - 1);
memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
@@ -221,12 +223,15 @@ void __init setup_node_bootmem(int nodei
return;
}
bootmap_start = __pa(bootmap);
- Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
bootmap_start >> PAGE_SHIFT,
start_pfn, end_pfn);
+ printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
+ bootmap_start, bootmap_start + bootmap_size - 1,
+ bootmap_pages);
+
free_bootmem_with_active_regions(nodeid, end);
reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] x86_64: make early_node_mem return align address
2008-01-29 9:05 ` [PATCH 2/2] x86_64: make early_node_mem return align address Yinghai Lu
@ 2008-01-29 9:33 ` Andi Kleen
2008-01-29 17:41 ` Yinghai Lu
2008-01-29 18:08 ` Yinghai Lu
1 sibling, 1 reply; 10+ messages in thread
From: Andi Kleen @ 2008-01-29 9:33 UTC (permalink / raw)
To: Yinghai Lu; +Cc: Ingo Molnar, Christoph Lameter, Andrew Morton, linux-kernel
On Tuesday 29 January 2008 10:05, Yinghai Lu wrote:
> [PATCH 2/2] x86_64: make early_node_mem return align address
>
> boot oops when system get 64g or 128g installed
Probably it should just use reserve_early(). Does this patch work?
The alignment change is needed at some point too, but only to
relax the alignment to not force all early allocations to be page
padded.
-Andi
---
Use early reservation for early node data
Signed-off-by: Andi Kleen <ak@suse.de>
Index: linux/arch/x86/mm/numa_64.c
===================================================================
--- linux.orig/arch/x86/mm/numa_64.c
+++ linux/arch/x86/mm/numa_64.c
@@ -169,8 +169,10 @@ static void * __init early_node_mem(int
unsigned long mem = find_e820_area(start, end, size);
void *ptr;
- if (mem != -1L)
+ if (mem != -1L) {
+ reserve_early(mem, mem + size);
return __va(mem);
+ }
ptr = __alloc_bootmem_nopanic(size,
SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
if (ptr == NULL) {
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] x86_64: make early_node_mem return align address
2008-01-29 9:33 ` Andi Kleen
@ 2008-01-29 17:41 ` Yinghai Lu
2008-01-30 2:55 ` Andi Kleen
0 siblings, 1 reply; 10+ messages in thread
From: Yinghai Lu @ 2008-01-29 17:41 UTC (permalink / raw)
To: Andi Kleen; +Cc: Ingo Molnar, Christoph Lameter, Andrew Morton, linux-kernel
On Tuesday 29 January 2008 01:33:29 am Andi Kleen wrote:
> On Tuesday 29 January 2008 10:05, Yinghai Lu wrote:
> > [PATCH 2/2] x86_64: make early_node_mem return align address
> >
> > boot oops when system get 64g or 128g installed
>
> Probably it should just use reserve_early(). Does this patch work?
>
> The alignment change is needed at some point too, but only to
> relax the alignment to not force all early allocations to be page
> padded.
No, my patch doesn't force all early allocations to be page padded.
for find_e820_mem, i just change PAGE_ALIGN to be aligned align parameter....
only make early_node_mem have aligned data. because it seems it like to...and assume that.
I think your patch will get early panic about overlap between bss and bootmem...
like the 256g machine, bss is overlapped with early page table...
so could change
- node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+ node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
+ ZONE_ALIGN);
===>
- node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
+ node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
+ PAGE_SIZE);
or
- if (mem != -1L)
+ if (mem != -1L) {
+ mem = round_up(mem, PAGE_SIZE);
return __va(mem);
+ }
YH
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] x86_64: make early_node_mem return align address
2008-01-29 9:05 ` [PATCH 2/2] x86_64: make early_node_mem return align address Yinghai Lu
2008-01-29 9:33 ` Andi Kleen
@ 2008-01-29 18:08 ` Yinghai Lu
1 sibling, 0 replies; 10+ messages in thread
From: Yinghai Lu @ 2008-01-29 18:08 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Christoph Lameter, Andrew Morton, Andi Kleen, linux-kernel
On Tuesday 29 January 2008 01:05:03 am Yinghai Lu wrote:
> [PATCH 2/2] x86_64: make early_node_mem return align address
>
> boot oops when system get 64g or 128g installed
>
> Calling initcall 0xffffffff80bc33b6: sctp_init+0x0/0x711()
> BUG: unable to handle kernel NULL pointer dereference at 000000000000005f
> IP: [<ffffffff802bfe55>] proc_register+0xe7/0x10f
> PGD 0
> Oops: 0000 [1] SMP
> CPU 0
> Modules linked in:
> Pid: 1, comm: swapper Not tainted 2.6.24-smp-g5a514e21-dirty #6
> RIP: 0010:[<ffffffff802bfe55>] [<ffffffff802bfe55>] proc_register+0xe7/0x10f
> RSP: 0000:ffff810824c57e60 EFLAGS: 00010246
> RAX: 000000000000d7d7 RBX: ffff811024c5fa80 RCX: ffff810824c57e08
> RDX: 0000000000000000 RSI: 0000000000000195 RDI: ffffffff80cc2460
> RBP: ffffffffffffffff R08: 0000000000000000 R09: ffff811024c5fa80
> R10: 0000000000000000 R11: 0000000000000002 R12: ffff810824c57e6c
> R13: 0000000000000000 R14: ffff810824c57ee0 R15: 00000006abd25bee
> FS: 0000000000000000(0000) GS:ffffffff80b4d000(0000) knlGS:0000000000000000
> CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b
> CR2: 000000000000005f CR3: 0000000000201000 CR4: 00000000000006e0
> DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
> Process swapper (pid: 1, threadinfo ffff810824c56000, task ffff812024c52000)
> Stack: ffffffff80a57348 0000019500000000 ffff811024c5fa80 0000000000000000
> 00000000ffffff97 ffffffff802bfef0 0000000000000000 ffffffffffffffff
> 0000000000000000 ffffffff80bc3b4b ffff810824c57ee0 ffffffff80bc34a5
> Call Trace:
> [<ffffffff802bfef0>] ? create_proc_entry+0x73/0x8a
> [<ffffffff80bc3b4b>] ? sctp_snmp_proc_init+0x1c/0x34
> [<ffffffff80bc34a5>] ? sctp_init+0xef/0x711
> [<ffffffff80b976e3>] ? kernel_init+0x175/0x2e1
> [<ffffffff8020ccf8>] ? child_rip+0xa/0x12
> [<ffffffff80b9756e>] ? kernel_init+0x0/0x2e1
> [<ffffffff8020ccee>] ? child_rip+0x0/0x12
>
>
> Code: 1e 48 83 7b 38 00 75 08 48 c7 43 38 f0 e8 82 80 48 83 7b 30 00 75 08 48 c7 43 30 d0 e9 82 80 48 c7 c7 60 24 cc 80 e8 bd 5a 54 00 <48> 8b 45 60 48 89 6b 58 48 89 5d 60 48 89 43 50 fe 05 f5 25 a0
> RIP [<ffffffff802bfe55>] proc_register+0xe7/0x10f
> RSP <ffff810824c57e60>
> CR2: 000000000000005f
> ---[ end trace 02c2d78def82877a ]---
> Kernel panic - not syncing: Attempted to kill init!
>
> it turns out some variables near end of bss is corrupted already.
>
> in System.map we have
> ffffffff80d40420 b rsi_table
> ffffffff80d40620 B krb5_seq_lock
> ffffffff80d40628 b i.20437
> ffffffff80d40630 b xprt_rdma_inline_write_padding
> ffffffff80d40638 b sunrpc_table_header
> ffffffff80d40640 b zero
> ffffffff80d40644 b min_memreg
> ffffffff80d40648 b rpcrdma_tk_lock_g
> ffffffff80d40650 B sctp_assocs_id_lock
> ffffffff80d40658 B proc_net_sctp
> ffffffff80d40660 B sctp_assocs_id
> ffffffff80d40680 B sysctl_sctp_mem
> ffffffff80d40690 B sysctl_sctp_rmem
> ffffffff80d406a0 B sysctl_sctp_wmem
> ffffffff80d406b0 b sctp_ctl_socket
> ffffffff80d406b8 b sctp_pf_inet6_specific
> ffffffff80d406c0 b sctp_pf_inet_specific
> ffffffff80d406c8 b sctp_af_v4_specific
> ffffffff80d406d0 b sctp_af_v6_specific
> ffffffff80d406d8 b sctp_rand.33270
> ffffffff80d406dc b sctp_memory_pressure
> ffffffff80d406e0 b sctp_sockets_allocated
> ffffffff80d406e4 b sctp_memory_allocated
> ffffffff80d406e8 b sctp_sysctl_header
> ffffffff80d406f0 b zero
> ffffffff80d406f4 A __bss_stop
> ffffffff80d406f4 A _end
>
> and setup_node_bootmem() will use that page 0xd40000 for bootmap
> Bootmem setup node 0 0000000000000000-0000000828000000
> NODE_DATA [000000000008a485 - 0000000000091484]
> bootmap [0000000000d406f4 - 0000000000e456f3] pages 105
> Bootmem setup node 1 0000000828000000-0000001028000000
> NODE_DATA [0000000828000000 - 0000000828006fff]
> bootmap [0000000828007000 - 0000000828106fff] pages 100
> Bootmem setup node 2 0000001028000000-0000001828000000
> NODE_DATA [0000001028000000 - 0000001028006fff]
> bootmap [0000001028007000 - 0000001028106fff] pages 100
> Bootmem setup node 3 0000001828000000-0000002028000000
> NODE_DATA [0000001828000000 - 0000001828006fff]
> bootmap [0000001828007000 - 0000001828106fff] pages 100
>
> actually, setup_node_bootmem hope to make NODE_DATA to be ZONE_ALIGN (1<<(11+12)),
> and bootmap will after that in PAGE.
>
> the patch update early_node_mem, and find_e820_mem to make sure we can extra range
> for alignment.
>
> Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
>
please discard this one. I will have a new one.
Thanks
Yinghai Lu
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] x86_64: make early_node_mem return align address
2008-01-29 17:41 ` Yinghai Lu
@ 2008-01-30 2:55 ` Andi Kleen
2008-01-30 3:24 ` Yinghai Lu
0 siblings, 1 reply; 10+ messages in thread
From: Andi Kleen @ 2008-01-30 2:55 UTC (permalink / raw)
To: Yinghai Lu; +Cc: Ingo Molnar, Christoph Lameter, Andrew Morton, linux-kernel
On Tuesday 29 January 2008 18:41, Yinghai Lu wrote:
> On Tuesday 29 January 2008 01:33:29 am Andi Kleen wrote:
> > On Tuesday 29 January 2008 10:05, Yinghai Lu wrote:
> > > [PATCH 2/2] x86_64: make early_node_mem return align address
> > >
> > > boot oops when system get 64g or 128g installed
> >
> > Probably it should just use reserve_early(). Does this patch work?
> >
> > The alignment change is needed at some point too, but only to
> > relax the alignment to not force all early allocations to be page
> > padded.
>
> No, my patch doesn't force all early allocations to be page padded.
> for find_e820_mem, i just change PAGE_ALIGN to be aligned align
> parameter....
They are already all PAGE_ALIGN()ed (which is too strict, but needs
some care to fix properly), but your patch uses it the wrong way.
The PAGE_ALIGNment was added some time ago to avoid such over
lapping, but it should not actually be needed for that anymore.
>
> only make early_node_mem have aligned data. because it seems it like
> to...and assume that.
Using alignment doesn't seem the correct way to avoid overlapping.
If there is still overlap then some reservation needs to be extended.
> I think your patch will get early panic about overlap between bss and
> bootmem... like the 256g machine, bss is overlapped with early page
> table...
Well did you test it?
bss should have been reserved by this line in head64.c
reserve_early(__pa_symbol(&_text), __pa_symbol(&_end));
(in git-x86). In earlier kernels it was checked for explicitely by the e820
allocator.
-Andi
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 2/2] x86_64: make early_node_mem return align address
2008-01-30 2:55 ` Andi Kleen
@ 2008-01-30 3:24 ` Yinghai Lu
0 siblings, 0 replies; 10+ messages in thread
From: Yinghai Lu @ 2008-01-30 3:24 UTC (permalink / raw)
To: Andi Kleen; +Cc: Ingo Molnar, Christoph Lameter, Andrew Morton, linux-kernel
On Tuesday 29 January 2008 06:55:45 pm Andi Kleen wrote:
> On Tuesday 29 January 2008 18:41, Yinghai Lu wrote:
> > On Tuesday 29 January 2008 01:33:29 am Andi Kleen wrote:
> > > On Tuesday 29 January 2008 10:05, Yinghai Lu wrote:
> > > > [PATCH 2/2] x86_64: make early_node_mem return align address
> > > >
> > > > boot oops when system get 64g or 128g installed
> > >
> > > Probably it should just use reserve_early(). Does this patch work?
> > >
> > > The alignment change is needed at some point too, but only to
> > > relax the alignment to not force all early allocations to be page
> > > padded.
> >
> > No, my patch doesn't force all early allocations to be page padded.
> > for find_e820_mem, i just change PAGE_ALIGN to be aligned align
> > parameter....
>
> They are already all PAGE_ALIGN()ed (which is too strict, but needs
> some care to fix properly), but your patch uses it the wrong way.
> The PAGE_ALIGNment was added some time ago to avoid such over
> lapping, but it should not actually be needed for that anymore.
>
> >
> > only make early_node_mem have aligned data. because it seems it like
> > to...and assume that.
>
> Using alignment doesn't seem the correct way to avoid overlapping.
>
> If there is still overlap then some reservation needs to be extended.
>
> > I think your patch will get early panic about overlap between bss and
> > bootmem... like the 256g machine, bss is overlapped with early page
> > table...
>
> Well did you test it?
>
> bss should have been reserved by this line in head64.c
>
> reserve_early(__pa_symbol(&_text), __pa_symbol(&_end));
>
> (in git-x86). In earlier kernels it was checked for explicitely by the e820
> allocator.
no early panic. but the bss end still get corrupted.
because bootmap_start is used as <<PAGE_SHIFT, and it is overlapped with bss tail page.
YH
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] x86_64: mark x86_cpu_to_node_map_init to __initdata like other xx_init
[not found] ` <20080201170908.GB2159@elte.hu>
@ 2008-02-01 21:29 ` Yinghai Lu
0 siblings, 0 replies; 10+ messages in thread
From: Yinghai Lu @ 2008-02-01 21:29 UTC (permalink / raw)
To: Ingo Molnar; +Cc: linux-kernel
[PATCH] x86_64: mark x86_cpu_to_node_map_init to __initdata like other xx_init
x86_cpu_to_apicid_init and x86_bios_cpu_apicid_init are defined with __initdata.
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f0e5cab..d7af3fd 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,7 +31,7 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct memnode memnode;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
+int x86_cpu_to_node_map_init[NR_CPUS] __initdata = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
void *x86_cpu_to_node_map_early_ptr;
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 8af05a9..d3340de 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -35,7 +35,7 @@ extern int cpu_to_node_map[];
#else
DECLARE_PER_CPU(int, x86_cpu_to_node_map);
-extern int x86_cpu_to_node_map_init[];
+extern int __initdata x86_cpu_to_node_map_init[];
extern void *x86_cpu_to_node_map_early_ptr;
/* Returns the number of the current Node. */
#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id()))
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] x86_64: mark x86_cpu_to_node_map_init to __initdata like other xx_init
2008-01-28 9:16 Yinghai Lu
@ 2008-01-28 10:34 ` Ingo Molnar
0 siblings, 0 replies; 10+ messages in thread
From: Ingo Molnar @ 2008-01-28 10:34 UTC (permalink / raw)
To: Yinghai Lu
Cc: Mike Travis, Christoph Lameter, Linux Kernel Mailing List, Sam Ravnborg
* Yinghai Lu <Yinghai.Lu@Sun.COM> wrote:
> -int x86_cpu_to_node_map_init[NR_CPUS] = {
> +int x86_cpu_to_node_map_init[NR_CPUS] __initdata = {
> [0 ... NR_CPUS-1] = NUMA_NO_NODE
> };
i remember some linker warning here. While this array should indeed only
be used in early init, that decision is dynamic and our linker warnings
do not notice it. There's a special marker for such cases:
__initdata_refok. But ... i'm slightly nervous about turning off a vital
warning like that.
Sam, it would be nice to have a DEBUG_INITDATA mode of operation: in
this case free_initmem() would not truly free those pages but would
unmap them via:
kernel_map_pages(page, nrpages, 0);
could be made dependent on DEBUG_PAGEALLOC.
If this debugging is enabled then if any code references it, we get a
hard page fault. If we had a debug mode like that then bugs in this area
would not go unnoticed.
Or perhaps just make this part of normal DEBUG_PAGEALLOC. Like the patch
below on top of latest x86.git. Hm?
Ingo
------------------->
Subject: x86: init memory debugging
From: Ingo Molnar <mingo@elte.hu>
debug incorrect/late access to init memory, by permanently unmapping
the init memory ranges. Depends on CONFIG_DEBUG_PAGEALLOC=y.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
arch/x86/mm/init_32.c | 12 ++++++++++++
arch/x86/mm/init_64.c | 11 +++++++++++
2 files changed, 23 insertions(+)
Index: linux-x86.q/arch/x86/mm/init_32.c
===================================================================
--- linux-x86.q.orig/arch/x86/mm/init_32.c
+++ linux-x86.q/arch/x86/mm/init_32.c
@@ -794,6 +794,18 @@ void free_init_pages(char *what, unsigne
unsigned long addr;
/*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+ return;
+#endif
+ set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
+ /*
* We just marked the kernel text read only above, now that
* we are going to free part of that, we need to make that
* writeable first.
Index: linux-x86.q/arch/x86/mm/init_64.c
===================================================================
--- linux-x86.q.orig/arch/x86/mm/init_64.c
+++ linux-x86.q/arch/x86/mm/init_64.c
@@ -580,6 +580,17 @@ void free_init_pages(char *what, unsigne
if (begin >= end)
return;
+ /*
+ * If debugging page accesses then do not free this memory but
+ * mark them not present - any buggy init-section access will
+ * create a kernel page fault:
+ */
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
+ begin, PAGE_ALIGN(end));
+ set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
+ return;
+#endif
printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
for (addr = begin; addr < end; addr += PAGE_SIZE) {
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] x86_64: mark x86_cpu_to_node_map_init to __initdata like other xx_init
@ 2008-01-28 9:16 Yinghai Lu
2008-01-28 10:34 ` Ingo Molnar
0 siblings, 1 reply; 10+ messages in thread
From: Yinghai Lu @ 2008-01-28 9:16 UTC (permalink / raw)
To: Ingo Molnar; +Cc: Mike Travis, Christoph Lameter, Linux Kernel Mailing List
[PATCH] x86_64: mark x86_cpu_to_node_map_init to __initdata like other xx_init
Signed-off-by: Yinghai Lu <yinghai.lu@sun.com>
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index f0e5cab..d7af3fd 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,7 +31,7 @@ bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct memnode memnode;
-int x86_cpu_to_node_map_init[NR_CPUS] = {
+int x86_cpu_to_node_map_init[NR_CPUS] __initdata = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
void *x86_cpu_to_node_map_early_ptr;
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 8af05a9..d3340de 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -35,7 +35,7 @@ extern int cpu_to_node_map[];
#else
DECLARE_PER_CPU(int, x86_cpu_to_node_map);
-extern int x86_cpu_to_node_map_init[];
+extern int __initdata x86_cpu_to_node_map_init[];
extern void *x86_cpu_to_node_map_early_ptr;
/* Returns the number of the current Node. */
#define numa_node_id() (early_cpu_to_node(raw_smp_processor_id()))
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2008-02-01 21:22 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <200801290053.45776.yinghai.lu@sun.com>
2008-01-29 9:05 ` [PATCH 2/2] x86_64: make early_node_mem return align address Yinghai Lu
2008-01-29 9:33 ` Andi Kleen
2008-01-29 17:41 ` Yinghai Lu
2008-01-30 2:55 ` Andi Kleen
2008-01-30 3:24 ` Yinghai Lu
2008-01-29 18:08 ` Yinghai Lu
2008-01-29 9:05 ` [PATCH 1/2] print out node_data addr and bootmap_start addr Yinghai Lu
[not found] ` <20080201170908.GB2159@elte.hu>
2008-02-01 21:29 ` [PATCH] x86_64: mark x86_cpu_to_node_map_init to __initdata like other xx_init Yinghai Lu
2008-01-28 9:16 Yinghai Lu
2008-01-28 10:34 ` Ingo Molnar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).