* [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
@ 2008-01-13 18:34 ` travis
2008-01-14 12:23 ` Mel Gorman
2008-01-14 18:10 ` Jan Engelhardt
2008-01-13 18:34 ` [PATCH 02/10] x86: Change size of node ids " travis
` (9 subsequent siblings)
10 siblings, 2 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:34 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: big_apicids --]
[-- Type: text/plain, Size: 6602 bytes --]
Change the size of APICIDs from u8 to u16. This partially
supports the new x2apic mode that will be present on future
processor chips. (Chips actually support 32-bit APICIDs, but that
change is more intrusive. Supporting 16-bit is sufficient for now).
Signed-off-by: Jack Steiner <steiner@sgi.com>
I've included just the partial change from u8 to u16 apicids. The
remaining x2apic changes will be in a separate patch.
In addition, the fake_node_to_pxm_map[] and fake_apicid_to_node[]
tables have been moved from local data to the __initdata section
reducing stack pressure when MAX_NUMNODES and MAX_LOCAL_APIC are
increased in size.
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/genapic_64.c | 4 ++--
arch/x86/kernel/mpparse_64.c | 4 ++--
arch/x86/kernel/smpboot_64.c | 2 +-
arch/x86/mm/numa_64.c | 2 +-
arch/x86/mm/srat_64.c | 22 +++++++++++++---------
include/asm-x86/processor.h | 14 +++++++-------
include/asm-x86/smp_64.h | 8 ++++----
7 files changed, 30 insertions(+), 26 deletions(-)
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,10 +32,10 @@
* array during this time. Is it zeroed when the per_cpu
* data area is removed.
*/
-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
= { [0 ... NR_CPUS-1] = BAD_APICID };
void *x86_cpu_to_apicid_ptr;
-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
struct genapic __read_mostly *genapic = &apic_flat;
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -67,7 +67,7 @@ unsigned disabled_cpus __cpuinitdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
/*
@@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(
* area is created.
*/
if (x86_cpu_to_apicid_ptr) {
- u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
+ u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
x86_cpu_to_apicid[cpu] = m->mpc_apicid;
} else {
per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -627,7 +627,7 @@ void __init init_cpu_to_node(void)
int i;
for (i = 0; i < NR_CPUS; i++) {
- u8 apicid = x86_cpu_to_apicid_init[i];
+ u16 apicid = x86_cpu_to_apicid_init[i];
if (apicid == BAD_APICID)
continue;
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
{
int pxm, node;
+ int apic_id;
+
+ apic_id = pa->apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,10 +148,10 @@ acpi_numa_processor_affinity_init(struct
bad_srat();
return;
}
- apicid_to_node[pa->apic_id] = node;
+ apicid_to_node[apic_id] = node;
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
- pxm, pa->apic_id, node);
+ pxm, apic_id, node);
}
int update_end_of_memory(unsigned long end) {return -1;}
@@ -343,7 +346,8 @@ int __init acpi_scan_nodes(unsigned long
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
cutoff_node(i, start, end);
- if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+ /* ZZZ why was this needed. At least add a comment */
+ if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
unparse_node(i);
node_set_offline(i);
}
@@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long
}
#ifdef CONFIG_NUMA_EMU
+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
+ [0 ... MAX_NUMNODES-1] = PXM_INVAL
+};
+static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
static int __init find_node_by_addr(unsigned long addr)
{
int ret = NUMA_NO_NODE;
@@ -414,12 +424,6 @@ static int __init find_node_by_addr(unsi
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- int fake_node_to_pxm_map[MAX_NUMNODES] = {
- [0 ... MAX_NUMNODES-1] = PXM_INVAL
- };
- unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
- [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
- };
printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
"topology.\n");
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -86,14 +86,14 @@ struct cpuinfo_x86 {
#ifdef CONFIG_SMP
cpumask_t llc_shared_map; /* cpus sharing the last level cache */
#endif
- unsigned char x86_max_cores; /* cpuid returned max cores value */
- unsigned char apicid;
- unsigned short x86_clflush_size;
+ u16 x86_max_cores; /* cpuid returned max cores value */
+ u16 apicid;
+ u16 x86_clflush_size;
#ifdef CONFIG_SMP
- unsigned char booted_cores; /* number of cores as seen by OS */
- __u8 phys_proc_id; /* Physical processor id. */
- __u8 cpu_core_id; /* Core id */
- __u8 cpu_index; /* index into per_cpu list */
+ u16 booted_cores; /* number of cores as seen by OS */
+ u16 phys_proc_id; /* Physical processor id. */
+ u16 cpu_core_id; /* Core id */
+ u16 cpu_index; /* index into per_cpu list */
#endif
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -26,14 +26,14 @@ extern void unlock_ipi_call_lock(void);
extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait);
-extern u8 __initdata x86_cpu_to_apicid_init[];
+extern u16 __initdata x86_cpu_to_apicid_init[];
extern void *x86_cpu_to_apicid_ptr;
-extern u8 bios_cpu_apicid[];
+extern u16 bios_cpu_apicid[];
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u8, cpu_llc_id);
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
static inline int cpu_present_to_apicid(int mps_cpu)
{
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-13 18:34 ` [PATCH 01/10] x86: Change size of APICIDs from u8 to u16 travis
@ 2008-01-14 12:23 ` Mel Gorman
2008-01-14 18:13 ` Mike Travis
2008-01-14 19:26 ` Mike Travis
2008-01-14 18:10 ` Jan Engelhardt
1 sibling, 2 replies; 33+ messages in thread
From: Mel Gorman @ 2008-01-14 12:23 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
On (13/01/08 10:34), travis@sgi.com didst pronounce:
> Change the size of APICIDs from u8 to u16. This partially
> supports the new x2apic mode that will be present on future
> processor chips. (Chips actually support 32-bit APICIDs, but that
> change is more intrusive. Supporting 16-bit is sufficient for now).
>
> Signed-off-by: Jack Steiner <steiner@sgi.com>
>
> I've included just the partial change from u8 to u16 apicids. The
> remaining x2apic changes will be in a separate patch.
>
> In addition, the fake_node_to_pxm_map[] and fake_apicid_to_node[]
> tables have been moved from local data to the __initdata section
> reducing stack pressure when MAX_NUMNODES and MAX_LOCAL_APIC are
> increased in size.
>
Does this make a different to inter-node effects?
> Signed-off-by: Mike Travis <travis@sgi.com>
> Reviewed-by: Christoph Lameter <clameter@sgi.com>
> ---
> arch/x86/kernel/genapic_64.c | 4 ++--
> arch/x86/kernel/mpparse_64.c | 4 ++--
> arch/x86/kernel/smpboot_64.c | 2 +-
> arch/x86/mm/numa_64.c | 2 +-
> arch/x86/mm/srat_64.c | 22 +++++++++++++---------
> include/asm-x86/processor.h | 14 +++++++-------
> include/asm-x86/smp_64.h | 8 ++++----
> 7 files changed, 30 insertions(+), 26 deletions(-)
>
> --- a/arch/x86/kernel/genapic_64.c
> +++ b/arch/x86/kernel/genapic_64.c
> @@ -32,10 +32,10 @@
> * array during this time. Is it zeroed when the per_cpu
> * data area is removed.
> */
> -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
> +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
> = { [0 ... NR_CPUS-1] = BAD_APICID };
> void *x86_cpu_to_apicid_ptr;
> -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
> +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
> EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
>
> struct genapic __read_mostly *genapic = &apic_flat;
> --- a/arch/x86/kernel/mpparse_64.c
> +++ b/arch/x86/kernel/mpparse_64.c
> @@ -67,7 +67,7 @@ unsigned disabled_cpus __cpuinitdata;
> /* Bitmask of physically existing CPUs */
> physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
>
> -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
> +u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
>
>
> /*
> @@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(
> * area is created.
> */
> if (x86_cpu_to_apicid_ptr) {
> - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
> + u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
> x86_cpu_to_apicid[cpu] = m->mpc_apicid;
> } else {
> per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
> --- a/arch/x86/kernel/smpboot_64.c
> +++ b/arch/x86/kernel/smpboot_64.c
> @@ -65,7 +65,7 @@ int smp_num_siblings = 1;
> EXPORT_SYMBOL(smp_num_siblings);
>
> /* Last level cache ID of each logical CPU */
> -DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
> +DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
>
> /* Bitmask of currently online CPUs */
> cpumask_t cpu_online_map __read_mostly;
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -627,7 +627,7 @@ void __init init_cpu_to_node(void)
> int i;
>
> for (i = 0; i < NR_CPUS; i++) {
> - u8 apicid = x86_cpu_to_apicid_init[i];
> + u16 apicid = x86_cpu_to_apicid_init[i];
>
> if (apicid == BAD_APICID)
> continue;
> --- a/arch/x86/mm/srat_64.c
> +++ b/arch/x86/mm/srat_64.c
> @@ -130,6 +130,9 @@ void __init
> acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
> {
> int pxm, node;
> + int apic_id;
> +
> + apic_id = pa->apic_id;
> if (srat_disabled())
> return;
> if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
> @@ -145,10 +148,10 @@ acpi_numa_processor_affinity_init(struct
> bad_srat();
> return;
> }
> - apicid_to_node[pa->apic_id] = node;
> + apicid_to_node[apic_id] = node;
> acpi_numa = 1;
> printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
> - pxm, pa->apic_id, node);
> + pxm, apic_id, node);
> }
>
> int update_end_of_memory(unsigned long end) {return -1;}
> @@ -343,7 +346,8 @@ int __init acpi_scan_nodes(unsigned long
> /* First clean up the node list */
> for (i = 0; i < MAX_NUMNODES; i++) {
> cutoff_node(i, start, end);
> - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
> + /* ZZZ why was this needed. At least add a comment */
> + if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
Care to actually add a comment? This looks like a note to yourself that
got missed.
> unparse_node(i);
> node_set_offline(i);
> }
> @@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long
> }
>
> #ifdef CONFIG_NUMA_EMU
> +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
> + [0 ... MAX_NUMNODES-1] = PXM_INVAL
> +};
> +static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
> + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
> +};
> static int __init find_node_by_addr(unsigned long addr)
> {
> int ret = NUMA_NO_NODE;
> @@ -414,12 +424,6 @@ static int __init find_node_by_addr(unsi
> void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
> {
> int i, j;
> - int fake_node_to_pxm_map[MAX_NUMNODES] = {
> - [0 ... MAX_NUMNODES-1] = PXM_INVAL
> - };
> - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
> - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
> - };
>
> printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
> "topology.\n");
> --- a/include/asm-x86/processor.h
> +++ b/include/asm-x86/processor.h
> @@ -86,14 +86,14 @@ struct cpuinfo_x86 {
> #ifdef CONFIG_SMP
> cpumask_t llc_shared_map; /* cpus sharing the last level cache */
> #endif
> - unsigned char x86_max_cores; /* cpuid returned max cores value */
> - unsigned char apicid;
> - unsigned short x86_clflush_size;
> + u16 x86_max_cores; /* cpuid returned max cores value */
> + u16 apicid;
> + u16 x86_clflush_size;
> #ifdef CONFIG_SMP
> - unsigned char booted_cores; /* number of cores as seen by OS */
> - __u8 phys_proc_id; /* Physical processor id. */
> - __u8 cpu_core_id; /* Core id */
> - __u8 cpu_index; /* index into per_cpu list */
> + u16 booted_cores; /* number of cores as seen by OS */
> + u16 phys_proc_id; /* Physical processor id. */
> + u16 cpu_core_id; /* Core id */
> + u16 cpu_index; /* index into per_cpu list */
> #endif
> } __attribute__((__aligned__(SMP_CACHE_BYTES)));
>
> --- a/include/asm-x86/smp_64.h
> +++ b/include/asm-x86/smp_64.h
> @@ -26,14 +26,14 @@ extern void unlock_ipi_call_lock(void);
> extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
> void *info, int wait);
>
> -extern u8 __initdata x86_cpu_to_apicid_init[];
> +extern u16 __initdata x86_cpu_to_apicid_init[];
> extern void *x86_cpu_to_apicid_ptr;
> -extern u8 bios_cpu_apicid[];
> +extern u16 bios_cpu_apicid[];
>
> DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
> DECLARE_PER_CPU(cpumask_t, cpu_core_map);
> -DECLARE_PER_CPU(u8, cpu_llc_id);
> -DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
> +DECLARE_PER_CPU(u16, cpu_llc_id);
> +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
>
> static inline int cpu_present_to_apicid(int mps_cpu)
> {
>
> --
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@kvack.org. For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>
--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-14 12:23 ` Mel Gorman
@ 2008-01-14 18:13 ` Mike Travis
2008-01-14 19:26 ` Mike Travis
1 sibling, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 18:13 UTC (permalink / raw)
To: Mel Gorman
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
Mel Gorman wrote:
> On (13/01/08 10:34), travis@sgi.com didst pronounce:
>> Change the size of APICIDs from u8 to u16. This partially
>> supports the new x2apic mode that will be present on future
>> processor chips. (Chips actually support 32-bit APICIDs, but that
>> change is more intrusive. Supporting 16-bit is sufficient for now).
>>
>> Signed-off-by: Jack Steiner <steiner@sgi.com>
>>
>> I've included just the partial change from u8 to u16 apicids. The
>> remaining x2apic changes will be in a separate patch.
>>
>> In addition, the fake_node_to_pxm_map[] and fake_apicid_to_node[]
>> tables have been moved from local data to the __initdata section
>> reducing stack pressure when MAX_NUMNODES and MAX_LOCAL_APIC are
>> increased in size.
>>
>
> Does this make a different to inter-node effects?
Are you asking about the movement of the fake arrays? Since these
are used once and discarded, the it shouldn't have any effect.
If you are asking abut the general increase to 16 bits, I don't
think we have much choice. The hardware for 16 (and as mentioned
32) is coming. I'm trying to minimize as much traffic between
node 0 and the other nodes. Apic id AFAICT is used mostly for
sending IPI's, either to cpus within the node or to remote cpus.
Using the remote cpu's node memory to query it's apicid seems
reasonable? And, of course, having all the data about one's own
node local to itself seems a big win as well.
Thanks,
Mike
>
>> Signed-off-by: Mike Travis <travis@sgi.com>
>> Reviewed-by: Christoph Lameter <clameter@sgi.com>
>> ---
>> arch/x86/kernel/genapic_64.c | 4 ++--
>> arch/x86/kernel/mpparse_64.c | 4 ++--
>> arch/x86/kernel/smpboot_64.c | 2 +-
>> arch/x86/mm/numa_64.c | 2 +-
>> arch/x86/mm/srat_64.c | 22 +++++++++++++---------
>> include/asm-x86/processor.h | 14 +++++++-------
>> include/asm-x86/smp_64.h | 8 ++++----
>> 7 files changed, 30 insertions(+), 26 deletions(-)
>>
>> --- a/arch/x86/kernel/genapic_64.c
>> +++ b/arch/x86/kernel/genapic_64.c
>> @@ -32,10 +32,10 @@
>> * array during this time. Is it zeroed when the per_cpu
>> * data area is removed.
>> */
>> -u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
>> +u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
>> = { [0 ... NR_CPUS-1] = BAD_APICID };
>> void *x86_cpu_to_apicid_ptr;
>> -DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
>> +DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
>> EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
>>
>> struct genapic __read_mostly *genapic = &apic_flat;
>> --- a/arch/x86/kernel/mpparse_64.c
>> +++ b/arch/x86/kernel/mpparse_64.c
>> @@ -67,7 +67,7 @@ unsigned disabled_cpus __cpuinitdata;
>> /* Bitmask of physically existing CPUs */
>> physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
>>
>> -u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
>> +u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
>>
>>
>> /*
>> @@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(
>> * area is created.
>> */
>> if (x86_cpu_to_apicid_ptr) {
>> - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
>> + u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
>> x86_cpu_to_apicid[cpu] = m->mpc_apicid;
>> } else {
>> per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
>> --- a/arch/x86/kernel/smpboot_64.c
>> +++ b/arch/x86/kernel/smpboot_64.c
>> @@ -65,7 +65,7 @@ int smp_num_siblings = 1;
>> EXPORT_SYMBOL(smp_num_siblings);
>>
>> /* Last level cache ID of each logical CPU */
>> -DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
>> +DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;
>>
>> /* Bitmask of currently online CPUs */
>> cpumask_t cpu_online_map __read_mostly;
>> --- a/arch/x86/mm/numa_64.c
>> +++ b/arch/x86/mm/numa_64.c
>> @@ -627,7 +627,7 @@ void __init init_cpu_to_node(void)
>> int i;
>>
>> for (i = 0; i < NR_CPUS; i++) {
>> - u8 apicid = x86_cpu_to_apicid_init[i];
>> + u16 apicid = x86_cpu_to_apicid_init[i];
>>
>> if (apicid == BAD_APICID)
>> continue;
>> --- a/arch/x86/mm/srat_64.c
>> +++ b/arch/x86/mm/srat_64.c
>> @@ -130,6 +130,9 @@ void __init
>> acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
>> {
>> int pxm, node;
>> + int apic_id;
>> +
>> + apic_id = pa->apic_id;
>> if (srat_disabled())
>> return;
>> if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
>> @@ -145,10 +148,10 @@ acpi_numa_processor_affinity_init(struct
>> bad_srat();
>> return;
>> }
>> - apicid_to_node[pa->apic_id] = node;
>> + apicid_to_node[apic_id] = node;
>> acpi_numa = 1;
>> printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
>> - pxm, pa->apic_id, node);
>> + pxm, apic_id, node);
>> }
>>
>> int update_end_of_memory(unsigned long end) {return -1;}
>> @@ -343,7 +346,8 @@ int __init acpi_scan_nodes(unsigned long
>> /* First clean up the node list */
>> for (i = 0; i < MAX_NUMNODES; i++) {
>> cutoff_node(i, start, end);
>> - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
>> + /* ZZZ why was this needed. At least add a comment */
>> + if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
>
> Care to actually add a comment? This looks like a note to yourself that
> got missed.
>
>> unparse_node(i);
>> node_set_offline(i);
>> }
>> @@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long
>> }
>>
>> #ifdef CONFIG_NUMA_EMU
>> +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
>> + [0 ... MAX_NUMNODES-1] = PXM_INVAL
>> +};
>> +static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
>> + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>> +};
>> static int __init find_node_by_addr(unsigned long addr)
>> {
>> int ret = NUMA_NO_NODE;
>> @@ -414,12 +424,6 @@ static int __init find_node_by_addr(unsi
>> void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
>> {
>> int i, j;
>> - int fake_node_to_pxm_map[MAX_NUMNODES] = {
>> - [0 ... MAX_NUMNODES-1] = PXM_INVAL
>> - };
>> - unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
>> - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>> - };
>>
>> printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
>> "topology.\n");
>> --- a/include/asm-x86/processor.h
>> +++ b/include/asm-x86/processor.h
>> @@ -86,14 +86,14 @@ struct cpuinfo_x86 {
>> #ifdef CONFIG_SMP
>> cpumask_t llc_shared_map; /* cpus sharing the last level cache */
>> #endif
>> - unsigned char x86_max_cores; /* cpuid returned max cores value */
>> - unsigned char apicid;
>> - unsigned short x86_clflush_size;
>> + u16 x86_max_cores; /* cpuid returned max cores value */
>> + u16 apicid;
>> + u16 x86_clflush_size;
>> #ifdef CONFIG_SMP
>> - unsigned char booted_cores; /* number of cores as seen by OS */
>> - __u8 phys_proc_id; /* Physical processor id. */
>> - __u8 cpu_core_id; /* Core id */
>> - __u8 cpu_index; /* index into per_cpu list */
>> + u16 booted_cores; /* number of cores as seen by OS */
>> + u16 phys_proc_id; /* Physical processor id. */
>> + u16 cpu_core_id; /* Core id */
>> + u16 cpu_index; /* index into per_cpu list */
>> #endif
>> } __attribute__((__aligned__(SMP_CACHE_BYTES)));
>>
>> --- a/include/asm-x86/smp_64.h
>> +++ b/include/asm-x86/smp_64.h
>> @@ -26,14 +26,14 @@ extern void unlock_ipi_call_lock(void);
>> extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
>> void *info, int wait);
>>
>> -extern u8 __initdata x86_cpu_to_apicid_init[];
>> +extern u16 __initdata x86_cpu_to_apicid_init[];
>> extern void *x86_cpu_to_apicid_ptr;
>> -extern u8 bios_cpu_apicid[];
>> +extern u16 bios_cpu_apicid[];
>>
>> DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
>> DECLARE_PER_CPU(cpumask_t, cpu_core_map);
>> -DECLARE_PER_CPU(u8, cpu_llc_id);
>> -DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
>> +DECLARE_PER_CPU(u16, cpu_llc_id);
>> +DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
>>
>> static inline int cpu_present_to_apicid(int mps_cpu)
>> {
>>
>> --
>>
>> --
>> To unsubscribe, send a message with 'unsubscribe linux-mm' in
>> the body to majordomo@kvack.org. For more info on Linux MM,
>> see: http://www.linux-mm.org/ .
>> Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
>>
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-14 12:23 ` Mel Gorman
2008-01-14 18:13 ` Mike Travis
@ 2008-01-14 19:26 ` Mike Travis
1 sibling, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 19:26 UTC (permalink / raw)
To: Mel Gorman
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
Mel Gorman wrote:
> On (13/01/08 10:34), travis@sgi.com didst pronounce:
...
>> int update_end_of_memory(unsigned long end) {return -1;}
>> @@ -343,7 +346,8 @@ int __init acpi_scan_nodes(unsigned long
>> /* First clean up the node list */
>> for (i = 0; i < MAX_NUMNODES; i++) {
>> cutoff_node(i, start, end);
>> - if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
>> + /* ZZZ why was this needed. At least add a comment */
>> + if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
>
> Care to actually add a comment? This looks like a note to yourself that
> got missed.
Oops, sorry, missed this the first time.
Actually that was a note from someone else and I didn't address it.
(Weirdly, I had removed it but some quilt refresh demon brought it back. ;-)
We found this error in testing with a virtual BIOS but I think we
never figured out if it was an error in our BIOS or a valid error.
But in any case, I'll fix it.
Thanks,
Mike
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-13 18:34 ` [PATCH 01/10] x86: Change size of APICIDs from u8 to u16 travis
2008-01-14 12:23 ` Mel Gorman
@ 2008-01-14 18:10 ` Jan Engelhardt
2008-01-14 18:22 ` Mike Travis
2008-01-14 18:32 ` Mike Travis
1 sibling, 2 replies; 33+ messages in thread
From: Jan Engelhardt @ 2008-01-14 18:10 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
On Jan 13 2008 10:34, travis@sgi.com wrote:
>--- a/arch/x86/kernel/mpparse_64.c
>+++ b/arch/x86/kernel/mpparse_64.c
>@@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(
> * area is created.
> */
> if (x86_cpu_to_apicid_ptr) {
>- u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
>+ u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
> x86_cpu_to_apicid[cpu] = m->mpc_apicid;
> } else {
> per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
You can do away with the cast while modifying this line.
>--- a/arch/x86/mm/srat_64.c
>+++ b/arch/x86/mm/srat_64.c
>@@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long
> }
>
> #ifdef CONFIG_NUMA_EMU
>+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
>+ [0 ... MAX_NUMNODES-1] = PXM_INVAL
>+};
>+static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
>+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>+};
> static int __init find_node_by_addr(unsigned long addr)
> {
> int ret = NUMA_NO_NODE;
No u8/u16 here?
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-14 18:10 ` Jan Engelhardt
@ 2008-01-14 18:22 ` Mike Travis
2008-01-14 18:32 ` Mike Travis
1 sibling, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 18:22 UTC (permalink / raw)
To: Jan Engelhardt
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
Jan Engelhardt wrote:
> On Jan 13 2008 10:34, travis@sgi.com wrote:
>> --- a/arch/x86/kernel/mpparse_64.c
>> +++ b/arch/x86/kernel/mpparse_64.c
>> @@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(
>> * area is created.
>> */
>> if (x86_cpu_to_apicid_ptr) {
>> - u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
>> + u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
>> x86_cpu_to_apicid[cpu] = m->mpc_apicid;
>> } else {
>> per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
>
> You can do away with the cast while modifying this line.
Thanks! For some reason I had problems with the 'inter-section'
referencing and this slipped in while addressing that problem.
>
>> --- a/arch/x86/mm/srat_64.c
>> +++ b/arch/x86/mm/srat_64.c
>> @@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long
>> }
>>
>> #ifdef CONFIG_NUMA_EMU
>> +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
>> + [0 ... MAX_NUMNODES-1] = PXM_INVAL
>> +};
>> +static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
>> + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>> +};
>> static int __init find_node_by_addr(unsigned long addr)
>> {
>> int ret = NUMA_NO_NODE;
>
> No u8/u16 here?
Good point.
Thanks,
Mike
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-14 18:10 ` Jan Engelhardt
2008-01-14 18:22 ` Mike Travis
@ 2008-01-14 18:32 ` Mike Travis
2008-01-14 19:16 ` Christoph Lameter
1 sibling, 1 reply; 33+ messages in thread
From: Mike Travis @ 2008-01-14 18:32 UTC (permalink / raw)
To: Jan Engelhardt
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
Jan Engelhardt wrote:
...
>> --- a/arch/x86/mm/srat_64.c
>> +++ b/arch/x86/mm/srat_64.c
>> @@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long
>> }
>>
>> #ifdef CONFIG_NUMA_EMU
>> +static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
>> + [0 ... MAX_NUMNODES-1] = PXM_INVAL
>> +};
>> +static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
>> + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
>> +};
>> static int __init find_node_by_addr(unsigned long addr)
>> {
>> int ret = NUMA_NO_NODE;
>
> No u8/u16 here?
I see the mistake in the node array. But AFAICT, pxm is the proximity
between nodes and cannot be expressed as greater than the number of
nodes, yes? (Or can it be arbitrarily expressed where 32 bits is
necessary?) I ask this because the real node_to_pxm_map is already
32 bits.
Thanks,
Mike
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 01/10] x86: Change size of APICIDs from u8 to u16
2008-01-14 18:32 ` Mike Travis
@ 2008-01-14 19:16 ` Christoph Lameter
0 siblings, 0 replies; 33+ messages in thread
From: Christoph Lameter @ 2008-01-14 19:16 UTC (permalink / raw)
To: Mike Travis
Cc: Jan Engelhardt, Andrew Morton, Andi Kleen, mingo, Jack Steiner,
linux-mm, linux-kernel
On Mon, 14 Jan 2008, Mike Travis wrote:
> I see the mistake in the node array. But AFAICT, pxm is the proximity
> between nodes and cannot be expressed as greater than the number of
> nodes, yes? (Or can it be arbitrarily expressed where 32 bits is
> necessary?) I ask this because the real node_to_pxm_map is already
> 32 bits.
Well I think local variables that contain a node can be int without a
problem because that is what the core used to store node ids.
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 02/10] x86: Change size of node ids from u8 to u16
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
2008-01-13 18:34 ` [PATCH 01/10] x86: Change size of APICIDs from u8 to u16 travis
@ 2008-01-13 18:34 ` travis
2008-01-13 20:01 ` Eric Dumazet
2008-01-13 18:34 ` [PATCH 03/10] x86: Change NR_CPUS arrays in powernow-k8 travis
` (8 subsequent siblings)
10 siblings, 1 reply; 33+ messages in thread
From: travis @ 2008-01-13 18:34 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: big_nodeids --]
[-- Type: text/plain, Size: 2947 bytes --]
Change the size of node ids from 8 bits to 16 bits to
accomodate more than 256 nodes.
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/mm/numa_64.c | 9 ++++++---
arch/x86/mm/srat_64.c | 2 +-
include/asm-x86/numa_64.h | 4 ++--
include/asm-x86/topology.h | 2 +-
4 files changed, 10 insertions(+), 7 deletions(-)
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -11,6 +11,7 @@
#include <linux/ctype.h>
#include <linux/module.h>
#include <linux/nodemask.h>
+#include <linux/sched.h>
#include <asm/e820.h>
#include <asm/proto.h>
@@ -30,12 +31,12 @@ bootmem_data_t plat_node_bdata[MAX_NUMNO
struct memnode memnode;
-int cpu_to_node_map[NR_CPUS] __read_mostly = {
+u16 cpu_to_node_map[NR_CPUS] __read_mostly = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
EXPORT_SYMBOL(cpu_to_node_map);
-unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
+u16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
@@ -544,7 +545,9 @@ void __init numa_initmem_init(unsigned l
node_set(0, node_possible_map);
for (i = 0; i < NR_CPUS; i++)
numa_set_node(i, 0);
- node_to_cpumask_map[0] = cpumask_of_cpu(0);
+ /* we can't use cpumask_of_cpu() yet */
+ memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
+ cpu_set(0, node_to_cpumask_map[0]);
e820_register_active_regions(0, start_pfn, end_pfn);
setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
}
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -391,7 +391,7 @@ int __init acpi_scan_nodes(unsigned long
static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
[0 ... MAX_NUMNODES-1] = PXM_INVAL
};
-static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+static u16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
};
static int __init find_node_by_addr(unsigned long addr)
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -20,7 +20,7 @@ extern void numa_set_node(int cpu, int n
extern void srat_reserve_add_area(int nodeid);
extern int hotadd_percent;
-extern unsigned char apicid_to_node[MAX_LOCAL_APIC];
+extern u16 apicid_to_node[MAX_LOCAL_APIC];
extern void numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn);
extern unsigned long numa_free_all_bootmem(void);
@@ -40,6 +40,6 @@ static inline void clear_node_cpumask(in
#define clear_node_cpumask(cpu) do {} while (0)
#endif
-#define NUMA_NO_NODE 0xff
+#define NUMA_NO_NODE 0xffff
#endif
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -30,7 +30,7 @@
#include <asm/mpspec.h>
/* Mappings between logical cpu number and node number */
-extern int cpu_to_node_map[];
+extern u16 cpu_to_node_map[];
extern cpumask_t node_to_cpumask_map[];
/* Returns the number of the node containing CPU 'cpu' */
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 02/10] x86: Change size of node ids from u8 to u16
2008-01-13 18:34 ` [PATCH 02/10] x86: Change size of node ids " travis
@ 2008-01-13 20:01 ` Eric Dumazet
0 siblings, 0 replies; 33+ messages in thread
From: Eric Dumazet @ 2008-01-13 20:01 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
travis@sgi.com a écrit :
> Change the size of node ids from 8 bits to 16 bits to
> accomodate more than 256 nodes.
>
> Signed-off-by: Mike Travis <travis@sgi.com>
> Reviewed-by: Christoph Lameter <clameter@sgi.com>
> ---
> arch/x86/mm/numa_64.c | 9 ++++++---
> arch/x86/mm/srat_64.c | 2 +-
> include/asm-x86/numa_64.h | 4 ++--
> include/asm-x86/topology.h | 2 +-
> 4 files changed, 10 insertions(+), 7 deletions(-)
So, you think some machine is going to have more than 256 nodes ?
If so, you probably need to change 'struct memnode' too
(include/asm-x86/mmzone_64.h)
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 03/10] x86: Change NR_CPUS arrays in powernow-k8
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
2008-01-13 18:34 ` [PATCH 01/10] x86: Change size of APICIDs from u8 to u16 travis
2008-01-13 18:34 ` [PATCH 02/10] x86: Change size of node ids " travis
@ 2008-01-13 18:34 ` travis
2008-01-13 18:34 ` [PATCH 04/10] x86: Change NR_CPUS arrays in intel_cacheinfo travis
` (7 subsequent siblings)
10 siblings, 0 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:34 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: NR_CPUS-arrays-in-powernow-k8 --]
[-- Type: text/plain, Size: 2218 bytes --]
Change the following static arrays sized by NR_CPUS to
per_cpu data variables:
powernow_k8_data *powernow_data[NR_CPUS];
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 12 ++++++------
1 file changed, 6 insertions(+), 6 deletions(-)
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -53,7 +53,7 @@
/* serialize freq changes */
static DEFINE_MUTEX(fidvid_mutex);
-static struct powernow_k8_data *powernow_data[NR_CPUS];
+static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
static int cpu_family = CPU_OPTERON;
@@ -1052,7 +1052,7 @@ static int transition_frequency_pstate(s
static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation)
{
cpumask_t oldmask = CPU_MASK_ALL;
- struct powernow_k8_data *data = powernow_data[pol->cpu];
+ struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
u32 checkfid;
u32 checkvid;
unsigned int newstate;
@@ -1128,7 +1128,7 @@ err_out:
/* Driver entry point to verify the policy and range of frequencies */
static int powernowk8_verify(struct cpufreq_policy *pol)
{
- struct powernow_k8_data *data = powernow_data[pol->cpu];
+ struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
if (!data)
return -EINVAL;
@@ -1233,7 +1233,7 @@ static int __cpuinit powernowk8_cpu_init
dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
data->currfid, data->currvid);
- powernow_data[pol->cpu] = data;
+ per_cpu(powernow_data, pol->cpu) = data;
return 0;
@@ -1247,7 +1247,7 @@ err_out:
static int __devexit powernowk8_cpu_exit (struct cpufreq_policy *pol)
{
- struct powernow_k8_data *data = powernow_data[pol->cpu];
+ struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
if (!data)
return -EINVAL;
@@ -1268,7 +1268,7 @@ static unsigned int powernowk8_get (unsi
cpumask_t oldmask = current->cpus_allowed;
unsigned int khz = 0;
- data = powernow_data[first_cpu(per_cpu(cpu_core_map, cpu))];
+ data = per_cpu(powernow_data, first_cpu(per_cpu(cpu_core_map, cpu)));
if (!data)
return -EINVAL;
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 04/10] x86: Change NR_CPUS arrays in intel_cacheinfo
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (2 preceding siblings ...)
2008-01-13 18:34 ` [PATCH 03/10] x86: Change NR_CPUS arrays in powernow-k8 travis
@ 2008-01-13 18:34 ` travis
2008-01-13 18:34 ` [PATCH 05/10] x86: Change NR_CPUS arrays in smpboot_64 travis
` (6 subsequent siblings)
10 siblings, 0 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:34 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: NR_CPUS-arrays-in-intel_cacheinfo --]
[-- Type: text/plain, Size: 5841 bytes --]
Change the following static arrays sized by NR_CPUS to
per_cpu data variables:
_cpuid4_info *cpuid4_info[NR_CPUS];
_index_kobject *index_kobject[NR_CPUS];
kobject * cache_kobject[NR_CPUS];
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/cpu/intel_cacheinfo.c | 55 +++++++++++++++++-----------------
1 file changed, 29 insertions(+), 26 deletions(-)
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -451,8 +451,8 @@ unsigned int __cpuinit init_intel_cachei
}
/* pointer to _cpuid4_info array (for each cache leaf) */
-static struct _cpuid4_info *cpuid4_info[NR_CPUS];
-#define CPUID4_INFO_IDX(x,y) (&((cpuid4_info[x])[y]))
+static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
+#define CPUID4_INFO_IDX(x,y) (&((per_cpu(cpuid4_info, x))[y]))
#ifdef CONFIG_SMP
static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -474,7 +474,7 @@ static void __cpuinit cache_shared_cpu_m
if (cpu_data(i).apicid >> index_msb ==
c->apicid >> index_msb) {
cpu_set(i, this_leaf->shared_cpu_map);
- if (i != cpu && cpuid4_info[i]) {
+ if (i != cpu && per_cpu(cpuid4_info, i)) {
sibling_leaf = CPUID4_INFO_IDX(i, index);
cpu_set(cpu, sibling_leaf->shared_cpu_map);
}
@@ -505,8 +505,8 @@ static void __cpuinit free_cache_attribu
for (i = 0; i < num_cache_leaves; i++)
cache_remove_shared_cpu_map(cpu, i);
- kfree(cpuid4_info[cpu]);
- cpuid4_info[cpu] = NULL;
+ kfree(per_cpu(cpuid4_info, cpu));
+ per_cpu(cpuid4_info, cpu) = NULL;
}
static int __cpuinit detect_cache_attributes(unsigned int cpu)
@@ -519,9 +519,9 @@ static int __cpuinit detect_cache_attrib
if (num_cache_leaves == 0)
return -ENOENT;
- cpuid4_info[cpu] = kzalloc(
+ per_cpu(cpuid4_info, cpu) = kzalloc(
sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (cpuid4_info[cpu] == NULL)
+ if (per_cpu(cpuid4_info, cpu) == NULL)
return -ENOMEM;
oldmask = current->cpus_allowed;
@@ -546,8 +546,8 @@ static int __cpuinit detect_cache_attrib
out:
if (retval) {
- kfree(cpuid4_info[cpu]);
- cpuid4_info[cpu] = NULL;
+ kfree(per_cpu(cpuid4_info, cpu));
+ per_cpu(cpuid4_info, cpu) = NULL;
}
return retval;
@@ -561,7 +561,7 @@ out:
extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
/* pointer to kobject for cpuX/cache */
-static struct kobject * cache_kobject[NR_CPUS];
+static DEFINE_PER_CPU(struct kobject *, cache_kobject);
struct _index_kobject {
struct kobject kobj;
@@ -570,8 +570,8 @@ struct _index_kobject {
};
/* pointer to array of kobjects for cpuX/cache/indexY */
-static struct _index_kobject *index_kobject[NR_CPUS];
-#define INDEX_KOBJECT_PTR(x,y) (&((index_kobject[x])[y]))
+static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
+#define INDEX_KOBJECT_PTR(x,y) (&((per_cpu(index_kobject, x))[y]))
#define show_one_plus(file_name, object, val) \
static ssize_t show_##file_name \
@@ -684,10 +684,10 @@ static struct kobj_type ktype_percpu_ent
static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
{
- kfree(cache_kobject[cpu]);
- kfree(index_kobject[cpu]);
- cache_kobject[cpu] = NULL;
- index_kobject[cpu] = NULL;
+ kfree(per_cpu(cache_kobject, cpu));
+ kfree(per_cpu(index_kobject, cpu));
+ per_cpu(cache_kobject, cpu) = NULL;
+ per_cpu(index_kobject, cpu) = NULL;
free_cache_attributes(cpu);
}
@@ -703,13 +703,14 @@ static int __cpuinit cpuid4_cache_sysfs_
return err;
/* Allocate all required memory */
- cache_kobject[cpu] = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(cache_kobject[cpu] == NULL))
+ per_cpu(cache_kobject, cpu) =
+ kzalloc(sizeof(struct kobject), GFP_KERNEL);
+ if (unlikely(per_cpu(cache_kobject, cpu) == NULL))
goto err_out;
- index_kobject[cpu] = kzalloc(
+ per_cpu(index_kobject, cpu) = kzalloc(
sizeof(struct _index_kobject ) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(index_kobject[cpu] == NULL))
+ if (unlikely(per_cpu(index_kobject, cpu) == NULL))
goto err_out;
return 0;
@@ -733,7 +734,8 @@ static int __cpuinit cache_add_dev(struc
if (unlikely(retval < 0))
return retval;
- retval = kobject_init_and_add(cache_kobject[cpu], &ktype_percpu_entry,
+ retval = kobject_init_and_add(per_cpu(cache_kobject, cpu),
+ &ktype_percpu_entry,
&sys_dev->kobj, "%s", "cache");
if (retval < 0) {
cpuid4_cache_sysfs_exit(cpu);
@@ -745,13 +747,14 @@ static int __cpuinit cache_add_dev(struc
this_object->cpu = cpu;
this_object->index = i;
retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache, cache_kobject[cpu],
+ &ktype_cache,
+ per_cpu(cache_kobject, cpu),
"index%1lu", i);
if (unlikely(retval)) {
for (j = 0; j < i; j++) {
kobject_put(&(INDEX_KOBJECT_PTR(cpu,j)->kobj));
}
- kobject_put(cache_kobject[cpu]);
+ kobject_put(per_cpu(cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
break;
}
@@ -760,7 +763,7 @@ static int __cpuinit cache_add_dev(struc
if (!retval)
cpu_set(cpu, cache_dev_map);
- kobject_uevent(cache_kobject[cpu], KOBJ_ADD);
+ kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
return retval;
}
@@ -769,7 +772,7 @@ static void __cpuinit cache_remove_dev(s
unsigned int cpu = sys_dev->id;
unsigned long i;
- if (cpuid4_info[cpu] == NULL)
+ if (per_cpu(cpuid4_info, cpu) == NULL)
return;
if (!cpu_isset(cpu, cache_dev_map))
return;
@@ -777,7 +780,7 @@ static void __cpuinit cache_remove_dev(s
for (i = 0; i < num_cache_leaves; i++)
kobject_put(&(INDEX_KOBJECT_PTR(cpu,i)->kobj));
- kobject_put(cache_kobject[cpu]);
+ kobject_put(per_cpu(cache_kobject, cpu));
cpuid4_cache_sysfs_exit(cpu);
}
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 05/10] x86: Change NR_CPUS arrays in smpboot_64
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (3 preceding siblings ...)
2008-01-13 18:34 ` [PATCH 04/10] x86: Change NR_CPUS arrays in intel_cacheinfo travis
@ 2008-01-13 18:34 ` travis
2008-01-13 18:34 ` [PATCH 06/10] x86: Change NR_CPUS arrays in topology travis
` (5 subsequent siblings)
10 siblings, 0 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:34 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: NR_CPUS-arrays-in-smpboot_64 --]
[-- Type: text/plain, Size: 1287 bytes --]
Change the following static arrays sized by NR_CPUS to
per_cpu data variables:
task_struct *idle_thread_array[NR_CPUS];
This is only done if CONFIG_HOTPLUG_CPU is defined
as otherwise, the array is removed after initialization
anyways.
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/smpboot_64.c | 12 +++++++++++-
1 file changed, 11 insertions(+), 1 deletion(-)
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -111,10 +111,20 @@ DEFINE_PER_CPU(int, cpu_state) = { 0 };
* a new thread. Also avoids complicated thread destroy functionality
* for idle threads.
*/
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Needed only for CONFIG_HOTPLUG_CPU because __cpuinitdata is
+ * removed after init for !CONFIG_HOTPLUG_CPU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
+#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
+#define set_idle_for_cpu(x,p) (per_cpu(idle_thread_array, x) = (p))
+#else
struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
-
#define get_idle_for_cpu(x) (idle_thread_array[(x)])
#define set_idle_for_cpu(x,p) (idle_thread_array[(x)] = (p))
+#endif
+
/*
* Currently trivial. Write the real->protected mode
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 06/10] x86: Change NR_CPUS arrays in topology
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (4 preceding siblings ...)
2008-01-13 18:34 ` [PATCH 05/10] x86: Change NR_CPUS arrays in smpboot_64 travis
@ 2008-01-13 18:34 ` travis
2008-01-14 18:25 ` Jan Engelhardt
2008-01-13 18:35 ` [PATCH 07/10] x86: Cleanup x86_cpu_to_apicid references travis
` (4 subsequent siblings)
10 siblings, 1 reply; 33+ messages in thread
From: travis @ 2008-01-13 18:34 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: NR_CPUS-arrays-in-topology --]
[-- Type: text/plain, Size: 1470 bytes --]
Change the following static arrays sized by NR_CPUS to
per_cpu data variables:
i386_cpu cpu_devices[NR_CPUS];
(And change the struct name to x86_cpu.)
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/topology.c | 8 ++++----
include/asm-x86/cpu.h | 2 +-
2 files changed, 5 insertions(+), 5 deletions(-)
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -31,7 +31,7 @@
#include <linux/mmzone.h>
#include <asm/cpu.h>
-static struct i386_cpu cpu_devices[NR_CPUS];
+static DEFINE_PER_CPU(struct x86_cpu, cpu_devices);
int __cpuinit arch_register_cpu(int num)
{
@@ -46,16 +46,16 @@ int __cpuinit arch_register_cpu(int num)
*/
#ifdef CONFIG_HOTPLUG_CPU
if (num)
- cpu_devices[num].cpu.hotpluggable = 1;
+ per_cpu(cpu_devices, num).cpu.hotpluggable = 1;
#endif
- return register_cpu(&cpu_devices[num].cpu, num);
+ return register_cpu(&per_cpu(cpu_devices, num).cpu, num);
}
#ifdef CONFIG_HOTPLUG_CPU
void arch_unregister_cpu(int num)
{
- return unregister_cpu(&cpu_devices[num].cpu);
+ return unregister_cpu(&per_cpu(cpu_devices, num).cpu);
}
EXPORT_SYMBOL(arch_register_cpu);
EXPORT_SYMBOL(arch_unregister_cpu);
--- a/include/asm-x86/cpu.h
+++ b/include/asm-x86/cpu.h
@@ -7,7 +7,7 @@
#include <linux/nodemask.h>
#include <linux/percpu.h>
-struct i386_cpu {
+struct x86_cpu {
struct cpu cpu;
};
extern int arch_register_cpu(int num);
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 06/10] x86: Change NR_CPUS arrays in topology
2008-01-13 18:34 ` [PATCH 06/10] x86: Change NR_CPUS arrays in topology travis
@ 2008-01-14 18:25 ` Jan Engelhardt
2008-01-14 19:08 ` Mike Travis
0 siblings, 1 reply; 33+ messages in thread
From: Jan Engelhardt @ 2008-01-14 18:25 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
On Jan 13 2008 10:34, travis@sgi.com wrote:
>+++ b/include/asm-x86/cpu.h
>@@ -7,7 +7,7 @@
> #include <linux/nodemask.h>
> #include <linux/percpu.h>
>
>-struct i386_cpu {
>+struct x86_cpu {
> struct cpu cpu;
> };
> extern int arch_register_cpu(int num);
Is not struct x86_cpu kinda redundant here if it only wraps around
one member?
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 06/10] x86: Change NR_CPUS arrays in topology
2008-01-14 18:25 ` Jan Engelhardt
@ 2008-01-14 19:08 ` Mike Travis
0 siblings, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 19:08 UTC (permalink / raw)
To: Jan Engelhardt
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
Jan Engelhardt wrote:
> On Jan 13 2008 10:34, travis@sgi.com wrote:
>> +++ b/include/asm-x86/cpu.h
>> @@ -7,7 +7,7 @@
>> #include <linux/nodemask.h>
>> #include <linux/percpu.h>
>>
>> -struct i386_cpu {
>> +struct x86_cpu {
>> struct cpu cpu;
>> };
>> extern int arch_register_cpu(int num);
>
> Is not struct x86_cpu kinda redundant here if it only wraps around
> one member?
Looking at it, I think the x86 arch specific include file
is including the generic struct cpu (instead of say, a
different one)...?
Thanks,
Mike
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 07/10] x86: Cleanup x86_cpu_to_apicid references
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (5 preceding siblings ...)
2008-01-13 18:34 ` [PATCH 06/10] x86: Change NR_CPUS arrays in topology travis
@ 2008-01-13 18:35 ` travis
2008-01-13 18:35 ` [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64 travis
` (3 subsequent siblings)
10 siblings, 0 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:35 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: cleanup-x86_cpu_to_apicid --]
[-- Type: text/plain, Size: 5148 bytes --]
Clean up references to x86_cpu_to_apicid. Removes extraneous
comments and standardizes on "x86_*_early_ptr" for the early
kernel init references.
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/genapic_64.c | 11 ++---------
arch/x86/kernel/mpparse_64.c | 11 +++--------
arch/x86/kernel/setup_64.c | 2 +-
arch/x86/kernel/smpboot_32.c | 9 ++-------
arch/x86/kernel/smpboot_64.c | 16 +++++++++-------
include/asm-x86/smp_32.h | 2 +-
include/asm-x86/smp_64.h | 2 +-
7 files changed, 19 insertions(+), 34 deletions(-)
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -24,17 +24,10 @@
#include <acpi/acpi_bus.h>
#endif
-/*
- * which logical CPU number maps to which CPU (physical APIC ID)
- *
- * The following static array is used during kernel startup
- * and the x86_cpu_to_apicid_ptr contains the address of the
- * array during this time. Is it zeroed when the per_cpu
- * data area is removed.
- */
+/* which logical CPU number maps to which CPU (physical APIC ID) */
u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
= { [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_ptr;
+void *x86_cpu_to_apicid_early_ptr;
DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -125,14 +125,9 @@ static void __cpuinit MP_processor_info(
cpu = 0;
}
bios_cpu_apicid[cpu] = m->mpc_apicid;
- /*
- * We get called early in the the start_kernel initialization
- * process when the per_cpu data area is not yet setup, so we
- * use a static array that is removed after the per_cpu data
- * area is created.
- */
- if (x86_cpu_to_apicid_ptr) {
- u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
+ /* are we being called early in kernel startup? */
+ if (x86_cpu_to_apicid_early_ptr) {
+ u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
x86_cpu_to_apicid[cpu] = m->mpc_apicid;
} else {
per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -373,7 +373,7 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_SMP
/* setup to use the static apicid table during kernel startup */
- x86_cpu_to_apicid_ptr = (void *)&x86_cpu_to_apicid_init;
+ x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
#endif
#ifdef CONFIG_ACPI
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -91,15 +91,10 @@ static cpumask_t smp_commenced_mask;
DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
EXPORT_PER_CPU_SYMBOL(cpu_info);
-/*
- * The following static array is used during kernel startup
- * and the x86_cpu_to_apicid_ptr contains the address of the
- * array during this time. Is it zeroed when the per_cpu
- * data area is removed.
- */
+/* which logical CPU number maps to which CPU (physical APIC ID) */
u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata =
{ [0 ... NR_CPUS-1] = BAD_APICID };
-void *x86_cpu_to_apicid_ptr;
+void *x86_cpu_to_apicid_early_ptr;
DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -852,23 +852,25 @@ static int __init smp_sanity_check(unsig
}
/*
- * Copy apicid's found by MP_processor_info from initial array to the per cpu
- * data area. The x86_cpu_to_apicid_init array is then expendable and the
- * x86_cpu_to_apicid_ptr is zeroed indicating that the static array is no
- * longer available.
+ * Copy data used in early init routines from the initial arrays to the
+ * per cpu data areas. These arrays then become expendable and the
+ * *_ptrs are zeroed indicating that the static arrays are gone.
*/
void __init smp_set_apicids(void)
{
int cpu;
- for_each_cpu_mask(cpu, cpu_possible_map) {
+ for_each_possible_cpu(cpu) {
if (per_cpu_offset(cpu))
per_cpu(x86_cpu_to_apicid, cpu) =
x86_cpu_to_apicid_init[cpu];
+ else
+ printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
+ cpu);
}
- /* indicate the static array will be going away soon */
- x86_cpu_to_apicid_ptr = NULL;
+ /* indicate the early static arrays are gone */
+ x86_cpu_to_apicid_early_ptr = NULL;
}
static void __init smp_cpu_index_default(void)
--- a/include/asm-x86/smp_32.h
+++ b/include/asm-x86/smp_32.h
@@ -30,7 +30,7 @@ extern void (*mtrr_hook) (void);
extern void zap_low_mappings (void);
extern u8 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
+extern void *x86_cpu_to_apicid_early_ptr;
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -27,7 +27,7 @@ extern int smp_call_function_mask(cpumas
void *info, int wait);
extern u16 __initdata x86_cpu_to_apicid_init[];
-extern void *x86_cpu_to_apicid_ptr;
+extern void *x86_cpu_to_apicid_early_ptr;
extern u16 bios_cpu_apicid[];
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (6 preceding siblings ...)
2008-01-13 18:35 ` [PATCH 07/10] x86: Cleanup x86_cpu_to_apicid references travis
@ 2008-01-13 18:35 ` travis
2008-01-14 11:14 ` Ingo Molnar
2008-01-14 18:14 ` Jan Engelhardt
2008-01-13 18:35 ` [PATCH 09/10] x86: Change NR_CPUS arrays in acpi-cpufreq travis
` (2 subsequent siblings)
10 siblings, 2 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:35 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: NR_CPUS-arrays-in-numa_64 --]
[-- Type: text/plain, Size: 4714 bytes --]
Change the following static arrays sized by NR_CPUS to
per_cpu data variables:
char cpu_to_node_map[NR_CPUS];
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/setup_64.c | 4 +++-
arch/x86/kernel/smpboot_64.c | 6 +++++-
arch/x86/mm/numa_64.c | 20 ++++++++++++++++----
include/asm-x86/numa_64.h | 2 --
include/asm-x86/topology.h | 15 +++++++++++++--
net/sunrpc/svc.c | 1 +
6 files changed, 38 insertions(+), 10 deletions(-)
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -63,6 +63,7 @@
#include <asm/cacheflush.h>
#include <asm/mce.h>
#include <asm/ds.h>
+#include <asm/topology.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -372,8 +373,9 @@ void __init setup_arch(char **cmdline_p)
io_delay_init();
#ifdef CONFIG_SMP
- /* setup to use the static apicid table during kernel startup */
+ /* setup to use the early static init tables during kernel startup */
x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
+ x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
#endif
#ifdef CONFIG_ACPI
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -861,9 +861,12 @@ void __init smp_set_apicids(void)
int cpu;
for_each_possible_cpu(cpu) {
- if (per_cpu_offset(cpu))
+ if (per_cpu_offset(cpu)) {
per_cpu(x86_cpu_to_apicid, cpu) =
x86_cpu_to_apicid_init[cpu];
+ per_cpu(x86_cpu_to_node_map, cpu) =
+ x86_cpu_to_node_map_init[cpu];
+ }
else
printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
cpu);
@@ -871,6 +874,7 @@ void __init smp_set_apicids(void)
/* indicate the early static arrays are gone */
x86_cpu_to_apicid_early_ptr = NULL;
+ x86_cpu_to_node_map_early_ptr = NULL;
}
static void __init smp_cpu_index_default(void)
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -31,10 +31,14 @@ bootmem_data_t plat_node_bdata[MAX_NUMNO
struct memnode memnode;
-u16 cpu_to_node_map[NR_CPUS] __read_mostly = {
+u16 x86_cpu_to_node_map_init[NR_CPUS] __initdata = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
};
-EXPORT_SYMBOL(cpu_to_node_map);
+void *x86_cpu_to_node_map_early_ptr;
+EXPORT_SYMBOL(x86_cpu_to_node_map_init);
+EXPORT_SYMBOL(x86_cpu_to_node_map_early_ptr);
+DEFINE_PER_CPU(u16, x86_cpu_to_node_map) = NUMA_NO_NODE;
+EXPORT_PER_CPU_SYMBOL(x86_cpu_to_node_map);
u16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
@@ -545,7 +549,7 @@ void __init numa_initmem_init(unsigned l
node_set(0, node_possible_map);
for (i = 0; i < NR_CPUS; i++)
numa_set_node(i, 0);
- /* we can't use cpumask_of_cpu() yet */
+ /* cpumask_of_cpu() may not be available during early startup */
memset(&node_to_cpumask_map[0], 0, sizeof(node_to_cpumask_map[0]));
cpu_set(0, node_to_cpumask_map[0]);
e820_register_active_regions(0, start_pfn, end_pfn);
@@ -559,8 +563,16 @@ __cpuinit void numa_add_cpu(int cpu)
void __cpuinit numa_set_node(int cpu, int node)
{
+ u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
+
cpu_pda(cpu)->nodenumber = node;
- cpu_to_node_map[cpu] = node;
+
+ if(cpu_to_node_map)
+ cpu_to_node_map[cpu] = node;
+ else if(per_cpu_offset(cpu))
+ per_cpu(x86_cpu_to_node_map, cpu) = node;
+ else
+ Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu);
}
unsigned long __init numa_free_all_bootmem(void)
--- a/include/asm-x86/numa_64.h
+++ b/include/asm-x86/numa_64.h
@@ -40,6 +40,4 @@ static inline void clear_node_cpumask(in
#define clear_node_cpumask(cpu) do {} while (0)
#endif
-#define NUMA_NO_NODE 0xffff
-
#endif
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -30,13 +30,24 @@
#include <asm/mpspec.h>
/* Mappings between logical cpu number and node number */
-extern u16 cpu_to_node_map[];
+DECLARE_PER_CPU(u16, x86_cpu_to_node_map);
+extern u16 __initdata x86_cpu_to_node_map_init[];
+extern void *x86_cpu_to_node_map_early_ptr;
extern cpumask_t node_to_cpumask_map[];
+#define NUMA_NO_NODE ((u16)(~0))
+
/* Returns the number of the node containing CPU 'cpu' */
static inline int cpu_to_node(int cpu)
{
- return cpu_to_node_map[cpu];
+ u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
+
+ if (cpu_to_node_map)
+ return cpu_to_node_map[cpu];
+ else if(per_cpu_offset(cpu))
+ return per_cpu(x86_cpu_to_node_map, cpu);
+ else
+ return NUMA_NO_NODE;
}
/*
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -18,6 +18,7 @@
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/module.h>
+#include <linux/sched.h>
#include <linux/sunrpc/types.h>
#include <linux/sunrpc/xdr.h>
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64
2008-01-13 18:35 ` [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64 travis
@ 2008-01-14 11:14 ` Ingo Molnar
2008-01-14 17:17 ` Mike Travis
2008-01-14 18:14 ` Jan Engelhardt
1 sibling, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2008-01-14 11:14 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, Christoph Lameter, Jack Steiner,
linux-mm, linux-kernel
* travis@sgi.com <travis@sgi.com> wrote:
> Change the following static arrays sized by NR_CPUS to
> per_cpu data variables:
>
> char cpu_to_node_map[NR_CPUS];
x86.git randconfig testing found the !NUMA build bugs below.
Ingo
--------------->
---
arch/x86/kernel/setup_64.c | 2 ++
arch/x86/kernel/smpboot_64.c | 4 ++++
2 files changed, 6 insertions(+)
Index: linux/arch/x86/kernel/setup_64.c
===================================================================
--- linux.orig/arch/x86/kernel/setup_64.c
+++ linux/arch/x86/kernel/setup_64.c
@@ -379,7 +379,9 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_SMP
/* setup to use the early static init tables during kernel startup */
x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
+#ifdef CONFIG_NUMA
x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
+#endif
x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
#endif
Index: linux/arch/x86/kernel/smpboot_64.c
===================================================================
--- linux.orig/arch/x86/kernel/smpboot_64.c
+++ linux/arch/x86/kernel/smpboot_64.c
@@ -864,8 +864,10 @@ void __init smp_set_apicids(void)
if (per_cpu_offset(cpu)) {
per_cpu(x86_cpu_to_apicid, cpu) =
x86_cpu_to_apicid_init[cpu];
+#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
x86_cpu_to_node_map_init[cpu];
+#endif
per_cpu(x86_bios_cpu_apicid, cpu) =
x86_bios_cpu_apicid_init[cpu];
}
@@ -876,7 +878,9 @@ void __init smp_set_apicids(void)
/* indicate the early static arrays are gone */
x86_cpu_to_apicid_early_ptr = NULL;
+#ifdef CONFIG_NUMA
x86_cpu_to_node_map_early_ptr = NULL;
+#endif
x86_bios_cpu_apicid_early_ptr = NULL;
}
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64
2008-01-14 11:14 ` Ingo Molnar
@ 2008-01-14 17:17 ` Mike Travis
0 siblings, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 17:17 UTC (permalink / raw)
To: Ingo Molnar
Cc: Andrew Morton, Andi Kleen, Christoph Lameter, Jack Steiner,
linux-mm, linux-kernel
Ingo Molnar wrote:
> * travis@sgi.com <travis@sgi.com> wrote:
>
>> Change the following static arrays sized by NR_CPUS to
>> per_cpu data variables:
>>
>> char cpu_to_node_map[NR_CPUS];
>
> x86.git randconfig testing found the !NUMA build bugs below.
>
> Ingo
Thanks! I'll add this in.
Mike
>
> --------------->
> ---
> arch/x86/kernel/setup_64.c | 2 ++
> arch/x86/kernel/smpboot_64.c | 4 ++++
> 2 files changed, 6 insertions(+)
>
> Index: linux/arch/x86/kernel/setup_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/setup_64.c
> +++ linux/arch/x86/kernel/setup_64.c
> @@ -379,7 +379,9 @@ void __init setup_arch(char **cmdline_p)
> #ifdef CONFIG_SMP
> /* setup to use the early static init tables during kernel startup */
> x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
> +#ifdef CONFIG_NUMA
> x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
> +#endif
> x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
> #endif
>
> Index: linux/arch/x86/kernel/smpboot_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/smpboot_64.c
> +++ linux/arch/x86/kernel/smpboot_64.c
> @@ -864,8 +864,10 @@ void __init smp_set_apicids(void)
> if (per_cpu_offset(cpu)) {
> per_cpu(x86_cpu_to_apicid, cpu) =
> x86_cpu_to_apicid_init[cpu];
> +#ifdef CONFIG_NUMA
> per_cpu(x86_cpu_to_node_map, cpu) =
> x86_cpu_to_node_map_init[cpu];
> +#endif
> per_cpu(x86_bios_cpu_apicid, cpu) =
> x86_bios_cpu_apicid_init[cpu];
> }
> @@ -876,7 +878,9 @@ void __init smp_set_apicids(void)
>
> /* indicate the early static arrays are gone */
> x86_cpu_to_apicid_early_ptr = NULL;
> +#ifdef CONFIG_NUMA
> x86_cpu_to_node_map_early_ptr = NULL;
> +#endif
> x86_bios_cpu_apicid_early_ptr = NULL;
> }
>
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64
2008-01-13 18:35 ` [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64 travis
2008-01-14 11:14 ` Ingo Molnar
@ 2008-01-14 18:14 ` Jan Engelhardt
1 sibling, 0 replies; 33+ messages in thread
From: Jan Engelhardt @ 2008-01-14 18:14 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, mingo, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
On Jan 13 2008 10:35, travis@sgi.com wrote:
>--- a/arch/x86/kernel/setup_64.c
>+++ b/arch/x86/kernel/setup_64.c
>@@ -372,8 +373,9 @@ void __init setup_arch(char **cmdline_p)
> io_delay_init();
>
> #ifdef CONFIG_SMP
>- /* setup to use the static apicid table during kernel startup */
>+ /* setup to use the early static init tables during kernel startup */
> x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
>+ x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
> #endif
>
> #ifdef CONFIG_ACPI
Please do not add unnecessary casts.
>--- a/arch/x86/kernel/smpboot_64.c
>+++ b/arch/x86/kernel/smpboot_64.c
>@@ -559,8 +563,16 @@ __cpuinit void numa_add_cpu(int cpu)
>
> void __cpuinit numa_set_node(int cpu, int node)
> {
>+ u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
>+
^
> static inline int cpu_to_node(int cpu)
> {
>- return cpu_to_node_map[cpu];
>+ u16 *cpu_to_node_map = (u16 *)x86_cpu_to_node_map_early_ptr;
^
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 09/10] x86: Change NR_CPUS arrays in acpi-cpufreq
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (7 preceding siblings ...)
2008-01-13 18:35 ` [PATCH 08/10] x86: Change NR_CPUS arrays in numa_64 travis
@ 2008-01-13 18:35 ` travis
2008-01-13 18:35 ` [PATCH 10/10] x86: Change bios_cpu_apicid to percpu data variable travis
2008-01-14 8:14 ` [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs Ingo Molnar
10 siblings, 0 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:35 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: NR_CPUS-arrays-in-acpi-cpufreq --]
[-- Type: text/plain, Size: 3914 bytes --]
Change the following static arrays sized by NR_CPUS to
per_cpu data variables:
acpi_cpufreq_data *drv_data[NR_CPUS]
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -67,7 +67,8 @@ struct acpi_cpufreq_data {
unsigned int cpu_feature;
};
-static struct acpi_cpufreq_data *drv_data[NR_CPUS];
+static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
+
/* acpi_perf_data is a pointer to percpu data. */
static struct acpi_processor_performance *acpi_perf_data;
@@ -218,14 +219,14 @@ static u32 get_cur_val(cpumask_t mask)
if (unlikely(cpus_empty(mask)))
return 0;
- switch (drv_data[first_cpu(mask)]->cpu_feature) {
+ switch (per_cpu(drv_data, first_cpu(mask))->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
break;
case SYSTEM_IO_CAPABLE:
cmd.type = SYSTEM_IO_CAPABLE;
- perf = drv_data[first_cpu(mask)]->acpi_data;
+ perf = per_cpu(drv_data, first_cpu(mask))->acpi_data;
cmd.addr.io.port = perf->control_register.address;
cmd.addr.io.bit_width = perf->control_register.bit_width;
break;
@@ -325,7 +326,7 @@ static unsigned int get_measured_perf(un
#endif
- retval = drv_data[cpu]->max_freq * perf_percent / 100;
+ retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100;
put_cpu();
set_cpus_allowed(current, saved_mask);
@@ -336,7 +337,7 @@ static unsigned int get_measured_perf(un
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
- struct acpi_cpufreq_data *data = drv_data[cpu];
+ struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu);
unsigned int freq;
dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
@@ -370,7 +371,7 @@ static unsigned int check_freqs(cpumask_
static int acpi_cpufreq_target(struct cpufreq_policy *policy,
unsigned int target_freq, unsigned int relation)
{
- struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+ struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
struct acpi_processor_performance *perf;
struct cpufreq_freqs freqs;
cpumask_t online_policy_cpus;
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cp
static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
{
- struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+ struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
dprintk("acpi_cpufreq_verify\n");
@@ -570,7 +571,7 @@ static int acpi_cpufreq_cpu_init(struct
return -ENOMEM;
data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
- drv_data[cpu] = data;
+ per_cpu(drv_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -714,20 +715,20 @@ err_unreg:
acpi_processor_unregister_performance(perf, cpu);
err_free:
kfree(data);
- drv_data[cpu] = NULL;
+ per_cpu(drv_data, cpu) = NULL;
return result;
}
static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
{
- struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+ struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
dprintk("acpi_cpufreq_cpu_exit\n");
if (data) {
cpufreq_frequency_table_put_attr(policy->cpu);
- drv_data[policy->cpu] = NULL;
+ per_cpu(drv_data, policy->cpu) = NULL;
acpi_processor_unregister_performance(data->acpi_data,
policy->cpu);
kfree(data);
@@ -738,7 +739,7 @@ static int acpi_cpufreq_cpu_exit(struct
static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
{
- struct acpi_cpufreq_data *data = drv_data[policy->cpu];
+ struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu);
dprintk("acpi_cpufreq_resume\n");
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* [PATCH 10/10] x86: Change bios_cpu_apicid to percpu data variable
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (8 preceding siblings ...)
2008-01-13 18:35 ` [PATCH 09/10] x86: Change NR_CPUS arrays in acpi-cpufreq travis
@ 2008-01-13 18:35 ` travis
2008-01-14 8:14 ` [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs Ingo Molnar
10 siblings, 0 replies; 33+ messages in thread
From: travis @ 2008-01-13 18:35 UTC (permalink / raw)
To: Andrew Morton, Andi Kleen, mingo
Cc: Christoph Lameter, Jack Steiner, linux-mm, linux-kernel
[-- Attachment #1: change-bios_cpu_apicid-to-percpu --]
[-- Type: text/plain, Size: 5244 bytes --]
Change static bios_cpu_apicid array to a per_cpu data variable.
This includes using a static array used during initialization
similar to the way x86_cpu_to_apicid[] is handled.
There is one early use of bios_cpu_apicid in apic_is_clustered_box().
The other reference in cpu_present_to_apicid() is called after
smp_set_apicids() has setup the percpu version of bios_cpu_apicid.
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Christoph Lameter <clameter@sgi.com>
---
arch/x86/kernel/apic_64.c | 16 ++++++++++++++--
arch/x86/kernel/mpparse_64.c | 17 ++++++++++++-----
arch/x86/kernel/setup_64.c | 1 +
arch/x86/kernel/smpboot_64.c | 3 +++
include/asm-x86/smp_64.h | 8 +++++---
5 files changed, 35 insertions(+), 10 deletions(-)
--- a/arch/x86/kernel/apic_64.c
+++ b/arch/x86/kernel/apic_64.c
@@ -1155,14 +1155,26 @@ __cpuinit int apic_is_clustered_box(void
bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
for (i = 0; i < NR_CPUS; i++) {
- id = bios_cpu_apicid[i];
+ /* are we being called early in kernel startup? */
+ if (x86_bios_cpu_apicid_early_ptr) {
+ id = ((u16 *)x86_bios_cpu_apicid_early_ptr)[i];
+ }
+ else if (i < nr_cpu_ids) {
+ if (cpu_present(i))
+ id = per_cpu(x86_bios_cpu_apicid, i);
+ else
+ continue;
+ }
+ else
+ break;
+
if (id != BAD_APICID)
__set_bit(APIC_CLUSTERID(id), clustermap);
}
/* Problem: Partially populated chassis may not have CPUs in some of
* the APIC clusters they have been allocated. Only present CPUs have
- * bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
+ * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. Since
* clusters are allocated sequentially, count zeros only if they are
* bounded by ones.
*/
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -67,7 +67,11 @@ unsigned disabled_cpus __cpuinitdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;
-u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u16 x86_bios_cpu_apicid_init[NR_CPUS] __initdata
+ = { [0 ... NR_CPUS-1] = BAD_APICID };
+void *x86_bios_cpu_apicid_early_ptr;
+DEFINE_PER_CPU(u16, x86_bios_cpu_apicid) = BAD_APICID;
+EXPORT_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
/*
@@ -118,19 +122,22 @@ static void __cpuinit MP_processor_info(
physid_set(m->mpc_apicid, phys_cpu_present_map);
if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
/*
- * bios_cpu_apicid is required to have processors listed
+ * x86_bios_cpu_apicid is required to have processors listed
* in same order as logical cpu numbers. Hence the first
* entry is BSP, and so on.
*/
cpu = 0;
}
- bios_cpu_apicid[cpu] = m->mpc_apicid;
/* are we being called early in kernel startup? */
if (x86_cpu_to_apicid_early_ptr) {
- u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
- x86_cpu_to_apicid[cpu] = m->mpc_apicid;
+ u16 *cpu_to_apicid = (u16 *)x86_cpu_to_apicid_early_ptr;
+ u16 *bios_cpu_apicid = (u16 *)x86_bios_cpu_apicid_early_ptr;
+
+ cpu_to_apicid[cpu] = m->mpc_apicid;
+ bios_cpu_apicid[cpu] = m->mpc_apicid;
} else {
per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
+ per_cpu(x86_bios_cpu_apicid, cpu) = m->mpc_apicid;
}
cpu_set(cpu, cpu_possible_map);
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -376,6 +376,7 @@ void __init setup_arch(char **cmdline_p)
/* setup to use the early static init tables during kernel startup */
x86_cpu_to_apicid_early_ptr = (void *)&x86_cpu_to_apicid_init;
x86_cpu_to_node_map_early_ptr = (void *)&x86_cpu_to_node_map_init;
+ x86_bios_cpu_apicid_early_ptr = (void *)&x86_bios_cpu_apicid_init;
#endif
#ifdef CONFIG_ACPI
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -866,6 +866,8 @@ void __init smp_set_apicids(void)
x86_cpu_to_apicid_init[cpu];
per_cpu(x86_cpu_to_node_map, cpu) =
x86_cpu_to_node_map_init[cpu];
+ per_cpu(x86_bios_cpu_apicid, cpu) =
+ x86_bios_cpu_apicid_init[cpu];
}
else
printk(KERN_NOTICE "per_cpu_offset zero for cpu %d\n",
@@ -875,6 +877,7 @@ void __init smp_set_apicids(void)
/* indicate the early static arrays are gone */
x86_cpu_to_apicid_early_ptr = NULL;
x86_cpu_to_node_map_early_ptr = NULL;
+ x86_bios_cpu_apicid_early_ptr = NULL;
}
static void __init smp_cpu_index_default(void)
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -27,18 +27,20 @@ extern int smp_call_function_mask(cpumas
void *info, int wait);
extern u16 __initdata x86_cpu_to_apicid_init[];
+extern u16 __initdata x86_bios_cpu_apicid_init[];
extern void *x86_cpu_to_apicid_early_ptr;
-extern u16 bios_cpu_apicid[];
+extern void *x86_bios_cpu_apicid_early_ptr;
DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
DECLARE_PER_CPU(u16, cpu_llc_id);
DECLARE_PER_CPU(u16, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, x86_bios_cpu_apicid);
static inline int cpu_present_to_apicid(int mps_cpu)
{
- if (mps_cpu < NR_CPUS)
- return (int)bios_cpu_apicid[mps_cpu];
+ if (cpu_present(mps_cpu))
+ return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
else
return BAD_APICID;
}
--
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-13 18:34 [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs travis
` (9 preceding siblings ...)
2008-01-13 18:35 ` [PATCH 10/10] x86: Change bios_cpu_apicid to percpu data variable travis
@ 2008-01-14 8:14 ` Ingo Molnar
2008-01-14 9:00 ` Ingo Molnar
2008-01-14 10:04 ` Andi Kleen
10 siblings, 2 replies; 33+ messages in thread
From: Ingo Molnar @ 2008-01-14 8:14 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, Christoph Lameter, Jack Steiner,
linux-mm, linux-kernel
* travis@sgi.com <travis@sgi.com> wrote:
> This patchset addresses the kernel bloat that occurs when NR_CPUS is
> increased. The memory numbers below are with NR_CPUS = 1024 which I've
> been testing (4 and 32 real processors, the rest "possible" using the
> additional_cpus start option.) These changes are all specific to the
> x86 architecture, non-arch specific changes will follow.
thanks, i'll try this patchset in x86.git.
> 32cpus 1kcpus-before 1kcpus-after
> 7172678 Total +23314404 Total -147590 Total
1kcpus-after means it's +23314404-147590, i.e. +23166814? (i.e. a 0.6%
reduction of the bloat?)
i.e. we've got ~22K bloat per CPU - which is not bad, but because it's a
static component, it hurts smaller boxes. For distributors to enable
CONFIG_NR_CPU=1024 by default i guess that bloat has to drop below 1-2K
per CPU :-/ [that would still mean 1-2MB total bloat but that's much
more acceptable than 23MB]
Ingo
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 8:14 ` [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs Ingo Molnar
@ 2008-01-14 9:00 ` Ingo Molnar
2008-01-14 17:52 ` Mike Travis
2008-01-14 10:04 ` Andi Kleen
1 sibling, 1 reply; 33+ messages in thread
From: Ingo Molnar @ 2008-01-14 9:00 UTC (permalink / raw)
To: travis
Cc: Andrew Morton, Andi Kleen, Christoph Lameter, Jack Steiner,
linux-mm, linux-kernel
* Ingo Molnar <mingo@elte.hu> wrote:
> > 32cpus 1kcpus-before 1kcpus-after
> > 7172678 Total +23314404 Total -147590 Total
>
> 1kcpus-after means it's +23314404-147590, i.e. +23166814? (i.e. a 0.6%
> reduction of the bloat?)
or if it's relative to 32cpus then that's an excellent result :)
Ingo
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 9:00 ` Ingo Molnar
@ 2008-01-14 17:52 ` Mike Travis
0 siblings, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 17:52 UTC (permalink / raw)
To: Ingo Molnar
Cc: Andrew Morton, Andi Kleen, Christoph Lameter, Jack Steiner,
linux-mm, linux-kernel
Ingo Molnar wrote:
> * Ingo Molnar <mingo@elte.hu> wrote:
>
>>> 32cpus 1kcpus-before 1kcpus-after
>>> 7172678 Total +23314404 Total -147590 Total
>> 1kcpus-after means it's +23314404-147590, i.e. +23166814? (i.e. a 0.6%
>> reduction of the bloat?)
>
> or if it's relative to 32cpus then that's an excellent result :)
>
> Ingo
Nope, it's a cumulative thing.
> allsizes -w 72 32cpus 1kcpus-after
32cpus 1kcpus-after
228 .altinstr_replacemen +0 .altinstr_replacemen
1219 .altinstructions +0 .altinstructions
717512 .bss +1395328 .bss
61374 .comment +0 .comment
16 .con_initcall.init +0 .con_initcall.init
425256 .data +19200 .data
178688 .data.cacheline_alig +12898304 .data.cacheline_alig
8192 .data.init_task +0 .data.init_task
4096 .data.page_aligned +0 .data.page_aligned
27008 .data.percpu +128896 .data.percpu
43904 .data.read_mostly +8703776 .data.read_mostly
4 .data_nosave +0 .data_nosave
5141 .exit.text +8 .exit.text
138480 .init.data +4608 .init.data
133 .init.ramfs +1 .init.ramfs
3192 .init.setup +0 .init.setup
159754 .init.text +904 .init.text
2288 .initcall.init +0 .initcall.init
8 .jiffies +0 .jiffies
4512 .pci_fixup +0 .pci_fixup
1314438 .rodata +760 .rodata
36552 .smp_locks +256 .smp_locks
3971848 .text +14773 .text
3368 .vdso +0 .vdso
4 .vgetcpu_mode +0 .vgetcpu_mode
218 .vsyscall_0 +0 .vsyscall_0
52 .vsyscall_1 +0 .vsyscall_1
91 .vsyscall_2 +0 .vsyscall_2
8 .vsyscall_3 +0 .vsyscall_3
54 .vsyscall_fn +0 .vsyscall_fn
80 .vsyscall_gtod_data +0 .vsyscall_gtod_data
39480 __bug_table +0 __bug_table
16320 __ex_table +0 __ex_table
9160 __param +0 __param
7172678 Total +23166814 Total
My goal is to move 90+% of the wasted, unused memory to either
the percpu area or the initdata section. The 4 fronts are:
NR_CPUS arrays, cpumask_t usages, more efficient cpu_alloc/percpu
area, and (relatively small) redesign of the irq system. (The
node and apicid arrays are related to the NR_CPUS arrays.)
The irq structs are particularly bad because they use NR_CPUS**2
arrays and the irq vars use 22588416 bytes (74%) of the total
30339492 bytes of memory:
7172678 Total 30339492 Total
> datasizes -w 72 32cpus 1kcpus-before
32cpus 1kcpus-before
262144 BSS __log_buf 12681216 CALNDA irq_desc
163840 CALNDA irq_desc 8718336 RMDATA irq_cfg
131072 BSS entries 528384 BSS irq_lists
76800 INITDA early_node_map 396288 BSS irq_2_pin
30720 RMDATA irq_cfg 264192 BSS irq_timer_state
29440 BSS ide_hwifs 262144 BSS __log_buf
24576 BSS boot_exception_ 132168 PERCPU per_cpu__kstat
20480 BSS irq_lists 131072 BSS entries
18840 DATA ioctl_start 131072 BSS boot_pageset
16384 BSS boot_cpu_stack 131072 CALNDA boot_cpu_pda
15360 BSS irq_2_pin 98304 BSS cpu_devices
14677 DATA bnx2_CP_b06FwTe 76800 INITDA early_node_map
I'm still working on a tool to analyze runtime usage of kernel
memory.
And I'm very open to any and all suggestions... ;-)
Thanks,
Mike
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 8:14 ` [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs Ingo Molnar
2008-01-14 9:00 ` Ingo Molnar
@ 2008-01-14 10:04 ` Andi Kleen
2008-01-14 10:11 ` Ingo Molnar
1 sibling, 1 reply; 33+ messages in thread
From: Andi Kleen @ 2008-01-14 10:04 UTC (permalink / raw)
To: Ingo Molnar
Cc: travis, Andrew Morton, Christoph Lameter, Jack Steiner, linux-mm,
linux-kernel
> i.e. we've got ~22K bloat per CPU - which is not bad, but because it's a
> static component, it hurts smaller boxes. For distributors to enable
> CONFIG_NR_CPU=1024 by default i guess that bloat has to drop below 1-2K
> per CPU :-/ [that would still mean 1-2MB total bloat but that's much
> more acceptable than 23MB]
Even 1-2MB overhead would be too much for distributors I think. Ideally
there must be near zero overhead for possible CPUs (and I see no principle
reason why this is not possible) Worst case a low few hundred KBs, but even
that would be much.
There are the cpusets which get passed around, but these are only one bit per
possible CPU.
-Andi
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 10:04 ` Andi Kleen
@ 2008-01-14 10:11 ` Ingo Molnar
2008-01-14 11:30 ` Andi Kleen
2008-01-14 18:00 ` Mike Travis
0 siblings, 2 replies; 33+ messages in thread
From: Ingo Molnar @ 2008-01-14 10:11 UTC (permalink / raw)
To: Andi Kleen
Cc: travis, Andrew Morton, Christoph Lameter, Jack Steiner, linux-mm,
linux-kernel
* Andi Kleen <ak@suse.de> wrote:
> > i.e. we've got ~22K bloat per CPU - which is not bad, but because
> > it's a static component, it hurts smaller boxes. For distributors to
> > enable CONFIG_NR_CPU=1024 by default i guess that bloat has to drop
> > below 1-2K per CPU :-/ [that would still mean 1-2MB total bloat but
> > that's much more acceptable than 23MB]
>
> Even 1-2MB overhead would be too much for distributors I think.
> Ideally there must be near zero overhead for possible CPUs (and I see
> no principle reason why this is not possible) Worst case a low few
> hundred KBs, but even that would be much.
i think this patchset already gives a net win, by moving stuff from
NR_CPUS arrays into per_cpu area. (Travis please confirm that this is
indeed what the numbers show)
The (total-)size of the per-cpu area(s) grows linearly with the number
of CPUs, so we'll have the expected near-zero overhead on 4-8-16-32 CPUs
and the expected larger total overhead on 1024 CPUs.
Ingo
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 10:11 ` Ingo Molnar
@ 2008-01-14 11:30 ` Andi Kleen
2008-01-16 7:34 ` Nick Piggin
2008-01-14 18:00 ` Mike Travis
1 sibling, 1 reply; 33+ messages in thread
From: Andi Kleen @ 2008-01-14 11:30 UTC (permalink / raw)
To: Ingo Molnar
Cc: travis, Andrew Morton, Christoph Lameter, Jack Steiner, linux-mm,
linux-kernel
> i think this patchset already gives a net win, by moving stuff from
> NR_CPUS arrays into per_cpu area. (Travis please confirm that this is
> indeed what the numbers show)
Yes that is what his patchkit does, although I'm not sure he has addressed all NR_CPUS
pigs yet. The basic idea came out of some discussions we had at kernel summit on this
topic. It's definitely a step in the right direction.
Another problem is that NR_IRQS currently scales with NR_CPUs which is wrong too
(e.g. a hyperthreaded quad core/socket does not need 8 times as many
external interrupts as a single core/socket). And there are unfortunately a few
drivers that declare NR_IRQS arrays.
In general there are more scaling problems like this (e.g. it also doesn't make
sense to scale kernel threads for each CPU thread for example).
At some point we might need to separate CONFIG_NR_CPUS into a
CONFIG_NR_SOCKETS / CONFIG_NR_CPUS to address this, although full dynamic
scaling without configuration is best of course.
All can just be addressed step by step of course.
-Andi
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 11:30 ` Andi Kleen
@ 2008-01-16 7:34 ` Nick Piggin
2008-01-16 18:07 ` Christoph Lameter
0 siblings, 1 reply; 33+ messages in thread
From: Nick Piggin @ 2008-01-16 7:34 UTC (permalink / raw)
To: Andi Kleen
Cc: Ingo Molnar, travis, Andrew Morton, Christoph Lameter,
Jack Steiner, linux-mm, linux-kernel
On Monday 14 January 2008 22:30, Andi Kleen wrote:
> In general there are more scaling problems like this (e.g. it also doesn't
> make sense to scale kernel threads for each CPU thread for example).
I think in a lot of ways, per-CPU kernel threads scale OK. At least
they should mostly be dynamic, so they don't require overhead on
smaller systems. On larger systems, I don't know if there are too
many kernel problems with all those threads (except for userspace
tools sometimes don't report well).
And I think making them per-CPU can be much easier than tuning some
arbitrary algorithm to get a mix between parallelism and footprint.
For example, I'm finding that it might actually be worthwhile to move
some per-node and dynamically-controlled thread creation over to the
basic per-CPU scheme because of differences in topologies...
Anyway, that's just an aside.
Oh, just while I remember it also, something funny is that MAX_NUMNODES
can be bigger than NR_CPUS on x86. I guess one can have CPUless nodes,
but wouldn't it make sense to have an upper bound of NR_CPUS by default?
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-16 7:34 ` Nick Piggin
@ 2008-01-16 18:07 ` Christoph Lameter
0 siblings, 0 replies; 33+ messages in thread
From: Christoph Lameter @ 2008-01-16 18:07 UTC (permalink / raw)
To: Nick Piggin
Cc: Andi Kleen, Ingo Molnar, travis, Andrew Morton, Jack Steiner,
linux-mm, linux-kernel
On Wed, 16 Jan 2008, Nick Piggin wrote:
> Oh, just while I remember it also, something funny is that MAX_NUMNODES
> can be bigger than NR_CPUS on x86. I guess one can have CPUless nodes,
> but wouldn't it make sense to have an upper bound of NR_CPUS by default?
There are special configurations that some customers want which involves
huge amounts of memory and just a few processors. In that case the number
of nodes becomes larger than the number of processors.
^ permalink raw reply [flat|nested] 33+ messages in thread
* Re: [PATCH 00/10] x86: Reduce memory and intra-node effects with large count NR_CPUs
2008-01-14 10:11 ` Ingo Molnar
2008-01-14 11:30 ` Andi Kleen
@ 2008-01-14 18:00 ` Mike Travis
1 sibling, 0 replies; 33+ messages in thread
From: Mike Travis @ 2008-01-14 18:00 UTC (permalink / raw)
To: Ingo Molnar
Cc: Andi Kleen, Andrew Morton, Christoph Lameter, Jack Steiner,
linux-mm, linux-kernel
Ingo Molnar wrote:
> * Andi Kleen <ak@suse.de> wrote:
>
>>> i.e. we've got ~22K bloat per CPU - which is not bad, but because
>>> it's a static component, it hurts smaller boxes. For distributors to
>>> enable CONFIG_NR_CPU=1024 by default i guess that bloat has to drop
>>> below 1-2K per CPU :-/ [that would still mean 1-2MB total bloat but
>>> that's much more acceptable than 23MB]
>> Even 1-2MB overhead would be too much for distributors I think.
>> Ideally there must be near zero overhead for possible CPUs (and I see
>> no principle reason why this is not possible) Worst case a low few
>> hundred KBs, but even that would be much.
>
> i think this patchset already gives a net win, by moving stuff from
> NR_CPUS arrays into per_cpu area. (Travis please confirm that this is
> indeed what the numbers show)
>
> The (total-)size of the per-cpu area(s) grows linearly with the number
> of CPUs, so we'll have the expected near-zero overhead on 4-8-16-32 CPUs
> and the expected larger total overhead on 1024 CPUs.
>
> Ingo
Yes, and it's just the first step. Ideally, there is *no* extra memory
used by specifying NR_CPUS = <whatever> and all the extra memory only
comes into play when they are "possible/probable". This means that almost
all of the data needs to be in the percpu area (and compact that as much
as possible) or in the initdata section and discarded after use.
And Andi is right, the distributors will not default the NR_CPUS to a large
value unless there is zero or very little overhead. And since so much
depends on using standard configurations (certifications, etc.) we cannot
depend on a special build.
Thanks,
Mike
^ permalink raw reply [flat|nested] 33+ messages in thread