LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
@ 2011-02-03 21:03 Cyrill Gorcunov
  2011-02-14 11:45 ` Ingo Molnar
  0 siblings, 1 reply; 6+ messages in thread
From: Cyrill Gorcunov @ 2011-02-03 21:03 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Suresh Siddha, Yinghai Lu, Thomas Gleixner, H. Peter Anvin, lkml

In the case of x2apic cluster mode we can group IPI register writes based
on the cluster group instead of individual per-cpu destiantion messages.
This reduces the apic register writes and reduces the amount of IPI messages
(in the best case we can reduce it by a factor of 16).

With this change, microbenchmark measuring the cost of flush_tlb_others(),
with the flush tlb IPI being sent from a cpu in the socket-1 to all the logical
cpus in socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket)
is 3x times better now (compared to the former 'send one-by-one' algorithm).

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
 arch/x86/include/asm/apic.h           |    2
 arch/x86/kernel/apic/probe_64.c       |    4
 arch/x86/kernel/apic/x2apic_cluster.c |  169 +++++++++++++++++++++++++---------
 3 files changed, 131 insertions(+), 44 deletions(-)

Index: tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/probe_64.c
+++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
@@ -71,7 +71,9 @@ void __init default_setup_apic_routing(v
 #endif

 	if (apic == &apic_flat && num_possible_cpus() > 8)
-			apic = &apic_physflat;
+		apic = &apic_physflat;
+	else if (apic == &apic_x2apic_cluster)
+		x2apic_init_cpu_notifier();

 	printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);

Index: tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
===================================================================
--- tip-linux-2.6.orig/arch/x86/kernel/apic/x2apic_cluster.c
+++ tip-linux-2.6/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,12 +5,15 @@
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/dmar.h>
+#include <linux/cpu.h>

 #include <asm/smp.h>
 #include <asm/apic.h>
 #include <asm/ipi.h>

 static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
+static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);

 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
@@ -48,24 +51,53 @@ static void
 	native_x2apic_icr_write(cfg, apicid);
 }

-/*
- * for now, we send the IPI's one by one in the cpumask.
- * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
- * at once. We have 16 cpu's in a cluster. This will minimize IPI register
- * writes.
- */
-static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+static inline u32 x2apic_cluster(int cpu)
 {
-	unsigned long query_cpu;
+	return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
+}
+
+static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector,
+				   int exclude_self)
+{
+	unsigned long cpu;
 	unsigned long flags;
+	struct cpumask *cpus_in_cluster_ptr, *ipi_mask_ptr;
+	u32 dest, this_cpu;

 	x2apic_wrmsr_fence();

 	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		__x2apic_send_IPI_dest(
-			per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-			vector, apic->dest_logical);
+	this_cpu = smp_processor_id();
+
+	/*
+	 * we are to modify mask, so we need an own copy
+	 * and be sure it's manipulated with irq off
+	 */
+	ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
+	cpumask_copy(ipi_mask_ptr, mask);
+
+	/*
+	 * the idea is to send one IPI per cluster
+	 */
+	for_each_cpu(cpu, ipi_mask_ptr) {
+		unsigned long i;
+		dest = 0;
+		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
+
+		/* only cpus in cluster involved */
+		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr)
+			if (!exclude_self || i != this_cpu)
+				dest |= per_cpu(x86_cpu_to_logical_apicid, i);
+
+		if (!dest)
+			continue;
+
+		__x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
+		/*
+		 * cluster sibling cpus should be discared now so
+		 * we would not send IPI them second time
+		 */
+		cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
 	}
 	local_irq_restore(flags);
 }
@@ -73,45 +105,22 @@ static void x2apic_send_IPI_mask(const s
 static void
  x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
-
-	local_irq_save(flags);
-	for_each_cpu(query_cpu, mask) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+	__x2apic_send_IPI_mask(mask, vector, 1);
 }

 static void x2apic_send_IPI_allbutself(int vector)
 {
-	unsigned long this_cpu = smp_processor_id();
-	unsigned long query_cpu;
-	unsigned long flags;
-
-	x2apic_wrmsr_fence();
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 1);
+}

-	local_irq_save(flags);
-	for_each_online_cpu(query_cpu) {
-		if (query_cpu == this_cpu)
-			continue;
-		__x2apic_send_IPI_dest(
-				per_cpu(x86_cpu_to_logical_apicid, query_cpu),
-				vector, apic->dest_logical);
-	}
-	local_irq_restore(flags);
+static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
+{
+	__x2apic_send_IPI_mask(mask, vector, 0);
 }

 static void x2apic_send_IPI_all(int vector)
 {
-	x2apic_send_IPI_mask(cpu_online_mask, vector);
+	__x2apic_send_IPI_mask(cpu_online_mask, vector, 0);
 }

 static int x2apic_apic_id_registered(void)
@@ -151,6 +160,34 @@ x2apic_cpu_mask_to_apicid_and(const stru
 	return per_cpu(x86_cpu_to_logical_apicid, cpu);
 }

+#define x2apic_propagate_cpu_cluster_status_online(cpu)		\
+	x2apic_propagate_cpu_cluster_status(cpu, 1)
+
+#define x2apic_propagate_cpu_cluster_status_offline(cpu)	\
+	x2apic_propagate_cpu_cluster_status(cpu, 0)
+
+/* kind of 'fill cluster cpu siblings' map */
+static void x2apic_propagate_cpu_cluster_status(int this_cpu, int online)
+{
+	int cpu;
+
+	if (online) {
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+	} else {
+		for_each_online_cpu(cpu) {
+			if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
+				continue;
+			__cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
+			__cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		}
+	}
+}
+
 static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
 {
 	unsigned int id;
@@ -184,8 +221,54 @@ static void init_x2apic_ldr(void)
 	per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
 }

-struct apic apic_x2apic_cluster = {
+static int __cpuinit
+cluster_setup(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+			free_cpumask_var(per_cpu(ipi_mask, cpu));
+			err = -ENOMEM;
+		}
+		break;
+	case CPU_ONLINE:
+		x2apic_propagate_cpu_cluster_status_online(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+		x2apic_propagate_cpu_cluster_status_offline(cpu);
+		free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+		free_cpumask_var(per_cpu(ipi_mask, cpu));
+		break;
+	}
+
+	return notifier_from_errno(err);
+}
+
+static struct notifier_block __refdata x2apic_cpu_notifier =
+{
+	.notifier_call = cluster_setup,
+};
+
+void x2apic_init_cpu_notifier(void)
+{
+	int cpu = smp_processor_id();

+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
+	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	register_hotcpu_notifier(&x2apic_cpu_notifier);
+}
+
+struct apic apic_x2apic_cluster = {
 	.name				= "cluster x2apic",
 	.probe				= NULL,
 	.acpi_madt_oem_check		= x2apic_acpi_madt_oem_check,
Index: tip-linux-2.6/arch/x86/include/asm/apic.h
===================================================================
--- tip-linux-2.6.orig/arch/x86/include/asm/apic.h
+++ tip-linux-2.6/arch/x86/include/asm/apic.h
@@ -179,6 +179,8 @@ extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
+extern void x2apic_init_cpu_notifier(void);
+
 static inline int x2apic_enabled(void)
 {
 	u64 msr;

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
  2011-02-03 21:03 [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups Cyrill Gorcunov
@ 2011-02-14 11:45 ` Ingo Molnar
  2011-02-14 15:10   ` Cyrill Gorcunov
  0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2011-02-14 11:45 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Suresh Siddha, Yinghai Lu, Thomas Gleixner, H. Peter Anvin, lkml


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> In the case of x2apic cluster mode we can group IPI register writes based on the 
> cluster group instead of individual per-cpu destiantion messages. This reduces the 
> apic register writes and reduces the amount of IPI messages (in the best case we 
> can reduce it by a factor of 16).
> 
> With this change, microbenchmark measuring the cost of flush_tlb_others(), with 
> the flush tlb IPI being sent from a cpu in the socket-1 to all the logical cpus in 
> socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket) is 3x 
> times better now (compared to the former 'send one-by-one' algorithm).

Pretty nice!

I have a few structural and nitpicking comments:

> +++ tip-linux-2.6/arch/x86/kernel/apic/probe_64.c
> @@ -71,7 +71,9 @@ void __init default_setup_apic_routing(v
>  #endif
> 
>  	if (apic == &apic_flat && num_possible_cpus() > 8)
> -			apic = &apic_physflat;
> +		apic = &apic_physflat;
> +	else if (apic == &apic_x2apic_cluster)
> +		x2apic_init_cpu_notifier();

This should be encapsulated cleaner - this higher layer does not care what the 
x2apic code tries to initialize.

So i think the cleanest method would be to clean up default_setup_apic_routing() and 
get rid of the CONFIG_X86_X2APIC (and UV and vsmp) mess from there.

We want some sort of proper driver init sequence between the various APIC drivers, 
which sequence the default_setup_apic_routing() code just iterates and as a result 
it gets a pointer to a 'struct apic'. This would be like all other driver init 
sequences work - no ugly 'glue' like probe_64.c would be needed.


> +static void __x2apic_send_IPI_mask(const struct cpumask *mask, int vector,
> +				   int exclude_self)

Please if possible don't break function prototypes in the middle of the argument 
list. There are more natural places to break the line:

static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int exclude_self)

But it can be in a single line as well. (up to 90-100 cols is fine.)

> +{
> +	unsigned long cpu;
>  	unsigned long flags;
> +	struct cpumask *cpus_in_cluster_ptr, *ipi_mask_ptr;
> +	u32 dest, this_cpu;

Ok, this is a pet peeve of mine: look how inconsistent this variable definition 
block has become. This would be cleaner:

	struct cpumask *cpus_in_cluster_ptr, *ipi_mask_ptr;
	unsigned long cpu, flags;
	u32 dest, this_cpu;

> +	/*
> +	 * we are to modify mask, so we need an own copy
> +	 * and be sure it's manipulated with irq off
> +	 */

Most comments in this file capitalize the first word in a comment block.

(this holds for a number of similar cases as well in later portions of your patch)

> +	for_each_cpu(cpu, ipi_mask_ptr) {
> +		unsigned long i;
> +		dest = 0;

Please separate variable definitions from the first statement in a more visible manner.

> +		cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
> +
> +		/* only cpus in cluster involved */
> +		for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr)
> +			if (!exclude_self || i != this_cpu)
> +				dest |= per_cpu(x86_cpu_to_logical_apicid, i);

Please use curly braces in blocks that are longer than a single line.

> +#define x2apic_propagate_cpu_cluster_status_online(cpu)		\
> +	x2apic_propagate_cpu_cluster_status(cpu, 1)
> +
> +#define x2apic_propagate_cpu_cluster_status_offline(cpu)	\
> +	x2apic_propagate_cpu_cluster_status(cpu, 0)

Is there any particular reason why we use preprocessor technology here, instead of 
the fine C language that .c files are generally written in?

Enumerating 0/1 symbolically would be cleaner that introducing two smaller helper 
functions in any case.

> +static int __cpuinit
> +cluster_setup(struct notifier_block *nfb, unsigned long action, void *hcpu)
> +{
> +	unsigned int cpu = (unsigned long)hcpu;
> +	int err = 0;
> +
> +	switch (action) {
> +	case CPU_UP_PREPARE:
> +		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
> +		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
> +		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
> +			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
> +			free_cpumask_var(per_cpu(ipi_mask, cpu));
> +			err = -ENOMEM;
> +		}

zalloc_cpumask_var() has a return code that could be used for a slightly more 
standard deinit/teardown sequence.

> +void x2apic_init_cpu_notifier(void)
> +{
> +	int cpu = smp_processor_id();
> 
> +	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
> +	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
> +	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));

Such a BUG_ON() is not particularly user friendly - and this could trigger during 
CPU hotplug events, i.e. while the system is fully booted up, right?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
  2011-02-14 11:45 ` Ingo Molnar
@ 2011-02-14 15:10   ` Cyrill Gorcunov
  2011-02-15  3:22     ` Ingo Molnar
  0 siblings, 1 reply; 6+ messages in thread
From: Cyrill Gorcunov @ 2011-02-14 15:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Suresh Siddha, Yinghai Lu, Thomas Gleixner, H. Peter Anvin, lkml

On 02/14/2011 02:45 PM, Ingo Molnar wrote:
>
> * Cyrill Gorcunov<gorcunov@gmail.com>  wrote:
>
>> In the case of x2apic cluster mode we can group IPI register writes based on the
>> cluster group instead of individual per-cpu destiantion messages. This reduces the
>> apic register writes and reduces the amount of IPI messages (in the best case we
>> can reduce it by a factor of 16).
>>
>> With this change, microbenchmark measuring the cost of flush_tlb_others(), with
>> the flush tlb IPI being sent from a cpu in the socket-1 to all the logical cpus in
>> socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket) is 3x
>> times better now (compared to the former 'send one-by-one' algorithm).
>
> Pretty nice!
>
> I have a few structural and nitpicking comments:

Thanks a lot for review, Ingo! I'll address all the nits during this week.

...
>
>> +void x2apic_init_cpu_notifier(void)
>> +{
>> +	int cpu = smp_processor_id();
>>
>> +	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
>> +	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
>> +	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
>
> Such a BUG_ON() is not particularly user friendly - and this could trigger during
> CPU hotplug events, i.e. while the system is fully booted up, right?
>
> Thanks,
>
> 	Ingo

Yup is not that much friendly but it's called during system bootup,
hotplug events are handled by

+static int __cpuinit
+cluster_setup(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	int err = 0;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
+		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
+			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
+			free_cpumask_var(per_cpu(ipi_mask, cpu));
+			err = -ENOMEM;
+		}
+		break;

so it returns -ENOMEM if failed. And btw just noted that we forgot to make
x2apic_init_cpu_notifier being in __init section.

Or I miss something?

-- 
     Cyrill

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
  2011-02-14 15:10   ` Cyrill Gorcunov
@ 2011-02-15  3:22     ` Ingo Molnar
  2011-02-15  8:39       ` Cyrill Gorcunov
  0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2011-02-15  3:22 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Suresh Siddha, Yinghai Lu, Thomas Gleixner, H. Peter Anvin, lkml


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On 02/14/2011 02:45 PM, Ingo Molnar wrote:
> >
> >* Cyrill Gorcunov<gorcunov@gmail.com>  wrote:
> >
> >>In the case of x2apic cluster mode we can group IPI register writes based on the
> >>cluster group instead of individual per-cpu destiantion messages. This reduces the
> >>apic register writes and reduces the amount of IPI messages (in the best case we
> >>can reduce it by a factor of 16).
> >>
> >>With this change, microbenchmark measuring the cost of flush_tlb_others(), with
> >>the flush tlb IPI being sent from a cpu in the socket-1 to all the logical cpus in
> >>socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket) is 3x
> >>times better now (compared to the former 'send one-by-one' algorithm).
> >
> >Pretty nice!
> >
> >I have a few structural and nitpicking comments:
> 
> Thanks a lot for review, Ingo! I'll address all the nits during this week.
> 
> ...
> >
> >>+void x2apic_init_cpu_notifier(void)
> >>+{
> >>+	int cpu = smp_processor_id();
> >>
> >>+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
> >>+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
> >>+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
> >
> >Such a BUG_ON() is not particularly user friendly - and this could trigger during
> >CPU hotplug events, i.e. while the system is fully booted up, right?
> >
> >Thanks,
> >
> >	Ingo
> 
> Yup is not that much friendly but it's called during system bootup,
> hotplug events are handled by
> 
> +static int __cpuinit
> +cluster_setup(struct notifier_block *nfb, unsigned long action, void *hcpu)
> +{
> +	unsigned int cpu = (unsigned long)hcpu;
> +	int err = 0;
> +
> +	switch (action) {
> +	case CPU_UP_PREPARE:
> +		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
> +		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
> +		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
> +			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
> +			free_cpumask_var(per_cpu(ipi_mask, cpu));
> +			err = -ENOMEM;
> +		}
> +		break;
> 
> so it returns -ENOMEM if failed. And btw just noted that we forgot to make
> x2apic_init_cpu_notifier being in __init section.
> 
> Or I miss something?

Is there no GFP_NOFAIL or GFP_FAIL_ON_PANIC variant that could be used the 'must not 
fail' property of the boot-time allocation?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
  2011-02-15  3:22     ` Ingo Molnar
@ 2011-02-15  8:39       ` Cyrill Gorcunov
  2011-02-16  9:23         ` Ingo Molnar
  0 siblings, 1 reply; 6+ messages in thread
From: Cyrill Gorcunov @ 2011-02-15  8:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Suresh Siddha, Yinghai Lu, Thomas Gleixner, H. Peter Anvin, lkml

On 2/15/11, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
>
>> On 02/14/2011 02:45 PM, Ingo Molnar wrote:
>> >
>> >* Cyrill Gorcunov<gorcunov@gmail.com>  wrote:
>> >
>> >>In the case of x2apic cluster mode we can group IPI register writes
>> >> based on the
>> >>cluster group instead of individual per-cpu destiantion messages. This
>> >> reduces the
>> >>apic register writes and reduces the amount of IPI messages (in the best
>> >> case we
>> >>can reduce it by a factor of 16).
>> >>
>> >>With this change, microbenchmark measuring the cost of
>> >> flush_tlb_others(), with
>> >>the flush tlb IPI being sent from a cpu in the socket-1 to all the
>> >> logical cpus in
>> >>socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket)
>> >> is 3x
>> >>times better now (compared to the former 'send one-by-one' algorithm).
>> >
>> >Pretty nice!
>> >
>> >I have a few structural and nitpicking comments:
>>
>> Thanks a lot for review, Ingo! I'll address all the nits during this week.
>>
>> ...
>> >
>> >>+void x2apic_init_cpu_notifier(void)
>> >>+{
>> >>+	int cpu = smp_processor_id();
>> >>
>> >>+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
>> >>+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
>> >>+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
>> >
>> >Such a BUG_ON() is not particularly user friendly - and this could
>> > trigger during
>> >CPU hotplug events, i.e. while the system is fully booted up, right?
>> >
>> >Thanks,
>> >
>> >	Ingo
>>
>> Yup is not that much friendly but it's called during system bootup,
>> hotplug events are handled by
>>
>> +static int __cpuinit
>> +cluster_setup(struct notifier_block *nfb, unsigned long action, void
>> *hcpu)
>> +{
>> +	unsigned int cpu = (unsigned long)hcpu;
>> +	int err = 0;
>> +
>> +	switch (action) {
>> +	case CPU_UP_PREPARE:
>> +		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
>> +		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
>> +		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
>> +			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
>> +			free_cpumask_var(per_cpu(ipi_mask, cpu));
>> +			err = -ENOMEM;
>> +		}
>> +		break;
>>
>> so it returns -ENOMEM if failed. And btw just noted that we forgot to make
>> x2apic_init_cpu_notifier being in __init section.
>>
>> Or I miss something?
>
> Is there no GFP_NOFAIL or GFP_FAIL_ON_PANIC variant that could be used the
> 'must not
> fail' property of the boot-time allocation?
>
> Thanks,
>
> 	Ingo
>

If only i'm not missing something obvious we can set GFP_NOFAIL ending
in endless loop if allocation failed (slab should be already running
at this point of boot). Probably another option might be to switch to
no-apic mode if there is no enough memory to allocate this masks
(though i guess if allocation failed at this point we likely to fail
in further allocations in kernel anyway)

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups
  2011-02-15  8:39       ` Cyrill Gorcunov
@ 2011-02-16  9:23         ` Ingo Molnar
  0 siblings, 0 replies; 6+ messages in thread
From: Ingo Molnar @ 2011-02-16  9:23 UTC (permalink / raw)
  To: Cyrill Gorcunov
  Cc: Suresh Siddha, Yinghai Lu, Thomas Gleixner, H. Peter Anvin, lkml


* Cyrill Gorcunov <gorcunov@gmail.com> wrote:

> On 2/15/11, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > * Cyrill Gorcunov <gorcunov@gmail.com> wrote:
> >
> >> On 02/14/2011 02:45 PM, Ingo Molnar wrote:
> >> >
> >> >* Cyrill Gorcunov<gorcunov@gmail.com>  wrote:
> >> >
> >> >>In the case of x2apic cluster mode we can group IPI register writes
> >> >> based on the
> >> >>cluster group instead of individual per-cpu destiantion messages. This
> >> >> reduces the
> >> >>apic register writes and reduces the amount of IPI messages (in the best
> >> >> case we
> >> >>can reduce it by a factor of 16).
> >> >>
> >> >>With this change, microbenchmark measuring the cost of
> >> >> flush_tlb_others(), with
> >> >>the flush tlb IPI being sent from a cpu in the socket-1 to all the
> >> >> logical cpus in
> >> >>socket-2 (on a Westmere-EX system that has 20 logical cpus in a socket)
> >> >> is 3x
> >> >>times better now (compared to the former 'send one-by-one' algorithm).
> >> >
> >> >Pretty nice!
> >> >
> >> >I have a few structural and nitpicking comments:
> >>
> >> Thanks a lot for review, Ingo! I'll address all the nits during this week.
> >>
> >> ...
> >> >
> >> >>+void x2apic_init_cpu_notifier(void)
> >> >>+{
> >> >>+	int cpu = smp_processor_id();
> >> >>
> >> >>+	zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
> >> >>+	zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
> >> >>+	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
> >> >
> >> >Such a BUG_ON() is not particularly user friendly - and this could
> >> > trigger during
> >> >CPU hotplug events, i.e. while the system is fully booted up, right?
> >> >
> >> >Thanks,
> >> >
> >> >	Ingo
> >>
> >> Yup is not that much friendly but it's called during system bootup,
> >> hotplug events are handled by
> >>
> >> +static int __cpuinit
> >> +cluster_setup(struct notifier_block *nfb, unsigned long action, void
> >> *hcpu)
> >> +{
> >> +	unsigned int cpu = (unsigned long)hcpu;
> >> +	int err = 0;
> >> +
> >> +	switch (action) {
> >> +	case CPU_UP_PREPARE:
> >> +		zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
> >> +		zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
> >> +		if (!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)) {
> >> +			free_cpumask_var(per_cpu(cpus_in_cluster, cpu));
> >> +			free_cpumask_var(per_cpu(ipi_mask, cpu));
> >> +			err = -ENOMEM;
> >> +		}
> >> +		break;
> >>
> >> so it returns -ENOMEM if failed. And btw just noted that we forgot to make
> >> x2apic_init_cpu_notifier being in __init section.
> >>
> >> Or I miss something?
> >
> > Is there no GFP_NOFAIL or GFP_FAIL_ON_PANIC variant that could be used the
> > 'must not
> > fail' property of the boot-time allocation?
> >
> > Thanks,
> >
> > 	Ingo
> >
> 
> If only i'm not missing something obvious we can set GFP_NOFAIL ending
> in endless loop if allocation failed (slab should be already running
> at this point of boot). Probably another option might be to switch to
> no-apic mode if there is no enough memory to allocate this masks
> (though i guess if allocation failed at this point we likely to fail
> in further allocations in kernel anyway)

Ok, if it's boot time only then it's no big deal.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2011-02-16  9:23 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-03 21:03 [RFC 1/2 -tip/master] x86, x2apic: minimize IPI register writes using cluster groups Cyrill Gorcunov
2011-02-14 11:45 ` Ingo Molnar
2011-02-14 15:10   ` Cyrill Gorcunov
2011-02-15  3:22     ` Ingo Molnar
2011-02-15  8:39       ` Cyrill Gorcunov
2011-02-16  9:23         ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).