LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] arm: perf: Directly handle SMP platforms with one SPI
@ 2014-11-21 14:53 Daniel Thompson
  2014-11-26 16:59 ` Daniel Thompson
                   ` (5 more replies)
  0 siblings, 6 replies; 20+ messages in thread
From: Daniel Thompson @ 2014-11-21 14:53 UTC (permalink / raw)
  To: Russell King, Will Deacon
  Cc: Daniel Thompson, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, patches, linaro-kernel, John Stultz, Sumit Semwal

Some ARM platforms mux the PMU interrupt of every core into a single
SPI. On such platforms if the PMU of any core except 0 raises an interrupt
then it cannot be serviced and eventually, if you are lucky, the spurious
irq detection might forcefully disable the interrupt.

On these SoCs it is not possible to determine which core raised the
interrupt so workaround this issue by queuing irqwork on the other
cores whenever the primary interrupt handler is unable to service the
interrupt.

The u8500 platform has an alternative workaround that dynamically alters
the affinity of the PMU interrupt. This workaround logic is no longer
required so the original code is removed as is the hook it relied upon.

Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    Thanks to Lucas Stach, Russell King and Thomas Gleixner for critiquing
    an older, completely different way to tackle the same problem.
    

 arch/arm/include/asm/pmu.h       | 10 +++++
 arch/arm/kernel/perf_event.c     | 11 ++---
 arch/arm/kernel/perf_event_cpu.c | 94 ++++++++++++++++++++++++++++++++++++++++
 arch/arm/mach-ux500/cpu-db8500.c | 29 -------------
 4 files changed, 107 insertions(+), 37 deletions(-)

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index 0b648c541293..36472c3cc283 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -81,6 +81,12 @@ struct pmu_hw_events {
 	raw_spinlock_t		pmu_lock;
 };

+struct arm_pmu_work {
+	struct irq_work work;
+	struct arm_pmu *arm_pmu;
+	atomic_t ret;
+};
+
 struct arm_pmu {
 	struct pmu	pmu;
 	cpumask_t	active_irqs;
@@ -101,6 +107,7 @@ struct arm_pmu {
 	void		(*reset)(void *);
 	int		(*request_irq)(struct arm_pmu *, irq_handler_t handler);
 	void		(*free_irq)(struct arm_pmu *);
+	irqreturn_t	(*handle_irq_none)(struct arm_pmu *);
 	int		(*map_event)(struct perf_event *event);
 	int		num_events;
 	atomic_t	active_events;
@@ -108,6 +115,9 @@ struct arm_pmu {
 	u64		max_period;
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	*(*get_hw_events)(void);
+	int             single_irq;
+	struct arm_pmu_work __percpu *work;
+	atomic_t        remaining_work;
 };

 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index b50a770f8c99..0792c913b9bb 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -306,22 +306,17 @@ validate_group(struct perf_event *event)
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;

 	if (irq_is_percpu(irq))
 		dev = *(void **)dev;
 	armpmu = dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);

 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, dev, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, dev);
+	ret = armpmu->handle_irq(irq, dev);
+	if (ret == IRQ_NONE && armpmu->handle_irq_none)
+		ret = armpmu->handle_irq_none(dev);
 	finish_clock = sched_clock();

 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index eb2c4d55666b..e7153dc3b489 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -88,6 +88,75 @@ static void cpu_pmu_disable_percpu_irq(void *data)
 	disable_percpu_irq(irq);
 }

+/*
+ * Workaround logic that is distributed to all cores if the PMU has only
+ * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
+ * job is to try to service the interrupt on the current CPU. It will
+ * also enable the IRQ again if all the other CPUs have already tried to
+ * service it.
+ */
+static void cpu_pmu_do_percpu_work(struct irq_work *w)
+{
+	struct arm_pmu_work *work = container_of(w, struct arm_pmu_work, work);
+	struct arm_pmu *cpu_pmu = work->arm_pmu;
+
+	atomic_set(&work->ret,
+		   cpu_pmu->handle_irq(cpu_pmu->single_irq, cpu_pmu));
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_work))
+		enable_irq(cpu_pmu->single_irq);
+}
+
+/*
+ * This callback, which is enabled only on SMP platforms that are
+ * running with a single IRQ, is called when the PMU handler running in
+ * the current CPU cannot service the interrupt.
+ *
+ * It will disable the interrupt and distribute irqwork to all other
+ * processors in the system. Hopefully one of them will clear the
+ * interrupt...
+ */
+static irqreturn_t cpu_pmu_handle_irq_none(struct arm_pmu *cpu_pmu)
+{
+	int num_online = num_online_cpus();
+	irqreturn_t ret = IRQ_NONE;
+	int cpu, cret;
+
+	if (num_online <= 1)
+		return IRQ_NONE;
+
+	disable_irq_nosync(cpu_pmu->single_irq);
+	atomic_add(num_online, &cpu_pmu->remaining_work);
+	smp_mb__after_atomic();
+
+	for_each_online_cpu(cpu) {
+		struct arm_pmu_work *work = per_cpu_ptr(cpu_pmu->work, cpu);
+
+		if (cpu == smp_processor_id())
+			continue;
+
+		/*
+		 * We can be extremely relaxed about memory ordering
+		 * here. All we are doing is gathering information
+		 * about the past to help us give a return value that
+		 * will keep the spurious interrupt detector both happy
+		 * *and* functional. We are not shared so we can
+		 * tolerate the occasional spurious IRQ_HANDLED.
+		 */
+		cret = atomic_read(&work->ret);
+		if (cret != IRQ_NONE)
+			ret = cret;
+
+		if (!irq_work_queue_on(&work->work, cpu))
+			atomic_dec(&cpu_pmu->remaining_work);
+	}
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_work))
+		enable_irq(cpu_pmu->single_irq);
+
+	return ret;
+}
+
 static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 {
 	int i, irq, irqs;
@@ -107,6 +176,9 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			if (irq >= 0)
 				free_irq(irq, cpu_pmu);
 		}
+
+		cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
+		free_percpu(cpu_pmu->work);
 	}
 }

@@ -162,6 +234,28 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)

 			cpumask_set_cpu(i, &cpu_pmu->active_irqs);
 		}
+
+		/*
+		 * If we are running SMP and have only one interrupt source
+		 * then get ready to share that single irq among the cores.
+		 */
+		if (nr_cpu_ids > 1 && irqs == 1) {
+			cpu_pmu->single_irq = platform_get_irq(pmu_device, 0);
+			cpu_pmu->work = alloc_percpu(struct arm_pmu_work);
+			if (!cpu_pmu->work) {
+				pr_err("no memory for shared IRQ workaround\n");
+				return -ENOMEM;
+			}
+
+			for_each_possible_cpu(i) {
+				struct arm_pmu_work *w =
+				    per_cpu_ptr(cpu_pmu->work, i);
+				init_irq_work(&w->work, cpu_pmu_do_percpu_work);
+				w->arm_pmu = cpu_pmu;
+			}
+
+			cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
+		}
 	}

 	return 0;
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 6f63954c8bde..917774999c5c 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -12,8 +12,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -23,7 +21,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/random.h>

-#include <asm/pmu.h>
 #include <asm/mach/map.h>

 #include "setup.h"
@@ -99,30 +96,6 @@ static void __init u8500_map_io(void)
 		iotable_init(u8500_io_desc, ARRAY_SIZE(u8500_io_desc));
 }

-/*
- * The PMU IRQ lines of two cores are wired together into a single interrupt.
- * Bounce the interrupt to the other core if it's not ours.
- */
-static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
-{
-	irqreturn_t ret = handler(irq, dev);
-	int other = !smp_processor_id();
-
-	if (ret == IRQ_NONE && cpu_online(other))
-		irq_set_affinity(irq, cpumask_of(other));
-
-	/*
-	 * We should be able to get away with the amount of IRQ_NONEs we give,
-	 * while still having the spurious IRQ detection code kick in if the
-	 * interrupt really starts hitting spuriously.
-	 */
-	return ret;
-}
-
-static struct arm_pmu_platdata db8500_pmu_platdata = {
-	.handle_irq		= db8500_pmu_handler,
-};
-
 static const char *db8500_read_soc_id(void)
 {
 	void __iomem *uid = __io_address(U8500_BB_UID_BASE);
@@ -143,8 +116,6 @@ static struct device * __init db8500_soc_device_init(void)
 }

 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
-	/* Requires call-back bindings. */
-	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &db8500_pmu_platdata),
 	/* Requires DMA bindings. */
 	OF_DEV_AUXDATA("stericsson,ux500-msp-i2s", 0x80123000,
 		       "ux500-msp-i2s.0", &msp0_platform_data),
--
1.9.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
@ 2014-11-26 16:59 ` Daniel Thompson
  2014-12-02 14:49   ` Mark Rutland
  2014-12-02 13:22 ` Linus Walleij
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Daniel Thompson @ 2014-11-26 16:59 UTC (permalink / raw)
  To: Russell King, Will Deacon
  Cc: Daniel Thompson, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, patches, linaro-kernel, John Stultz, Sumit Semwal

Some ARM platforms mux the PMU interrupt of every core into a single
SPI. On such platforms if the PMU of any core except 0 raises an interrupt
then it cannot be serviced and eventually, if you are lucky, the spurious
irq detection might forcefully disable the interrupt.

On these SoCs it is not possible to determine which core raised the
interrupt so workaround this issue by queuing irqwork on the other
cores whenever the primary interrupt handler is unable to service the
interrupt.

The u8500 platform has an alternative workaround that dynamically alters
the affinity of the PMU interrupt. This workaround logic is no longer
required so the original code is removed as is the hook it relied upon.

Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    v2:
     * Fixed build problems on systems without SMP.
    
    v1:
     * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
       critiquing an older, completely different way to tackle the
       same problem.
    

 arch/arm/include/asm/pmu.h       |  12 ++++
 arch/arm/kernel/perf_event.c     |  13 ++--
 arch/arm/kernel/perf_event_cpu.c | 126 +++++++++++++++++++++++++++++++++++++++
 arch/arm/mach-ux500/cpu-db8500.c |  29 ---------
 4 files changed, 143 insertions(+), 37 deletions(-)

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index 0b648c541293..771201ff0988 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -81,6 +81,12 @@ struct pmu_hw_events {
 	raw_spinlock_t		pmu_lock;
 };

+struct arm_pmu_work {
+	struct irq_work work;
+	struct arm_pmu *arm_pmu;
+	atomic_t ret;
+};
+
 struct arm_pmu {
 	struct pmu	pmu;
 	cpumask_t	active_irqs;
@@ -108,6 +114,12 @@ struct arm_pmu {
 	u64		max_period;
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	*(*get_hw_events)(void);
+#ifdef CONFIG_SMP
+	irqreturn_t	(*handle_irq_none)(struct arm_pmu *);
+	int             single_irq;
+	struct arm_pmu_work __percpu *work;
+	atomic_t        remaining_work;
+#endif
 };

 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index b50a770f8c99..ba67d6309e1e 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -306,22 +306,19 @@ validate_group(struct perf_event *event)
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;

 	if (irq_is_percpu(irq))
 		dev = *(void **)dev;
 	armpmu = dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);

 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, dev, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, dev);
+	ret = armpmu->handle_irq(irq, dev);
+#ifdef CONFIG_SMP
+	if (ret == IRQ_NONE && armpmu->handle_irq_none)
+		ret = armpmu->handle_irq_none(dev);
+#endif
 	finish_clock = sched_clock();

 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index eb2c4d55666b..5605d4a4c01f 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -88,6 +88,120 @@ static void cpu_pmu_disable_percpu_irq(void *data)
 	disable_percpu_irq(irq);
 }

+#ifdef CONFIG_SMP
+
+/*
+ * Workaround logic that is distributed to all cores if the PMU has only
+ * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
+ * job is to try to service the interrupt on the current CPU. It will
+ * also enable the IRQ again if all the other CPUs have already tried to
+ * service it.
+ */
+static void cpu_pmu_do_percpu_work(struct irq_work *w)
+{
+	struct arm_pmu_work *work = container_of(w, struct arm_pmu_work, work);
+	struct arm_pmu *cpu_pmu = work->arm_pmu;
+
+	atomic_set(&work->ret,
+		   cpu_pmu->handle_irq(cpu_pmu->single_irq, cpu_pmu));
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_work))
+		enable_irq(cpu_pmu->single_irq);
+}
+
+/*
+ * This callback, which is enabled only on SMP platforms that are
+ * running with a single IRQ, is called when the PMU handler running in
+ * the current CPU cannot service the interrupt.
+ *
+ * It will disable the interrupt and distribute irqwork to all other
+ * processors in the system. Hopefully one of them will clear the
+ * interrupt...
+ */
+static irqreturn_t cpu_pmu_handle_irq_none(struct arm_pmu *cpu_pmu)
+{
+	int num_online = num_online_cpus();
+	irqreturn_t ret = IRQ_NONE;
+	int cpu, cret;
+
+	if (num_online <= 1)
+		return IRQ_NONE;
+
+	disable_irq_nosync(cpu_pmu->single_irq);
+	atomic_add(num_online, &cpu_pmu->remaining_work);
+	smp_mb__after_atomic();
+
+	for_each_online_cpu(cpu) {
+		struct arm_pmu_work *work = per_cpu_ptr(cpu_pmu->work, cpu);
+
+		if (cpu == smp_processor_id())
+			continue;
+
+		/*
+		 * We can be extremely relaxed about memory ordering
+		 * here. All we are doing is gathering information
+		 * about the past to help us give a return value that
+		 * will keep the spurious interrupt detector both happy
+		 * *and* functional. We are not shared so we can
+		 * tolerate the occasional spurious IRQ_HANDLED.
+		 */
+		cret = atomic_read(&work->ret);
+		if (cret != IRQ_NONE)
+			ret = cret;
+
+		if (!irq_work_queue_on(&work->work, cpu))
+			atomic_dec(&cpu_pmu->remaining_work);
+	}
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_work))
+		enable_irq(cpu_pmu->single_irq);
+
+	return ret;
+}
+
+static int cpu_pmu_single_irq_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	int cpu;
+
+	cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
+	cpu_pmu->single_irq = platform_get_irq(pmu_device, 0);
+
+	cpu_pmu->work = alloc_percpu(struct arm_pmu_work);
+	if (!cpu_pmu->work) {
+		pr_err("no memory for shared IRQ workaround\n");
+		return -ENOMEM;
+	}
+
+	for_each_possible_cpu(cpu) {
+		struct arm_pmu_work *w = per_cpu_ptr(cpu_pmu->work, cpu);
+
+		init_irq_work(&w->work, cpu_pmu_do_percpu_work);
+		w->arm_pmu = cpu_pmu;
+	}
+
+	return 0;
+}
+
+static void cpu_pmu_single_irq_workaround_term(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
+	free_percpu(cpu_pmu->work);
+}
+
+#else /* CONFIG_SMP */
+
+static int cpu_pmu_single_irq_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	return 0;
+}
+
+static void cpu_pmu_single_irq_workaround_term(struct arm_pmu *cpu_pmu)
+{
+}
+
+#endif /* CONFIG_SMP */
+
 static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 {
 	int i, irq, irqs;
@@ -107,6 +221,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			if (irq >= 0)
 				free_irq(irq, cpu_pmu);
 		}
+
+		cpu_pmu_single_irq_workaround_term(cpu_pmu);
 	}
 }

@@ -162,6 +278,16 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)

 			cpumask_set_cpu(i, &cpu_pmu->active_irqs);
 		}
+
+		/*
+		 * If we are running SMP and have only one interrupt source
+		 * then get ready to share that single irq among the cores.
+		 */
+		if (nr_cpu_ids > 1 && irqs == 1) {
+			err = cpu_pmu_single_irq_workaround_init(cpu_pmu);
+			if (err)
+				return err;
+		}
 	}

 	return 0;
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 6f63954c8bde..917774999c5c 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -12,8 +12,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -23,7 +21,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/random.h>

-#include <asm/pmu.h>
 #include <asm/mach/map.h>

 #include "setup.h"
@@ -99,30 +96,6 @@ static void __init u8500_map_io(void)
 		iotable_init(u8500_io_desc, ARRAY_SIZE(u8500_io_desc));
 }

-/*
- * The PMU IRQ lines of two cores are wired together into a single interrupt.
- * Bounce the interrupt to the other core if it's not ours.
- */
-static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
-{
-	irqreturn_t ret = handler(irq, dev);
-	int other = !smp_processor_id();
-
-	if (ret == IRQ_NONE && cpu_online(other))
-		irq_set_affinity(irq, cpumask_of(other));
-
-	/*
-	 * We should be able to get away with the amount of IRQ_NONEs we give,
-	 * while still having the spurious IRQ detection code kick in if the
-	 * interrupt really starts hitting spuriously.
-	 */
-	return ret;
-}
-
-static struct arm_pmu_platdata db8500_pmu_platdata = {
-	.handle_irq		= db8500_pmu_handler,
-};
-
 static const char *db8500_read_soc_id(void)
 {
 	void __iomem *uid = __io_address(U8500_BB_UID_BASE);
@@ -143,8 +116,6 @@ static struct device * __init db8500_soc_device_init(void)
 }

 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
-	/* Requires call-back bindings. */
-	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &db8500_pmu_platdata),
 	/* Requires DMA bindings. */
 	OF_DEV_AUXDATA("stericsson,ux500-msp-i2s", 0x80123000,
 		       "ux500-msp-i2s.0", &msp0_platform_data),
--
1.9.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
  2014-11-26 16:59 ` Daniel Thompson
@ 2014-12-02 13:22 ` Linus Walleij
  2015-01-07 12:28 ` [PATCH v3] " Daniel Thompson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 20+ messages in thread
From: Linus Walleij @ 2014-12-02 13:22 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, Will Deacon, linux-arm-kernel, linux-kernel,
	Shawn Guo, Sascha Hauer, Peter Zijlstra, Paul Mackerras,
	Ingo Molnar, Arnaldo Carvalho de Melo, Thomas Gleixner,
	Lucas Stach, Patch Tracking, linaro-kernel, John Stultz,
	Sumit Semwal

On Fri, Nov 21, 2014 at 3:53 PM, Daniel Thompson
<daniel.thompson@linaro.org> wrote:

> Some ARM platforms mux the PMU interrupt of every core into a single
> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
> then it cannot be serviced and eventually, if you are lucky, the spurious
> irq detection might forcefully disable the interrupt.
>
> On these SoCs it is not possible to determine which core raised the
> interrupt so workaround this issue by queuing irqwork on the other
> cores whenever the primary interrupt handler is unable to service the
> interrupt.
>
> The u8500 platform has an alternative workaround that dynamically alters
> the affinity of the PMU interrupt. This workaround logic is no longer
> required so the original code is removed as is the hook it relied upon.
>
> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).
>
> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>

After som pain I managed to compile and test this with perf.
Works like a charm.

Tested-by: Linus Walleij <linus.walleij@linaro.org>

Yours,
Linus Walleij

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-26 16:59 ` Daniel Thompson
@ 2014-12-02 14:49   ` Mark Rutland
  2014-12-02 21:33     ` Daniel Thompson
  0 siblings, 1 reply; 20+ messages in thread
From: Mark Rutland @ 2014-12-02 14:49 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, Will Deacon, linaro-kernel, Peter Zijlstra,
	patches, Linus Walleij, linux-kernel, Arnaldo Carvalho de Melo,
	Ingo Molnar, Paul Mackerras, Sascha Hauer, John Stultz,
	Thomas Gleixner, Shawn Guo, Sumit Semwal, linux-arm-kernel,
	Lucas Stach

Hi Daniel,

On Wed, Nov 26, 2014 at 04:59:07PM +0000, Daniel Thompson wrote:
> Some ARM platforms mux the PMU interrupt of every core into a single
> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
> then it cannot be serviced and eventually, if you are lucky, the spurious
> irq detection might forcefully disable the interrupt.
> 
> On these SoCs it is not possible to determine which core raised the
> interrupt so workaround this issue by queuing irqwork on the other
> cores whenever the primary interrupt handler is unable to service the
> interrupt.
> 
> The u8500 platform has an alternative workaround that dynamically alters
> the affinity of the PMU interrupt. This workaround logic is no longer
> required so the original code is removed as is the hook it relied upon.

I agree that the workaround for this design "feature" should live in the
architectural perf code rather than in board files.

I had intended to implement that atop of my heterogeneous CPUs series
[1] (so as to only target the subset of cores for a given PMU), but I
only had an affinity bouncing implementation, and the general approach
looks like it has the potential to be far better in terms of latency.

Hwoever, I have some concerns with the implementation below.

> 
> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).
> 
> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
> ---
> 
> Notes:
>     v2:
>      * Fixed build problems on systems without SMP.
> 
>     v1:
>      * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
>        critiquing an older, completely different way to tackle the
>        same problem.
> 
> 
>  arch/arm/include/asm/pmu.h       |  12 ++++
>  arch/arm/kernel/perf_event.c     |  13 ++--
>  arch/arm/kernel/perf_event_cpu.c | 126 +++++++++++++++++++++++++++++++++++++++
>  arch/arm/mach-ux500/cpu-db8500.c |  29 ---------
>  4 files changed, 143 insertions(+), 37 deletions(-)
> 
> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> index 0b648c541293..771201ff0988 100644
> --- a/arch/arm/include/asm/pmu.h
> +++ b/arch/arm/include/asm/pmu.h
> @@ -81,6 +81,12 @@ struct pmu_hw_events {
>         raw_spinlock_t          pmu_lock;
>  };
> 
> +struct arm_pmu_work {
> +       struct irq_work work;
> +       struct arm_pmu *arm_pmu;
> +       atomic_t ret;
> +};
> +
>  struct arm_pmu {
>         struct pmu      pmu;
>         cpumask_t       active_irqs;
> @@ -108,6 +114,12 @@ struct arm_pmu {
>         u64             max_period;
>         struct platform_device  *plat_device;
>         struct pmu_hw_events    *(*get_hw_events)(void);
> +#ifdef CONFIG_SMP
> +       irqreturn_t     (*handle_irq_none)(struct arm_pmu *);

I don't think this needs to live on the struct arm_pmu; we're unlikely
to need a different callback given this is already generic to CPUs, so
we can just call it directly.

Once Will's arm-perf-3.19 tag hits mainline (with my perf cleanups
series) the CCI will be decoupled [2] and the arm_pmu framework will
only be used by CPU PMUs.

> +       int             single_irq;

With my heterogeneous PMUs series you shouldn't need this, there will be
an irq_map with a single entry instead.

> +       struct arm_pmu_work __percpu *work;

In my cleanups series I folded all the percpu data into the
pmu_hw_events. It would be nice to fold this too if possible, so as to
make allocation and freeing simpler. We already have a percpu_pmu
pointer with my series applied, so we don't need the arm_pmu pointer.

If it feels like we're straying too far from hw events data we can
rename the structure to pmu_cpu_data or something like that.

> +       atomic_t        remaining_work;
> +#endif
>  };
> 
>  #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
> index b50a770f8c99..ba67d6309e1e 100644
> --- a/arch/arm/kernel/perf_event.c
> +++ b/arch/arm/kernel/perf_event.c
> @@ -306,22 +306,19 @@ validate_group(struct perf_event *event)
>  static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>  {
>         struct arm_pmu *armpmu;
> -       struct platform_device *plat_device;
> -       struct arm_pmu_platdata *plat;
>         int ret;
>         u64 start_clock, finish_clock;
> 
>         if (irq_is_percpu(irq))
>                 dev = *(void **)dev;
>         armpmu = dev;
> -       plat_device = armpmu->plat_device;
> -       plat = dev_get_platdata(&plat_device->dev);
> 
>         start_clock = sched_clock();
> -       if (plat && plat->handle_irq)
> -               ret = plat->handle_irq(irq, dev, armpmu->handle_irq);
> -       else
> -               ret = armpmu->handle_irq(irq, dev);
> +       ret = armpmu->handle_irq(irq, dev);
> +#ifdef CONFIG_SMP
> +       if (ret == IRQ_NONE && armpmu->handle_irq_none)
> +               ret = armpmu->handle_irq_none(dev);
> +#endif
>         finish_clock = sched_clock();
> 
>         perf_sample_event_took(finish_clock - start_clock);
> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
> index eb2c4d55666b..5605d4a4c01f 100644
> --- a/arch/arm/kernel/perf_event_cpu.c
> +++ b/arch/arm/kernel/perf_event_cpu.c
> @@ -88,6 +88,120 @@ static void cpu_pmu_disable_percpu_irq(void *data)
>         disable_percpu_irq(irq);
>  }
> 
> +#ifdef CONFIG_SMP
> +
> +/*
> + * Workaround logic that is distributed to all cores if the PMU has only
> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
> + * job is to try to service the interrupt on the current CPU. It will
> + * also enable the IRQ again if all the other CPUs have already tried to
> + * service it.
> + */
> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
> +{
> +       struct arm_pmu_work *work = container_of(w, struct arm_pmu_work, work);
> +       struct arm_pmu *cpu_pmu = work->arm_pmu;
> +
> +       atomic_set(&work->ret,
> +                  cpu_pmu->handle_irq(cpu_pmu->single_irq, cpu_pmu));
> +
> +       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
> +               enable_irq(cpu_pmu->single_irq);
> +}
> +
> +/*
> + * This callback, which is enabled only on SMP platforms that are
> + * running with a single IRQ, is called when the PMU handler running in
> + * the current CPU cannot service the interrupt.
> + *
> + * It will disable the interrupt and distribute irqwork to all other
> + * processors in the system. Hopefully one of them will clear the
> + * interrupt...
> + */
> +static irqreturn_t cpu_pmu_handle_irq_none(struct arm_pmu *cpu_pmu)
> +{
> +       int num_online = num_online_cpus();
> +       irqreturn_t ret = IRQ_NONE;
> +       int cpu, cret;

s/cret/cpu_ret/ -- it's slightly more typing but easier to read IMO.

> +
> +       if (num_online <= 1)
> +               return IRQ_NONE;

Surely num_online can never be below one here?

We didn't get the online CPUs, so can't other CPUs may shut down between
the call to num_online_cpus() and portions of the below code?

Could that prevent us from re-enabling the interrupt? Or are we
protected from that someehow?

> +
> +       disable_irq_nosync(cpu_pmu->single_irq);
> +       atomic_add(num_online, &cpu_pmu->remaining_work);
> +       smp_mb__after_atomic();
> +
> +       for_each_online_cpu(cpu) {
> +               struct arm_pmu_work *work = per_cpu_ptr(cpu_pmu->work, cpu);
> +
> +               if (cpu == smp_processor_id())
> +                       continue;
> +
> +               /*
> +                * We can be extremely relaxed about memory ordering
> +                * here. All we are doing is gathering information
> +                * about the past to help us give a return value that
> +                * will keep the spurious interrupt detector both happy
> +                * *and* functional. We are not shared so we can
> +                * tolerate the occasional spurious IRQ_HANDLED.
> +                */
> +               cret = atomic_read(&work->ret);
> +               if (cret != IRQ_NONE)
> +                       ret = cret;

I'm a little confused as to what's going on here. Why are we checking
the return code for the work triggered by the _previous_ interrupt
before queueing up the next work item?

As far as I can tell, work_ret was never initialised, so what does this
do for the first round?

> +
> +               if (!irq_work_queue_on(&work->work, cpu))
> +                       atomic_dec(&cpu_pmu->remaining_work);

What context does queued work run in? Will we sample the expected set of
registers (e.g. can we sample userspace)?

> +       }
> +
> +       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
> +               enable_irq(cpu_pmu->single_irq);

If we initialise remaining work to the number of CPUs minus one, we can
drop this last bit, and another CPU will always re-enable the interrupt.

> +
> +       return ret;
> +}
> +
> +static int cpu_pmu_single_irq_workaround_init(struct arm_pmu *cpu_pmu)
> +{
> +       struct platform_device *pmu_device = cpu_pmu->plat_device;
> +       int cpu;
> +
> +       cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
> +       cpu_pmu->single_irq = platform_get_irq(pmu_device, 0);
> +
> +       cpu_pmu->work = alloc_percpu(struct arm_pmu_work);
> +       if (!cpu_pmu->work) {
> +               pr_err("no memory for shared IRQ workaround\n");
> +               return -ENOMEM;
> +       }
> +
> +       for_each_possible_cpu(cpu) {
> +               struct arm_pmu_work *w = per_cpu_ptr(cpu_pmu->work, cpu);
> +
> +               init_irq_work(&w->work, cpu_pmu_do_percpu_work);
> +               w->arm_pmu = cpu_pmu;
> +       }
> +
> +       return 0;
> +}
> +
> +static void cpu_pmu_single_irq_workaround_term(struct arm_pmu *cpu_pmu)
> +{
> +       cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;

Huh? We set that up identically in cpu_pmu_single_irq_workaround_init.

I think the handle_irq_none member should go.

> +       free_percpu(cpu_pmu->work);
> +}
> +
> +#else /* CONFIG_SMP */
> +
> +static int cpu_pmu_single_irq_workaround_init(struct arm_pmu *cpu_pmu)
> +{
> +       return 0;
> +}
> +
> +static void cpu_pmu_single_irq_workaround_term(struct arm_pmu *cpu_pmu)
> +{
> +}
> +
> +#endif /* CONFIG_SMP */
> +
>  static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
>  {
>         int i, irq, irqs;
> @@ -107,6 +221,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
>                         if (irq >= 0)
>                                 free_irq(irq, cpu_pmu);
>                 }
> +
> +               cpu_pmu_single_irq_workaround_term(cpu_pmu);
>         }
>  }
> 
> @@ -162,6 +278,16 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
> 
>                         cpumask_set_cpu(i, &cpu_pmu->active_irqs);
>                 }
> +
> +               /*
> +                * If we are running SMP and have only one interrupt source
> +                * then get ready to share that single irq among the cores.
> +                */
> +               if (nr_cpu_ids > 1 && irqs == 1) {
> +                       err = cpu_pmu_single_irq_workaround_init(cpu_pmu);
> +                       if (err)
> +                               return err;
> +               }

What does this do for systems using PPIs, where there's a single percpu
interrupt (e.g. Krait)?

Thanks,
Mark.

[1] http://lists.infradead.org/pipermail/linux-arm-kernel/2014-November/300748.html
[2] https://git.kernel.org/cgit/linux/kernel/git/will/linux.git/tag/?h=perf/updates&id=arm-perf-3.19
[3] https://git.kernel.org/cgit/linux/kernel/git/will/linux.git/commit/?h=perf/updates&id=c6f85cb4305bd80658d19f7b097a7c36ef9912e2

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] arm: perf: Directly handle SMP platforms with one SPI
  2014-12-02 14:49   ` Mark Rutland
@ 2014-12-02 21:33     ` Daniel Thompson
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Thompson @ 2014-12-02 21:33 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Russell King, Will Deacon, linaro-kernel, Peter Zijlstra,
	patches, Linus Walleij, linux-kernel, Arnaldo Carvalho de Melo,
	Ingo Molnar, Paul Mackerras, Sascha Hauer, John Stultz,
	Thomas Gleixner, Shawn Guo, Sumit Semwal, linux-arm-kernel,
	Lucas Stach

On 02/12/14 14:49, Mark Rutland wrote:
> Hi Daniel,
> 
> On Wed, Nov 26, 2014 at 04:59:07PM +0000, Daniel Thompson wrote:
>> Some ARM platforms mux the PMU interrupt of every core into a single
>> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
>> then it cannot be serviced and eventually, if you are lucky, the spurious
>> irq detection might forcefully disable the interrupt.
>>
>> On these SoCs it is not possible to determine which core raised the
>> interrupt so workaround this issue by queuing irqwork on the other
>> cores whenever the primary interrupt handler is unable to service the
>> interrupt.
>>
>> The u8500 platform has an alternative workaround that dynamically alters
>> the affinity of the PMU interrupt. This workaround logic is no longer
>> required so the original code is removed as is the hook it relied upon.
> 
> I agree that the workaround for this design "feature" should live in the
> architectural perf code rather than in board files.
> 
> I had intended to implement that atop of my heterogeneous CPUs series
> [1] (so as to only target the subset of cores for a given PMU), but I
> only had an affinity bouncing implementation, and the general approach
> looks like it has the potential to be far better in terms of latency.
> 
> Hwoever, I have some concerns with the implementation below.
> 
>>
>> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).
>>
>> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
>> ---
>>
>> Notes:
>>     v2:
>>      * Fixed build problems on systems without SMP.
>>
>>     v1:
>>      * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
>>        critiquing an older, completely different way to tackle the
>>        same problem.
>>
>>
>>  arch/arm/include/asm/pmu.h       |  12 ++++
>>  arch/arm/kernel/perf_event.c     |  13 ++--
>>  arch/arm/kernel/perf_event_cpu.c | 126 +++++++++++++++++++++++++++++++++++++++
>>  arch/arm/mach-ux500/cpu-db8500.c |  29 ---------
>>  4 files changed, 143 insertions(+), 37 deletions(-)
>>
>> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
>> index 0b648c541293..771201ff0988 100644
>> --- a/arch/arm/include/asm/pmu.h
>> +++ b/arch/arm/include/asm/pmu.h
>> @@ -81,6 +81,12 @@ struct pmu_hw_events {
>>         raw_spinlock_t          pmu_lock;
>>  };
>>
>> +struct arm_pmu_work {
>> +       struct irq_work work;
>> +       struct arm_pmu *arm_pmu;
>> +       atomic_t ret;
>> +};
>> +
>>  struct arm_pmu {
>>         struct pmu      pmu;
>>         cpumask_t       active_irqs;
>> @@ -108,6 +114,12 @@ struct arm_pmu {
>>         u64             max_period;
>>         struct platform_device  *plat_device;
>>         struct pmu_hw_events    *(*get_hw_events)(void);
>> +#ifdef CONFIG_SMP
>> +       irqreturn_t     (*handle_irq_none)(struct arm_pmu *);
> 
> I don't think this needs to live on the struct arm_pmu; we're unlikely
> to need a different callback given this is already generic to CPUs, so
> we can just call it directly.
> 
> Once Will's arm-perf-3.19 tag hits mainline (with my perf cleanups
> series) the CCI will be decoupled [2] and the arm_pmu framework will
> only be used by CPU PMUs.

I didn't much like this either but it allowed all the workaround code to
be in the same place (and to easily decide whether to deploy the
workaround).

I will take a closer look at the new code.


>> +       int             single_irq;
> 
> With my heterogeneous PMUs series you shouldn't need this, there will be
> an irq_map with a single entry instead.

Ok.


>> +       struct arm_pmu_work __percpu *work;
> 
> In my cleanups series I folded all the percpu data into the
> pmu_hw_events. It would be nice to fold this too if possible, so as to
> make allocation and freeing simpler. We already have a percpu_pmu
> pointer with my series applied, so we don't need the arm_pmu pointer.

I'll have a look. All this codes needs is for an irq_work structure and
arm_pmu pointer to be part of the same structure (for container_of).


> If it feels like we're straying too far from hw events data we can
> rename the structure to pmu_cpu_data or something like that.
> 
>> +       atomic_t        remaining_work;
>> +#endif
>>  };
>>
>>  #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
>> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
>> index b50a770f8c99..ba67d6309e1e 100644
>> --- a/arch/arm/kernel/perf_event.c
>> +++ b/arch/arm/kernel/perf_event.c
>> @@ -306,22 +306,19 @@ validate_group(struct perf_event *event)
>>  static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>>  {
>>         struct arm_pmu *armpmu;
>> -       struct platform_device *plat_device;
>> -       struct arm_pmu_platdata *plat;
>>         int ret;
>>         u64 start_clock, finish_clock;
>>
>>         if (irq_is_percpu(irq))
>>                 dev = *(void **)dev;
>>         armpmu = dev;
>> -       plat_device = armpmu->plat_device;
>> -       plat = dev_get_platdata(&plat_device->dev);
>>
>>         start_clock = sched_clock();
>> -       if (plat && plat->handle_irq)
>> -               ret = plat->handle_irq(irq, dev, armpmu->handle_irq);
>> -       else
>> -               ret = armpmu->handle_irq(irq, dev);
>> +       ret = armpmu->handle_irq(irq, dev);
>> +#ifdef CONFIG_SMP
>> +       if (ret == IRQ_NONE && armpmu->handle_irq_none)
>> +               ret = armpmu->handle_irq_none(dev);
>> +#endif
>>         finish_clock = sched_clock();
>>
>>         perf_sample_event_took(finish_clock - start_clock);
>> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
>> index eb2c4d55666b..5605d4a4c01f 100644
>> --- a/arch/arm/kernel/perf_event_cpu.c
>> +++ b/arch/arm/kernel/perf_event_cpu.c
>> @@ -88,6 +88,120 @@ static void cpu_pmu_disable_percpu_irq(void *data)
>>         disable_percpu_irq(irq);
>>  }
>>
>> +#ifdef CONFIG_SMP
>> +
>> +/*
>> + * Workaround logic that is distributed to all cores if the PMU has only
>> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
>> + * job is to try to service the interrupt on the current CPU. It will
>> + * also enable the IRQ again if all the other CPUs have already tried to
>> + * service it.
>> + */
>> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
>> +{
>> +       struct arm_pmu_work *work = container_of(w, struct arm_pmu_work, work);
>> +       struct arm_pmu *cpu_pmu = work->arm_pmu;
>> +
>> +       atomic_set(&work->ret,
>> +                  cpu_pmu->handle_irq(cpu_pmu->single_irq, cpu_pmu));
>> +
>> +       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
>> +               enable_irq(cpu_pmu->single_irq);
>> +}
>> +
>> +/*
>> + * This callback, which is enabled only on SMP platforms that are
>> + * running with a single IRQ, is called when the PMU handler running in
>> + * the current CPU cannot service the interrupt.
>> + *
>> + * It will disable the interrupt and distribute irqwork to all other
>> + * processors in the system. Hopefully one of them will clear the
>> + * interrupt...
>> + */
>> +static irqreturn_t cpu_pmu_handle_irq_none(struct arm_pmu *cpu_pmu)
>> +{
>> +       int num_online = num_online_cpus();
>> +       irqreturn_t ret = IRQ_NONE;
>> +       int cpu, cret;
> 
> s/cret/cpu_ret/ -- it's slightly more typing but easier to read IMO.

Ok.

>> +
>> +       if (num_online <= 1)
>> +               return IRQ_NONE;
> 
> Surely num_online can never be below one here?

You are right. This check will probably never fire and, even if it did,
the code below will (mostly) cope.

> We didn't get the online CPUs, so can't other CPUs may shut down between
> the call to num_online_cpus() and portions of the below code?
>
> Could that prevent us from re-enabling the interrupt? Or are we
> protected from that someehow?

We can't get_online_cpus() since we are running in an interrupt handler.

Still looking at the least racy way to handle that...

We could copy the online mask and use that to manipulate the atomic and
iterate over the map. That would make it impossible for the workaround
function to mismanage remaining_work although we do rely on the
irq_work_queue_on() actually resulting in the work being done. It looks
like that code relies on winning a race with the CPU_DYING notifier
chain to achieve that but I'm not 100% sure.


>> +
>> +       disable_irq_nosync(cpu_pmu->single_irq);
>> +       atomic_add(num_online, &cpu_pmu->remaining_work);
>> +       smp_mb__after_atomic();
>> +
>> +       for_each_online_cpu(cpu) {
>> +               struct arm_pmu_work *work = per_cpu_ptr(cpu_pmu->work, cpu);
>> +
>> +               if (cpu == smp_processor_id())
>> +                       continue;
>> +
>> +               /*
>> +                * We can be extremely relaxed about memory ordering
>> +                * here. All we are doing is gathering information
>> +                * about the past to help us give a return value that
>> +                * will keep the spurious interrupt detector both happy
>> +                * *and* functional. We are not shared so we can
>> +                * tolerate the occasional spurious IRQ_HANDLED.
>> +                */
>> +               cret = atomic_read(&work->ret);
>> +               if (cret != IRQ_NONE)
>> +                       ret = cret;
> 
> I'm a little confused as to what's going on here. Why are we checking
> the return code for the work triggered by the _previous_ interrupt
> before queueing up the next work item?

The workaround exits immediately without waiting for the results from
other CPUs. That means we can't figure out whether to return IRQ_NONE or
IRQ_HANDLED.

We could unconditionally return IRQ_HANDLED but this would de-fang the
spurious interrupt detector so instead we use the previous values to
make an informed guess about the likelihood of the interrupt being real.
This code will not trigger the spurious interrupt detector in normal
cases but will if the interrupt really did get wedged due to a bad DT or
something like that.

Should I expand the comment or just return IRQ_HANDLED in all cases?


> As far as I can tell, work_ret was never initialised, so what does this
> do for the first round?

Sends back a stupid value. I'll fix this.

>> +
>> +               if (!irq_work_queue_on(&work->work, cpu))
>> +                       atomic_dec(&cpu_pmu->remaining_work);
> 
> What context does queued work run in? Will we sample the expected set of
> registers (e.g. can we sample userspace)?

hardirq, sampling userspace should not be a problem (and was working
fine in my tests).

>> +       }
>> +
>> +       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
>> +               enable_irq(cpu_pmu->single_irq);
> 
> If we initialise remaining work to the number of CPUs minus one, we can
> drop this last bit, and another CPU will always re-enable the interrupt.

My code originally did that but if we do that then we have to add an
atomic_dec_and_test()/enable_irq() into the main loop. I didn't really
like that because it was harder to scan for mismatched up disable/enables.

I've no idea how the extra atomic manipulation trades off against the
extra memory barriers if we use atomic_dec_and_test() in the loop.

> 
>> +
>> +       return ret;
>> +}
>> +
>> +static int cpu_pmu_single_irq_workaround_init(struct arm_pmu *cpu_pmu)
>> +{
>> +       struct platform_device *pmu_device = cpu_pmu->plat_device;
>> +       int cpu;
>> +
>> +       cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
>> +       cpu_pmu->single_irq = platform_get_irq(pmu_device, 0);
>> +
>> +       cpu_pmu->work = alloc_percpu(struct arm_pmu_work);
>> +       if (!cpu_pmu->work) {
>> +               pr_err("no memory for shared IRQ workaround\n");
>> +               return -ENOMEM;
>> +       }
>> +
>> +       for_each_possible_cpu(cpu) {
>> +               struct arm_pmu_work *w = per_cpu_ptr(cpu_pmu->work, cpu);
>> +
>> +               init_irq_work(&w->work, cpu_pmu_do_percpu_work);
>> +               w->arm_pmu = cpu_pmu;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static void cpu_pmu_single_irq_workaround_term(struct arm_pmu *cpu_pmu)
>> +{
>> +       cpu_pmu->handle_irq_none = cpu_pmu_handle_irq_none;
> 
> Huh? We set that up identically in cpu_pmu_single_irq_workaround_init.
> 
> I think the handle_irq_none member should go.

Should have been NULL. However if we remove handle_irq_none then there's
no problem here.

> 
>> +       free_percpu(cpu_pmu->work);
>> +}
>> +
>> +#else /* CONFIG_SMP */
>> +
>> +static int cpu_pmu_single_irq_workaround_init(struct arm_pmu *cpu_pmu)
>> +{
>> +       return 0;
>> +}
>> +
>> +static void cpu_pmu_single_irq_workaround_term(struct arm_pmu *cpu_pmu)
>> +{
>> +}
>> +
>> +#endif /* CONFIG_SMP */
>> +
>>  static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
>>  {
>>         int i, irq, irqs;
>> @@ -107,6 +221,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
>>                         if (irq >= 0)
>>                                 free_irq(irq, cpu_pmu);
>>                 }
>> +
>> +               cpu_pmu_single_irq_workaround_term(cpu_pmu);
>>         }
>>  }
>>
>> @@ -162,6 +278,16 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)
>>
>>                         cpumask_set_cpu(i, &cpu_pmu->active_irqs);
>>                 }
>> +
>> +               /*
>> +                * If we are running SMP and have only one interrupt source
>> +                * then get ready to share that single irq among the cores.
>> +                */
>> +               if (nr_cpu_ids > 1 && irqs == 1) {
>> +                       err = cpu_pmu_single_irq_workaround_init(cpu_pmu);
>> +                       if (err)
>> +                               return err;
>> +               }
> 
> What does this do for systems using PPIs, where there's a single percpu
> interrupt (e.g. Krait)?

This code is not reached on such a system (code here is added to the
else clause of the "am I using PPIs" test).

> 
> Thanks,
> Mark.
> 
> [1] http://lists.infradead.org/pipermail/linux-arm-kernel/2014-November/300748.html
> [2] https://git.kernel.org/cgit/linux/kernel/git/will/linux.git/tag/?h=perf/updates&id=arm-perf-3.19
> [3] https://git.kernel.org/cgit/linux/kernel/git/will/linux.git/commit/?h=perf/updates&id=c6f85cb4305bd80658d19f7b097a7c36ef9912e2
> 


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
  2014-11-26 16:59 ` Daniel Thompson
  2014-12-02 13:22 ` Linus Walleij
@ 2015-01-07 12:28 ` Daniel Thompson
  2015-01-08 17:30   ` Will Deacon
  2015-01-09 16:16 ` [PATCH v4] " Daniel Thompson
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: Daniel Thompson @ 2015-01-07 12:28 UTC (permalink / raw)
  To: Russell King, Will Deacon
  Cc: Daniel Thompson, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, patches, linaro-kernel, John Stultz, Sumit Semwal

Some ARM platforms mux the PMU interrupt of every core into a single
SPI. On such platforms if the PMU of any core except 0 raises an interrupt
then it cannot be serviced and eventually, if you are lucky, the spurious
irq detection might forcefully disable the interrupt.

On these SoCs it is not possible to determine which core raised the
interrupt so workaround this issue by queuing irqwork on the other
cores whenever the primary interrupt handler is unable to service the
interrupt.

The u8500 platform has an alternative workaround that dynamically alters
the affinity of the PMU interrupt. This workaround logic is no longer
required so the original code is removed as is the hook it relied upon.

Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    v2 was tested on u8500 (thanks to Linus Walleij). v3 doesn't change
    anything conceptual but the changes were just enough for me not to
    preserve the Tested-By:.
    
    v3:
     * Removed function pointer indirection when deploying workaround code
       and reorganise the code accordingly (Mark Rutland).
     * Move the workaround state tracking into the existing percpu data
       structure (Mark Rutland).
     * Renamed cret to percpu_ret and rewrote the comment describing the
       purpose of this variable (Mark Rutland).
     * Copy the cpu_online_mask and use that to act on a consistent set of
       cpus throughout the workaround (Mark Rutland).
     * Changed "single_irq" to "muxed_spi" to more explicitly describe
       the problem.
    
    v2:
     * Fixed build problems on systems without SMP.
    
    v1:
     * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
       critiquing an older, completely different way to tackle the
       same problem.
    

 arch/arm/include/asm/pmu.h       |  17 ++++++
 arch/arm/kernel/perf_event.c     |   9 +--
 arch/arm/kernel/perf_event_cpu.c | 122 +++++++++++++++++++++++++++++++++++++++
 arch/arm/kernel/perf_event_v7.c  |   2 +-
 arch/arm/mach-ux500/cpu-db8500.c |  29 ----------
 5 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index b1596bd59129..295e762d5116 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -87,6 +87,19 @@ struct pmu_hw_events {
 	 * already have to allocate this struct per cpu.
 	 */
 	struct arm_pmu		*percpu_pmu;
+
+#ifdef CONFIG_SMP
+	/*
+	 * This is used to schedule workaround logic on platforms where all
+	 * the PMUs are attached to a single SPI.
+	 */
+	struct irq_work work;
+
+	/*
+	 * Used to track state when deploying above workaround.
+	 */
+	atomic_t work_ret;
+#endif
 };

 struct arm_pmu {
@@ -117,6 +130,10 @@ struct arm_pmu {
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	__percpu *hw_events;
 	struct notifier_block	hotplug_nb;
+#ifdef CONFIG_SMP
+	int             muxed_spi_workaround_irq;
+	atomic_t        remaining_work;
+#endif
 };

 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index f7c65adaa428..e5c537b57f94 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -299,8 +299,6 @@ validate_group(struct perf_event *event)
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;

@@ -311,14 +309,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	 * dereference.
 	 */
 	armpmu = *(void **)dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);

 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, armpmu);
+	ret = armpmu->handle_irq(irq, armpmu);
 	finish_clock = sched_clock();

 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index dd9acc95ebc0..3d51c5f442eb 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -59,6 +59,116 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);

+#ifdef CONFIG_SMP
+/*
+ * Workaround logic that is distributed to all cores if the PMU has only
+ * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
+ * job is to try to service the interrupt on the current CPU. It will
+ * also enable the IRQ again if all the other CPUs have already tried to
+ * service it.
+ */
+static void cpu_pmu_do_percpu_work(struct irq_work *w)
+{
+	struct pmu_hw_events *hw_events =
+	    container_of(w, struct pmu_hw_events, work);
+	struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
+
+	atomic_set(&hw_events->work_ret,
+		   cpu_pmu->handle_irq(0, cpu_pmu));
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_work))
+		enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+}
+
+/*
+ * Called when the main interrupt handler cannot determine the source
+ * of interrupt. It will deploy a workaround if we are running on an SMP
+ * platform with only a single muxed SPI.
+ *
+ * The workaround disables the interrupt and distributes irqwork to all
+ * other processors in the system. Hopefully one of them will clear the
+ * interrupt...
+ */
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	irqreturn_t ret = IRQ_NONE;
+	cpumask_t deploy_on_mask;
+	int cpu, work_ret;
+
+	if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
+		return IRQ_NONE;
+
+	disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
+
+	cpumask_copy(&deploy_on_mask, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), &deploy_on_mask);
+	atomic_add(cpumask_weight(&deploy_on_mask), &cpu_pmu->remaining_work);
+	smp_mb__after_atomic();
+
+	for_each_cpu(cpu, &deploy_on_mask) {
+		struct pmu_hw_events *hw_events =
+		    per_cpu_ptr(cpu_pmu->hw_events, cpu);
+
+		/*
+		 * The workaround code exits immediately without waiting to
+		 * see if the interrupt was handled by another CPU. This makes
+		 * it hard for us to decide between IRQ_HANDLED and IRQ_NONE.
+		 * However, the handler isn't shared so we don't have to worry
+		 * about being a good citizen, only about keeping the spurious
+		 * interrupt detector working. This allows us to return the
+		 * result of our *previous* attempt to deploy workaround.
+		 */
+		work_ret = atomic_read(&hw_events->work_ret);
+		if (work_ret != IRQ_NONE)
+			ret = work_ret;
+
+		if (!irq_work_queue_on(&hw_events->work, cpu))
+			if (atomic_dec_and_test(&cpu_pmu->remaining_work))
+				enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+	}
+
+	return ret;
+}
+
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct pmu_hw_events *hw_events =
+		    per_cpu_ptr(cpu_pmu->hw_events, cpu);
+
+		init_irq_work(&hw_events->work, cpu_pmu_do_percpu_work);
+		atomic_set(&hw_events->work_ret, IRQ_HANDLED);
+	}
+
+	aomic_set(cpu_pmu->remaining_work, 0);
+	cpu_pmu->muxed_spi_workaround_irq = platform_get_irq(pmu_device, 0);
+
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->muxed_spi_workaround_irq = 0;
+}
+#else /* CONFIG_SMP */
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+}
+
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	return IRQ_NONE;
+}
+#endif /* CONFIG_SMP */
+
 /* Include the PMU-specific implementations. */
 #include "perf_event_xscale.c"
 #include "perf_event_v6.c"
@@ -98,6 +208,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			if (irq >= 0)
 				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, i));
 		}
+
+		cpu_pmu_muxed_spi_workaround_term(cpu_pmu);
 	}
 }

@@ -155,6 +267,16 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)

 			cpumask_set_cpu(i, &cpu_pmu->active_irqs);
 		}
+
+		/*
+		 * If we are running SMP and have only one interrupt source
+		 * then get ready to share that single irq among the cores.
+		 */
+		if (nr_cpu_ids > 1 && irqs == 1) {
+			err = cpu_pmu_muxed_spi_workaround_init(cpu_pmu);
+			if (err)
+				return err;
+		}
 	}

 	return 0;
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 8993770c47de..0dd914c10803 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 * Did an overflow occur?
 	 */
 	if (!armv7_pmnc_has_overflowed(pmnc))
-		return IRQ_NONE;
+		return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);

 	/*
 	 * Handle the counter(s) overflow(s)
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 6f63954c8bde..917774999c5c 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -12,8 +12,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -23,7 +21,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/random.h>

-#include <asm/pmu.h>
 #include <asm/mach/map.h>

 #include "setup.h"
@@ -99,30 +96,6 @@ static void __init u8500_map_io(void)
 		iotable_init(u8500_io_desc, ARRAY_SIZE(u8500_io_desc));
 }

-/*
- * The PMU IRQ lines of two cores are wired together into a single interrupt.
- * Bounce the interrupt to the other core if it's not ours.
- */
-static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
-{
-	irqreturn_t ret = handler(irq, dev);
-	int other = !smp_processor_id();
-
-	if (ret == IRQ_NONE && cpu_online(other))
-		irq_set_affinity(irq, cpumask_of(other));
-
-	/*
-	 * We should be able to get away with the amount of IRQ_NONEs we give,
-	 * while still having the spurious IRQ detection code kick in if the
-	 * interrupt really starts hitting spuriously.
-	 */
-	return ret;
-}
-
-static struct arm_pmu_platdata db8500_pmu_platdata = {
-	.handle_irq		= db8500_pmu_handler,
-};
-
 static const char *db8500_read_soc_id(void)
 {
 	void __iomem *uid = __io_address(U8500_BB_UID_BASE);
@@ -143,8 +116,6 @@ static struct device * __init db8500_soc_device_init(void)
 }

 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
-	/* Requires call-back bindings. */
-	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &db8500_pmu_platdata),
 	/* Requires DMA bindings. */
 	OF_DEV_AUXDATA("stericsson,ux500-msp-i2s", 0x80123000,
 		       "ux500-msp-i2s.0", &msp0_platform_data),
--
1.9.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] arm: perf: Directly handle SMP platforms with one SPI
  2015-01-07 12:28 ` [PATCH v3] " Daniel Thompson
@ 2015-01-08 17:30   ` Will Deacon
  2015-01-09 14:23     ` Daniel Thompson
  0 siblings, 1 reply; 20+ messages in thread
From: Will Deacon @ 2015-01-08 17:30 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, patches, linaro-kernel, John Stultz, Sumit Semwal

Hi Daniel,

Some minor comments below...

On Wed, Jan 07, 2015 at 12:28:18PM +0000, Daniel Thompson wrote:
> Some ARM platforms mux the PMU interrupt of every core into a single
> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
> then it cannot be serviced and eventually, if you are lucky, the spurious
> irq detection might forcefully disable the interrupt.
> 
> On these SoCs it is not possible to determine which core raised the
> interrupt so workaround this issue by queuing irqwork on the other
> cores whenever the primary interrupt handler is unable to service the
> interrupt.
> 
> The u8500 platform has an alternative workaround that dynamically alters
> the affinity of the PMU interrupt. This workaround logic is no longer
> required so the original code is removed as is the hook it relied upon.
> 
> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).
> 
> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>

[...]

> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
> index dd9acc95ebc0..3d51c5f442eb 100644
> --- a/arch/arm/kernel/perf_event_cpu.c
> +++ b/arch/arm/kernel/perf_event_cpu.c
> @@ -59,6 +59,116 @@ int perf_num_counters(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_num_counters);
> 
> +#ifdef CONFIG_SMP
> +/*
> + * Workaround logic that is distributed to all cores if the PMU has only
> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
> + * job is to try to service the interrupt on the current CPU. It will
> + * also enable the IRQ again if all the other CPUs have already tried to
> + * service it.
> + */
> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
> +{
> +       struct pmu_hw_events *hw_events =
> +           container_of(w, struct pmu_hw_events, work);
> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
> +
> +       atomic_set(&hw_events->work_ret,
> +                  cpu_pmu->handle_irq(0, cpu_pmu));

Do you need a memory barrier here, or is that implued by enable_irq?

> +       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +}
> +
> +/*
> + * Called when the main interrupt handler cannot determine the source
> + * of interrupt. It will deploy a workaround if we are running on an SMP
> + * platform with only a single muxed SPI.
> + *
> + * The workaround disables the interrupt and distributes irqwork to all
> + * other processors in the system. Hopefully one of them will clear the
> + * interrupt...
> + */
> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
> +{
> +       irqreturn_t ret = IRQ_NONE;
> +       cpumask_t deploy_on_mask;
> +       int cpu, work_ret;
> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
> +               return IRQ_NONE;

return ret ?

> +
> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
> +
> +       cpumask_copy(&deploy_on_mask, cpu_online_mask);
> +       cpumask_clear_cpu(smp_processor_id(), &deploy_on_mask);
> +       atomic_add(cpumask_weight(&deploy_on_mask), &cpu_pmu->remaining_work);
> +       smp_mb__after_atomic();

What's this barrier needed for?

> +
> +       for_each_cpu(cpu, &deploy_on_mask) {

Why not for_each_online_cpu and then continue if cpu == smp_processor_id() ?
I assume the race against hotplug is benign, as the interrupt will no longer
be asserted to the GIC if the source CPU goes offline?

> +               struct pmu_hw_events *hw_events =
> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
> +
> +               /*
> +                * The workaround code exits immediately without waiting to
> +                * see if the interrupt was handled by another CPU. This makes
> +                * it hard for us to decide between IRQ_HANDLED and IRQ_NONE.
> +                * However, the handler isn't shared so we don't have to worry
> +                * about being a good citizen, only about keeping the spurious
> +                * interrupt detector working. This allows us to return the
> +                * result of our *previous* attempt to deploy workaround.
> +                */
> +               work_ret = atomic_read(&hw_events->work_ret);
> +               if (work_ret != IRQ_NONE)
> +                       ret = work_ret;

Is this actually necessary, or can we always return handled?

> +
> +               if (!irq_work_queue_on(&hw_events->work, cpu))
> +                       if (atomic_dec_and_test(&cpu_pmu->remaining_work))

I'm not convinced that we can't have old work racing on the remaining work
field with a subsequent interrupt.

> +                               enable_irq(cpu_pmu->muxed_spi_workaround_irq);

"This function (enable_irq) may be called from IRQ context only when
 desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !"

Can we guarantee that in the general case?

> +       }
> +
> +       return ret;
> +}
> +
> +static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
> +{
> +       struct platform_device *pmu_device = cpu_pmu->plat_device;
> +       int cpu;
> +
> +       for_each_possible_cpu(cpu) {
> +               struct pmu_hw_events *hw_events =
> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
> +
> +               init_irq_work(&hw_events->work, cpu_pmu_do_percpu_work);
> +               atomic_set(&hw_events->work_ret, IRQ_HANDLED);
> +       }
> +
> +       aomic_set(cpu_pmu->remaining_work, 0);

So you didn't even build this...

Will

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3] arm: perf: Directly handle SMP platforms with one SPI
  2015-01-08 17:30   ` Will Deacon
@ 2015-01-09 14:23     ` Daniel Thompson
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Thompson @ 2015-01-09 14:23 UTC (permalink / raw)
  To: Will Deacon
  Cc: Russell King, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, patches, linaro-kernel, John Stultz, Sumit Semwal

On 08/01/15 17:30, Will Deacon wrote:
> Hi Daniel,
> 
> Some minor comments below...
> 
> On Wed, Jan 07, 2015 at 12:28:18PM +0000, Daniel Thompson wrote:
>> Some ARM platforms mux the PMU interrupt of every core into a single
>> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
>> then it cannot be serviced and eventually, if you are lucky, the spurious
>> irq detection might forcefully disable the interrupt.
>>
>> On these SoCs it is not possible to determine which core raised the
>> interrupt so workaround this issue by queuing irqwork on the other
>> cores whenever the primary interrupt handler is unable to service the
>> interrupt.
>>
>> The u8500 platform has an alternative workaround that dynamically alters
>> the affinity of the PMU interrupt. This workaround logic is no longer
>> required so the original code is removed as is the hook it relied upon.
>>
>> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).
>>
>> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
> 
> [...]
> 
>> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
>> index dd9acc95ebc0..3d51c5f442eb 100644
>> --- a/arch/arm/kernel/perf_event_cpu.c
>> +++ b/arch/arm/kernel/perf_event_cpu.c
>> @@ -59,6 +59,116 @@ int perf_num_counters(void)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_num_counters);
>>
>> +#ifdef CONFIG_SMP
>> +/*
>> + * Workaround logic that is distributed to all cores if the PMU has only
>> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
>> + * job is to try to service the interrupt on the current CPU. It will
>> + * also enable the IRQ again if all the other CPUs have already tried to
>> + * service it.
>> + */
>> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
>> +{
>> +       struct pmu_hw_events *hw_events =
>> +           container_of(w, struct pmu_hw_events, work);
>> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
>> +
>> +       atomic_set(&hw_events->work_ret,
>> +                  cpu_pmu->handle_irq(0, cpu_pmu));
> 
> Do you need a memory barrier here, or is that implued by enable_irq?

We are more getting away without a memory barrier... the spurious
interrupt detector won't really mind if we see an out of date value
(since it can tolerate getting the value wrong sometimes).

I think we can moot this issue though by removing the code. See below...


>> +       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
>> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>> +}
>> +
>> +/*
>> + * Called when the main interrupt handler cannot determine the source
>> + * of interrupt. It will deploy a workaround if we are running on an SMP
>> + * platform with only a single muxed SPI.
>> + *
>> + * The workaround disables the interrupt and distributes irqwork to all
>> + * other processors in the system. Hopefully one of them will clear the
>> + * interrupt...
>> + */
>> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
>> +{
>> +       irqreturn_t ret = IRQ_NONE;
>> +       cpumask_t deploy_on_mask;
>> +       int cpu, work_ret;
>> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
>> +               return IRQ_NONE;
> 
> return ret ?

In general I prefer only to take return variables from variables on
control paths where the return value is not constant.

However this will also become moot if I remove the work_ret logic.


>> +
>> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
>> +
>> +       cpumask_copy(&deploy_on_mask, cpu_online_mask);
>> +       cpumask_clear_cpu(smp_processor_id(), &deploy_on_mask);
>> +       atomic_add(cpumask_weight(&deploy_on_mask), &cpu_pmu->remaining_work);
>> +       smp_mb__after_atomic();
> 
> What's this barrier needed for?

It pairs up with the implicit barrier in atomic_dec_and_test() and
ensures the increment happens before the decrement (and therefore that
the interrupt will be re-enabled).

It can be removed providing we can rely on there being an implicit
barrier in irq_work_queue_on(). Internally this function uses
arch_send_call_function_single_ipi() there is definitely a barrier for
that all current arm and arm64 platforms.

I will remove this.


>> +
>> +       for_each_cpu(cpu, &deploy_on_mask) {
> 
> Why not for_each_online_cpu and then continue if cpu == smp_processor_id() ?
> I  assume the race against hotplug is benign, as the interrupt will no longer
> be asserted to the GIC if the source CPU goes offline?

If cpu_online_mask changes after we have performed the atomic_add() but
before (or during) the loop then we would mis-manage the value of
remaining_work might fail to re-enable the interrupt.


>> +               struct pmu_hw_events *hw_events =
>> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
>> +
>> +               /*
>> +                * The workaround code exits immediately without waiting to
>> +                * see if the interrupt was handled by another CPU. This makes
>> +                * it hard for us to decide between IRQ_HANDLED and IRQ_NONE.
>> +                * However, the handler isn't shared so we don't have to worry
>> +                * about being a good citizen, only about keeping the spurious
>> +                * interrupt detector working. This allows us to return the
>> +                * result of our *previous* attempt to deploy workaround.
>> +                */
>> +               work_ret = atomic_read(&hw_events->work_ret);
>> +               if (work_ret != IRQ_NONE)
>> +                       ret = work_ret;
> 
> Is this actually necessary, or can we always return handled?

Ultimately it depends if we need the spurious interrupt detection logic
to work. The work_ret approach is rather nasty (and most of your review
comments are linked to it one way or another). Thus I think I'm OK to
remove this altogether; spurious interrupts are very unlikely for the
PMU IRQ.


>> +
>> +               if (!irq_work_queue_on(&hw_events->work, cpu))
>> +                       if (atomic_dec_and_test(&cpu_pmu->remaining_work))
> 
> I'm not convinced that we can't have old work racing on the remaining work
> field with a subsequent interrupt.

I don't think that can happen.

For the interrupt to be re-enabled all cores must the started executing
their irq_work handlers and called atomic_dec_and_test(). Even though
they may not have completed the pending flag is cleared before they are
called making it safe to re-trigger them.

In fact even if that were not the case the error path for
irq_work_queue_on() would resolve the problem for us (at the cost of
re-entering the interrupt handler during races).


>> +                               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> 
> "This function (enable_irq) may be called from IRQ context only when
>  desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !"
> 
> Can we guarantee that in the general case?

For the PMU I think we can.

The irqchips that use these callbacks are found in one of the following
directories:

  arch/mips
  drivers/base/regmap
  drivers/gpio
  drivers/mfd
  drivers/platform/x86

All of above would be a pretty astonishing way to hook up an intimate
peripheral like the PMU.

>> +       }
>> +
>> +       return ret;
>> +}
>> +
>> +static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
>> +{
>> +       struct platform_device *pmu_device = cpu_pmu->plat_device;
>> +       int cpu;
>> +
>> +       for_each_possible_cpu(cpu) {
>> +               struct pmu_hw_events *hw_events =
>> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
>> +
>> +               init_irq_work(&hw_events->work, cpu_pmu_do_percpu_work);
>> +               atomic_set(&hw_events->work_ret, IRQ_HANDLED);
>> +       }
>> +
>> +       aomic_set(cpu_pmu->remaining_work, 0);
> 
> So you didn't even build this...

Oh no.

I built... I found that typo... I fixed... I soak tested for an hour on
i.MX6 (because I had changed the function from which we deploy the
workaround since v2).

After all that overlooking regenerating the patch before posting it
really was rather foolish.

Sorry.

BTW I have just done a side by side diff with what I posted and what is
in my git repo. The only other difference between what I tested and what
I posted was a minor whitespace change.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v4] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
                   ` (2 preceding siblings ...)
  2015-01-07 12:28 ` [PATCH v3] " Daniel Thompson
@ 2015-01-09 16:16 ` Daniel Thompson
  2015-01-16 10:58   ` Will Deacon
  2015-01-20 12:25 ` [PATCH v5] " Daniel Thompson
  2015-03-04 13:25 ` [PATCH v6] " Daniel Thompson
  5 siblings, 1 reply; 20+ messages in thread
From: Daniel Thompson @ 2015-01-09 16:16 UTC (permalink / raw)
  To: Russell King, Will Deacon
  Cc: Daniel Thompson, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

Some ARM platforms mux the PMU interrupt of every core into a single
SPI. On such platforms if the PMU of any core except 0 raises an interrupt
then it cannot be serviced and eventually, if you are lucky, the spurious
irq detection might forcefully disable the interrupt.

On these SoCs it is not possible to determine which core raised the
interrupt so workaround this issue by queuing irqwork on the other
cores whenever the primary interrupt handler is unable to service the
interrupt.

The u8500 platform has an alternative workaround that dynamically alters
the affinity of the PMU interrupt. This workaround logic is no longer
required so the original code is removed as is the hook it relied upon.

Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    v2 was tested on u8500 (thanks to Linus Walleij). v4 doesn't change
    anything conceptual but the changes were sufficient for me not to
    preserve the Tested-By:.
    
    v4:
     * Ripped out the logic that tried to preserve the operation of the
       spurious interrupt detector. It was complex and not really needed
       (Will Deacon).
     * Removed a redundant memory barrier and added a comment explaining
       why it is not needed (Will Deacon).
     * Made fully safe w.r.t. hotplug by falling back to a work queue
       if there is a hotplug operation in flight when the PMU interrupt
       comes in (Will Deacon). The work queue code paths have been tested
       synthetically (by changing the if condition).
     * Posted the correct, as in compilable and tested, version of the code
       (Will Deacon).
    
    v3:
     * Removed function pointer indirection when deploying workaround code
       and reorganise the code accordingly (Mark Rutland).
     * Move the workaround state tracking into the existing percpu data
       structure (Mark Rutland).
     * Renamed cret to percpu_ret and rewrote the comment describing the
       purpose of this variable (Mark Rutland).
     * Copy the cpu_online_mask and use that to act on a consistent set of
       cpus throughout the workaround (Mark Rutland).
     * Changed "single_irq" to "muxed_spi" to more explicitly describe
       the problem.
    
    v2:
     * Fixed build problems on systems without SMP.
    
    v1:
     * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
       critiquing an older, completely different way to tackle the
       same problem.
    

 arch/arm/include/asm/pmu.h       |  13 ++++
 arch/arm/kernel/perf_event.c     |   9 +--
 arch/arm/kernel/perf_event_cpu.c | 148 +++++++++++++++++++++++++++++++++++++++
 arch/arm/kernel/perf_event_v7.c  |   2 +-
 arch/arm/mach-ux500/cpu-db8500.c |  29 --------
 5 files changed, 163 insertions(+), 38 deletions(-)

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index b1596bd59129..26c7d29c976d 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -87,6 +87,14 @@ struct pmu_hw_events {
 	 * already have to allocate this struct per cpu.
 	 */
 	struct arm_pmu		*percpu_pmu;
+
+#ifdef CONFIG_SMP
+	/*
+	 * This is used to schedule workaround logic on platforms where all
+	 * the PMUs are attached to a single SPI.
+	 */
+	struct irq_work work;
+#endif
 };

 struct arm_pmu {
@@ -117,6 +125,11 @@ struct arm_pmu {
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	__percpu *hw_events;
 	struct notifier_block	hotplug_nb;
+#ifdef CONFIG_SMP
+	int			muxed_spi_workaround_irq;
+	struct work_struct	muxed_spi_workaround_work;
+	atomic_t		remaining_irq_work;
+#endif
 };

 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index f7c65adaa428..e5c537b57f94 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -299,8 +299,6 @@ validate_group(struct perf_event *event)
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;

@@ -311,14 +309,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	 * dereference.
 	 */
 	armpmu = *(void **)dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);

 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, armpmu);
+	ret = armpmu->handle_irq(irq, armpmu);
 	finish_clock = sched_clock();

 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index dd9acc95ebc0..76227484baa9 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -59,6 +59,142 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);

+#ifdef CONFIG_SMP
+/*
+ * Workaround logic that is distributed to all cores if the PMU has only
+ * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
+ * job is to try to service the interrupt on the current CPU. It will
+ * also enable the IRQ again if all the other CPUs have already tried to
+ * service it.
+ */
+static void cpu_pmu_do_percpu_work(struct irq_work *w)
+{
+	struct pmu_hw_events *hw_events =
+	    container_of(w, struct pmu_hw_events, work);
+	struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
+
+	/* Ignore the return code, we can do nothing useful with it */
+	cpu_pmu->handle_irq(0, cpu_pmu);
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
+		enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+}
+
+/*
+ * Issue work to the other CPUs. Must be called whilst we own the
+ * hotplug locks.
+ */
+static void cpu_pmu_queue_percpu_work(struct arm_pmu *cpu_pmu)
+{
+	int cpu;
+
+	atomic_add(num_online_cpus() - 1, &cpu_pmu->remaining_irq_work);
+
+	for_each_online_cpu(cpu) {
+		struct pmu_hw_events *hw_events =
+		    per_cpu_ptr(cpu_pmu->hw_events, cpu);
+
+		if (cpu == smp_processor_id())
+			continue;
+
+		/*
+		 * We assume that the IPI within irq_work_queue_on()
+		 * implies a full memory barrier making the value of
+		 * cpu_pmu->remaining_irq_work visible to the target.
+		 */
+		if (!irq_work_queue_on(&hw_events->work, cpu))
+			if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
+				enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+	}
+}
+
+void cpu_pmu_muxed_spi_workaround_worker(struct work_struct *work)
+{
+	struct arm_pmu *cpu_pmu =
+	    container_of(work, struct arm_pmu, muxed_spi_workaround_work);
+
+	get_online_cpus();
+	cpu_pmu_queue_percpu_work(cpu_pmu);
+	put_online_cpus();
+}
+
+/*
+ * Called when the main interrupt handler cannot determine the source
+ * of interrupt. It will deploy a workaround if we are running on an SMP
+ * platform with only a single muxed SPI.
+ *
+ * The workaround disables the interrupt and distributes irqwork to all
+ * other processors in the system. Hopefully one of them will clear the
+ * interrupt...
+ */
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+
+	if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
+		return IRQ_NONE;
+
+	disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
+
+	if (try_get_online_cpus()) {
+		cpu_pmu_queue_percpu_work(cpu_pmu);
+		put_online_cpus();
+	} else {
+		/*
+		 * There is a CPU hotplug operation in flight making it
+		 * unsafe for us to queue the percpu work. The PMU is
+		 * already silenced so we'll leave it like that and
+		 * schedule some work to tidy things up.
+		 *
+		 * Taking this code path should be very rare which is
+		 * good because the latencies involved here are way to
+		 * long for good profiling.
+		 */
+		schedule_work(&cpu_pmu->muxed_spi_workaround_work);
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct pmu_hw_events *hw_events =
+		    per_cpu_ptr(cpu_pmu->hw_events, cpu);
+
+		init_irq_work(&hw_events->work, cpu_pmu_do_percpu_work);
+	}
+
+	INIT_WORK(&cpu_pmu->muxed_spi_workaround_work,
+		  cpu_pmu_muxed_spi_workaround_worker);
+	atomic_set(&cpu_pmu->remaining_irq_work, 0);
+	cpu_pmu->muxed_spi_workaround_irq = platform_get_irq(pmu_device, 0);
+
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->muxed_spi_workaround_irq = 0;
+}
+#else /* CONFIG_SMP */
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+}
+
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	return IRQ_NONE;
+}
+#endif /* CONFIG_SMP */
+
 /* Include the PMU-specific implementations. */
 #include "perf_event_xscale.c"
 #include "perf_event_v6.c"
@@ -98,6 +234,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			if (irq >= 0)
 				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, i));
 		}
+
+		cpu_pmu_muxed_spi_workaround_term(cpu_pmu);
 	}
 }

@@ -155,6 +293,16 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)

 			cpumask_set_cpu(i, &cpu_pmu->active_irqs);
 		}
+
+		/*
+		 * If we are running SMP and have only one interrupt source
+		 * then get ready to share that single irq among the cores.
+		 */
+		if (nr_cpu_ids > 1 && irqs == 1) {
+			err = cpu_pmu_muxed_spi_workaround_init(cpu_pmu);
+			if (err)
+				return err;
+		}
 	}

 	return 0;
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 8993770c47de..0dd914c10803 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 * Did an overflow occur?
 	 */
 	if (!armv7_pmnc_has_overflowed(pmnc))
-		return IRQ_NONE;
+		return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);

 	/*
 	 * Handle the counter(s) overflow(s)
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 6f63954c8bde..917774999c5c 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -12,8 +12,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -23,7 +21,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/random.h>

-#include <asm/pmu.h>
 #include <asm/mach/map.h>

 #include "setup.h"
@@ -99,30 +96,6 @@ static void __init u8500_map_io(void)
 		iotable_init(u8500_io_desc, ARRAY_SIZE(u8500_io_desc));
 }

-/*
- * The PMU IRQ lines of two cores are wired together into a single interrupt.
- * Bounce the interrupt to the other core if it's not ours.
- */
-static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
-{
-	irqreturn_t ret = handler(irq, dev);
-	int other = !smp_processor_id();
-
-	if (ret == IRQ_NONE && cpu_online(other))
-		irq_set_affinity(irq, cpumask_of(other));
-
-	/*
-	 * We should be able to get away with the amount of IRQ_NONEs we give,
-	 * while still having the spurious IRQ detection code kick in if the
-	 * interrupt really starts hitting spuriously.
-	 */
-	return ret;
-}
-
-static struct arm_pmu_platdata db8500_pmu_platdata = {
-	.handle_irq		= db8500_pmu_handler,
-};
-
 static const char *db8500_read_soc_id(void)
 {
 	void __iomem *uid = __io_address(U8500_BB_UID_BASE);
@@ -143,8 +116,6 @@ static struct device * __init db8500_soc_device_init(void)
 }

 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
-	/* Requires call-back bindings. */
-	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &db8500_pmu_platdata),
 	/* Requires DMA bindings. */
 	OF_DEV_AUXDATA("stericsson,ux500-msp-i2s", 0x80123000,
 		       "ux500-msp-i2s.0", &msp0_platform_data),
--
1.9.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] arm: perf: Directly handle SMP platforms with one SPI
  2015-01-09 16:16 ` [PATCH v4] " Daniel Thompson
@ 2015-01-16 10:58   ` Will Deacon
  2015-01-16 11:28     ` Daniel Thompson
  0 siblings, 1 reply; 20+ messages in thread
From: Will Deacon @ 2015-01-16 10:58 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

On Fri, Jan 09, 2015 at 04:16:10PM +0000, Daniel Thompson wrote:
> Some ARM platforms mux the PMU interrupt of every core into a single
> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
> then it cannot be serviced and eventually, if you are lucky, the spurious
> irq detection might forcefully disable the interrupt.
> 
> On these SoCs it is not possible to determine which core raised the
> interrupt so workaround this issue by queuing irqwork on the other
> cores whenever the primary interrupt handler is unable to service the
> interrupt.
> 
> The u8500 platform has an alternative workaround that dynamically alters
> the affinity of the PMU interrupt. This workaround logic is no longer
> required so the original code is removed as is the hook it relied upon.
> 
> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).

[...]

> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
> index f7c65adaa428..e5c537b57f94 100644
> --- a/arch/arm/kernel/perf_event.c
> +++ b/arch/arm/kernel/perf_event.c
> @@ -299,8 +299,6 @@ validate_group(struct perf_event *event)
>  static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>  {
>         struct arm_pmu *armpmu;
> -       struct platform_device *plat_device;
> -       struct arm_pmu_platdata *plat;
>         int ret;
>         u64 start_clock, finish_clock;
> 
> @@ -311,14 +309,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>          * dereference.
>          */
>         armpmu = *(void **)dev;
> -       plat_device = armpmu->plat_device;
> -       plat = dev_get_platdata(&plat_device->dev);
> 
>         start_clock = sched_clock();
> -       if (plat && plat->handle_irq)
> -               ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
> -       else
> -               ret = armpmu->handle_irq(irq, armpmu);
> +       ret = armpmu->handle_irq(irq, armpmu);
>         finish_clock = sched_clock();
> 
>         perf_sample_event_took(finish_clock - start_clock);
> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
> index dd9acc95ebc0..76227484baa9 100644
> --- a/arch/arm/kernel/perf_event_cpu.c
> +++ b/arch/arm/kernel/perf_event_cpu.c
> @@ -59,6 +59,142 @@ int perf_num_counters(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_num_counters);
> 
> +#ifdef CONFIG_SMP
> +/*
> + * Workaround logic that is distributed to all cores if the PMU has only
> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
> + * job is to try to service the interrupt on the current CPU. It will
> + * also enable the IRQ again if all the other CPUs have already tried to
> + * service it.
> + */
> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
> +{
> +       struct pmu_hw_events *hw_events =
> +           container_of(w, struct pmu_hw_events, work);
> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
> +
> +       /* Ignore the return code, we can do nothing useful with it */
> +       cpu_pmu->handle_irq(0, cpu_pmu);
> +
> +       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +}
> +
> +/*
> + * Issue work to the other CPUs. Must be called whilst we own the
> + * hotplug locks.
> + */
> +static void cpu_pmu_queue_percpu_work(struct arm_pmu *cpu_pmu)
> +{
> +       int cpu;
> +
> +       atomic_add(num_online_cpus() - 1, &cpu_pmu->remaining_irq_work);
> +
> +       for_each_online_cpu(cpu) {
> +               struct pmu_hw_events *hw_events =
> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
> +
> +               if (cpu == smp_processor_id())
> +                       continue;
> +
> +               /*
> +                * We assume that the IPI within irq_work_queue_on()
> +                * implies a full memory barrier making the value of
> +                * cpu_pmu->remaining_irq_work visible to the target.
> +                */
> +               if (!irq_work_queue_on(&hw_events->work, cpu))
> +                       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
> +                               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +       }
> +}
> +
> +void cpu_pmu_muxed_spi_workaround_worker(struct work_struct *work)
> +{
> +       struct arm_pmu *cpu_pmu =
> +           container_of(work, struct arm_pmu, muxed_spi_workaround_work);
> +
> +       get_online_cpus();
> +       cpu_pmu_queue_percpu_work(cpu_pmu);
> +       put_online_cpus();
> +}
> +
> +/*
> + * Called when the main interrupt handler cannot determine the source
> + * of interrupt. It will deploy a workaround if we are running on an SMP
> + * platform with only a single muxed SPI.
> + *
> + * The workaround disables the interrupt and distributes irqwork to all
> + * other processors in the system. Hopefully one of them will clear the
> + * interrupt...
> + */
> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
> +{
> +
> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
> +               return IRQ_NONE;
> +
> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
> +
> +       if (try_get_online_cpus()) {

It's not safe to call this from interrupt context (it takes a mutex).

Can you try enabling a bunch of the debug options under "Kernel Hacking"
for things like detecting sleeping whilst atomic and then run Vince's perf
fuzzer to see what crops up please?

  https://github.com/deater/perf_event_tests/tree/master/fuzzer

Will

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v4] arm: perf: Directly handle SMP platforms with one SPI
  2015-01-16 10:58   ` Will Deacon
@ 2015-01-16 11:28     ` Daniel Thompson
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Thompson @ 2015-01-16 11:28 UTC (permalink / raw)
  To: Will Deacon
  Cc: Russell King, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

On 16/01/15 10:58, Will Deacon wrote:
> On Fri, Jan 09, 2015 at 04:16:10PM +0000, Daniel Thompson wrote:
>> Some ARM platforms mux the PMU interrupt of every core into a single
>> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
>> then it cannot be serviced and eventually, if you are lucky, the spurious
>> irq detection might forcefully disable the interrupt.
>>
>> On these SoCs it is not possible to determine which core raised the
>> interrupt so workaround this issue by queuing irqwork on the other
>> cores whenever the primary interrupt handler is unable to service the
>> interrupt.
>>
>> The u8500 platform has an alternative workaround that dynamically alters
>> the affinity of the PMU interrupt. This workaround logic is no longer
>> required so the original code is removed as is the hook it relied upon.
>>
>> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI).
> 
> [...]
> 
>> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
>> index f7c65adaa428..e5c537b57f94 100644
>> --- a/arch/arm/kernel/perf_event.c
>> +++ b/arch/arm/kernel/perf_event.c
>> @@ -299,8 +299,6 @@ validate_group(struct perf_event *event)
>>  static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>>  {
>>         struct arm_pmu *armpmu;
>> -       struct platform_device *plat_device;
>> -       struct arm_pmu_platdata *plat;
>>         int ret;
>>         u64 start_clock, finish_clock;
>>
>> @@ -311,14 +309,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>>          * dereference.
>>          */
>>         armpmu = *(void **)dev;
>> -       plat_device = armpmu->plat_device;
>> -       plat = dev_get_platdata(&plat_device->dev);
>>
>>         start_clock = sched_clock();
>> -       if (plat && plat->handle_irq)
>> -               ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
>> -       else
>> -               ret = armpmu->handle_irq(irq, armpmu);
>> +       ret = armpmu->handle_irq(irq, armpmu);
>>         finish_clock = sched_clock();
>>
>>         perf_sample_event_took(finish_clock - start_clock);
>> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
>> index dd9acc95ebc0..76227484baa9 100644
>> --- a/arch/arm/kernel/perf_event_cpu.c
>> +++ b/arch/arm/kernel/perf_event_cpu.c
>> @@ -59,6 +59,142 @@ int perf_num_counters(void)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_num_counters);
>>
>> +#ifdef CONFIG_SMP
>> +/*
>> + * Workaround logic that is distributed to all cores if the PMU has only
>> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
>> + * job is to try to service the interrupt on the current CPU. It will
>> + * also enable the IRQ again if all the other CPUs have already tried to
>> + * service it.
>> + */
>> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
>> +{
>> +       struct pmu_hw_events *hw_events =
>> +           container_of(w, struct pmu_hw_events, work);
>> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
>> +
>> +       /* Ignore the return code, we can do nothing useful with it */
>> +       cpu_pmu->handle_irq(0, cpu_pmu);
>> +
>> +       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
>> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>> +}
>> +
>> +/*
>> + * Issue work to the other CPUs. Must be called whilst we own the
>> + * hotplug locks.
>> + */
>> +static void cpu_pmu_queue_percpu_work(struct arm_pmu *cpu_pmu)
>> +{
>> +       int cpu;
>> +
>> +       atomic_add(num_online_cpus() - 1, &cpu_pmu->remaining_irq_work);
>> +
>> +       for_each_online_cpu(cpu) {
>> +               struct pmu_hw_events *hw_events =
>> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
>> +
>> +               if (cpu == smp_processor_id())
>> +                       continue;
>> +
>> +               /*
>> +                * We assume that the IPI within irq_work_queue_on()
>> +                * implies a full memory barrier making the value of
>> +                * cpu_pmu->remaining_irq_work visible to the target.
>> +                */
>> +               if (!irq_work_queue_on(&hw_events->work, cpu))
>> +                       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
>> +                               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>> +       }
>> +}
>> +
>> +void cpu_pmu_muxed_spi_workaround_worker(struct work_struct *work)
>> +{
>> +       struct arm_pmu *cpu_pmu =
>> +           container_of(work, struct arm_pmu, muxed_spi_workaround_work);
>> +
>> +       get_online_cpus();
>> +       cpu_pmu_queue_percpu_work(cpu_pmu);
>> +       put_online_cpus();
>> +}
>> +
>> +/*
>> + * Called when the main interrupt handler cannot determine the source
>> + * of interrupt. It will deploy a workaround if we are running on an SMP
>> + * platform with only a single muxed SPI.
>> + *
>> + * The workaround disables the interrupt and distributes irqwork to all
>> + * other processors in the system. Hopefully one of them will clear the
>> + * interrupt...
>> + */
>> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
>> +{
>> +
>> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
>> +               return IRQ_NONE;
>> +
>> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
>> +
>> +       if (try_get_online_cpus()) {
> 
> It's not safe to call this from interrupt context (it takes a mutex).

Ah... thanks.

I'll revisit the atomic counter approach again.


> Can you try enabling a bunch of the debug options under "Kernel Hacking"
> for things like detecting sleeping whilst atomic and then run Vince's perf
> fuzzer to see what crops up please?
> 
>   https://github.com/deater/perf_event_tests/tree/master/fuzzer

Will do.


Daniel.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v5] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
                   ` (3 preceding siblings ...)
  2015-01-09 16:16 ` [PATCH v4] " Daniel Thompson
@ 2015-01-20 12:25 ` Daniel Thompson
  2015-01-23 17:25   ` Mark Rutland
  2015-03-04 13:25 ` [PATCH v6] " Daniel Thompson
  5 siblings, 1 reply; 20+ messages in thread
From: Daniel Thompson @ 2015-01-20 12:25 UTC (permalink / raw)
  To: Russell King, Will Deacon
  Cc: Daniel Thompson, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

Some ARM platforms mux the PMU interrupt of every core into a single
SPI. On such platforms if the PMU of any core except 0 raises an interrupt
then it cannot be serviced and eventually, if you are lucky, the spurious
irq detection might forcefully disable the interrupt.

On these SoCs it is not possible to determine which core raised the
interrupt so workaround this issue by queuing irqwork on the other
cores whenever the primary interrupt handler is unable to service the
interrupt.

The u8500 platform has an alternative workaround that dynamically alters
the affinity of the PMU interrupt. This workaround logic is no longer
required so the original code is removed as is the hook it relied upon.

Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI)
using a simple soak, combined perf and CPU hotplug soak and using
perf fuzzer's fast_repro.sh.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    v2 was tested on u8500 (thanks to Linus Walleij). v4 doesn't change
    anything conceptual but the changes were sufficient for me not to
    preserve the Tested-By:.
    
    v5:
     * Removed the work queue nonsense; being completely race-free requires
       us to take a mutex or avoid dispatch from interrupt (Will Deacon).
       Replacement code can potentially race with a CPU hot unplug however
       it is careful to minimise exposure, to mitigate harmful effects and
       has fairly prominent comments.
    
    v4:
     * Ripped out the logic that tried to preserve the operation of the
       spurious interrupt detector. It was complex and not really needed
       (Will Deacon).
     * Removed a redundant memory barrier and added a comment explaining
       why it is not needed (Will Deacon).
     * Made fully safe w.r.t. hotplug by falling back to a work queue
       if there is a hotplug operation in flight when the PMU interrupt
       comes in (Will Deacon). The work queue code paths have been tested
       synthetically (by changing the if condition).
     * Posted the correct, as in compilable and tested, version of the code
       (Will Deacon).
    
    v3:
     * Removed function pointer indirection when deploying workaround code
       and reorganise the code accordingly (Mark Rutland).
     * Move the workaround state tracking into the existing percpu data
       structure (Mark Rutland).
     * Renamed cret to percpu_ret and rewrote the comment describing the
       purpose of this variable (Mark Rutland).
     * Copy the cpu_online_mask and use that to act on a consistent set of
       cpus throughout the workaround (Mark Rutland).
     * Changed "single_irq" to "muxed_spi" to more explicitly describe
       the problem.
    
    v2:
     * Fixed build problems on systems without SMP.
    
    v1:
     * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
       critiquing an older, completely different way to tackle the
       same problem.
    

 arch/arm/include/asm/pmu.h       |  12 ++++
 arch/arm/kernel/perf_event.c     |   9 +--
 arch/arm/kernel/perf_event_cpu.c | 128 +++++++++++++++++++++++++++++++++++++++
 arch/arm/kernel/perf_event_v7.c  |   2 +-
 arch/arm/mach-ux500/cpu-db8500.c |  29 ---------
 5 files changed, 142 insertions(+), 38 deletions(-)

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index b1596bd59129..dfef7904b790 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -87,6 +87,14 @@ struct pmu_hw_events {
 	 * already have to allocate this struct per cpu.
 	 */
 	struct arm_pmu		*percpu_pmu;
+
+#ifdef CONFIG_SMP
+	/*
+	 * This is used to schedule workaround logic on platforms where all
+	 * the PMUs are attached to a single SPI.
+	 */
+	struct irq_work work;
+#endif
 };

 struct arm_pmu {
@@ -117,6 +125,10 @@ struct arm_pmu {
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	__percpu *hw_events;
 	struct notifier_block	hotplug_nb;
+#ifdef CONFIG_SMP
+	int			muxed_spi_workaround_irq;
+	atomic_t		remaining_irq_work;
+#endif
 };

 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index f7c65adaa428..e5c537b57f94 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -299,8 +299,6 @@ validate_group(struct perf_event *event)
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;

@@ -311,14 +309,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	 * dereference.
 	 */
 	armpmu = *(void **)dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);

 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, armpmu);
+	ret = armpmu->handle_irq(irq, armpmu);
 	finish_clock = sched_clock();

 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index dd9acc95ebc0..63f7b19a5ebe 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -59,6 +59,119 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);

+#ifdef CONFIG_SMP
+/*
+ * Workaround logic that is distributed to all cores if the PMU has only
+ * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
+ * job is to try to service the interrupt on the current CPU. It will
+ * also enable the IRQ again if all the other CPUs have already tried to
+ * service it.
+ */
+static void cpu_pmu_do_percpu_work(struct irq_work *w)
+{
+	struct pmu_hw_events *hw_events =
+	    container_of(w, struct pmu_hw_events, work);
+	struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
+	int count;
+
+	/* Ignore the return code, we can do nothing useful with it */
+	cpu_pmu->handle_irq(0, cpu_pmu);
+
+	count = atomic_dec_return(&cpu_pmu->remaining_irq_work);
+	if (count == 0)
+		enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+
+	/*
+	 * Recover the count. We warn if we perform any recovery because this
+	 * code is expected to be unreachable except in the case were we lose
+	 * a race during CPU hot unplug (see cpu_pmu_handle_irq_none).
+	 */
+	if (WARN_ON(unlikely(count < 0)))
+		atomic_set(&cpu_pmu->remaining_irq_work, 0);
+}
+
+/*
+ * Called when the main interrupt handler cannot determine the source
+ * of interrupt. It will deploy a workaround if we are running on an SMP
+ * platform with only a single muxed SPI.
+ *
+ * The workaround disables the interrupt and distributes irqwork to all
+ * other processors in the system. Hopefully one of them will clear the
+ * interrupt...
+ */
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	int cpu, count = CONFIG_NR_CPUS;
+
+	if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
+		return IRQ_NONE;
+
+	disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
+
+	/*
+	 * No worker cpu will decrement remaining_irq_work to zero whilst
+	 * we have added CONFIG_NR_CPUS to it (because the current CPU will
+	 * not have work assigned to it)
+	 */
+	atomic_add(count, &cpu_pmu->remaining_irq_work);
+
+	for_each_online_cpu(cpu) {
+		struct pmu_hw_events *hw_events =
+		    per_cpu_ptr(cpu_pmu->hw_events, cpu);
+
+		if (cpu == smp_processor_id())
+			continue;
+
+		/*
+		 * There is a short race between reading the online cpu mask in
+		 * the loop logic above and dispatching work to it below. It
+		 * is unlikely we will lose the race (because the code path to
+		 * offline a CPU is relatively long). If we were to lose the
+		 * race then the interrupt will not be re-enabled and perf will
+		 * be broken until stopped and restarted. This is
+		 * not-a-good-thing (tm) but is not as bad as trying to
+		 * schedule a task to re-distribute the interrupt.
+		 */
+		if (irq_work_queue_on(&hw_events->work, cpu))
+			count--;
+	}
+
+	if (atomic_sub_and_test(count, &cpu_pmu->remaining_irq_work))
+		enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+
+	return IRQ_HANDLED;
+}
+
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+
+	atomic_set(&cpu_pmu->remaining_irq_work, 0);
+	cpu_pmu->muxed_spi_workaround_irq = platform_get_irq(pmu_device, 0);
+
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->muxed_spi_workaround_irq = 0;
+}
+#else /* CONFIG_SMP */
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+}
+
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	return IRQ_NONE;
+}
+#endif /* CONFIG_SMP */
+
 /* Include the PMU-specific implementations. */
 #include "perf_event_xscale.c"
 #include "perf_event_v6.c"
@@ -98,6 +211,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			if (irq >= 0)
 				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, i));
 		}
+
+		cpu_pmu_muxed_spi_workaround_term(cpu_pmu);
 	}
 }

@@ -155,6 +270,16 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)

 			cpumask_set_cpu(i, &cpu_pmu->active_irqs);
 		}
+
+		/*
+		 * If we are running SMP and have only one interrupt source
+		 * then get ready to share that single irq among the cores.
+		 */
+		if (nr_cpu_ids > 1 && irqs == 1) {
+			err = cpu_pmu_muxed_spi_workaround_init(cpu_pmu);
+			if (err)
+				return err;
+		}
 	}

 	return 0;
@@ -201,6 +326,9 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
 		raw_spin_lock_init(&events->pmu_lock);
 		events->percpu_pmu = cpu_pmu;
+#ifdef CONFIG_SMP
+		init_irq_work(&events->work, cpu_pmu_do_percpu_work);
+#endif
 	}

 	cpu_pmu->hw_events	= cpu_hw_events;
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 8993770c47de..0dd914c10803 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 * Did an overflow occur?
 	 */
 	if (!armv7_pmnc_has_overflowed(pmnc))
-		return IRQ_NONE;
+		return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);

 	/*
 	 * Handle the counter(s) overflow(s)
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 6f63954c8bde..917774999c5c 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -12,8 +12,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -23,7 +21,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/random.h>

-#include <asm/pmu.h>
 #include <asm/mach/map.h>

 #include "setup.h"
@@ -99,30 +96,6 @@ static void __init u8500_map_io(void)
 		iotable_init(u8500_io_desc, ARRAY_SIZE(u8500_io_desc));
 }

-/*
- * The PMU IRQ lines of two cores are wired together into a single interrupt.
- * Bounce the interrupt to the other core if it's not ours.
- */
-static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
-{
-	irqreturn_t ret = handler(irq, dev);
-	int other = !smp_processor_id();
-
-	if (ret == IRQ_NONE && cpu_online(other))
-		irq_set_affinity(irq, cpumask_of(other));
-
-	/*
-	 * We should be able to get away with the amount of IRQ_NONEs we give,
-	 * while still having the spurious IRQ detection code kick in if the
-	 * interrupt really starts hitting spuriously.
-	 */
-	return ret;
-}
-
-static struct arm_pmu_platdata db8500_pmu_platdata = {
-	.handle_irq		= db8500_pmu_handler,
-};
-
 static const char *db8500_read_soc_id(void)
 {
 	void __iomem *uid = __io_address(U8500_BB_UID_BASE);
@@ -143,8 +116,6 @@ static struct device * __init db8500_soc_device_init(void)
 }

 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
-	/* Requires call-back bindings. */
-	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &db8500_pmu_platdata),
 	/* Requires DMA bindings. */
 	OF_DEV_AUXDATA("stericsson,ux500-msp-i2s", 0x80123000,
 		       "ux500-msp-i2s.0", &msp0_platform_data),
--
1.9.3


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v5] arm: perf: Directly handle SMP platforms with one SPI
  2015-01-20 12:25 ` [PATCH v5] " Daniel Thompson
@ 2015-01-23 17:25   ` Mark Rutland
  2015-02-24 16:11     ` Daniel Thompson
  0 siblings, 1 reply; 20+ messages in thread
From: Mark Rutland @ 2015-01-23 17:25 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, Will Deacon, linux-arm-kernel, linux-kernel,
	Shawn Guo, Sascha Hauer, Peter Zijlstra, Paul Mackerras,
	Ingo Molnar, Arnaldo Carvalho de Melo, Thomas Gleixner,
	Lucas Stach, Linus Walleij, patches, linaro-kernel, John Stultz,
	Sumit Semwal

Hi Daniel,

On Tue, Jan 20, 2015 at 12:25:35PM +0000, Daniel Thompson wrote:
> Some ARM platforms mux the PMU interrupt of every core into a single
> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
> then it cannot be serviced and eventually, if you are lucky, the spurious
> irq detection might forcefully disable the interrupt.
> 
> On these SoCs it is not possible to determine which core raised the
> interrupt so workaround this issue by queuing irqwork on the other
> cores whenever the primary interrupt handler is unable to service the
> interrupt.
> 
> The u8500 platform has an alternative workaround that dynamically alters
> the affinity of the PMU interrupt. This workaround logic is no longer
> required so the original code is removed as is the hook it relied upon.
> 
> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI)
> using a simple soak, combined perf and CPU hotplug soak and using
> perf fuzzer's fast_repro.sh.
> 
> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>

[...]

> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
> index dd9acc95ebc0..63f7b19a5ebe 100644
> --- a/arch/arm/kernel/perf_event_cpu.c
> +++ b/arch/arm/kernel/perf_event_cpu.c
> @@ -59,6 +59,119 @@ int perf_num_counters(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_num_counters);
> 
> +#ifdef CONFIG_SMP
> +/*
> + * Workaround logic that is distributed to all cores if the PMU has only
> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
> + * job is to try to service the interrupt on the current CPU. It will
> + * also enable the IRQ again if all the other CPUs have already tried to
> + * service it.
> + */
> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
> +{
> +       struct pmu_hw_events *hw_events =
> +           container_of(w, struct pmu_hw_events, work);
> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
> +       int count;
> +
> +       /* Ignore the return code, we can do nothing useful with it */
> +       cpu_pmu->handle_irq(0, cpu_pmu);
> +
> +       count = atomic_dec_return(&cpu_pmu->remaining_irq_work);
> +       if (count == 0)
> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +
> +       /*
> +        * Recover the count. We warn if we perform any recovery because this
> +        * code is expected to be unreachable except in the case were we lose
> +        * a race during CPU hot unplug (see cpu_pmu_handle_irq_none).
> +        */
> +       if (WARN_ON(unlikely(count < 0)))
> +               atomic_set(&cpu_pmu->remaining_irq_work, 0);

I'm not sure I follow. For this case to occur, we have to have raised
some work on a CPU that we don't think we've raised it on (so that we
perform a decrement both on said CPU and via 'count' in
cpu_pmu_handle_irq_none).

So in cpu_pmu_handle_irq_none we'd have to see the CPU as online, and
irq_work_queue_on would have to succeed yet return false. I can only see
that it can do the oppposite (more on the below).

What am I missing?

> +}
> +
> +/*
> + * Called when the main interrupt handler cannot determine the source
> + * of interrupt. It will deploy a workaround if we are running on an SMP
> + * platform with only a single muxed SPI.
> + *
> + * The workaround disables the interrupt and distributes irqwork to all
> + * other processors in the system. Hopefully one of them will clear the
> + * interrupt...
> + */
> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
> +{
> +       int cpu, count = CONFIG_NR_CPUS;

I don't think 'count' is a great name here, as it's really vague (and
due to that, confusing).

You seem to be using it to count the CPUs we don't queue work on.
perhaps 'remaining' or something to that effect would be better.

> +
> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
> +               return IRQ_NONE;
> +
> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
> +
> +       /*
> +        * No worker cpu will decrement remaining_irq_work to zero whilst
> +        * we have added CONFIG_NR_CPUS to it (because the current CPU will
> +        * not have work assigned to it)
> +        */
> +       atomic_add(count, &cpu_pmu->remaining_irq_work);
> +
> +       for_each_online_cpu(cpu) {

There's a fundamental race here, given CPU hotplug isn't inhibited. CPUs
can come up or go down (even repeatedly so), while we read stale values
from the cpu_online_mask.

> +               struct pmu_hw_events *hw_events =
> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
> +
> +               if (cpu == smp_processor_id())
> +                       continue;
> +
> +               /*
> +                * There is a short race between reading the online cpu mask in
> +                * the loop logic above and dispatching work to it below. It
> +                * is unlikely we will lose the race (because the code path to
> +                * offline a CPU is relatively long). If we were to lose the
> +                * race then the interrupt will not be re-enabled and perf will
> +                * be broken until stopped and restarted. This is
> +                * not-a-good-thing (tm) but is not as bad as trying to
> +                * schedule a task to re-distribute the interrupt.
> +                */
> +               if (irq_work_queue_on(&hw_events->work, cpu))
> +                       count--;

The return value of irq_work_queue_on also cannot be relied upon if
hotplug is not inhibited. A CPU can go offline concurrently with
irq_work_queue_on, perhaps just before it calls
arch_send_call_function_single_ipi then unconditionally returns true.

So in that case, cpu_pmu->remaining_irq_work will be stuck above zero,
because we're waiting for an offline CPU to decrement it. You're dead
even with the hack in cpu_pmu_do_percpu_work.

Thanks,
Mark.

> +       }
> +
> +       if (atomic_sub_and_test(count, &cpu_pmu->remaining_irq_work))
> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +
> +       return IRQ_HANDLED;
> +}

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: Re: [PATCH v5] arm: perf: Directly handle SMP platforms with one SPI
  2015-01-23 17:25   ` Mark Rutland
@ 2015-02-24 16:11     ` Daniel Thompson
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Thompson @ 2015-02-24 16:11 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Russell King, Will Deacon, linux-arm-kernel, linux-kernel, Shawn Guo

On 23/01/15 17:25, Mark Rutland wrote:
> Hi Daniel,
> 
> On Tue, Jan 20, 2015 at 12:25:35PM +0000, Daniel Thompson wrote:
>> Some ARM platforms mux the PMU interrupt of every core into a single
>> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
>> then it cannot be serviced and eventually, if you are lucky, the spurious
>> irq detection might forcefully disable the interrupt.
>>
>> On these SoCs it is not possible to determine which core raised the
>> interrupt so workaround this issue by queuing irqwork on the other
>> cores whenever the primary interrupt handler is unable to service the
>> interrupt.
>>
>> The u8500 platform has an alternative workaround that dynamically alters
>> the affinity of the PMU interrupt. This workaround logic is no longer
>> required so the original code is removed as is the hook it relied upon.
>>
>> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI)
>> using a simple soak, combined perf and CPU hotplug soak and using
>> perf fuzzer's fast_repro.sh.
>>
>> Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
> 
> [...]

Thanks for the review and sorry it has taken me so long to reply. Had to
focus on other things for a while.


>> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
>> index dd9acc95ebc0..63f7b19a5ebe 100644
>> --- a/arch/arm/kernel/perf_event_cpu.c
>> +++ b/arch/arm/kernel/perf_event_cpu.c
>> @@ -59,6 +59,119 @@ int perf_num_counters(void)
>>  }
>>  EXPORT_SYMBOL_GPL(perf_num_counters);
>>
>> +#ifdef CONFIG_SMP
>> +/*
>> + * Workaround logic that is distributed to all cores if the PMU has only
>> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
>> + * job is to try to service the interrupt on the current CPU. It will
>> + * also enable the IRQ again if all the other CPUs have already tried to
>> + * service it.
>> + */
>> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
>> +{
>> +       struct pmu_hw_events *hw_events =
>> +           container_of(w, struct pmu_hw_events, work);
>> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
>> +       int count;
>> +
>> +       /* Ignore the return code, we can do nothing useful with it */
>> +       cpu_pmu->handle_irq(0, cpu_pmu);
>> +
>> +       count = atomic_dec_return(&cpu_pmu->remaining_irq_work);
>> +       if (count == 0)
>> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>> +
>> +       /*
>> +        * Recover the count. We warn if we perform any recovery because this
>> +        * code is expected to be unreachable except in the case were we lose
>> +        * a race during CPU hot unplug (see cpu_pmu_handle_irq_none).
>> +        */
>> +       if (WARN_ON(unlikely(count < 0)))
>> +               atomic_set(&cpu_pmu->remaining_irq_work, 0);
> 
> I'm not sure I follow. For this case to occur, we have to have raised
> some work on a CPU that we don't think we've raised it on (so that we
> perform a decrement both on said CPU and via 'count' in
> cpu_pmu_handle_irq_none).
> 
> So in cpu_pmu_handle_irq_none we'd have to see the CPU as online, and
> irq_work_queue_on would have to succeed yet return false. I can only see
> that it can do the oppposite (more on the below).
> 
> What am I missing?

I'll answer this just in case you are interested... however I plan to
rewrite this code and remove the above.

It is (or at least would have been) possible to stop and restart perf
without waking the other CPU. This would reset the counter allowing perf
to run again but leaves the irqwork pending on the stopped CPU... and I
couldn't find anything in the stop/restart logic that would clear the
irqwork meaning the workaround logic would run when the CPU comes alive
again.


>> +}
>> +
>> +/*
>> + * Called when the main interrupt handler cannot determine the source
>> + * of interrupt. It will deploy a workaround if we are running on an SMP
>> + * platform with only a single muxed SPI.
>> + *
>> + * The workaround disables the interrupt and distributes irqwork to all
>> + * other processors in the system. Hopefully one of them will clear the
>> + * interrupt...
>> + */
>> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
>> +{
>> +       int cpu, count = CONFIG_NR_CPUS;
> 
> I don't think 'count' is a great name here, as it's really vague (and
> due to that, confusing).
> 
> You seem to be using it to count the CPUs we don't queue work on.
> perhaps 'remaining' or something to that effect would be better.

The whole concept of the weird "add a big number and then take it away
at the end" (which is why I didn't want to call it remaining) will be
removed in the next revision of the patch.


>> +
>> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
>> +               return IRQ_NONE;
>> +
>> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
>> +
>> +       /*
>> +        * No worker cpu will decrement remaining_irq_work to zero whilst
>> +        * we have added CONFIG_NR_CPUS to it (because the current CPU will
>> +        * not have work assigned to it)
>> +        */
>> +       atomic_add(count, &cpu_pmu->remaining_irq_work);
>> +
>> +       for_each_online_cpu(cpu) {
> 
> There's a fundamental race here, given CPU hotplug isn't inhibited. CPUs
> can come up or go down (even repeatedly so), while we read stale values
> from the cpu_online_mask.

Quite so.

The ambition of the silly workaround (your commands above *and* below)
was to make the race (almost) benign. It was, on reflection, not a good
way to try and solve the problem.

I think I've come up with a way to make this code properly safe even
without having to take the hotplug lock (because taking the hotplug lock
would require us to deploy the workaround from a task). It means adding
a little extra code to the hotplug notifier code but end the end it
should come out OK.


Daniel.

> 
>> +               struct pmu_hw_events *hw_events =
>> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
>> +
>> +               if (cpu == smp_processor_id())
>> +                       continue;
>> +
>> +               /*
>> +                * There is a short race between reading the online cpu mask in
>> +                * the loop logic above and dispatching work to it below. It
>> +                * is unlikely we will lose the race (because the code path to
>> +                * offline a CPU is relatively long). If we were to lose the
>> +                * race then the interrupt will not be re-enabled and perf will
>> +                * be broken until stopped and restarted. This is
>> +                * not-a-good-thing (tm) but is not as bad as trying to
>> +                * schedule a task to re-distribute the interrupt.
>> +                */
>> +               if (irq_work_queue_on(&hw_events->work, cpu))
>> +                       count--;
> 
> The return value of irq_work_queue_on also cannot be relied upon if
> hotplug is not inhibited. A CPU can go offline concurrently with
> irq_work_queue_on, perhaps just before it calls
> arch_send_call_function_single_ipi then unconditionally returns true.
> 
> So in that case, cpu_pmu->remaining_irq_work will be stuck above zero,
> because we're waiting for an offline CPU to decrement it. You're dead
> even with the hack in cpu_pmu_do_percpu_work.




^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v6] arm: perf: Directly handle SMP platforms with one SPI
  2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
                   ` (4 preceding siblings ...)
  2015-01-20 12:25 ` [PATCH v5] " Daniel Thompson
@ 2015-03-04 13:25 ` Daniel Thompson
  2015-03-31 16:20   ` Will Deacon
  2015-03-31 17:08   ` Mark Rutland
  5 siblings, 2 replies; 20+ messages in thread
From: Daniel Thompson @ 2015-03-04 13:25 UTC (permalink / raw)
  To: Russell King, Will Deacon
  Cc: Daniel Thompson, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

Some ARM platforms mux the PMU interrupt of every core into a single
SPI. On such platforms if the PMU of any core except 0 raises an interrupt
then it cannot be serviced and eventually, if you are lucky, the spurious
irq detection might forcefully disable the interrupt.

On these SoCs it is not possible to determine which core raised the
interrupt so workaround this issue by queuing irqwork on the other
cores whenever the primary interrupt handler is unable to service the
interrupt.

The u8500 platform has an alternative workaround that dynamically alters
the affinity of the PMU interrupt. This workaround logic is no longer
required so the original code is removed as is the hook it relied upon.

Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI)
using a simple soak, combined perf and CPU hotplug soak and using
perf fuzzer's fast_repro.sh.

Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
---

Notes:
    v2 was tested on u8500 (thanks to Linus Walleij). The latest patch
    doesn't change the nature of the workaround itself but there has been
    substantial churn in the logic to decide when it can safely be deployed.
    Thus the changes were sufficient for me not to
    preserve the Tested-By:.
    
    v6:
     * Redesigned the code that decides if it is safe to deploy the
       workaround (acting on the review by Mark Rutland). Code should no
       longer race during hot unplug; previous patches sought to make the
       hot unplug race benign and the old approach had flaws and even if
       it could be made correct was tortuously hard to review.
    
    v5:
     * Removed the work queue nonsense; being completely race-free requires
       us to take a mutex or avoid dispatch from interrupt (Will Deacon).
       Replacement code can potentially race with a CPU hot unplug however
       it is careful to minimise exposure, to mitigate harmful effects and
       has fairly prominent comments.
    
    v4:
     * Ripped out the logic that tried to preserve the operation of the
       spurious interrupt detector. It was complex and not really needed
       (Will Deacon).
     * Removed a redundant memory barrier and added a comment explaining
       why it is not needed (Will Deacon).
     * Made fully safe w.r.t. hotplug by falling back to a work queue
       if there is a hotplug operation in flight when the PMU interrupt
       comes in (Will Deacon). The work queue code paths have been tested
       synthetically (by changing the if condition).
     * Posted the correct, as in compilable and tested, version of the code
       (Will Deacon).
    
    v3:
     * Removed function pointer indirection when deploying workaround code
       and reorganise the code accordingly (Mark Rutland).
     * Move the workaround state tracking into the existing percpu data
       structure (Mark Rutland).
     * Renamed cret to percpu_ret and rewrote the comment describing the
       purpose of this variable (Mark Rutland).
     * Copy the cpu_online_mask and use that to act on a consistent set of
       cpus throughout the workaround (Mark Rutland).
     * Changed "single_irq" to "muxed_spi" to more explicitly describe
       the problem.
    
    v2:
     * Fixed build problems on systems without SMP.
    
    v1:
     * Thanks to Lucas Stach, Russell King and Thomas Gleixner for
       critiquing an older, completely different way to tackle the
       same problem.
    

 arch/arm/include/asm/pmu.h       |  12 +++
 arch/arm/kernel/perf_event.c     |   9 +-
 arch/arm/kernel/perf_event_cpu.c | 179 +++++++++++++++++++++++++++++++++++----
 arch/arm/kernel/perf_event_v7.c  |   2 +-
 arch/arm/mach-ux500/cpu-db8500.c |  29 -------
 5 files changed, 178 insertions(+), 53 deletions(-)

diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
index b1596bd59129..dfef7904b790 100644
--- a/arch/arm/include/asm/pmu.h
+++ b/arch/arm/include/asm/pmu.h
@@ -87,6 +87,14 @@ struct pmu_hw_events {
 	 * already have to allocate this struct per cpu.
 	 */
 	struct arm_pmu		*percpu_pmu;
+
+#ifdef CONFIG_SMP
+	/*
+	 * This is used to schedule workaround logic on platforms where all
+	 * the PMUs are attached to a single SPI.
+	 */
+	struct irq_work work;
+#endif
 };

 struct arm_pmu {
@@ -117,6 +125,10 @@ struct arm_pmu {
 	struct platform_device	*plat_device;
 	struct pmu_hw_events	__percpu *hw_events;
 	struct notifier_block	hotplug_nb;
+#ifdef CONFIG_SMP
+	int			muxed_spi_workaround_irq;
+	atomic_t		remaining_irq_work;
+#endif
 };

 #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
index 557e128e4df0..e3fc1a0ce969 100644
--- a/arch/arm/kernel/perf_event.c
+++ b/arch/arm/kernel/perf_event.c
@@ -305,8 +305,6 @@ validate_group(struct perf_event *event)
 static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 {
 	struct arm_pmu *armpmu;
-	struct platform_device *plat_device;
-	struct arm_pmu_platdata *plat;
 	int ret;
 	u64 start_clock, finish_clock;

@@ -317,14 +315,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
 	 * dereference.
 	 */
 	armpmu = *(void **)dev;
-	plat_device = armpmu->plat_device;
-	plat = dev_get_platdata(&plat_device->dev);

 	start_clock = sched_clock();
-	if (plat && plat->handle_irq)
-		ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
-	else
-		ret = armpmu->handle_irq(irq, armpmu);
+	ret = armpmu->handle_irq(irq, armpmu);
 	finish_clock = sched_clock();

 	perf_sample_event_took(finish_clock - start_clock);
diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
index 61b53c46edfa..d5bbd79abd4c 100644
--- a/arch/arm/kernel/perf_event_cpu.c
+++ b/arch/arm/kernel/perf_event_cpu.c
@@ -59,6 +59,116 @@ int perf_num_counters(void)
 }
 EXPORT_SYMBOL_GPL(perf_num_counters);

+#ifdef CONFIG_SMP
+
+static cpumask_t down_prepare_cpu_mask;
+static DEFINE_SPINLOCK(down_prepare_cpu_lock);
+
+/*
+ * Workaround logic that is distributed to all cores if the PMU has only
+ * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
+ * job is to try to service the interrupt on the current CPU. It will
+ * also enable the IRQ again if all the other CPUs have already tried to
+ * service it.
+ */
+static void cpu_pmu_do_percpu_work(struct irq_work *w)
+{
+	struct pmu_hw_events *hw_events =
+	    container_of(w, struct pmu_hw_events, work);
+	struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
+
+	/* Ignore the return code, we can do nothing useful with it */
+	(void) cpu_pmu->handle_irq(0, cpu_pmu);
+
+	if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
+		enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+}
+
+/*
+ * Workaround for systems where all PMU interrupts are targeting a
+ * single SPI.
+ *
+ * The workaround will disable the interrupt and distribute irqwork to all
+ * the other processors in the system. Hopefully one of them will clear the
+ * interrupt...
+ *
+ * The workaround is only deployed when all PMU interrupts are aimed
+ * at a single core. As a result the workaround is never re-entered
+ * making it safe for us to use static data to maintain state.
+ */
+static void cpu_pmu_deploy_muxed_spi_workaround(struct arm_pmu *cpu_pmu)
+{
+	static cpumask_t irqwork_mask;
+	int cpu;
+
+	disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
+	spin_lock(&down_prepare_cpu_lock);
+
+	/*
+	 * Combining cpu_online_mask and down_prepare_cpu_mask gives
+	 * us the CPUs that are currently online and cannot die until
+	 * we release down_prepare_cpu_lock.
+	 */
+	cpumask_andnot(&irqwork_mask, cpu_online_mask, &down_prepare_cpu_mask);
+	cpumask_clear_cpu(smp_processor_id(), &irqwork_mask);
+	atomic_add(cpumask_weight(&irqwork_mask), &cpu_pmu->remaining_irq_work);
+
+	for_each_cpu(cpu, &irqwork_mask) {
+		struct pmu_hw_events *hw_events =
+		    per_cpu_ptr(cpu_pmu->hw_events, cpu);
+
+		if (!irq_work_queue_on(&hw_events->work, cpu))
+			if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
+				enable_irq(cpu_pmu->muxed_spi_workaround_irq);
+	}
+
+	spin_unlock(&down_prepare_cpu_lock);
+}
+
+/*
+ * Called when the main interrupt handler cannot determine the source
+ * of interrupt. It will deploy a workaround if we are running on an SMP
+ * platform with only a single muxed SPI.
+ */
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
+		return IRQ_NONE;
+
+	cpu_pmu_deploy_muxed_spi_workaround(cpu_pmu);
+	return IRQ_HANDLED;
+}
+
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	struct platform_device *pmu_device = cpu_pmu->plat_device;
+
+	atomic_set(&cpu_pmu->remaining_irq_work, 0);
+	cpu_pmu->muxed_spi_workaround_irq = platform_get_irq(pmu_device, 0);
+
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+	cpu_pmu->muxed_spi_workaround_irq = 0;
+}
+#else /* CONFIG_SMP */
+static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
+{
+	return 0;
+}
+
+static void cpu_pmu_muxed_spi_workaround_term(struct arm_pmu *cpu_pmu)
+{
+}
+
+static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
+{
+	return IRQ_NONE;
+}
+#endif /* CONFIG_SMP */
+
 /* Include the PMU-specific implementations. */
 #include "perf_event_xscale.c"
 #include "perf_event_v6.c"
@@ -98,6 +208,8 @@ static void cpu_pmu_free_irq(struct arm_pmu *cpu_pmu)
 			if (irq >= 0)
 				free_irq(irq, per_cpu_ptr(&hw_events->percpu_pmu, i));
 		}
+
+		cpu_pmu_muxed_spi_workaround_term(cpu_pmu);
 	}
 }

@@ -155,31 +267,65 @@ static int cpu_pmu_request_irq(struct arm_pmu *cpu_pmu, irq_handler_t handler)

 			cpumask_set_cpu(i, &cpu_pmu->active_irqs);
 		}
+
+		/*
+		 * If we are running SMP and have only one interrupt source
+		 * then get ready to share that single irq among the cores.
+		 */
+		if (nr_cpu_ids > 1 && irqs == 1) {
+			err = cpu_pmu_muxed_spi_workaround_init(cpu_pmu);
+			if (err)
+				return err;
+		}
 	}

 	return 0;
 }

-/*
- * PMU hardware loses all context when a CPU goes offline.
- * When a CPU is hotplugged back in, since some hardware registers are
- * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
- * junk values out of them.
- */
 static int cpu_pmu_notify(struct notifier_block *b, unsigned long action,
 			  void *hcpu)
 {
 	struct arm_pmu *pmu = container_of(b, struct arm_pmu, hotplug_nb);
+	long cpu = (long)hcpu;
+	unsigned long flags;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	/*
+	 * PMU hardware loses all context when a CPU goes offline.
+	 * When a CPU is hotplugged back in, since some hardware registers are
+	 * UNKNOWN at reset, the PMU must be explicitly reset to avoid reading
+	 * junk values out of them.
+	 */
+	case CPU_STARTING:
+		if (pmu->reset)
+			pmu->reset(pmu);
+		else
+			return NOTIFY_DONE;
+
+		return NOTIFY_OK;
+
+#ifdef CONFIG_SMP
+	/*
+	 * The workaround logic in cpu_pmu_handle_irq_none requires us
+	 * to keep track of CPUs that may be about to die so we can
+	 * avoid sending them irqwork.
+	 */
+	case CPU_DOWN_PREPARE:
+		spin_lock_irqsave(&down_prepare_cpu_lock, flags);
+		cpumask_set_cpu(cpu, &down_prepare_cpu_mask);
+		spin_unlock_irqrestore(&down_prepare_cpu_lock, flags);
+		return NOTIFY_OK;
+
+	case CPU_DOWN_FAILED:
+	case CPU_DEAD:
+		spin_lock_irqsave(&down_prepare_cpu_lock, flags);
+		cpumask_clear_cpu(cpu, &down_prepare_cpu_mask);
+		spin_unlock_irqrestore(&down_prepare_cpu_lock, flags);
+		return NOTIFY_OK;
+#endif
+	}

-	if ((action & ~CPU_TASKS_FROZEN) != CPU_STARTING)
-		return NOTIFY_DONE;
-
-	if (pmu->reset)
-		pmu->reset(pmu);
-	else
-		return NOTIFY_DONE;
-
-	return NOTIFY_OK;
+	return NOTIFY_DONE;
 }

 static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
@@ -201,6 +347,9 @@ static int cpu_pmu_init(struct arm_pmu *cpu_pmu)
 		struct pmu_hw_events *events = per_cpu_ptr(cpu_hw_events, cpu);
 		raw_spin_lock_init(&events->pmu_lock);
 		events->percpu_pmu = cpu_pmu;
+#ifdef CONFIG_SMP
+		init_irq_work(&events->work, cpu_pmu_do_percpu_work);
+#endif
 	}

 	cpu_pmu->hw_events	= cpu_hw_events;
diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
index 8993770c47de..0dd914c10803 100644
--- a/arch/arm/kernel/perf_event_v7.c
+++ b/arch/arm/kernel/perf_event_v7.c
@@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
 	 * Did an overflow occur?
 	 */
 	if (!armv7_pmnc_has_overflowed(pmnc))
-		return IRQ_NONE;
+		return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);

 	/*
 	 * Handle the counter(s) overflow(s)
diff --git a/arch/arm/mach-ux500/cpu-db8500.c b/arch/arm/mach-ux500/cpu-db8500.c
index 6f63954c8bde..917774999c5c 100644
--- a/arch/arm/mach-ux500/cpu-db8500.c
+++ b/arch/arm/mach-ux500/cpu-db8500.c
@@ -12,8 +12,6 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/amba/bus.h>
-#include <linux/interrupt.h>
-#include <linux/irq.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -23,7 +21,6 @@
 #include <linux/regulator/machine.h>
 #include <linux/random.h>

-#include <asm/pmu.h>
 #include <asm/mach/map.h>

 #include "setup.h"
@@ -99,30 +96,6 @@ static void __init u8500_map_io(void)
 		iotable_init(u8500_io_desc, ARRAY_SIZE(u8500_io_desc));
 }

-/*
- * The PMU IRQ lines of two cores are wired together into a single interrupt.
- * Bounce the interrupt to the other core if it's not ours.
- */
-static irqreturn_t db8500_pmu_handler(int irq, void *dev, irq_handler_t handler)
-{
-	irqreturn_t ret = handler(irq, dev);
-	int other = !smp_processor_id();
-
-	if (ret == IRQ_NONE && cpu_online(other))
-		irq_set_affinity(irq, cpumask_of(other));
-
-	/*
-	 * We should be able to get away with the amount of IRQ_NONEs we give,
-	 * while still having the spurious IRQ detection code kick in if the
-	 * interrupt really starts hitting spuriously.
-	 */
-	return ret;
-}
-
-static struct arm_pmu_platdata db8500_pmu_platdata = {
-	.handle_irq		= db8500_pmu_handler,
-};
-
 static const char *db8500_read_soc_id(void)
 {
 	void __iomem *uid = __io_address(U8500_BB_UID_BASE);
@@ -143,8 +116,6 @@ static struct device * __init db8500_soc_device_init(void)
 }

 static struct of_dev_auxdata u8500_auxdata_lookup[] __initdata = {
-	/* Requires call-back bindings. */
-	OF_DEV_AUXDATA("arm,cortex-a9-pmu", 0, "arm-pmu", &db8500_pmu_platdata),
 	/* Requires DMA bindings. */
 	OF_DEV_AUXDATA("stericsson,ux500-msp-i2s", 0x80123000,
 		       "ux500-msp-i2s.0", &msp0_platform_data),
--
2.1.0


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v6] arm: perf: Directly handle SMP platforms with one SPI
  2015-03-04 13:25 ` [PATCH v6] " Daniel Thompson
@ 2015-03-31 16:20   ` Will Deacon
  2015-04-02 15:49     ` Daniel Thompson
  2015-03-31 17:08   ` Mark Rutland
  1 sibling, 1 reply; 20+ messages in thread
From: Will Deacon @ 2015-03-31 16:20 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

Hi Daniel,

On Wed, Mar 04, 2015 at 01:25:45PM +0000, Daniel Thompson wrote:
> Some ARM platforms mux the PMU interrupt of every core into a single
> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
> then it cannot be serviced and eventually, if you are lucky, the spurious
> irq detection might forcefully disable the interrupt.
> 
> On these SoCs it is not possible to determine which core raised the
> interrupt so workaround this issue by queuing irqwork on the other
> cores whenever the primary interrupt handler is unable to service the
> interrupt.
> 
> The u8500 platform has an alternative workaround that dynamically alters
> the affinity of the PMU interrupt. This workaround logic is no longer
> required so the original code is removed as is the hook it relied upon.
> 
> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI)
> using a simple soak, combined perf and CPU hotplug soak and using
> perf fuzzer's fast_repro.sh.

[...]

> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> index b1596bd59129..dfef7904b790 100644
> --- a/arch/arm/include/asm/pmu.h
> +++ b/arch/arm/include/asm/pmu.h
> @@ -87,6 +87,14 @@ struct pmu_hw_events {
>          * already have to allocate this struct per cpu.
>          */
>         struct arm_pmu          *percpu_pmu;
> +
> +#ifdef CONFIG_SMP
> +       /*
> +        * This is used to schedule workaround logic on platforms where all
> +        * the PMUs are attached to a single SPI.
> +        */
> +       struct irq_work work;
> +#endif
>  };
> 
>  struct arm_pmu {
> @@ -117,6 +125,10 @@ struct arm_pmu {
>         struct platform_device  *plat_device;
>         struct pmu_hw_events    __percpu *hw_events;
>         struct notifier_block   hotplug_nb;
> +#ifdef CONFIG_SMP
> +       int                     muxed_spi_workaround_irq;
> +       atomic_t                remaining_irq_work;
> +#endif
>  };
> 
>  #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
> index 557e128e4df0..e3fc1a0ce969 100644
> --- a/arch/arm/kernel/perf_event.c
> +++ b/arch/arm/kernel/perf_event.c
> @@ -305,8 +305,6 @@ validate_group(struct perf_event *event)
>  static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>  {
>         struct arm_pmu *armpmu;
> -       struct platform_device *plat_device;
> -       struct arm_pmu_platdata *plat;
>         int ret;
>         u64 start_clock, finish_clock;
> 
> @@ -317,14 +315,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>          * dereference.
>          */
>         armpmu = *(void **)dev;
> -       plat_device = armpmu->plat_device;
> -       plat = dev_get_platdata(&plat_device->dev);
> 
>         start_clock = sched_clock();
> -       if (plat && plat->handle_irq)
> -               ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
> -       else
> -               ret = armpmu->handle_irq(irq, armpmu);
> +       ret = armpmu->handle_irq(irq, armpmu);
>         finish_clock = sched_clock();
> 
>         perf_sample_event_took(finish_clock - start_clock);
> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
> index 61b53c46edfa..d5bbd79abd4c 100644
> --- a/arch/arm/kernel/perf_event_cpu.c
> +++ b/arch/arm/kernel/perf_event_cpu.c
> @@ -59,6 +59,116 @@ int perf_num_counters(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_num_counters);
> 
> +#ifdef CONFIG_SMP
> +
> +static cpumask_t down_prepare_cpu_mask;
> +static DEFINE_SPINLOCK(down_prepare_cpu_lock);
> +
> +/*
> + * Workaround logic that is distributed to all cores if the PMU has only
> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
> + * job is to try to service the interrupt on the current CPU. It will
> + * also enable the IRQ again if all the other CPUs have already tried to
> + * service it.
> + */
> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
> +{
> +       struct pmu_hw_events *hw_events =
> +           container_of(w, struct pmu_hw_events, work);
> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
> +
> +       /* Ignore the return code, we can do nothing useful with it */
> +       (void) cpu_pmu->handle_irq(0, cpu_pmu);
> +
> +       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +}
> +
> +/*
> + * Workaround for systems where all PMU interrupts are targeting a
> + * single SPI.
> + *
> + * The workaround will disable the interrupt and distribute irqwork to all
> + * the other processors in the system. Hopefully one of them will clear the
> + * interrupt...
> + *
> + * The workaround is only deployed when all PMU interrupts are aimed
> + * at a single core. As a result the workaround is never re-entered
> + * making it safe for us to use static data to maintain state.
> + */
> +static void cpu_pmu_deploy_muxed_spi_workaround(struct arm_pmu *cpu_pmu)
> +{
> +       static cpumask_t irqwork_mask;
> +       int cpu;
> +
> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
> +       spin_lock(&down_prepare_cpu_lock);
> +
> +       /*
> +        * Combining cpu_online_mask and down_prepare_cpu_mask gives
> +        * us the CPUs that are currently online and cannot die until
> +        * we release down_prepare_cpu_lock.
> +        */
> +       cpumask_andnot(&irqwork_mask, cpu_online_mask, &down_prepare_cpu_mask);
> +       cpumask_clear_cpu(smp_processor_id(), &irqwork_mask);
> +       atomic_add(cpumask_weight(&irqwork_mask), &cpu_pmu->remaining_irq_work);

AFAICT, this is a hack to avoid get_online_cpus (which can sleep) from irq
context? Is there no way we can do try_get_online_cpus, and just return
IRQ_NONE if that fails? At some point, the hotplug operation will complete
and we'll be able to service the pending interrupt. I think that would allow
us to kill the down_prepare_cpu_lock.

Will

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v6] arm: perf: Directly handle SMP platforms with one SPI
  2015-03-04 13:25 ` [PATCH v6] " Daniel Thompson
  2015-03-31 16:20   ` Will Deacon
@ 2015-03-31 17:08   ` Mark Rutland
  2015-04-02 15:27     ` Daniel Thompson
  1 sibling, 1 reply; 20+ messages in thread
From: Mark Rutland @ 2015-03-31 17:08 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, Will Deacon, linux-arm-kernel, linux-kernel,
	Shawn Guo, Sascha Hauer, Peter Zijlstra, Paul Mackerras,
	Ingo Molnar, Arnaldo Carvalho de Melo, Thomas Gleixner,
	Lucas Stach, Linus Walleij, patches, linaro-kernel, John Stultz,
	Sumit Semwal

Hi Daniel,

I'd very much like to see us converge on a solution for this soon. The
existing hack is getting in the way of other rework of the arm/arm64
perf code.

I think the approach this patch takes should work, but there are some
parts that can be cleaned up (hopefully mostly cosmetic). Unfortunately
I don't seem to have a relevant platform for testing on.

[...]

> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
> index b1596bd59129..dfef7904b790 100644
> --- a/arch/arm/include/asm/pmu.h
> +++ b/arch/arm/include/asm/pmu.h
> @@ -87,6 +87,14 @@ struct pmu_hw_events {
>          * already have to allocate this struct per cpu.
>          */
>         struct arm_pmu          *percpu_pmu;
> +
> +#ifdef CONFIG_SMP
> +       /*
> +        * This is used to schedule workaround logic on platforms where all
> +        * the PMUs are attached to a single SPI.
> +        */
> +       struct irq_work work;
> +#endif
>  };
> 
>  struct arm_pmu {
> @@ -117,6 +125,10 @@ struct arm_pmu {
>         struct platform_device  *plat_device;
>         struct pmu_hw_events    __percpu *hw_events;
>         struct notifier_block   hotplug_nb;
> +#ifdef CONFIG_SMP
> +       int                     muxed_spi_workaround_irq;

There's nothing workaround specific about this IRQ; it's just the only
IRQ.

I think we should just pre-parse all the IRQs into a list at probe time,
regardless of SMP or the workaround. Then we can just grab the first
(and only) interrupt from the list in the workaround paths, and
otherwise just iterate over the list.

> +       atomic_t                remaining_irq_work;

Perhaps remaining_work_irqs? That would make it clear that this is a
counter rather than a boolean or enumeration. We could s/work/fake/ or
something to that effect.

> @@ -317,14 +315,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>          * dereference.
>          */
>         armpmu = *(void **)dev;
> -       plat_device = armpmu->plat_device;
> -       plat = dev_get_platdata(&plat_device->dev);
> 
>         start_clock = sched_clock();
> -       if (plat && plat->handle_irq)
> -               ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
> -       else
> -               ret = armpmu->handle_irq(irq, armpmu);
> +       ret = armpmu->handle_irq(irq, armpmu);
>         finish_clock = sched_clock();
> 
>         perf_sample_event_took(finish_clock - start_clock);

It's nice to see the plat stuff disappearing!

> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
> index 61b53c46edfa..d5bbd79abd4c 100644
> --- a/arch/arm/kernel/perf_event_cpu.c
> +++ b/arch/arm/kernel/perf_event_cpu.c
> @@ -59,6 +59,116 @@ int perf_num_counters(void)
>  }
>  EXPORT_SYMBOL_GPL(perf_num_counters);
> 
> +#ifdef CONFIG_SMP
> +
> +static cpumask_t down_prepare_cpu_mask;
> +static DEFINE_SPINLOCK(down_prepare_cpu_lock);

I think the names here are a little misleading, because we care about
the whole window from CPU_DOWN_PREPARE to CPU_DEAD (or DOWN_FAILED). I
think these would be clearer with s/down_prepare_cpu/dying_cpu/ (though
admittedly that could also be confused with CPU_DYING).

> +
> +/*
> + * Workaround logic that is distributed to all cores if the PMU has only
> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
> + * job is to try to service the interrupt on the current CPU. It will
> + * also enable the IRQ again if all the other CPUs have already tried to
> + * service it.
> + */
> +static void cpu_pmu_do_percpu_work(struct irq_work *w)

Perhaps do_muxed_irq_work?

> +{
> +       struct pmu_hw_events *hw_events =
> +           container_of(w, struct pmu_hw_events, work);
> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
> +
> +       /* Ignore the return code, we can do nothing useful with it */
> +       (void) cpu_pmu->handle_irq(0, cpu_pmu);

Nit: no space after a cast please.

Do we need the void cast here? Does your toolchain complain?

> +
> +       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);

I'm a little uneasy about calling enable_irq here, given we're in IRQ
context. While it doesn't look we'd be wired up through an irqchip with
irq_bus_lock or irq_bus_sync_unlock, I'm not sure how safe it is to rely
on that being the only thing that matters.

It would be nice to hear from someone familiar with the IRQ code on that
respect.

> +}
> +
> +/*
> + * Workaround for systems where all PMU interrupts are targeting a
> + * single SPI.
> + *
> + * The workaround will disable the interrupt and distribute irqwork to all
> + * the other processors in the system. Hopefully one of them will clear the
> + * interrupt...
> + *
> + * The workaround is only deployed when all PMU interrupts are aimed
> + * at a single core. As a result the workaround is never re-entered
> + * making it safe for us to use static data to maintain state.
> + */
> +static void cpu_pmu_deploy_muxed_spi_workaround(struct arm_pmu *cpu_pmu)

Perhaps distribute_muxed_irq?

> +{
> +       static cpumask_t irqwork_mask;
> +       int cpu;
> +
> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
> +       spin_lock(&down_prepare_cpu_lock);
> +
> +       /*
> +        * Combining cpu_online_mask and down_prepare_cpu_mask gives
> +        * us the CPUs that are currently online and cannot die until
> +        * we release down_prepare_cpu_lock.
> +        */
> +       cpumask_andnot(&irqwork_mask, cpu_online_mask, &down_prepare_cpu_mask);
> +       cpumask_clear_cpu(smp_processor_id(), &irqwork_mask);
> +       atomic_add(cpumask_weight(&irqwork_mask), &cpu_pmu->remaining_irq_work);
> +
> +       for_each_cpu(cpu, &irqwork_mask) {
> +               struct pmu_hw_events *hw_events =
> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
> +
> +               if (!irq_work_queue_on(&hw_events->work, cpu))
> +                       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
> +                               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
> +       }
> +
> +       spin_unlock(&down_prepare_cpu_lock);
> +}

I think this works, given the notifier logic and hotplug_cfd flushing
any pending irq_work items.

> +
> +/*
> + * Called when the main interrupt handler cannot determine the source
> + * of interrupt. It will deploy a workaround if we are running on an SMP
> + * platform with only a single muxed SPI.
> + */
> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
> +{
> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
> +               return IRQ_NONE;

This is somewhat opaque.

I'd rather just have a flag to determine when we need to do any special
handling for the muxed case (or better, swizzle the irq handler to a
wrapper that pings the other CPUs and calls the usual handler).

[...]

> +static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
> +{
> +       struct platform_device *pmu_device = cpu_pmu->plat_device;
> +
> +       atomic_set(&cpu_pmu->remaining_irq_work, 0);

Then we can move this atomic_set into the usual init path for SMP and
get rid of these init/term functions.

[...]

> diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
> index 8993770c47de..0dd914c10803 100644
> --- a/arch/arm/kernel/perf_event_v7.c
> +++ b/arch/arm/kernel/perf_event_v7.c
> @@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
>          * Did an overflow occur?
>          */
>         if (!armv7_pmnc_has_overflowed(pmnc))
> -               return IRQ_NONE;
> +               return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);

Won't this leave samples skewed towards the CPU the interrupt is affine
to? If you're counting something like cycles with a short enough period
(and therefore effectively always have something to handle on the local
CPU), we might never ping the other CPUs.

I think we always need to ping the other CPUs, regardless of whether
there was something to handle on this CPU.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v6] arm: perf: Directly handle SMP platforms with one SPI
  2015-03-31 17:08   ` Mark Rutland
@ 2015-04-02 15:27     ` Daniel Thompson
  2015-04-02 16:44       ` Mark Rutland
  0 siblings, 1 reply; 20+ messages in thread
From: Daniel Thompson @ 2015-04-02 15:27 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Russell King, Will Deacon, linux-arm-kernel, linux-kernel,
	Shawn Guo, Sascha Hauer, Peter Zijlstra, Paul Mackerras,
	Ingo Molnar, Arnaldo Carvalho de Melo, Thomas Gleixner,
	Lucas Stach, Linus Walleij, patches, linaro-kernel, John Stultz,
	Sumit Semwal

On 31/03/15 18:08, Mark Rutland wrote:
> Hi Daniel,
>
> I'd very much like to see us converge on a solution for this soon. The
> existing hack is getting in the way of other rework of the arm/arm64
> perf code.

I'd quite like to see this patch sorted out too (mostly because one o my 
"go to" devices still crashes horribly whenever I try to do any 
profiling on a mainline kernel).


> I think the approach this patch takes should work, but there are some
> parts that can be cleaned up (hopefully mostly cosmetic). Unfortunately
> I don't seem to have a relevant platform for testing on.
>
> [...]
>
>> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
>> index b1596bd59129..dfef7904b790 100644
>> --- a/arch/arm/include/asm/pmu.h
>> +++ b/arch/arm/include/asm/pmu.h
>> @@ -87,6 +87,14 @@ struct pmu_hw_events {
>>           * already have to allocate this struct per cpu.
>>           */
>>          struct arm_pmu          *percpu_pmu;
>> +
>> +#ifdef CONFIG_SMP
>> +       /*
>> +        * This is used to schedule workaround logic on platforms where all
>> +        * the PMUs are attached to a single SPI.
>> +        */
>> +       struct irq_work work;
>> +#endif
>>   };
>>
>>   struct arm_pmu {
>> @@ -117,6 +125,10 @@ struct arm_pmu {
>>          struct platform_device  *plat_device;
>>          struct pmu_hw_events    __percpu *hw_events;
>>          struct notifier_block   hotplug_nb;
>> +#ifdef CONFIG_SMP
>> +       int                     muxed_spi_workaround_irq;
>
> There's nothing workaround specific about this IRQ; it's just the only
> IRQ.
>
> I think we should just pre-parse all the IRQs into a list at probe time,
> regardless of SMP or the workaround. Then we can just grab the first
> (and only) interrupt from the list in the workaround paths, and
> otherwise just iterate over the list.

What other uses do you anticipate for such a list? I don't really 
understand why we would create a list when we only ever consume the 
first entry.

If there is a use for such information then why keep it as a list. 
Wouldn't the code be simpler if we add it as a field in the percpu data 
structure?

For PPIs all values would be the same, for SPIs all different, for 
broken SoCs all after [0] would be 0.


>> +       atomic_t                remaining_irq_work;
>
> Perhaps remaining_work_irqs? That would make it clear that this is a
> counter rather than a boolean or enumeration. We could s/work/fake/ or
> something to that effect.

I guess the thing we are actually counting is CPUs so 
remaining_cpus_with_irq_work would therefore be extremely descriptive.


>> @@ -317,14 +315,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>>           * dereference.
>>           */
>>          armpmu = *(void **)dev;
>> -       plat_device = armpmu->plat_device;
>> -       plat = dev_get_platdata(&plat_device->dev);
>>
>>          start_clock = sched_clock();
>> -       if (plat && plat->handle_irq)
>> -               ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
>> -       else
>> -               ret = armpmu->handle_irq(irq, armpmu);
>> +       ret = armpmu->handle_irq(irq, armpmu);
>>          finish_clock = sched_clock();
>>
>>          perf_sample_event_took(finish_clock - start_clock);
>
> It's nice to see the plat stuff disappearing!
>
>> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
>> index 61b53c46edfa..d5bbd79abd4c 100644
>> --- a/arch/arm/kernel/perf_event_cpu.c
>> +++ b/arch/arm/kernel/perf_event_cpu.c
>> @@ -59,6 +59,116 @@ int perf_num_counters(void)
>>   }
>>   EXPORT_SYMBOL_GPL(perf_num_counters);
>>
>> +#ifdef CONFIG_SMP
>> +
>> +static cpumask_t down_prepare_cpu_mask;
>> +static DEFINE_SPINLOCK(down_prepare_cpu_lock);
>
> I think the names here are a little misleading, because we care about
> the whole window from CPU_DOWN_PREPARE to CPU_DEAD (or DOWN_FAILED). I
> think these would be clearer with s/down_prepare_cpu/dying_cpu/ (though
> admittedly that could also be confused with CPU_DYING).

Fine. I'll change this.


>> +
>> +/*
>> + * Workaround logic that is distributed to all cores if the PMU has only
>> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
>> + * job is to try to service the interrupt on the current CPU. It will
>> + * also enable the IRQ again if all the other CPUs have already tried to
>> + * service it.
>> + */
>> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
>
> Perhaps do_muxed_irq_work?

Ok.


>> +{
>> +       struct pmu_hw_events *hw_events =
>> +           container_of(w, struct pmu_hw_events, work);
>> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
>> +
>> +       /* Ignore the return code, we can do nothing useful with it */
>> +       (void) cpu_pmu->handle_irq(0, cpu_pmu);
>
> Nit: no space after a cast please.
>
> Do we need the void cast here? Does your toolchain complain?

I've always considered casting to void to be an executable comment... In 
this case I added it out of habit rather than because the toolchain 
complained.


>> +
>> +       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
>> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>
> I'm a little uneasy about calling enable_irq here, given we're in IRQ
> context. While it doesn't look we'd be wired up through an irqchip with
> irq_bus_lock or irq_bus_sync_unlock, I'm not sure how safe it is to rely
> on that being the only thing that matters.
>
> It would be nice to hear from someone familiar with the IRQ code on that
> respect.

Sure. I've reviewed this as far as I can.

Probably the most significant difference between enable and disable 
paths is the retrigger logic. I can't see anything there that would 
cause a problem.


>> +}
>> +
>> +/*
>> + * Workaround for systems where all PMU interrupts are targeting a
>> + * single SPI.
>> + *
>> + * The workaround will disable the interrupt and distribute irqwork to all
>> + * the other processors in the system. Hopefully one of them will clear the
>> + * interrupt...
>> + *
>> + * The workaround is only deployed when all PMU interrupts are aimed
>> + * at a single core. As a result the workaround is never re-entered
>> + * making it safe for us to use static data to maintain state.
>> + */
>> +static void cpu_pmu_deploy_muxed_spi_workaround(struct arm_pmu *cpu_pmu)
>
> Perhaps distribute_muxed_irq?

Could do. Personally I prefer to be clear that this is deploying a 
workaround (i.e. implying it does something odd on a small number of 
SoCs) rather than this code being part of the normal case.


>> +{
>> +       static cpumask_t irqwork_mask;
>> +       int cpu;
>> +
>> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
>> +       spin_lock(&down_prepare_cpu_lock);
>> +
>> +       /*
>> +        * Combining cpu_online_mask and down_prepare_cpu_mask gives
>> +        * us the CPUs that are currently online and cannot die until
>> +        * we release down_prepare_cpu_lock.
>> +        */
>> +       cpumask_andnot(&irqwork_mask, cpu_online_mask, &down_prepare_cpu_mask);
>> +       cpumask_clear_cpu(smp_processor_id(), &irqwork_mask);
>> +       atomic_add(cpumask_weight(&irqwork_mask), &cpu_pmu->remaining_irq_work);
>> +
>> +       for_each_cpu(cpu, &irqwork_mask) {
>> +               struct pmu_hw_events *hw_events =
>> +                   per_cpu_ptr(cpu_pmu->hw_events, cpu);
>> +
>> +               if (!irq_work_queue_on(&hw_events->work, cpu))
>> +                       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
>> +                               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>> +       }
>> +
>> +       spin_unlock(&down_prepare_cpu_lock);
>> +}
>
> I think this works, given the notifier logic and hotplug_cfd flushing
> any pending irq_work items.

Certainly its a lot better than the benign race thing I was trying to do 
before!


>> +
>> +/*
>> + * Called when the main interrupt handler cannot determine the source
>> + * of interrupt. It will deploy a workaround if we are running on an SMP
>> + * platform with only a single muxed SPI.
>> + */
>> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
>> +{
>> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
>> +               return IRQ_NONE;
>
> This is somewhat opaque.
>
> I'd rather just have a flag to determine when we need to do any special
> handling for the muxed case (or better, swizzle the irq handler to a
> wrapper that pings the other CPUs and calls the usual handler).

Using a different handler for the workaround makes sense.

IIRC I avoided that previously because the way the code is currently 
split between perf_event.c and perf_event_cpu.c didn't make that very 
natural.

I'll look at this again.


> [...]
>
>> +static int cpu_pmu_muxed_spi_workaround_init(struct arm_pmu *cpu_pmu)
>> +{
>> +       struct platform_device *pmu_device = cpu_pmu->plat_device;
>> +
>> +       atomic_set(&cpu_pmu->remaining_irq_work, 0);
>
> Then we can move this atomic_set into the usual init path for SMP and
> get rid of these init/term functions.

Ok.


> [...]
>
>> diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
>> index 8993770c47de..0dd914c10803 100644
>> --- a/arch/arm/kernel/perf_event_v7.c
>> +++ b/arch/arm/kernel/perf_event_v7.c
>> @@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
>>           * Did an overflow occur?
>>           */
>>          if (!armv7_pmnc_has_overflowed(pmnc))
>> -               return IRQ_NONE;
>> +               return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);
>
> Won't this leave samples skewed towards the CPU the interrupt is affine
> to? If you're counting something like cycles with a short enough period
> (and therefore effectively always have something to handle on the local
> CPU), we might never ping the other CPUs.

Do we really care about fidelity of results when servicing a perf 
interrupt causes another perf interrupt to happen ;-) ? At this point 
perf has already stopped profiling anything useful.


> I think we always need to ping the other CPUs, regardless of whether
> there was something to handle on this CPU.

Ok.

If we make this unconditional then we can, as you suggest above 
somewhere, redistribute the interrupt before trying to handle it. The 
faster we can interrupt the other CPUs the higher the fidelity of the 
profile (less time for PC to change between perf event triggering and 
CPU reacting).


Daniel.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v6] arm: perf: Directly handle SMP platforms with one SPI
  2015-03-31 16:20   ` Will Deacon
@ 2015-04-02 15:49     ` Daniel Thompson
  0 siblings, 0 replies; 20+ messages in thread
From: Daniel Thompson @ 2015-04-02 15:49 UTC (permalink / raw)
  To: Will Deacon
  Cc: Russell King, linux-arm-kernel, linux-kernel, Shawn Guo,
	Sascha Hauer, Peter Zijlstra, Paul Mackerras, Ingo Molnar,
	Arnaldo Carvalho de Melo, Thomas Gleixner, Lucas Stach,
	Linus Walleij, Mark Rutland, patches, linaro-kernel, John Stultz,
	Sumit Semwal

On 31/03/15 17:20, Will Deacon wrote:
> Hi Daniel,
>
> On Wed, Mar 04, 2015 at 01:25:45PM +0000, Daniel Thompson wrote:
>> Some ARM platforms mux the PMU interrupt of every core into a single
>> SPI. On such platforms if the PMU of any core except 0 raises an interrupt
>> then it cannot be serviced and eventually, if you are lucky, the spurious
>> irq detection might forcefully disable the interrupt.
>>
>> On these SoCs it is not possible to determine which core raised the
>> interrupt so workaround this issue by queuing irqwork on the other
>> cores whenever the primary interrupt handler is unable to service the
>> interrupt.
>>
>> The u8500 platform has an alternative workaround that dynamically alters
>> the affinity of the PMU interrupt. This workaround logic is no longer
>> required so the original code is removed as is the hook it relied upon.
>>
>> Tested on imx6q (which has fours cores/PMUs all muxed to a single SPI)
>> using a simple soak, combined perf and CPU hotplug soak and using
>> perf fuzzer's fast_repro.sh.
>
> [...]
>
>> diff --git a/arch/arm/include/asm/pmu.h b/arch/arm/include/asm/pmu.h
>> index b1596bd59129..dfef7904b790 100644
>> --- a/arch/arm/include/asm/pmu.h
>> +++ b/arch/arm/include/asm/pmu.h
>> @@ -87,6 +87,14 @@ struct pmu_hw_events {
>>           * already have to allocate this struct per cpu.
>>           */
>>          struct arm_pmu          *percpu_pmu;
>> +
>> +#ifdef CONFIG_SMP
>> +       /*
>> +        * This is used to schedule workaround logic on platforms where all
>> +        * the PMUs are attached to a single SPI.
>> +        */
>> +       struct irq_work work;
>> +#endif
>>   };
>>
>>   struct arm_pmu {
>> @@ -117,6 +125,10 @@ struct arm_pmu {
>>          struct platform_device  *plat_device;
>>          struct pmu_hw_events    __percpu *hw_events;
>>          struct notifier_block   hotplug_nb;
>> +#ifdef CONFIG_SMP
>> +       int                     muxed_spi_workaround_irq;
>> +       atomic_t                remaining_irq_work;
>> +#endif
>>   };
>>
>>   #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu))
>> diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c
>> index 557e128e4df0..e3fc1a0ce969 100644
>> --- a/arch/arm/kernel/perf_event.c
>> +++ b/arch/arm/kernel/perf_event.c
>> @@ -305,8 +305,6 @@ validate_group(struct perf_event *event)
>>   static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>>   {
>>          struct arm_pmu *armpmu;
>> -       struct platform_device *plat_device;
>> -       struct arm_pmu_platdata *plat;
>>          int ret;
>>          u64 start_clock, finish_clock;
>>
>> @@ -317,14 +315,9 @@ static irqreturn_t armpmu_dispatch_irq(int irq, void *dev)
>>           * dereference.
>>           */
>>          armpmu = *(void **)dev;
>> -       plat_device = armpmu->plat_device;
>> -       plat = dev_get_platdata(&plat_device->dev);
>>
>>          start_clock = sched_clock();
>> -       if (plat && plat->handle_irq)
>> -               ret = plat->handle_irq(irq, armpmu, armpmu->handle_irq);
>> -       else
>> -               ret = armpmu->handle_irq(irq, armpmu);
>> +       ret = armpmu->handle_irq(irq, armpmu);
>>          finish_clock = sched_clock();
>>
>>          perf_sample_event_took(finish_clock - start_clock);
>> diff --git a/arch/arm/kernel/perf_event_cpu.c b/arch/arm/kernel/perf_event_cpu.c
>> index 61b53c46edfa..d5bbd79abd4c 100644
>> --- a/arch/arm/kernel/perf_event_cpu.c
>> +++ b/arch/arm/kernel/perf_event_cpu.c
>> @@ -59,6 +59,116 @@ int perf_num_counters(void)
>>   }
>>   EXPORT_SYMBOL_GPL(perf_num_counters);
>>
>> +#ifdef CONFIG_SMP
>> +
>> +static cpumask_t down_prepare_cpu_mask;
>> +static DEFINE_SPINLOCK(down_prepare_cpu_lock);
>> +
>> +/*
>> + * Workaround logic that is distributed to all cores if the PMU has only
>> + * a single IRQ and the CPU receiving that IRQ cannot handle it. Its
>> + * job is to try to service the interrupt on the current CPU. It will
>> + * also enable the IRQ again if all the other CPUs have already tried to
>> + * service it.
>> + */
>> +static void cpu_pmu_do_percpu_work(struct irq_work *w)
>> +{
>> +       struct pmu_hw_events *hw_events =
>> +           container_of(w, struct pmu_hw_events, work);
>> +       struct arm_pmu *cpu_pmu = hw_events->percpu_pmu;
>> +
>> +       /* Ignore the return code, we can do nothing useful with it */
>> +       (void) cpu_pmu->handle_irq(0, cpu_pmu);
>> +
>> +       if (atomic_dec_and_test(&cpu_pmu->remaining_irq_work))
>> +               enable_irq(cpu_pmu->muxed_spi_workaround_irq);
>> +}
>> +
>> +/*
>> + * Workaround for systems where all PMU interrupts are targeting a
>> + * single SPI.
>> + *
>> + * The workaround will disable the interrupt and distribute irqwork to all
>> + * the other processors in the system. Hopefully one of them will clear the
>> + * interrupt...
>> + *
>> + * The workaround is only deployed when all PMU interrupts are aimed
>> + * at a single core. As a result the workaround is never re-entered
>> + * making it safe for us to use static data to maintain state.
>> + */
>> +static void cpu_pmu_deploy_muxed_spi_workaround(struct arm_pmu *cpu_pmu)
>> +{
>> +       static cpumask_t irqwork_mask;
>> +       int cpu;
>> +
>> +       disable_irq_nosync(cpu_pmu->muxed_spi_workaround_irq);
>> +       spin_lock(&down_prepare_cpu_lock);
>> +
>> +       /*
>> +        * Combining cpu_online_mask and down_prepare_cpu_mask gives
>> +        * us the CPUs that are currently online and cannot die until
>> +        * we release down_prepare_cpu_lock.
>> +        */
>> +       cpumask_andnot(&irqwork_mask, cpu_online_mask, &down_prepare_cpu_mask);
>> +       cpumask_clear_cpu(smp_processor_id(), &irqwork_mask);
>> +       atomic_add(cpumask_weight(&irqwork_mask), &cpu_pmu->remaining_irq_work);
>
> AFAICT, this is a hack to avoid get_online_cpus (which can sleep) from irq
> context? Is there no way we can do try_get_online_cpus, and just return
> IRQ_NONE if that fails? At some point, the hotplug operation will complete
> and we'll be able to service the pending interrupt. I think that would allow
> us to kill the down_prepare_cpu_lock.

It certainly doesn't look easy but I will take a closer look...

 From the perf side its reasonably easy. I don't think we can return 
IRQ_NONE (we might own the h.p. lock on the CPU handling the interrupt) 
so we have to disable the interrupt. However we could probably arrange 
to re-enable it from the hotplug notifications when this is needed.

Unfortunately I'm not sure the cpu_hotplug side is so easy.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v6] arm: perf: Directly handle SMP platforms with one SPI
  2015-04-02 15:27     ` Daniel Thompson
@ 2015-04-02 16:44       ` Mark Rutland
  0 siblings, 0 replies; 20+ messages in thread
From: Mark Rutland @ 2015-04-02 16:44 UTC (permalink / raw)
  To: Daniel Thompson
  Cc: Russell King, Will Deacon, linux-arm-kernel, linux-kernel,
	Shawn Guo, Sascha Hauer, Peter Zijlstra, Paul Mackerras,
	Ingo Molnar, Arnaldo Carvalho de Melo, Thomas Gleixner,
	Lucas Stach, Linus Walleij, patches, linaro-kernel, John Stultz,
	Sumit Semwal

> >>   struct arm_pmu {
> >> @@ -117,6 +125,10 @@ struct arm_pmu {
> >>          struct platform_device  *plat_device;
> >>          struct pmu_hw_events    __percpu *hw_events;
> >>          struct notifier_block   hotplug_nb;
> >> +#ifdef CONFIG_SMP
> >> +       int                     muxed_spi_workaround_irq;
> >
> > There's nothing workaround specific about this IRQ; it's just the only
> > IRQ.
> >
> > I think we should just pre-parse all the IRQs into a list at probe time,
> > regardless of SMP or the workaround. Then we can just grab the first
> > (and only) interrupt from the list in the workaround paths, and
> > otherwise just iterate over the list.
> 
> What other uses do you anticipate for such a list? I don't really
> understand why we would create a list when we only ever consume the
> first entry.

It would go parallel to the affinity property Will recently added, and
would allow us to separate the DT parsing from the request/free paths,
making those far easier to understand (and likely a little faster too).

> If there is a use for such information then why keep it as a list.
> Wouldn't the code be simpler if we add it as a field in the percpu data
> structure?
> 
> For PPIs all values would be the same, for SPIs all different, for
> broken SoCs all after [0] would be 0.

That's a nice idea. That would make the affinity info implicit, and
could be far neater.

> >> +       atomic_t                remaining_irq_work;
> >
> > Perhaps remaining_work_irqs? That would make it clear that this is a
> > counter rather than a boolean or enumeration. We could s/work/fake/ or
> > something to that effect.
> 
> I guess the thing we are actually counting is CPUs so
> remaining_cpus_with_irq_work would therefore be extremely descriptive.

It would be nice if we could come up with something that was a little
more concise. Perhaps cpus_with_work would be good enough?

> >> +}
> >> +
> >> +/*
> >> + * Workaround for systems where all PMU interrupts are targeting a
> >> + * single SPI.
> >> + *
> >> + * The workaround will disable the interrupt and distribute irqwork to all
> >> + * the other processors in the system. Hopefully one of them will clear the
> >> + * interrupt...
> >> + *
> >> + * The workaround is only deployed when all PMU interrupts are aimed
> >> + * at a single core. As a result the workaround is never re-entered
> >> + * making it safe for us to use static data to maintain state.
> >> + */
> >> +static void cpu_pmu_deploy_muxed_spi_workaround(struct arm_pmu *cpu_pmu)
> >
> > Perhaps distribute_muxed_irq?
> 
> Could do. Personally I prefer to be clear that this is deploying a
> workaround (i.e. implying it does something odd on a small number of
> SoCs) rather than this code being part of the normal case.

I'd prefer the more succinct name. We already have the comment stating
that this is a workaround.

[...]

> >> +/*
> >> + * Called when the main interrupt handler cannot determine the source
> >> + * of interrupt. It will deploy a workaround if we are running on an SMP
> >> + * platform with only a single muxed SPI.
> >> + */
> >> +static irqreturn_t cpu_pmu_handle_irq_none(int irq_num, struct arm_pmu *cpu_pmu)
> >> +{
> >> +       if (irq_num != cpu_pmu->muxed_spi_workaround_irq)
> >> +               return IRQ_NONE;
> >
> > This is somewhat opaque.
> >
> > I'd rather just have a flag to determine when we need to do any special
> > handling for the muxed case (or better, swizzle the irq handler to a
> > wrapper that pings the other CPUs and calls the usual handler).
> 
> Using a different handler for the workaround makes sense.
> 
> IIRC I avoided that previously because the way the code is currently
> split between perf_event.c and perf_event_cpu.c didn't make that very
> natural.
> 
> I'll look at this again.

The split between the two is an unfortunate legacy from when it looked
like system PMUs could reuse the code. We recently decoupled the CCI PMU
from arm_pmu so that we could fuse the two layers and get rid of a load
of indirection.

I have some patches doing that. I'll try to rework them next week given
it sounds like they could be a useful base for this.


> > [...]
> >
> >> diff --git a/arch/arm/kernel/perf_event_v7.c b/arch/arm/kernel/perf_event_v7.c
> >> index 8993770c47de..0dd914c10803 100644
> >> --- a/arch/arm/kernel/perf_event_v7.c
> >> +++ b/arch/arm/kernel/perf_event_v7.c
> >> @@ -792,7 +792,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev)
> >>           * Did an overflow occur?
> >>           */
> >>          if (!armv7_pmnc_has_overflowed(pmnc))
> >> -               return IRQ_NONE;
> >> +               return cpu_pmu_handle_irq_none(irq_num, cpu_pmu);
> >
> > Won't this leave samples skewed towards the CPU the interrupt is affine
> > to? If you're counting something like cycles with a short enough period
> > (and therefore effectively always have something to handle on the local
> > CPU), we might never ping the other CPUs.
> 
> Do we really care about fidelity of results when servicing a perf
> interrupt causes another perf interrupt to happen ;-) ? At this point
> perf has already stopped profiling anything useful.

Fair point. I'll have to consider that for a bit...

> > I think we always need to ping the other CPUs, regardless of whether
> > there was something to handle on this CPU.
> 
> Ok.
> 
> If we make this unconditional then we can, as you suggest above
> somewhere, redistribute the interrupt before trying to handle it. The
> faster we can interrupt the other CPUs the higher the fidelity of the
> profile (less time for PC to change between perf event triggering and
> CPU reacting).

Yup!

Mark.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2015-04-02 16:44 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-21 14:53 [PATCH] arm: perf: Directly handle SMP platforms with one SPI Daniel Thompson
2014-11-26 16:59 ` Daniel Thompson
2014-12-02 14:49   ` Mark Rutland
2014-12-02 21:33     ` Daniel Thompson
2014-12-02 13:22 ` Linus Walleij
2015-01-07 12:28 ` [PATCH v3] " Daniel Thompson
2015-01-08 17:30   ` Will Deacon
2015-01-09 14:23     ` Daniel Thompson
2015-01-09 16:16 ` [PATCH v4] " Daniel Thompson
2015-01-16 10:58   ` Will Deacon
2015-01-16 11:28     ` Daniel Thompson
2015-01-20 12:25 ` [PATCH v5] " Daniel Thompson
2015-01-23 17:25   ` Mark Rutland
2015-02-24 16:11     ` Daniel Thompson
2015-03-04 13:25 ` [PATCH v6] " Daniel Thompson
2015-03-31 16:20   ` Will Deacon
2015-04-02 15:49     ` Daniel Thompson
2015-03-31 17:08   ` Mark Rutland
2015-04-02 15:27     ` Daniel Thompson
2015-04-02 16:44       ` Mark Rutland

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).