LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
From: Daniel Thompson <daniel.thompson@linaro.org>
To: Thomas Gleixner <tglx@linutronix.de>
Cc: Daniel Thompson <daniel.thompson@linaro.org>,
	Jason Cooper <jason@lakedaemon.net>,
	Russell King <linux@arm.linux.org.uk>,
	Will Deacon <will.deacon@arm.com>,
	Catalin Marinas <catalin.marinas@arm.com>,
	Marc Zyngier <marc.zyngier@arm.com>,
	Stephen Boyd <sboyd@codeaurora.org>,
	John Stultz <john.stultz@linaro.org>,
	Steven Rostedt <rostedt@goodmis.org>,
	linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, patches@linaro.org,
	linaro-kernel@lists.linaro.org,
	Sumit Semwal <sumit.semwal@linaro.org>,
	Dirk Behme <dirk.behme@de.bosch.com>,
	Daniel Drake <drake@endlessm.com>,
	Dmitry Pervushin <dpervushin@gmail.com>,
	Tim Sander <tim@krieglstein.org>
Subject: [PATCH 3.19-rc2 v15 4/8] sched_clock: Avoid deadlock during read from NMI
Date: Fri, 23 Jan 2015 14:22:28 +0000	[thread overview]
Message-ID: <1422022952-31552-5-git-send-email-daniel.thompson@linaro.org> (raw)
In-Reply-To: <1422022952-31552-1-git-send-email-daniel.thompson@linaro.org>

Currently it is possible for an NMI (or FIQ on ARM) to come in and
read sched_clock() whilst update_sched_clock() has locked the seqcount
for writing. This results in the NMI handler locking up when it calls
raw_read_seqcount_begin().

This patch fixes that problem by providing banked clock data in a
similar manner to Thomas Gleixner's 4396e058c52e("timekeeping: Provide
fast and NMI safe access to CLOCK_MONOTONIC").

Changing the mode of operation of the seqcount away from the traditional
LSB-means-interrupted-write to a banked approach also revealed a good deal
of "fake" write locking within sched_clock_register(). This is likely
to be a latent issue because sched_clock_register() is typically called
before we enable interrupts. Nevertheless the issue has been eliminated
by increasing the scope of the read locking performed by sched_clock().

Suggested-by: Stephen Boyd <sboyd@codeaurora.org>
Signed-off-by: Daniel Thompson <daniel.thompson@linaro.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: John Stultz <john.stultz@linaro.org>
---
 kernel/time/sched_clock.c | 157 +++++++++++++++++++++++++++++-----------------
 1 file changed, 100 insertions(+), 57 deletions(-)

diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 01d2d15aa662..a2ea66944bc1 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -18,28 +18,28 @@
 #include <linux/seqlock.h>
 #include <linux/bitops.h>
 
-struct clock_data {
-	ktime_t wrap_kt;
+struct clock_data_banked {
 	u64 epoch_ns;
 	u64 epoch_cyc;
-	seqcount_t seq;
-	unsigned long rate;
+	u64 (*read_sched_clock)(void);
+	u64 sched_clock_mask;
 	u32 mult;
 	u32 shift;
 	bool suspended;
 };
 
+struct clock_data {
+	ktime_t wrap_kt;
+	seqcount_t seq;
+	unsigned long rate;
+	struct clock_data_banked bank[2];
+};
+
 static struct hrtimer sched_clock_timer;
 static int irqtime = -1;
 
 core_param(irqtime, irqtime, int, 0400);
 
-static struct clock_data cd = {
-	.mult	= NSEC_PER_SEC / HZ,
-};
-
-static u64 __read_mostly sched_clock_mask;
-
 static u64 notrace jiffy_sched_clock_read(void)
 {
 	/*
@@ -49,7 +49,14 @@ static u64 notrace jiffy_sched_clock_read(void)
 	return (u64)(jiffies - INITIAL_JIFFIES);
 }
 
-static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
+static struct clock_data cd = {
+	.bank = {
+		[0] = {
+			.mult	= NSEC_PER_SEC / HZ,
+			.read_sched_clock = jiffy_sched_clock_read,
+		},
+	},
+};
 
 static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 {
@@ -58,50 +65,82 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
 
 unsigned long long notrace sched_clock(void)
 {
-	u64 epoch_ns;
-	u64 epoch_cyc;
 	u64 cyc;
 	unsigned long seq;
-
-	if (cd.suspended)
-		return cd.epoch_ns;
+	struct clock_data_banked *b;
+	u64 res;
 
 	do {
-		seq = raw_read_seqcount_begin(&cd.seq);
-		epoch_cyc = cd.epoch_cyc;
-		epoch_ns = cd.epoch_ns;
+		seq = raw_read_seqcount(&cd.seq);
+		b = cd.bank + (seq & 1);
+		if (b->suspended) {
+			res = b->epoch_ns;
+		} else {
+			cyc = b->read_sched_clock();
+			cyc = (cyc - b->epoch_cyc) & b->sched_clock_mask;
+			res = b->epoch_ns + cyc_to_ns(cyc, b->mult, b->shift);
+		}
 	} while (read_seqcount_retry(&cd.seq, seq));
 
-	cyc = read_sched_clock();
-	cyc = (cyc - epoch_cyc) & sched_clock_mask;
-	return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift);
+	return res;
+}
+
+/*
+ * Start updating the banked clock data.
+ *
+ * sched_clock will never observe mis-matched data even if called from
+ * an NMI. We do this by maintaining an odd/even copy of the data and
+ * steering sched_clock to one or the other using a sequence counter.
+ * In order to preserve the data cache profile of sched_clock as much
+ * as possible the system reverts back to the even copy when the update
+ * completes; the odd copy is used *only* during an update.
+ *
+ * The caller is responsible for avoiding simultaneous updates.
+ */
+static struct clock_data_banked *update_bank_begin(void)
+{
+	/* update the backup (odd) bank and steer readers towards it */
+	memcpy(cd.bank + 1, cd.bank, sizeof(struct clock_data_banked));
+	raw_write_seqcount_latch(&cd.seq);
+
+	return cd.bank;
+}
+
+/*
+ * Finalize update of banked clock data.
+ *
+ * This is just a trivial switch back to the primary (even) copy.
+ */
+static void update_bank_end(void)
+{
+	raw_write_seqcount_latch(&cd.seq);
 }
 
 /*
  * Atomically update the sched_clock epoch.
  */
-static void notrace update_sched_clock(void)
+static void notrace update_sched_clock(bool suspended)
 {
-	unsigned long flags;
+	struct clock_data_banked *b;
 	u64 cyc;
 	u64 ns;
 
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns +
-		cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
-
-	raw_local_irq_save(flags);
-	raw_write_seqcount_begin(&cd.seq);
-	cd.epoch_ns = ns;
-	cd.epoch_cyc = cyc;
-	raw_write_seqcount_end(&cd.seq);
-	raw_local_irq_restore(flags);
+	b = update_bank_begin();
+
+	cyc = b->read_sched_clock();
+	ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask,
+				     b->mult, b->shift);
+
+	b->epoch_ns = ns;
+	b->epoch_cyc = cyc;
+	b->suspended = suspended;
+
+	update_bank_end();
 }
 
 static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
 {
-	update_sched_clock();
+	update_sched_clock(false);
 	hrtimer_forward_now(hrt, cd.wrap_kt);
 	return HRTIMER_RESTART;
 }
@@ -111,9 +150,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 {
 	u64 res, wrap, new_mask, new_epoch, cyc, ns;
 	u32 new_mult, new_shift;
-	ktime_t new_wrap_kt;
 	unsigned long r;
 	char r_unit;
+	struct clock_data_banked *b;
 
 	if (cd.rate > rate)
 		return;
@@ -122,29 +161,30 @@ void __init sched_clock_register(u64 (*read)(void), int bits,
 
 	/* calculate the mult/shift to convert counter ticks to ns. */
 	clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600);
+	cd.rate = rate;
 
 	new_mask = CLOCKSOURCE_MASK(bits);
 
 	/* calculate how many ns until we wrap */
 	wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask);
-	new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+	cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
+
+	b = update_bank_begin();
 
 	/* update epoch for new counter and update epoch_ns from old counter*/
 	new_epoch = read();
-	cyc = read_sched_clock();
-	ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
-			  cd.mult, cd.shift);
+	cyc = b->read_sched_clock();
+	ns = b->epoch_ns + cyc_to_ns((cyc - b->epoch_cyc) & b->sched_clock_mask,
+				     b->mult, b->shift);
 
-	raw_write_seqcount_begin(&cd.seq);
-	read_sched_clock = read;
-	sched_clock_mask = new_mask;
-	cd.rate = rate;
-	cd.wrap_kt = new_wrap_kt;
-	cd.mult = new_mult;
-	cd.shift = new_shift;
-	cd.epoch_cyc = new_epoch;
-	cd.epoch_ns = ns;
-	raw_write_seqcount_end(&cd.seq);
+	b->read_sched_clock = read;
+	b->sched_clock_mask = new_mask;
+	b->mult = new_mult;
+	b->shift = new_shift;
+	b->epoch_cyc = new_epoch;
+	b->epoch_ns = ns;
+
+	update_bank_end();
 
 	r = rate;
 	if (r >= 4000000) {
@@ -175,10 +215,10 @@ void __init sched_clock_postinit(void)
 	 * If no sched_clock function has been provided at that point,
 	 * make it the final one one.
 	 */
-	if (read_sched_clock == jiffy_sched_clock_read)
+	if (cd.bank[0].read_sched_clock == jiffy_sched_clock_read)
 		sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
 
-	update_sched_clock();
+	update_sched_clock(false);
 
 	/*
 	 * Start the timer to keep sched_clock() properly updated and
@@ -191,17 +231,20 @@ void __init sched_clock_postinit(void)
 
 static int sched_clock_suspend(void)
 {
-	update_sched_clock();
+	update_sched_clock(true);
 	hrtimer_cancel(&sched_clock_timer);
-	cd.suspended = true;
 	return 0;
 }
 
 static void sched_clock_resume(void)
 {
-	cd.epoch_cyc = read_sched_clock();
+	struct clock_data_banked *b;
+
+	b = update_bank_begin();
+	b->epoch_cyc = b->read_sched_clock();
 	hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
-	cd.suspended = false;
+	b->suspended = false;
+	update_bank_end();
 }
 
 static struct syscore_ops sched_clock_ops = {
-- 
1.9.3


  parent reply	other threads:[~2015-01-23 14:24 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-01-23 14:22 [PATCH 3.19-rc2 v15 0/8] irq/arm: Implement arch_trigger_all_cpu_backtrace Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 1/8] irqchip: gic: Optimize locking in gic_raise_softirq Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 2/8] irqchip: gic: Make gic_raise_softirq FIQ-safe Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 3/8] irqchip: gic: Introduce plumbing for IPI FIQ Daniel Thompson
2015-01-23 14:22 ` Daniel Thompson [this message]
2015-01-24 22:40   ` [PATCH 3.19-rc2 v15 4/8] sched_clock: Avoid deadlock during read from NMI Thomas Gleixner
2015-01-26 20:28     ` Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 5/8] printk: Simple implementation for NMI backtracing Daniel Thompson
2015-01-24 21:44   ` Thomas Gleixner
2015-01-26 17:21     ` Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 6/8] x86/nmi: Use common printk functions Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 7/8] ARM: Add support for on-demand backtrace of other CPUs Daniel Thompson
2015-01-23 14:22 ` [PATCH 3.19-rc2 v15 8/8] ARM: Fix on-demand backtrace triggered by IRQ Daniel Thompson
2015-02-03 19:06 ` [PATCH 3.19-rc6 v16 0/6] irq/arm: Implement arch_trigger_all_cpu_backtrace Daniel Thompson
2015-02-03 19:06   ` [PATCH 3.19-rc6 v16 1/6] irqchip: gic: Optimize locking in gic_raise_softirq Daniel Thompson
2015-02-26 20:31     ` Nicolas Pitre
2015-02-26 21:05       ` Daniel Thompson
2015-02-26 21:33         ` Nicolas Pitre
2015-02-03 19:06   ` [PATCH 3.19-rc6 v16 2/6] irqchip: gic: Make gic_raise_softirq FIQ-safe Daniel Thompson
2015-02-26 20:33     ` Nicolas Pitre
2015-02-03 19:06   ` [PATCH 3.19-rc6 v16 3/6] irqchip: gic: Introduce plumbing for IPI FIQ Daniel Thompson
2015-02-03 19:06   ` [PATCH 3.19-rc6 v16 4/6] printk: Simple implementation for NMI backtracing Daniel Thompson
2015-02-03 19:06   ` [PATCH 3.19-rc6 v16 5/6] x86/nmi: Use common printk functions Daniel Thompson
2015-02-03 19:06   ` [PATCH 3.19-rc6 v16 6/6] ARM: Add support for on-demand backtrace of other CPUs Daniel Thompson
2015-03-04 10:12 ` [PATCH 4.0-rc1 v17 0/6] irq/arm: Implement arch_trigger_all_cpu_backtrace Daniel Thompson
2015-03-04 10:12   ` [PATCH 4.0-rc1 v17 1/6] irqchip: gic: Optimize locking in gic_raise_softirq Daniel Thompson
2015-03-04 10:12   ` [PATCH 4.0-rc1 v17 2/6] irqchip: gic: Make gic_raise_softirq FIQ-safe Daniel Thompson
2015-03-04 10:12   ` [PATCH 4.0-rc1 v17 3/6] irqchip: gic: Introduce plumbing for IPI FIQ Daniel Thompson
2015-03-04 10:12   ` [PATCH 4.0-rc1 v17 4/6] printk: Simple implementation for NMI backtracing Daniel Thompson
2015-03-04 16:13     ` Joe Perches
2015-03-04 16:20       ` Steven Rostedt
2015-03-04 16:33         ` Daniel Thompson
2015-03-04 17:21           ` Joe Perches
2015-03-05 12:11             ` Daniel Thompson
2015-03-04 10:12   ` [PATCH 4.0-rc1 v17 5/6] x86/nmi: Use common printk functions Daniel Thompson
2015-03-05  0:54     ` Ingo Molnar
2015-03-05 12:29       ` Daniel Thompson
2015-03-05 19:46         ` Ingo Molnar
2015-03-06 19:02           ` Daniel Thompson
2015-03-04 10:12   ` [PATCH 4.0-rc1 v17 6/6] ARM: Add support for on-demand backtrace of other CPUs Daniel Thompson
2015-03-12 13:39 ` [PATCH 4.0-rc2 v18 0/6] irq/arm: Implement arch_trigger_all_cpu_backtrace Daniel Thompson
2015-03-12 13:39   ` [PATCH 4.0-rc2 v18 1/6] irqchip: gic: Optimize locking in gic_raise_softirq Daniel Thompson
2015-03-12 13:39   ` [PATCH 4.0-rc2 v18 2/6] irqchip: gic: Make gic_raise_softirq FIQ-safe Daniel Thompson
2015-03-12 13:39   ` [PATCH 4.0-rc2 v18 3/6] irqchip: gic: Introduce plumbing for IPI FIQ Daniel Thompson
2015-03-12 13:39   ` [PATCH 4.0-rc2 v18 4/6] printk: Simple implementation for NMI backtracing Daniel Thompson
2015-03-12 13:39   ` [PATCH 4.0-rc2 v18 5/6] x86/nmi: Use common printk functions Daniel Thompson
2015-03-12 13:39   ` [PATCH 4.0-rc2 v18 6/6] ARM: Add support for on-demand backtrace of other CPUs Daniel Thompson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1422022952-31552-5-git-send-email-daniel.thompson@linaro.org \
    --to=daniel.thompson@linaro.org \
    --cc=catalin.marinas@arm.com \
    --cc=dirk.behme@de.bosch.com \
    --cc=dpervushin@gmail.com \
    --cc=drake@endlessm.com \
    --cc=jason@lakedaemon.net \
    --cc=john.stultz@linaro.org \
    --cc=linaro-kernel@lists.linaro.org \
    --cc=linux-arm-kernel@lists.infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux@arm.linux.org.uk \
    --cc=marc.zyngier@arm.com \
    --cc=patches@linaro.org \
    --cc=rostedt@goodmis.org \
    --cc=sboyd@codeaurora.org \
    --cc=sumit.semwal@linaro.org \
    --cc=tglx@linutronix.de \
    --cc=tim@krieglstein.org \
    --cc=will.deacon@arm.com \
    --subject='Re: [PATCH 3.19-rc2 v15 4/8] sched_clock: Avoid deadlock during read from NMI' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).