LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
@ 2008-08-21 23:43 Paul E. McKenney
  2008-08-22  4:37 ` Ingo Molnar
                   ` (3 more replies)
  0 siblings, 4 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-21 23:43 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt

Hello!

Experimental, not for inclusion.

Attached is a patch to Classic RCU that applies a hierarchy, greatly
reducing the contention on the top-level lock for large machines.
This passes mild rcutorture testing on x86 and ppc64, but is most
definitely not ready for inclusion.  It is OK for experimental work
assuming sufficiently brave experimenters.  See also Manfred Spraul's
patch at http://lkml.org/lkml/2008/8/21/336 (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but
are currently exploring different regions of the design space.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	The interface to dynticks is clumsy at best.  Improvements
	on the way.

o	CPU onlining and offlining is probably broken.  Will be tested.

o	The check-CPU-stalls code is busted.  Will be fixed.

o	There are probably hangs, rcutorture failures, &c.

o	There is not yet a human-readable design document.  Will be fixed.

o	The largest machine I can get my hands on at the moment only
	has 8 CPUs, which really doesn't stress this algorithm much.

If you want to use this against a Linus kernel, the following will work:

Start with 2.6.27-rc3.

Apply http://www.rdrop.com/users/paulmck/patches/paulmck-rcu.2008.08.20a.patch
which catches you up to a recent linux-2.6-tip tip/core/rcu commit.

Apply http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-hierRCU-6.patch
which gets you the current hierarchical RCU implementation.

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 include/linux/rcuclassic.h |  154 ++++--
 kernel/Kconfig.preempt     |   31 +
 kernel/Makefile            |    3 
 kernel/rcuclassic.c        | 1095 +++++++++++++++++++++++++++++----------------
 kernel/rcuclassic_trace.c  |  219 +++++++++
 5 files changed, 1076 insertions(+), 426 deletions(-)

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 1658995..97e646a 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -18,6 +18,7 @@
  * Copyright IBM Corporation, 2001
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
@@ -26,8 +27,10 @@
  * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
- *
+ * 	Documentation/RCU
+ * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
+ * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
+ * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
  */
 
 #ifndef __LINUX_RCUCLASSIC_H
@@ -40,69 +43,136 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
+/*
+ * Define the shape of the rcu_node hierarchy based on NR_CPUS and
+ * CONFIG_RCU_FANOUT.
+ */
 
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	long	cur;		/* Current batch number.                      */
-	long	completed;	/* Number of the last completed batch         */
-	long	pending;	/* Number of the last pending batch           */
-#ifdef CONFIG_DEBUG_RCU_STALL
-	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
-#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
-
-	int	signaled;
+#define MAX_RCU_LEVELS 3
+#if NR_CPUS <= CONFIG_RCU_FANOUT
+#define NUM_RCU_LEVELS 1
+#define NUM_RCU_LEVEL_1 1
+#define NUM_RCU_LEVEL_2 NR_CPUS
+#define NUM_RCU_LEVEL_3 0
+#define NUM_RCU_LEVEL_4 0
+#define NUM_RCU_NODES NUM_RCU_LEVEL_1
+#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
+#define NUM_RCU_LEVELS 2
+#define NUM_RCU_LEVEL_1 1
+#define NUM_RCU_LEVEL_2 \
+	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
+#define NUM_RCU_LEVEL_3 NR_CPUS
+#define NUM_RCU_LEVEL_4 0
+#define NUM_RCU_NODES \
+	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
+#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
+#define NUM_RCU_LEVELS 3
+#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
+#define NUM_RCU_LEVEL_1 1
+#define NUM_RCU_LEVEL_2 \
+	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
+#define NUM_RCU_LEVEL_3 \
+	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
+#define NUM_RCU_LEVEL_4 NR_CPUS
+#define NUM_RCU_NODES \
+	((NUM_RCU_LEVEL_1) + \
+	 (NUM_RCU_LEVEL_2) + \
+	 (NUM_RCU_LEVEL_3))
+#else
+#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif
 
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-				 /* for current batch to proceed.        */
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
+				/*  order for current grace period to proceed.*/
+	unsigned long	qsmaskinit;
+				/* Per-GP initialization for qsmask.	      */
+	int	grplo;		/* lowest-numbered CPU or group here.	      */
+	int	grphi;		/* highest-numbered CPU or group here.	      */
+	char	grpnum;		/* CPU/group number for next level up.	      */
+	char	level;		/* root is at level 0.			      */
+	struct rcu_node *parent;
 } ____cacheline_internodealigned_in_smp;
 
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
-	return (a - b) < 0;
-}
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LEVELS];	/* Hierarchy levels. */
+	int levelcnt[MAX_RCU_LEVELS + 1];	/* # nodes in each level. */
+	int levelspread[NUM_RCU_LEVELS];	/* kids/node in each level. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	char	signaled ____cacheline_internodealigned_in_smp;
+						/* sent GP-kick IPIs? */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+};
 
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
-	return (a - b) > 0;
-}
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
 
 /* Per-CPU data for Read-Copy UPdate. */
 struct rcu_data {
-	/* 1) quiescent state handling : */
-	long		quiescbatch;     /* Batch # for grace period */
-	int		passed_quiesc;	 /* User-mode/idle loop etc. */
-	int		qs_pending;	 /* core waits for quiesc state */
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	int		passed_quiesc;	/* User-mode/idle loop etc. */
+	int		qs_pending;	/* Core waits for quiesc state. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 
 	/* 2) batch handling */
 	/*
-	 * if nxtlist is not NULL, then:
-	 * batch:
-	 *	The batch # for the last entry of nxtlist
-	 * [*nxttail[1], NULL = *nxttail[2]):
-	 *	Entries that batch # <= batch
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[2], NULL = *nxttail[3]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[1], *nxttail[2]):
+	 *	Entries known to have arrived before current GP ended
 	 * [*nxttail[0], *nxttail[1]):
-	 *	Entries that batch # <= batch - 1
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
 	 * [nxtlist, *nxttail[0]):
-	 *	Entries that batch # <= batch - 2
+	 *	Entries that batch # <= ->completed
 	 *	The grace period for these entries has completed, and
 	 *	the other grace-period-completed entries may be moved
 	 *	here temporarily in rcu_process_callbacks().
 	 */
-	long  	       	batch;
 	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail[3];
-	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *donelist;
-	struct rcu_head **donetail;
-	long		blimit;		 /* Upper limit on a processed batch */
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long            qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
 	int cpu;
 	struct rcu_head barrier;
 };
 
+extern struct rcu_state rcu_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03..43062bf 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -68,7 +68,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
 	select DEBUG_FS
 	default y
 	help
@@ -77,3 +76,33 @@ config RCU_TRACE
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on CLASSIC_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable hierarchical RCU auto-balancing"
+	depends on CLASSIC_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This can be useful
+	  on systems with strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df..d838fbd 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -75,6 +75,9 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+ifeq ($(CONFIG_CLASSIC_RCU),y)
+obj-$(CONFIG_RCU_TRACE) += rcuclassic_trace.o
+endif
 ifeq ($(CONFIG_PREEMPT_RCU),y)
 obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
 endif
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 01e761a..5584b22 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -19,6 +19,7 @@
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
@@ -27,7 +28,10 @@
  * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
+ * 	Documentation/RCU
+ * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
+ * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
+ * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
  *
  */
 #include <linux/types.h>
@@ -56,172 +60,71 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LEVEL_1,  /* root of hierarchy. */ \
+		NUM_RCU_LEVEL_2, \
+		NUM_RCU_LEVEL_3, \
+		NUM_RCU_LEVEL_4, /* == MAX_RCU_LEVELS */ \
+	}, \
+	.signaled = 0, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+}
 
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
 #ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+static void force_quiescent_state(struct rcu_state *rsp)
 {
 	int cpu;
-	cpumask_t cpumask;
 	unsigned long flags;
 
 	set_need_resched();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
+	if (!spin_trylock_irqsave(&rsp->onofflock, flags))
+		return;
+	if (unlikely(!rsp->signaled)) {
+		rsp->signaled = 1;
 		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 *
-		 * cpu_online_map is updated by the _cpu_down()
-		 * using __stop_machine(). Since we're in irqs disabled
-		 * section, __stop_machine() is not exectuting, hence
-		 * the cpu_online_map is stable.
-		 *
-		 * However,  a cpu might have been offlined _just_ before
-		 * we disabled irqs while entering here.
-		 * And rcu subsystem might not yet have handled the CPU_DEAD
-		 * notification, leading to the offlined cpu's bit
-		 * being set in the rcp->cpumask.
-		 *
-		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
-		 * sending smp_reschedule() to an offlined CPU.
+		 * Don't send IPI to self or to CPU that has already
+		 * passed through a quiescent state.
+		 * @@@ check dyntick state.  also do this incrementally.
 		 */
-		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask_nr(cpu, cpumask)
-			smp_send_reschedule(cpu);
+		for_each_online_cpu(cpu) {
+			if (cpu == smp_processor_id())
+				continue;
+			if (per_cpu(rcu_data, cpu).qs_pending)
+				smp_send_reschedule(cpu);
+		}
 	}
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 #else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+static inline void force_quiescent_state(struct rcu_state *rsp)
 {
 	set_need_resched();
 }
 #endif
 
-static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
-		struct rcu_data *rdp)
-{
-	long batch;
-
-	head->next = NULL;
-	smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
-
-	/*
-	 * Determine the batch number of this callback.
-	 *
-	 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
-	 * local variable "batch" and emits codes like this:
-	 *	1) rdp->batch = rcp->cur + 1 # gets old value
-	 *	......
-	 *	2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
-	 * then [*nxttail[0], *nxttail[1]) may contain callbacks
-	 * that batch# = rdp->batch, see the comment of struct rcu_data.
-	 */
-	batch = ACCESS_ONCE(rcp->cur) + 1;
-
-	if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
-		/* process callbacks */
-		rdp->nxttail[0] = rdp->nxttail[1];
-		rdp->nxttail[1] = rdp->nxttail[2];
-		if (rcu_batch_after(batch - 1, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[2];
-	}
-
-	rdp->batch = batch;
-	*rdp->nxttail[2] = head;
-	rdp->nxttail[2] = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
-}
-
-/**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- */
-void call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu);
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-
 /*
  * Return the number of RCU batches processed thus far.  Useful
  * for debug and statistics.
  */
 long rcu_batches_completed(void)
 {
-	return rcu_ctrlblk.completed;
+	return rcu_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
@@ -231,7 +134,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
  */
 long rcu_batches_completed_bh(void)
 {
-	return rcu_bh_ctrlblk.completed;
+	return rcu_bh_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
@@ -241,57 +144,6 @@ static inline void raise_rcu_softirq(void)
 	raise_softirq(RCU_SOFTIRQ);
 }
 
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-	rdp->donelist = list;
-
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-
 #ifdef CONFIG_DEBUG_RCU_STALL
 
 static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
@@ -359,78 +211,316 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 
 #else /* #ifdef CONFIG_DEBUG_RCU_STALL */
 
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+static inline void record_gp_check_time(struct rcu_state *rsp)
 {
 }
 
 static inline void
-check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
 #endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
 
 /*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static inline int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static inline int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static inline struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+void rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LEVELS - 1; i >= 0; i--) {
+		levelspread[i] = CONFIG_RCU_FANOUT;
+	}
+	
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+void rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LEVELS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+	
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * When a given CPU first becomes aware of a grace period, it knows
+ * that all of its pre-existing callbacks will be covered by the next
+ * grace period.
+ *
+ * Similarly, if a given CPU has not yet let RCU know that it passed
+ * through a quiescent state for the current grace period, then that
+ * CPU knows that all of its callbacks may safely be invoked at the
+ * end of the next grace period.
+ */
+static inline void
+rcu_next_callbacks_are_ready(struct rcu_data *rdp)
+{
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+}
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.
  */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	if (rcp->cur != rcp->pending &&
-			rcp->completed == rcp->cur) {
-		rcp->cur++;
-		record_gp_check_time(rcp);
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long iflg)
+{
+	unsigned long flags = iflg;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
 
 		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
+		 * Either there is no need to detect any more grace periods
+		 * at the moment, or we are already in the process of
+		 * detecting one.  Either way, we should not start a new
+		 * RCU grace period, so drop the lock and return.
 		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
 
-		rcp->signaled = 0;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize our local state. */
+
+	rsp->gpnum++;
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+
+	rcu_next_callbacks_are_ready(rdp);
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
 	}
+
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LEVELS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LEVELS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.
  */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
 	}
+	local_irq_restore(flags);
 }
 
 /*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
+ * Record a quiescent state for the specified CPU.  Note that a CPU
+ * going offline counts as a quiescent state.
  */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
+static void cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
+	long mask;
+	struct rcu_node *rnp;
 
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
-		return;
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	mask = 1L << (cpu - rnp->grplo);
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = 1L << rnp->grpnum;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
 	}
 
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rdp);
+	rcu_start_gp(rsp, rdp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
 	 */
 	if (!rdp->qs_pending)
 		return;
@@ -441,195 +531,253 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 	 */
 	if (!rdp->passed_quiesc)
 		return;
-	rdp->qs_pending = 0;
 
-	spin_lock_irqsave(&rcp->lock, flags);
 	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
+	 * Set up to process all currently pending callbacks at the end
+	 * of the next grace period, as these pending callbacks are
+	 * guaranteed to have been registered before the beginning of
+	 * the next grace period.  Then record the fact that this CPU
+	 * has done its part for the current grace period.
 	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	rcu_next_callbacks_are_ready(rdp);
+	rdp->qs_pending = 0;
+	cpu_quiet(rdp->cpu, rsp, rdp);
 }
 
-
 #ifdef CONFIG_HOTPLUG_CPU
 
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail, long batch)
-{
-	if (list) {
-		local_irq_disable();
-		this_rdp->batch = batch;
-		*this_rdp->nxttail[2] = list;
-		this_rdp->nxttail[2] = tail;
-		local_irq_enable();
-	}
-}
 
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp,
+			      struct rcu_data *rdp, struct rcu_data *rdp_me)
 {
+	int i;
 	unsigned long flags;
+	long mask;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	spin_lock(&rnp->lock);			/* irqs already disabled. */
+	mask = 1L << (cpu - rnp->grplo);
+	for (;;) {
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = 1L << rnp->grpnum;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		rnp = rnp->parent;
+		if (rnp == NULL)
+			break;
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+	}
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp);
 
 	/*
-	 * if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
 	 */
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-	spin_unlock(&rcp->lock);
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+	}
 
-	this_rdp->qlen += rdp->qlen;
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
 static void rcu_offline_cpu(int cpu)
 {
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_me = &__get_cpu_var(rcu_data);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+	struct rcu_data *bh_rdp_me = &__get_cpu_var(rcu_bh_data);
 
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
+	__rcu_offline_cpu(cpu, &rcu_state, rdp, rdp_me);
+	__rcu_offline_cpu(cpu, &rcu_bh_state, bh_rdp, bh_rdp_me);
 }
 
-#else
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_offline_cpu(int cpu)
+static inline void
+rcu_offline_cpu(int cpu)
 {
 }
 
-#endif
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
- * This does the RCU processing work from softirq context.
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_data *rdp)
 {
-	long completed_snap;
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
 
-	if (rdp->nxtlist) {
-		local_irq_disable();
-		completed_snap = ACCESS_ONCE(rcp->completed);
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
 
-		/*
-		 * move the other grace-period-completed entries to
-		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
-		else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
-			rdp->nxttail[0] = rdp->nxttail[1];
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
 
-		/*
-		 * the grace period for entries in
-		 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
-		 * move these entries to donelist
-		 */
-		if (rdp->nxttail[0] != &rdp->nxtlist) {
-			*rdp->donetail = rdp->nxtlist;
-			rdp->donetail = rdp->nxttail[0];
-			rdp->nxtlist = *rdp->nxttail[0];
-			*rdp->donetail = NULL;
-
-			if (rdp->nxttail[1] == rdp->nxttail[0])
-				rdp->nxttail[1] = &rdp->nxtlist;
-			if (rdp->nxttail[2] == rdp->nxttail[0])
-				rdp->nxttail[2] = &rdp->nxtlist;
-			rdp->nxttail[0] = &rdp->nxtlist;
-		}
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	/* Update count, and requeue any remaining callbacks. */
+	local_irq_save(flags);
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+	local_irq_restore(flags);
 
-		local_irq_enable();
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 
-		if (rcu_batch_after(rdp->batch, rcp->pending)) {
-			unsigned long flags;
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_rcu_softirq();
+}
 
-			/* and start it/schedule start if it's a new batch */
-			spin_lock_irqsave(&rcp->lock, flags);
-			if (rcu_batch_after(rdp->batch, rcp->pending)) {
-				rcp->pending = rdp->batch;
-				rcu_start_batch(rcp);
-			}
-			spin_unlock_irqrestore(&rcp->lock, flags);
-		}
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, rdp, flags);  /* releases rsp->lock */
 	}
 
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
 }
 
+/*
+ * Do softirq processing for the current CPU.
+ */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	/*
 	 * Memory references from any prior RCU read-side critical sections
-	 * executed by the interrupted code must be see before any RCU
+	 * executed by the interrupted code must be seen before any RCU
 	 * grace-period manupulations below.
 	 */
 
 	smp_mb(); /* See above block comment. */
 
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 
 	/*
 	 * Memory references from any later RCU read-side critical sections
-	 * executed by the interrupted code must be see after any RCU
+	 * executed by the interrupted code must be seen after any RCU
 	 * grace-period manupulations above.
 	 */
 
 	smp_mb(); /* See above block comment. */
 }
 
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	/* Check for CPU stalls, if enabled. */
-	check_cpu_stall(rcp, rdp);
+	check_cpu_stall(rsp, rdp);
 
-	if (rdp->nxtlist) {
-		long completed_snap = ACCESS_ONCE(rcp->completed);
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
 
-		/*
-		 * This cpu has pending rcu entries and the grace period
-		 * for them has completed.
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			return 1;
-		if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
-				rdp->nxttail[0] != rdp->nxttail[1])
-			return 1;
-		if (rdp->nxttail[0] != &rdp->nxtlist)
-			return 1;
+	/* Does this CPU have finished callbacks to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
 
-		/*
-		 * This cpu has pending rcu entries and the new batch
-		 * for then hasn't been started nor scheduled start
-		 */
-		if (rcu_batch_after(rdp->batch, rcp->pending))
-			return 1;
-	}
+	/* Are there callbacks waiting for a GP that needs to be started? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
 
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed)
 		return 1;
 
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum)
 		return 1;
 
 	/* nothing to do */
@@ -643,8 +791,8 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
  */
 int rcu_pending(int cpu)
 {
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
 }
 
 /*
@@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
 
-	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
+	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
+	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
+	       rcu_pending(cpu);
 }
 
 /*
- * Top-level function driving RCU grace-period detection, normally
- * invoked from the scheduler-clock interrupt.  This function simply
- * increments counters that are read only from softirq by this same
- * CPU, so there are no memory barriers required.
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
  */
 void rcu_check_callbacks(int cpu, int user)
 {
@@ -707,20 +860,132 @@ void rcu_check_callbacks(int cpu, int user)
 	raise_rcu_softirq();
 }
 
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
+static void
+__call_rcu(struct rcu_head *head, struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	long flags;
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
 
-	spin_lock_irqsave(&rcp->lock, flags);
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rsp);
+	}
+}
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	__call_rcu(head, &rcu_state, &__get_cpu_var(rcu_data));
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+	int i;
+	long mask;
+	struct rcu_node *rnp = rdp->mynode;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	spin_lock(&rnp->lock);		/* irqs already disabled. */
+	completed_snap = ACCESS_ONCE(rsp->completed);
+	rdp->completed = completed_snap;
+	rdp->gpnum = completed_snap;
+	rdp->passed_quiesc = 1;
 	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
 	rdp->blimit = blimit;
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	rdp->cpu = cpu;
+
+	/* Add CPU to rcu_node bitmasks. */
+
+	mask = 1L << (cpu - rnp->grplo);
+	for (;;) {
+		rnp->qsmaskinit |= mask;
+		mask = 1L << rnp->grpnum;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+		if ((rnp == NULL) || !!(rnp->qsmaskinit & mask))
+			break;
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -728,11 +993,14 @@ static void __cpuinit rcu_online_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	rcu_init_percpu_data(cpu, &rcu_state, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_state, bh_rdp);
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
+/*
+ * Handle CPU online/offline notifcation events.
+ */
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
@@ -753,20 +1021,81 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LEVELS; i++) {
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	}
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LEVELS - 1; i >= 0; i--) {
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->grplo = j * rsp->levelspread[i];
+			rnp->grphi = (j + 1) * rsp->levelspread[i] - 1;
+			if (rnp->grphi >= rsp->levelcnt[i + 1])
+				rnp->grphi = rsp->levelcnt[i + 1] - 1;
+			rnp->qsmaskinit = 0;
+			if (i != NUM_RCU_LEVELS - 1)
+				rnp->grplo = rnp->grphi = 0;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->parent = rsp->level[i - 1] + 
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LEVELS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+	} \
+} while (0)
+
 static struct notifier_block __cpuinitdata rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init __rcu_init(void)
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
 	/* Register notifier for non-boot CPUs */
 	register_cpu_notifier(&rcu_nb);
 }
diff --git a/kernel/rcuclassic_trace.c b/kernel/rcuclassic_trace.c
new file mode 100644
index 0000000..b7df67f
--- /dev/null
+++ b/kernel/rcuclassic_trace.c
@@ -0,0 +1,219 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+
+static struct mutex rcuclassic_trace_mutex;
+static char *rcuclassic_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE 4096
+
+static int print_one_rcu_data(struct rcu_data *rdp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		"%3d completed=%ld gpnum=%ld passed_q: %d qs_pending: %d",
+		rdp->cpu,
+		rdp->completed, rdp->gpnum,
+		rdp->passed_quiesc, rdp->qs_pending);
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" qlen: %ld blimit: %ld\n", rdp->qlen, rdp->blimit);
+	return cnt;
+}
+
+#define PRINT_RCU_DATA(name, buf, ebuf) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_online_cpu(_p_r_d_i) \
+			(buf) += print_one_rcu_data(&per_cpu(name, _p_r_d_i), \
+						    buf, ebuf); \
+	} while (0)
+
+static ssize_t rcudata_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static int print_one_rcu_state(struct rcu_state *rsp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+	int level = 0;
+	struct rcu_node *rnp;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+			"completed: %ld gpnum: %ld signaled: %d\n",
+			rsp->completed, rsp->gpnum, rsp->signaled);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+			level = rnp->level;
+		}
+		cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+				"%lx/%lx %d:%d ^%d    ",
+				rnp->qsmask, rnp->qsmaskinit,
+				rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+	return cnt;
+}
+
+static ssize_t rcuhier_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	buf += print_one_rcu_state(&rcu_state, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	buf += print_one_rcu_state(&rcu_bh_state, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: completed=%ld  gpnum=%ld\n",
+			rcu_state.completed, rcu_state.gpnum);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh: completed=%ld  gpnum=%ld\n",
+			rcu_bh_state.completed, rcu_bh_state.gpnum);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.read = rcudata_read,
+};
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuhier_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct dentry *rcudir, *datadir, *hierdir, *gpdir;
+static int rcuclassic_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcuclassic_trace_init(void)
+{
+	int ret;
+
+	mutex_init(&rcuclassic_trace_mutex);
+	rcuclassic_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcuclassic_trace_buf)
+		return 1;
+	ret = rcuclassic_debugfs_init();
+	if (ret)
+		kfree(rcuclassic_trace_buf);
+	return ret;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+	kfree(rcuclassic_trace_buf);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-21 23:43 [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation Paul E. McKenney
@ 2008-08-22  4:37 ` Ingo Molnar
  2008-08-22 13:47   ` Paul E. McKenney
  2008-08-22 23:29 ` Josh Triplett
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-08-22  4:37 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> +#define MAX_RCU_LEVELS 3
> +#if NR_CPUS <= CONFIG_RCU_FANOUT
> +#define NUM_RCU_LEVELS 1
> +#define NUM_RCU_LEVEL_1 1
> +#define NUM_RCU_LEVEL_2 NR_CPUS
> +#define NUM_RCU_LEVEL_3 0
> +#define NUM_RCU_LEVEL_4 0
> +#define NUM_RCU_NODES NUM_RCU_LEVEL_1
> +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> +#define NUM_RCU_LEVELS 2
> +#define NUM_RCU_LEVEL_1 1
> +#define NUM_RCU_LEVEL_2 \
> +	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
> +#define NUM_RCU_LEVEL_3 NR_CPUS
> +#define NUM_RCU_LEVEL_4 0
> +#define NUM_RCU_NODES \
> +	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
> +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> +#define NUM_RCU_LEVELS 3
> +#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
> +#define NUM_RCU_LEVEL_1 1
> +#define NUM_RCU_LEVEL_2 \
> +	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
> +#define NUM_RCU_LEVEL_3 \
> +	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
> +#define NUM_RCU_LEVEL_4 NR_CPUS
> +#define NUM_RCU_NODES \
> +	((NUM_RCU_LEVEL_1) + \
> +	 (NUM_RCU_LEVEL_2) + \
> +	 (NUM_RCU_LEVEL_3))
> +#else
> +#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> +#endif

just a quick stylistic suggestion: if feasible then such sizing ugliness 
should be hidden in a Kconfig file. (if Kconfig is capable enough for 
this that is)

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-22  4:37 ` Ingo Molnar
@ 2008-08-22 13:47   ` Paul E. McKenney
  2008-08-22 17:22     ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-22 13:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Fri, Aug 22, 2008 at 06:37:15AM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > +#define MAX_RCU_LEVELS 3
> > +#if NR_CPUS <= CONFIG_RCU_FANOUT
> > +#define NUM_RCU_LEVELS 1
> > +#define NUM_RCU_LEVEL_1 1
> > +#define NUM_RCU_LEVEL_2 NR_CPUS
> > +#define NUM_RCU_LEVEL_3 0
> > +#define NUM_RCU_LEVEL_4 0
> > +#define NUM_RCU_NODES NUM_RCU_LEVEL_1
> > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > +#define NUM_RCU_LEVELS 2
> > +#define NUM_RCU_LEVEL_1 1
> > +#define NUM_RCU_LEVEL_2 \
> > +	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
> > +#define NUM_RCU_LEVEL_3 NR_CPUS
> > +#define NUM_RCU_LEVEL_4 0
> > +#define NUM_RCU_NODES \
> > +	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
> > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > +#define NUM_RCU_LEVELS 3
> > +#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
> > +#define NUM_RCU_LEVEL_1 1
> > +#define NUM_RCU_LEVEL_2 \
> > +	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
> > +#define NUM_RCU_LEVEL_3 \
> > +	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
> > +#define NUM_RCU_LEVEL_4 NR_CPUS
> > +#define NUM_RCU_NODES \
> > +	((NUM_RCU_LEVEL_1) + \
> > +	 (NUM_RCU_LEVEL_2) + \
> > +	 (NUM_RCU_LEVEL_3))
> > +#else
> > +#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > +#endif
> 
> just a quick stylistic suggestion: if feasible then such sizing ugliness 
> should be hidden in a Kconfig file. (if Kconfig is capable enough for 
> this that is)

I have no idea if Kconfig can do it, but I will check.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-22 13:47   ` Paul E. McKenney
@ 2008-08-22 17:22     ` Paul E. McKenney
  2008-08-22 18:16       ` Josh Triplett
  2008-08-23 16:07       ` Ingo Molnar
  0 siblings, 2 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-22 17:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Fri, Aug 22, 2008 at 06:47:20AM -0700, Paul E. McKenney wrote:
> On Fri, Aug 22, 2008 at 06:37:15AM +0200, Ingo Molnar wrote:
> > 
> > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > 
> > > +#define MAX_RCU_LEVELS 3
> > > +#if NR_CPUS <= CONFIG_RCU_FANOUT
> > > +#define NUM_RCU_LEVELS 1
> > > +#define NUM_RCU_LEVEL_1 1
> > > +#define NUM_RCU_LEVEL_2 NR_CPUS
> > > +#define NUM_RCU_LEVEL_3 0
> > > +#define NUM_RCU_LEVEL_4 0
> > > +#define NUM_RCU_NODES NUM_RCU_LEVEL_1
> > > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > > +#define NUM_RCU_LEVELS 2
> > > +#define NUM_RCU_LEVEL_1 1
> > > +#define NUM_RCU_LEVEL_2 \
> > > +	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
> > > +#define NUM_RCU_LEVEL_3 NR_CPUS
> > > +#define NUM_RCU_LEVEL_4 0
> > > +#define NUM_RCU_NODES \
> > > +	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
> > > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > > +#define NUM_RCU_LEVELS 3
> > > +#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
> > > +#define NUM_RCU_LEVEL_1 1
> > > +#define NUM_RCU_LEVEL_2 \
> > > +	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
> > > +#define NUM_RCU_LEVEL_3 \
> > > +	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
> > > +#define NUM_RCU_LEVEL_4 NR_CPUS
> > > +#define NUM_RCU_NODES \
> > > +	((NUM_RCU_LEVEL_1) + \
> > > +	 (NUM_RCU_LEVEL_2) + \
> > > +	 (NUM_RCU_LEVEL_3))
> > > +#else
> > > +#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > > +#endif
> > 
> > just a quick stylistic suggestion: if feasible then such sizing ugliness 
> > should be hidden in a Kconfig file. (if Kconfig is capable enough for 
> > this that is)
> 
> I have no idea if Kconfig can do it, but I will check.

OK, Kconfig does not currently support arithmetic, based on zconf.y:

expr:	  symbol				{ $$ = expr_alloc_symbol($1); }
	| symbol T_EQUAL symbol			{ $$ = expr_alloc_comp(E_EQUAL, $1, $3); }
	| symbol T_UNEQUAL symbol		{ $$ = expr_alloc_comp(E_UNEQUAL, $1, $3); }
	| T_OPEN_PAREN expr T_CLOSE_PAREN	{ $$ = $2; }
	| T_NOT expr				{ $$ = expr_alloc_one(E_NOT, $2); }
	| expr T_OR expr			{ $$ = expr_alloc_two(E_OR, $1, $3); }
	| expr T_AND expr			{ $$ = expr_alloc_two(E_AND, $1, $3); }
;

All we currently get is basic comparison and logical operators.  It would
not be all -that- hard to add general arithmetic (famous last words),
but when I tried mapping out what the sizing code would look like in
such an augmented Kconfig, it was even uglier than the above.

So I took a hard look at the current mess, and prettied it as shown below.
Is this a sufficient improvement?

Another alternative I am considering is moving this to a separate
include file.

Thoughts?

							Thanx, Paul

#define MAX_RCU_LEVELS 3
#define RCU_FANOUT (CONFIG_RCU_FANOUT)
#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)

#if (NR_CPUS) <= RCU_FANOUT
#  define NUM_RCU_LVLS 1
#  define NUM_RCU_LVL_0 1
#  define NUM_RCU_LVL_1 (NR_CPUS)
#  define NUM_RCU_LVL_2 0
#  define NUM_RCU_LVL_3 0
#elif (NR_CPUS) <= RCU_FANOUT_SQ
#  define NUM_RCU_LVLS 2
#  define NUM_RCU_LVL_0 1
#  define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
#  define NUM_RCU_LVL_2 (NR_CPUS)
#  define NUM_RCU_LVL_3 0
#elif (NR_CPUS) <= RCU_FANOUT_CUBE
#  define NUM_RCU_LVLS 3
#  define NUM_RCU_LVL_0 1
#  define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
#  define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
#  define NUM_RCU_LVL_3 NR_CPUS
#else
# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
#endif /* #if (NR_CPUS) <= RCU_FANOUT */

#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-22 17:22     ` Paul E. McKenney
@ 2008-08-22 18:16       ` Josh Triplett
  2008-08-23 16:07       ` Ingo Molnar
  1 sibling, 0 replies; 94+ messages in thread
From: Josh Triplett @ 2008-08-22 18:16 UTC (permalink / raw)
  To: paulmck
  Cc: Ingo Molnar, linux-kernel, cl, akpm, manfred, dipankar, schamp,
	niv, dvhltc, ego, laijs, rostedt

On Fri, 2008-08-22 at 10:22 -0700, Paul E. McKenney wrote:
> On Fri, Aug 22, 2008 at 06:47:20AM -0700, Paul E. McKenney wrote:
> > On Fri, Aug 22, 2008 at 06:37:15AM +0200, Ingo Molnar wrote:
> > > 
> > > * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> > > 
> > > > +#define MAX_RCU_LEVELS 3
> > > > +#if NR_CPUS <= CONFIG_RCU_FANOUT
> > > > +#define NUM_RCU_LEVELS 1
> > > > +#define NUM_RCU_LEVEL_1 1
> > > > +#define NUM_RCU_LEVEL_2 NR_CPUS
> > > > +#define NUM_RCU_LEVEL_3 0
> > > > +#define NUM_RCU_LEVEL_4 0
> > > > +#define NUM_RCU_NODES NUM_RCU_LEVEL_1
> > > > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > > > +#define NUM_RCU_LEVELS 2
> > > > +#define NUM_RCU_LEVEL_1 1
> > > > +#define NUM_RCU_LEVEL_2 \
> > > > +	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
> > > > +#define NUM_RCU_LEVEL_3 NR_CPUS
> > > > +#define NUM_RCU_LEVEL_4 0
> > > > +#define NUM_RCU_NODES \
> > > > +	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
> > > > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > > > +#define NUM_RCU_LEVELS 3
> > > > +#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
> > > > +#define NUM_RCU_LEVEL_1 1
> > > > +#define NUM_RCU_LEVEL_2 \
> > > > +	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
> > > > +#define NUM_RCU_LEVEL_3 \
> > > > +	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
> > > > +#define NUM_RCU_LEVEL_4 NR_CPUS
> > > > +#define NUM_RCU_NODES \
> > > > +	((NUM_RCU_LEVEL_1) + \
> > > > +	 (NUM_RCU_LEVEL_2) + \
> > > > +	 (NUM_RCU_LEVEL_3))
> > > > +#else
> > > > +#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > > > +#endif
> > > 
> > > just a quick stylistic suggestion: if feasible then such sizing ugliness 
> > > should be hidden in a Kconfig file. (if Kconfig is capable enough for 
> > > this that is)
> > 
> > I have no idea if Kconfig can do it, but I will check.
> 
> OK, Kconfig does not currently support arithmetic, based on zconf.y:
> 
> expr:	  symbol				{ $$ = expr_alloc_symbol($1); }
> 	| symbol T_EQUAL symbol			{ $$ = expr_alloc_comp(E_EQUAL, $1, $3); }
> 	| symbol T_UNEQUAL symbol		{ $$ = expr_alloc_comp(E_UNEQUAL, $1, $3); }
> 	| T_OPEN_PAREN expr T_CLOSE_PAREN	{ $$ = $2; }
> 	| T_NOT expr				{ $$ = expr_alloc_one(E_NOT, $2); }
> 	| expr T_OR expr			{ $$ = expr_alloc_two(E_OR, $1, $3); }
> 	| expr T_AND expr			{ $$ = expr_alloc_two(E_AND, $1, $3); }
> ;
> 
> All we currently get is basic comparison and logical operators.  It would
> not be all -that- hard to add general arithmetic (famous last words),
> but when I tried mapping out what the sizing code would look like in
> such an augmented Kconfig, it was even uglier than the above.

Makes sense.

> So I took a hard look at the current mess, and prettied it as shown below.
> Is this a sufficient improvement?

Looks significantly improved.

> Another alternative I am considering is moving this to a separate
> include file.

I personally don't think that would help.  The revised version seems
fine.

> #define MAX_RCU_LEVELS 3
> #define RCU_FANOUT (CONFIG_RCU_FANOUT)
> #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
> #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
> 
> #if (NR_CPUS) <= RCU_FANOUT
> #  define NUM_RCU_LVLS 1
> #  define NUM_RCU_LVL_0 1
> #  define NUM_RCU_LVL_1 (NR_CPUS)
> #  define NUM_RCU_LVL_2 0
> #  define NUM_RCU_LVL_3 0
> #elif (NR_CPUS) <= RCU_FANOUT_SQ
> #  define NUM_RCU_LVLS 2
> #  define NUM_RCU_LVL_0 1
> #  define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> #  define NUM_RCU_LVL_2 (NR_CPUS)
> #  define NUM_RCU_LVL_3 0
> #elif (NR_CPUS) <= RCU_FANOUT_CUBE
> #  define NUM_RCU_LVLS 3
> #  define NUM_RCU_LVL_0 1
> #  define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> #  define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> #  define NUM_RCU_LVL_3 NR_CPUS
> #else
> # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> #endif /* #if (NR_CPUS) <= RCU_FANOUT */
> 
> #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
> #define NUM_RCU_NODES (RCU_SUM - NR_CPUS)

- Josh Triplett



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-21 23:43 [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation Paul E. McKenney
  2008-08-22  4:37 ` Ingo Molnar
@ 2008-08-22 23:29 ` Josh Triplett
  2008-08-23  1:53   ` Paul E. McKenney
  2008-08-25 10:34   ` Peter Zijlstra
  2008-08-24  8:08 ` Manfred Spraul
  2008-08-25  0:07 ` [PATCH, RFC, tip/core/rcu] v2 " Paul E. McKenney
  3 siblings, 2 replies; 94+ messages in thread
From: Josh Triplett @ 2008-08-22 23:29 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> Hello!
> 
> Experimental, not for inclusion.
> 
> Attached is a patch to Classic RCU that applies a hierarchy, greatly
> reducing the contention on the top-level lock for large machines.
> This passes mild rcutorture testing on x86 and ppc64, but is most
> definitely not ready for inclusion.  It is OK for experimental work
> assuming sufficiently brave experimenters.  See also Manfred Spraul's
> patch at http://lkml.org/lkml/2008/8/21/336 (or his earlier work from
> 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
> We will converge onto a common patch in the fullness of time, but
> are currently exploring different regions of the design space.
> 
> This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
> of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
> 64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
> there is no hierarchy.  By default, the RCU initialization code will
> adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
> architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
> this balancing, allowing the hierarchy to be exactly aligned to the
> underlying hardware.  Up to two levels of hierarchy are permitted
> (in addition to the root node), allowing up to 16,384 CPUs on 32-bit
> systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
> am going to regret saying this, but this seems more than sufficient
> for the foreseeable future.  (Some architectures might wish to set
> CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
> If this becomes a real problem, additional levels can be added, but I
> doubt that it will make a significant difference on real hardware.)
> 
> In the common case, a given CPU will manipulate its private rcu_data
> structure and the rcu_node structure that it shares with its immediate
> neighbors.  This can reduce both lock and memory contention by multiple
> orders of magnitude, which should eliminate the need for the strange
> manipulations that are reported to be required when running Linux on
> very large systems.
> 
> Some shortcomings:
> 
> o	The interface to dynticks is clumsy at best.  Improvements
> 	on the way.
> 
> o	CPU onlining and offlining is probably broken.  Will be tested.
> 
> o	The check-CPU-stalls code is busted.  Will be fixed.
> 
> o	There are probably hangs, rcutorture failures, &c.
> 
> o	There is not yet a human-readable design document.  Will be fixed.
> 
> o	The largest machine I can get my hands on at the moment only
> 	has 8 CPUs, which really doesn't stress this algorithm much.
> 
> If you want to use this against a Linus kernel, the following will work:
> 
> Start with 2.6.27-rc3.
> 
> Apply http://www.rdrop.com/users/paulmck/patches/paulmck-rcu.2008.08.20a.patch
> which catches you up to a recent linux-2.6-tip tip/core/rcu commit.
> 
> Apply http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-hierRCU-6.patch
> which gets you the current hierarchical RCU implementation.
> 
> Thoughts?

Looks great.  A few comments below.

> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---
> 
>  include/linux/rcuclassic.h |  154 ++++--
>  kernel/Kconfig.preempt     |   31 +
>  kernel/Makefile            |    3 
>  kernel/rcuclassic.c        | 1095 +++++++++++++++++++++++++++++----------------
>  kernel/rcuclassic_trace.c  |  219 +++++++++
>  5 files changed, 1076 insertions(+), 426 deletions(-)
> 
> diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
> index 1658995..97e646a 100644
> --- a/include/linux/rcuclassic.h
> +++ b/include/linux/rcuclassic.h
> @@ -18,6 +18,7 @@
>   * Copyright IBM Corporation, 2001

", 2008", here and elsewhere?

>   * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
> @@ -26,8 +27,10 @@
>   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
>   *
>   * For detailed explanation of Read-Copy Update mechanism see -
> - * 		Documentation/RCU
> - *
> + * 	Documentation/RCU
> + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
>   */

Why put these references here rather than in Documentation/RCU?  It
seems easier to keep documentation up to date in one place.  If you
think these represent a good "getting started" set of documents, how
about a Documentation/RCU/ReadTheseFirst with links to them, or how
about linking to them from whatisRCU.txt?

>  #ifndef __LINUX_RCUCLASSIC_H
> @@ -40,69 +43,136 @@
>  #include <linux/cpumask.h>
>  #include <linux/seqlock.h>
> 
> +/*
> + * Define the shape of the rcu_node hierarchy based on NR_CPUS and
> + * CONFIG_RCU_FANOUT.
> + */
> 
> -/* Global control variables for rcupdate callback mechanism. */
> -struct rcu_ctrlblk {
> -	long	cur;		/* Current batch number.                      */
> -	long	completed;	/* Number of the last completed batch         */
> -	long	pending;	/* Number of the last pending batch           */
> -#ifdef CONFIG_DEBUG_RCU_STALL
> -	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
> -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
> -
> -	int	signaled;
> +#define MAX_RCU_LEVELS 3
> +#if NR_CPUS <= CONFIG_RCU_FANOUT
> +#define NUM_RCU_LEVELS 1
> +#define NUM_RCU_LEVEL_1 1
> +#define NUM_RCU_LEVEL_2 NR_CPUS
> +#define NUM_RCU_LEVEL_3 0
> +#define NUM_RCU_LEVEL_4 0
> +#define NUM_RCU_NODES NUM_RCU_LEVEL_1
> +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> +#define NUM_RCU_LEVELS 2
> +#define NUM_RCU_LEVEL_1 1
> +#define NUM_RCU_LEVEL_2 \
> +	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
> +#define NUM_RCU_LEVEL_3 NR_CPUS
> +#define NUM_RCU_LEVEL_4 0
> +#define NUM_RCU_NODES \
> +	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
> +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> +#define NUM_RCU_LEVELS 3
> +#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
> +#define NUM_RCU_LEVEL_1 1
> +#define NUM_RCU_LEVEL_2 \
> +	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
> +#define NUM_RCU_LEVEL_3 \
> +	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
> +#define NUM_RCU_LEVEL_4 NR_CPUS
> +#define NUM_RCU_NODES \
> +	((NUM_RCU_LEVEL_1) + \
> +	 (NUM_RCU_LEVEL_2) + \
> +	 (NUM_RCU_LEVEL_3))
> +#else
> +#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> +#endif

This should get replaced by the revised version you followed up with.

> -	spinlock_t	lock	____cacheline_internodealigned_in_smp;
> -	cpumask_t	cpumask; /* CPUs that need to switch in order    */
> -				 /* for current batch to proceed.        */
> +/*
> + * Definition for node within the RCU grace-period-detection hierarchy.
> + */
> +struct rcu_node {
> +	spinlock_t lock;
> +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
> +				/*  order for current grace period to proceed.*/
> +	unsigned long	qsmaskinit;
> +				/* Per-GP initialization for qsmask.	      */
> +	int	grplo;		/* lowest-numbered CPU or group here.	      */
> +	int	grphi;		/* highest-numbered CPU or group here.	      */
> +	char	grpnum;		/* CPU/group number for next level up.	      */
> +	char	level;		/* root is at level 0.			      */

These four fields should use sized types, and preferably unsigned types.

> +	struct rcu_node *parent;
>  } ____cacheline_internodealigned_in_smp;
> 
> -/* Is batch a before batch b ? */
> -static inline int rcu_batch_before(long a, long b)
> -{
> -	return (a - b) < 0;
> -}
> +/*
> + * RCU global state, including node hierarchy.  This hierarchy is
> + * represented in "heap" form in a dense array.  The root (first level)
> + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
> + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
> + * and the third level in ->node[m+1] and following (->node[m+1] referenced
> + * by ->level[2]).  The number of levels is determined by the number of
> + * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
> + * consisting of a single rcu_node.
> + */
> +struct rcu_state {
> +	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
> +	struct rcu_node *level[NUM_RCU_LEVELS];	/* Hierarchy levels. */
> +	int levelcnt[MAX_RCU_LEVELS + 1];	/* # nodes in each level. */
> +	int levelspread[NUM_RCU_LEVELS];	/* kids/node in each level. */

These two should use sized types.

> +
> +	/* The following fields are guarded by the root rcu_node's lock. */
> +
> +	char	signaled ____cacheline_internodealigned_in_smp;
> +						/* sent GP-kick IPIs? */

u8 or bool, depending on semantics.  If just a simple flag, how about
bool?

> +	long	gpnum;				/* Current gp number. */
> +	long	completed;			/* # of last completed gp. */
> +	spinlock_t onofflock;			/* exclude on/offline and */
> +						/*  starting new GP. */
> +};
> 
> -/* Is batch a after batch b ? */
> -static inline int rcu_batch_after(long a, long b)
> -{
> -	return (a - b) > 0;
> -}
> +#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
> +#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
> +#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
> +#define RCU_NEXT_TAIL		3
> +#define RCU_NEXT_SIZE		4
> 
>  /* Per-CPU data for Read-Copy UPdate. */
>  struct rcu_data {
> -	/* 1) quiescent state handling : */
> -	long		quiescbatch;     /* Batch # for grace period */
> -	int		passed_quiesc;	 /* User-mode/idle loop etc. */
> -	int		qs_pending;	 /* core waits for quiesc state */
> +	/* 1) quiescent-state and grace-period handling : */
> +	long		completed;	/* Track rsp->completed gp number */
> +					/*  in order to detect GP end. */
> +	long		gpnum;		/* Highest gp number that this CPU */
> +					/*  is aware of having started. */
> +	int		passed_quiesc;	/* User-mode/idle loop etc. */
> +	int		qs_pending;	/* Core waits for quiesc state. */

Looks like several whitespace changes occurred here; several of these
lines didn't actually change except in whitespace.

The same comment about sized types applies here, but these fields didn't
actually change in this patch.

> +	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
> 
>  	/* 2) batch handling */
>  	/*
> -	 * if nxtlist is not NULL, then:
> -	 * batch:
> -	 *	The batch # for the last entry of nxtlist
> -	 * [*nxttail[1], NULL = *nxttail[2]):
> -	 *	Entries that batch # <= batch
> +	 * If nxtlist is not NULL, it is partitioned as follows.
> +	 * Any of the partitions might be empty, in which case the
> +	 * pointer to that partition will be equal to the pointer for
> +	 * the following partition.  When the list is empty, all of
> +	 * the nxttail elements point to nxtlist, which is NULL.
> +	 *
> +	 * [*nxttail[2], NULL = *nxttail[3]):
> +	 *	Entries that might have arrived after current GP ended
> +	 * [*nxttail[1], *nxttail[2]):
> +	 *	Entries known to have arrived before current GP ended
>  	 * [*nxttail[0], *nxttail[1]):
> -	 *	Entries that batch # <= batch - 1
> +	 *	Entries that batch # <= ->completed - 1: waiting for current GP
>  	 * [nxtlist, *nxttail[0]):
> -	 *	Entries that batch # <= batch - 2
> +	 *	Entries that batch # <= ->completed
>  	 *	The grace period for these entries has completed, and
>  	 *	the other grace-period-completed entries may be moved
>  	 *	here temporarily in rcu_process_callbacks().
>  	 */
> -	long  	       	batch;
>  	struct rcu_head *nxtlist;
> -	struct rcu_head **nxttail[3];
> -	long            qlen; 	 	 /* # of queued callbacks */
> -	struct rcu_head *donelist;
> -	struct rcu_head **donetail;
> -	long		blimit;		 /* Upper limit on a processed batch */
> +	struct rcu_head **nxttail[RCU_NEXT_SIZE];
> +	long            qlen; 	 	/* # of queued callbacks */
> +	long		blimit;		/* Upper limit on a processed batch */

Some whitespace changes again here; several of these lines didn't change
except in whitespace.

>  	int cpu;
>  	struct rcu_head barrier;
>  };
> 
> +extern struct rcu_state rcu_state;
>  DECLARE_PER_CPU(struct rcu_data, rcu_data);
> +
> +extern struct rcu_state rcu_bh_state;
>  DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);

Why extern and in the header?  I don't see anything else using them.

>  /*
> diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> index 9fdba03..43062bf 100644
> --- a/kernel/Kconfig.preempt
> +++ b/kernel/Kconfig.preempt
> @@ -68,7 +68,6 @@ config PREEMPT_RCU
> 
>  config RCU_TRACE
>  	bool "Enable tracing for RCU - currently stats in debugfs"
> -	depends on PREEMPT_RCU

Might want to document in the commit message that you have tracing
information through RCU_TRACE, and that it applies to non-preemptible
RCU as well now.

>  	select DEBUG_FS
>  	default y
>  	help
> @@ -77,3 +76,33 @@ config RCU_TRACE
> 
>  	  Say Y here if you want to enable RCU tracing
>  	  Say N if you are unsure.
> +
> +config RCU_FANOUT
> +	int "Hierarchical RCU fanout value"
> +	range 2 64 if 64BIT
> +	range 2 32 if !64BIT
> +	depends on CLASSIC_RCU
> +	default 64 if 64BIT
> +	default 32 if !64BIT
> +	help
> +	  This option controls the fanout of hierarchical implementations
> +	  of RCU, allowing RCU to work efficiently on machines with
> +	  large numbers of CPUs.  This value must be at least the cube
> +	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
> +	  systems and up to 262,144 for 64-bit systems.
> +
> +	  Select a specific number if testing RCU itself.

...or if attempting to tune for a specific NUMA system.

> +	  Take the default if unsure.
> +
> +config RCU_FANOUT_EXACT
> +	bool "Disable hierarchical RCU auto-balancing"
> +	depends on CLASSIC_RCU
> +	default n
> +	help
> +	  This option forces use of the exact RCU_FANOUT value specified,
> +	  regardless of imbalances in the hierarchy.  This can be useful
> +	  on systems with strong NUMA behavior.
> +
> +	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.

You might want to give a specific example of a NUMA machine, the
appropriate value to use on that machine, and the result with and
without RCU_FANOUT_EXACT.

> +	  Say n if unsure.
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 4e1d7df..d838fbd 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -75,6 +75,9 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
>  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
>  obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
>  obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
> +ifeq ($(CONFIG_CLASSIC_RCU),y)
> +obj-$(CONFIG_RCU_TRACE) += rcuclassic_trace.o
> +endif
>  ifeq ($(CONFIG_PREEMPT_RCU),y)
>  obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
>  endif

It might actually make sense here to do this instead:

ifeq ($(CONFIG_RCU_TRACE),y)
obj-$(CONFIG_CLASSIC_RCU) += rcuclassic_trace.o
obj-$(CONFIG_PREEMPT_RCU) += rcupreempt_trace.o
endif

> diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
> index 01e761a..5584b22 100644
> --- a/kernel/rcuclassic.c
> +++ b/kernel/rcuclassic.c
> @@ -27,7 +28,10 @@
>   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
>   *
>   * For detailed explanation of Read-Copy Update mechanism see -
> - * 		Documentation/RCU
> + * 	Documentation/RCU
> + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)

Same comment as before; maintaining these in a single place seems
easier.

> +struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
>  DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
> +
> +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
>  DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };

How about making these state structures static, along with removing the
extern in the header?

> -static int blimit = 10;
> -static int qhimark = 10000;
> -static int qlowmark = 100;
> +static int blimit = 10;		/* Maximum callbacks per softirq. */
> +static int qhimark = 10000;	/* If this many pending, ignore blimit. */
> +static int qlowmark = 100;	/* Once only this many pending, use blimit. */

Indentation mismatch on the comments?

>  #ifdef CONFIG_SMP
> -static void force_quiescent_state(struct rcu_data *rdp,
> -			struct rcu_ctrlblk *rcp)
> +static void force_quiescent_state(struct rcu_state *rsp)
>  {
>  	int cpu;
> -	cpumask_t cpumask;
>  	unsigned long flags;
> 
>  	set_need_resched();
> -	spin_lock_irqsave(&rcp->lock, flags);
> -	if (unlikely(!rcp->signaled)) {
> -		rcp->signaled = 1;
> +	if (!spin_trylock_irqsave(&rsp->onofflock, flags))
> +		return;

This seems to make force_quiescent_state rather less forceful.

> +/*
> + * Does the current CPU require a yet-as-unscheduled grace period?
> + */
> +static inline int
> +cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
> +{
> +	return *rdp->nxttail[RCU_DONE_TAIL] &&
> +	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
> +}

ACCESS_ONCE, like memory barriers, benefits from an accompanying
explanation.

> -#else
> +#else /* #ifdef CONFIG_HOTPLUG_CPU */
> 
> -static void rcu_offline_cpu(int cpu)
> +static inline void
> +rcu_offline_cpu(int cpu)
>  {
>  }

No need to explicitly say "inline"; GCC should do the right thing here.
Same comment applies a couple of other places in your patch.


> @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
>  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
>  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> 
> -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> +	       rcu_pending(cpu);

!! seems unnecessary here.

> +void call_rcu_bh(struct rcu_head *head,
> +				void (*func)(struct rcu_head *rcu))
> +{
> +	unsigned long flags;
> +
> +	head->func = func;
> +	head->next = NULL;
> +	local_irq_save(flags);
> +	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
> +	local_irq_restore(flags);
> +}
> +EXPORT_SYMBOL_GPL(call_rcu_bh);

This comment applies to the original code, but:
You only call __call_rcu twice, in call_rcu and call_rcu_bh.  Both
times, you set head first, then wrap the call with local_irq_save.  How
about moving both into __call_rcu, making call_rcu and call_rcu_bh
one-liners?

> --- /dev/null
> +++ b/kernel/rcuclassic_trace.c

> +static struct mutex rcuclassic_trace_mutex;

static DEFINE_MUTEX(rcuclassic_trace_mutex);
Then you don't need mutex_init later in your init function.

> +static char *rcuclassic_trace_buf;
> +#define RCUPREEMPT_TRACE_BUF_SIZE 4096

Did you perhaps want PAGE_SIZE?

- Josh Triplett



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-22 23:29 ` Josh Triplett
@ 2008-08-23  1:53   ` Paul E. McKenney
  2008-08-25 22:02     ` Josh Triplett
  2008-08-25 10:34   ` Peter Zijlstra
  1 sibling, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-23  1:53 UTC (permalink / raw)
  To: Josh Triplett
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > Hello!
> > 
> > Experimental, not for inclusion.
> > 
> > Attached is a patch to Classic RCU that applies a hierarchy, greatly
> > reducing the contention on the top-level lock for large machines.
> > This passes mild rcutorture testing on x86 and ppc64, but is most
> > definitely not ready for inclusion.  It is OK for experimental work
> > assuming sufficiently brave experimenters.  See also Manfred Spraul's
> > patch at http://lkml.org/lkml/2008/8/21/336 (or his earlier work from
> > 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
> > We will converge onto a common patch in the fullness of time, but
> > are currently exploring different regions of the design space.
> > 
> > This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
> > of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
> > 64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
> > there is no hierarchy.  By default, the RCU initialization code will
> > adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
> > architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
> > this balancing, allowing the hierarchy to be exactly aligned to the
> > underlying hardware.  Up to two levels of hierarchy are permitted
> > (in addition to the root node), allowing up to 16,384 CPUs on 32-bit
> > systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
> > am going to regret saying this, but this seems more than sufficient
> > for the foreseeable future.  (Some architectures might wish to set
> > CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
> > If this becomes a real problem, additional levels can be added, but I
> > doubt that it will make a significant difference on real hardware.)
> > 
> > In the common case, a given CPU will manipulate its private rcu_data
> > structure and the rcu_node structure that it shares with its immediate
> > neighbors.  This can reduce both lock and memory contention by multiple
> > orders of magnitude, which should eliminate the need for the strange
> > manipulations that are reported to be required when running Linux on
> > very large systems.
> > 
> > Some shortcomings:
> > 
> > o	The interface to dynticks is clumsy at best.  Improvements
> > 	on the way.
> > 
> > o	CPU onlining and offlining is probably broken.  Will be tested.
> > 
> > o	The check-CPU-stalls code is busted.  Will be fixed.
> > 
> > o	There are probably hangs, rcutorture failures, &c.
> > 
> > o	There is not yet a human-readable design document.  Will be fixed.
> > 
> > o	The largest machine I can get my hands on at the moment only
> > 	has 8 CPUs, which really doesn't stress this algorithm much.
> > 
> > If you want to use this against a Linus kernel, the following will work:
> > 
> > Start with 2.6.27-rc3.
> > 
> > Apply http://www.rdrop.com/users/paulmck/patches/paulmck-rcu.2008.08.20a.patch
> > which catches you up to a recent linux-2.6-tip tip/core/rcu commit.
> > 
> > Apply http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-hierRCU-6.patch
> > which gets you the current hierarchical RCU implementation.
> > 
> > Thoughts?
> 
> Looks great.  A few comments below.

Thank you for reviewing this!

> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> > 
> >  include/linux/rcuclassic.h |  154 ++++--
> >  kernel/Kconfig.preempt     |   31 +
> >  kernel/Makefile            |    3 
> >  kernel/rcuclassic.c        | 1095 +++++++++++++++++++++++++++++----------------
> >  kernel/rcuclassic_trace.c  |  219 +++++++++
> >  5 files changed, 1076 insertions(+), 426 deletions(-)
> > 
> > diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
> > index 1658995..97e646a 100644
> > --- a/include/linux/rcuclassic.h
> > +++ b/include/linux/rcuclassic.h
> > @@ -18,6 +18,7 @@
> >   * Copyright IBM Corporation, 2001
> 
> ", 2008", here and elsewhere?

Hey, at least I got the first three digits right!  ;-)  Fixed.

> >   * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
> > @@ -26,8 +27,10 @@
> >   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> >   *
> >   * For detailed explanation of Read-Copy Update mechanism see -
> > - * 		Documentation/RCU
> > - *
> > + * 	Documentation/RCU
> > + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> > + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> > + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> >   */
> 
> Why put these references here rather than in Documentation/RCU?  It
> seems easier to keep documentation up to date in one place.  If you
> think these represent a good "getting started" set of documents, how
> about a Documentation/RCU/ReadTheseFirst with links to them, or how
> about linking to them from whatisRCU.txt?

OK, now just refers to Documentation/RCU.

> >  #ifndef __LINUX_RCUCLASSIC_H
> > @@ -40,69 +43,136 @@
> >  #include <linux/cpumask.h>
> >  #include <linux/seqlock.h>
> > 
> > +/*
> > + * Define the shape of the rcu_node hierarchy based on NR_CPUS and
> > + * CONFIG_RCU_FANOUT.
> > + */
> > 
> > -/* Global control variables for rcupdate callback mechanism. */
> > -struct rcu_ctrlblk {
> > -	long	cur;		/* Current batch number.                      */
> > -	long	completed;	/* Number of the last completed batch         */
> > -	long	pending;	/* Number of the last pending batch           */
> > -#ifdef CONFIG_DEBUG_RCU_STALL
> > -	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
> > -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
> > -
> > -	int	signaled;
> > +#define MAX_RCU_LEVELS 3
> > +#if NR_CPUS <= CONFIG_RCU_FANOUT
> > +#define NUM_RCU_LEVELS 1
> > +#define NUM_RCU_LEVEL_1 1
> > +#define NUM_RCU_LEVEL_2 NR_CPUS
> > +#define NUM_RCU_LEVEL_3 0
> > +#define NUM_RCU_LEVEL_4 0
> > +#define NUM_RCU_NODES NUM_RCU_LEVEL_1
> > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > +#define NUM_RCU_LEVELS 2
> > +#define NUM_RCU_LEVEL_1 1
> > +#define NUM_RCU_LEVEL_2 \
> > +	(((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT))
> > +#define NUM_RCU_LEVEL_3 NR_CPUS
> > +#define NUM_RCU_LEVEL_4 0
> > +#define NUM_RCU_NODES \
> > +	((NUM_RCU_LEVEL_1) + (NUM_RCU_LEVEL_2))
> > +#elif NR_CPUS <= CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT * CONFIG_RCU_FANOUT
> > +#define NUM_RCU_LEVELS 3
> > +#define RCU_FANOUT_SQ ((CONFIG_RCU_FANOUT) * (CONFIG_RCU_FANOUT))
> > +#define NUM_RCU_LEVEL_1 1
> > +#define NUM_RCU_LEVEL_2 \
> > +	(((NR_CPUS) + (RCU_FANOUT_SQ) - 1) / (RCU_FANOUT_SQ))
> > +#define NUM_RCU_LEVEL_3 \
> > +	((NR_CPUS) + (CONFIG_RCU_FANOUT) - 1) / (CONFIG_RCU_FANOUT)
> > +#define NUM_RCU_LEVEL_4 NR_CPUS
> > +#define NUM_RCU_NODES \
> > +	((NUM_RCU_LEVEL_1) + \
> > +	 (NUM_RCU_LEVEL_2) + \
> > +	 (NUM_RCU_LEVEL_3))
> > +#else
> > +#error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > +#endif
> 
> This should get replaced by the revised version you followed up with.

Yep, fixed.

> > -	spinlock_t	lock	____cacheline_internodealigned_in_smp;
> > -	cpumask_t	cpumask; /* CPUs that need to switch in order    */
> > -				 /* for current batch to proceed.        */
> > +/*
> > + * Definition for node within the RCU grace-period-detection hierarchy.
> > + */
> > +struct rcu_node {
> > +	spinlock_t lock;
> > +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
> > +				/*  order for current grace period to proceed.*/
> > +	unsigned long	qsmaskinit;
> > +				/* Per-GP initialization for qsmask.	      */
> > +	int	grplo;		/* lowest-numbered CPU or group here.	      */
> > +	int	grphi;		/* highest-numbered CPU or group here.	      */
> > +	char	grpnum;		/* CPU/group number for next level up.	      */
> > +	char	level;		/* root is at level 0.			      */
> 
> These four fields should use sized types, and preferably unsigned types.

OK for grpnum and level, but grphi and grplo need to be "int" to
match the various CPU-manipulation primitives.

> > +	struct rcu_node *parent;
> >  } ____cacheline_internodealigned_in_smp;
> > 
> > -/* Is batch a before batch b ? */
> > -static inline int rcu_batch_before(long a, long b)
> > -{
> > -	return (a - b) < 0;
> > -}
> > +/*
> > + * RCU global state, including node hierarchy.  This hierarchy is
> > + * represented in "heap" form in a dense array.  The root (first level)
> > + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
> > + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
> > + * and the third level in ->node[m+1] and following (->node[m+1] referenced
> > + * by ->level[2]).  The number of levels is determined by the number of
> > + * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
> > + * consisting of a single rcu_node.
> > + */
> > +struct rcu_state {
> > +	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
> > +	struct rcu_node *level[NUM_RCU_LEVELS];	/* Hierarchy levels. */
> > +	int levelcnt[MAX_RCU_LEVELS + 1];	/* # nodes in each level. */
> > +	int levelspread[NUM_RCU_LEVELS];	/* kids/node in each level. */
> 
> These two should use sized types.

Fair enough.  And can be 8 bits, for that matter.

> > +
> > +	/* The following fields are guarded by the root rcu_node's lock. */
> > +
> > +	char	signaled ____cacheline_internodealigned_in_smp;
> > +						/* sent GP-kick IPIs? */
> 
> u8 or bool, depending on semantics.  If just a simple flag, how about
> bool?

This will need to be a non-bool shortly.

OK, so what the heck -are- the official type names???  u8 seems
to be defined in a powerpc-specific file.  OK, it also appears in
include/asm-generic/int-l64.h.  s8, u8, s16, u16, s32, u32, s64, and
u64, then?

> > +	long	gpnum;				/* Current gp number. */
> > +	long	completed;			/* # of last completed gp. */
> > +	spinlock_t onofflock;			/* exclude on/offline and */
> > +						/*  starting new GP. */
> > +};
> > 
> > -/* Is batch a after batch b ? */
> > -static inline int rcu_batch_after(long a, long b)
> > -{
> > -	return (a - b) > 0;
> > -}
> > +#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
> > +#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
> > +#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
> > +#define RCU_NEXT_TAIL		3
> > +#define RCU_NEXT_SIZE		4
> > 
> >  /* Per-CPU data for Read-Copy UPdate. */
> >  struct rcu_data {
> > -	/* 1) quiescent state handling : */
> > -	long		quiescbatch;     /* Batch # for grace period */
> > -	int		passed_quiesc;	 /* User-mode/idle loop etc. */
> > -	int		qs_pending;	 /* core waits for quiesc state */
> > +	/* 1) quiescent-state and grace-period handling : */
> > +	long		completed;	/* Track rsp->completed gp number */
> > +					/*  in order to detect GP end. */
> > +	long		gpnum;		/* Highest gp number that this CPU */
> > +					/*  is aware of having started. */
> > +	int		passed_quiesc;	/* User-mode/idle loop etc. */
> > +	int		qs_pending;	/* Core waits for quiesc state. */
> 
> Looks like several whitespace changes occurred here; several of these
> lines didn't actually change except in whitespace.

Whitespace-only changes are kind of lost in the noise with this patch.
Best to apply it and inspect the result.

> The same comment about sized types applies here, but these fields didn't
> actually change in this patch.

Both can be bool.  The completed and gpnum fields can probably be s32,
but deferring that change.

> > +	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
> > 
> >  	/* 2) batch handling */
> >  	/*
> > -	 * if nxtlist is not NULL, then:
> > -	 * batch:
> > -	 *	The batch # for the last entry of nxtlist
> > -	 * [*nxttail[1], NULL = *nxttail[2]):
> > -	 *	Entries that batch # <= batch
> > +	 * If nxtlist is not NULL, it is partitioned as follows.
> > +	 * Any of the partitions might be empty, in which case the
> > +	 * pointer to that partition will be equal to the pointer for
> > +	 * the following partition.  When the list is empty, all of
> > +	 * the nxttail elements point to nxtlist, which is NULL.
> > +	 *
> > +	 * [*nxttail[2], NULL = *nxttail[3]):
> > +	 *	Entries that might have arrived after current GP ended
> > +	 * [*nxttail[1], *nxttail[2]):
> > +	 *	Entries known to have arrived before current GP ended
> >  	 * [*nxttail[0], *nxttail[1]):
> > -	 *	Entries that batch # <= batch - 1
> > +	 *	Entries that batch # <= ->completed - 1: waiting for current GP
> >  	 * [nxtlist, *nxttail[0]):
> > -	 *	Entries that batch # <= batch - 2
> > +	 *	Entries that batch # <= ->completed
> >  	 *	The grace period for these entries has completed, and
> >  	 *	the other grace-period-completed entries may be moved
> >  	 *	here temporarily in rcu_process_callbacks().
> >  	 */
> > -	long  	       	batch;
> >  	struct rcu_head *nxtlist;
> > -	struct rcu_head **nxttail[3];
> > -	long            qlen; 	 	 /* # of queued callbacks */
> > -	struct rcu_head *donelist;
> > -	struct rcu_head **donetail;
> > -	long		blimit;		 /* Upper limit on a processed batch */
> > +	struct rcu_head **nxttail[RCU_NEXT_SIZE];
> > +	long            qlen; 	 	/* # of queued callbacks */
> > +	long		blimit;		/* Upper limit on a processed batch */
> 
> Some whitespace changes again here; several of these lines didn't change
> except in whitespace.

And the string of spaces needs to become tabs while I am at it...

> >  	int cpu;
> >  	struct rcu_head barrier;
> >  };
> > 
> > +extern struct rcu_state rcu_state;
> >  DECLARE_PER_CPU(struct rcu_data, rcu_data);
> > +
> > +extern struct rcu_state rcu_bh_state;
> >  DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
> 
> Why extern and in the header?  I don't see anything else using them.

kernel/rcuclassic_trace.c, right?

> >  /*
> > diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
> > index 9fdba03..43062bf 100644
> > --- a/kernel/Kconfig.preempt
> > +++ b/kernel/Kconfig.preempt
> > @@ -68,7 +68,6 @@ config PREEMPT_RCU
> > 
> >  config RCU_TRACE
> >  	bool "Enable tracing for RCU - currently stats in debugfs"
> > -	depends on PREEMPT_RCU
> 
> Might want to document in the commit message that you have tracing
> information through RCU_TRACE, and that it applies to non-preemptible
> RCU as well now.

And in Documentation/RCU, for that matter...

> >  	select DEBUG_FS
> >  	default y
> >  	help
> > @@ -77,3 +76,33 @@ config RCU_TRACE
> > 
> >  	  Say Y here if you want to enable RCU tracing
> >  	  Say N if you are unsure.
> > +
> > +config RCU_FANOUT
> > +	int "Hierarchical RCU fanout value"
> > +	range 2 64 if 64BIT
> > +	range 2 32 if !64BIT
> > +	depends on CLASSIC_RCU
> > +	default 64 if 64BIT
> > +	default 32 if !64BIT
> > +	help
> > +	  This option controls the fanout of hierarchical implementations
> > +	  of RCU, allowing RCU to work efficiently on machines with
> > +	  large numbers of CPUs.  This value must be at least the cube
> > +	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
> > +	  systems and up to 262,144 for 64-bit systems.
> > +
> > +	  Select a specific number if testing RCU itself.
> 
> ...or if attempting to tune for a specific NUMA system.

Indeed.  But I need to see an actual example before I document it.
It would be easy to make things slower by following the NUMA hardware
layout.

> > +	  Take the default if unsure.
> > +
> > +config RCU_FANOUT_EXACT
> > +	bool "Disable hierarchical RCU auto-balancing"
> > +	depends on CLASSIC_RCU
> > +	default n
> > +	help
> > +	  This option forces use of the exact RCU_FANOUT value specified,
> > +	  regardless of imbalances in the hierarchy.  This can be useful
> > +	  on systems with strong NUMA behavior.
> > +
> > +	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
> 
> You might want to give a specific example of a NUMA machine, the
> appropriate value to use on that machine, and the result with and
> without RCU_FANOUT_EXACT.

Or change "can" to "might".  ;-)

> > +	  Say n if unsure.
> > diff --git a/kernel/Makefile b/kernel/Makefile
> > index 4e1d7df..d838fbd 100644
> > --- a/kernel/Makefile
> > +++ b/kernel/Makefile
> > @@ -75,6 +75,9 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
> >  obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
> >  obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
> >  obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
> > +ifeq ($(CONFIG_CLASSIC_RCU),y)
> > +obj-$(CONFIG_RCU_TRACE) += rcuclassic_trace.o
> > +endif
> >  ifeq ($(CONFIG_PREEMPT_RCU),y)
> >  obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
> >  endif
> 
> It might actually make sense here to do this instead:
> 
> ifeq ($(CONFIG_RCU_TRACE),y)
> obj-$(CONFIG_CLASSIC_RCU) += rcuclassic_trace.o
> obj-$(CONFIG_PREEMPT_RCU) += rcupreempt_trace.o
> endif

Excellent point!  Fixed.

> > diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
> > index 01e761a..5584b22 100644
> > --- a/kernel/rcuclassic.c
> > +++ b/kernel/rcuclassic.c
> > @@ -27,7 +28,10 @@
> >   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> >   *
> >   * For detailed explanation of Read-Copy Update mechanism see -
> > - * 		Documentation/RCU
> > + * 	Documentation/RCU
> > + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> > + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> > + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> 
> Same comment as before; maintaining these in a single place seems
> easier.

Now just does Documentation/RCU.

> > +struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
> >  DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
> > +
> > +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
> >  DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
> 
> How about making these state structures static, along with removing the
> extern in the header?

No can do, as I need them in kernel/rcuclassic_trace.c.

> > -static int blimit = 10;
> > -static int qhimark = 10000;
> > -static int qlowmark = 100;
> > +static int blimit = 10;		/* Maximum callbacks per softirq. */
> > +static int qhimark = 10000;	/* If this many pending, ignore blimit. */
> > +static int qlowmark = 100;	/* Once only this many pending, use blimit. */
> 
> Indentation mismatch on the comments?

Looks fine in the source -- context diff-ism.

> >  #ifdef CONFIG_SMP
> > -static void force_quiescent_state(struct rcu_data *rdp,
> > -			struct rcu_ctrlblk *rcp)
> > +static void force_quiescent_state(struct rcu_state *rsp)
> >  {
> >  	int cpu;
> > -	cpumask_t cpumask;
> >  	unsigned long flags;
> > 
> >  	set_need_resched();
> > -	spin_lock_irqsave(&rcp->lock, flags);
> > -	if (unlikely(!rcp->signaled)) {
> > -		rcp->signaled = 1;
> > +	if (!spin_trylock_irqsave(&rsp->onofflock, flags))
> > +		return;
> 
> This seems to make force_quiescent_state rather less forceful.

It will try again on the next scheduling-clock interrupt.  The reason
I did this is because ->onofflock is a global lock acquired when
beginning a quiescent state or when onlining/offlining.  Can't let
force_quiescent_state() monopolize things, and would like to exclude
online/offline while doing force_quiescent_state().  Hence make
force_quiescent_state() back off if the lock is held.

There is probably a better way to do this...

> > +/*
> > + * Does the current CPU require a yet-as-unscheduled grace period?
> > + */
> > +static inline int
> > +cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
> > +{
> > +	return *rdp->nxttail[RCU_DONE_TAIL] &&
> > +	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
> > +}
> 
> ACCESS_ONCE, like memory barriers, benefits from an accompanying
> explanation.

OK.  These are accessed without holding the relevant lock, and I don't
want the compiler to refetch them.  (For example, if inlined into other
code touching these same variables.)  Yes, I am paranoid.

> > -#else
> > +#else /* #ifdef CONFIG_HOTPLUG_CPU */
> > 
> > -static void rcu_offline_cpu(int cpu)
> > +static inline void
> > +rcu_offline_cpu(int cpu)
> >  {
> >  }
> 
> No need to explicitly say "inline"; GCC should do the right thing here.
> Same comment applies a couple of other places in your patch.

OK, I will get rid of these.  You can do the other 26,000 of them.  ;-)

> > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > 
> > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > +	       rcu_pending(cpu);
> 
> !! seems unnecessary here.

Someone once told me why this was necessary, but I forget.  It was in the
original, and I didn't put it there.  Some weirdness about conversion
to 32-bit integer when the lower 32 bits of the pointer was zero or
some such.  So if your pointer value was 0x100000000, for example,
so that conversion to int gives zero.

> > +void call_rcu_bh(struct rcu_head *head,
> > +				void (*func)(struct rcu_head *rcu))
> > +{
> > +	unsigned long flags;
> > +
> > +	head->func = func;
> > +	head->next = NULL;
> > +	local_irq_save(flags);
> > +	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
> > +	local_irq_restore(flags);
> > +}
> > +EXPORT_SYMBOL_GPL(call_rcu_bh);
> 
> This comment applies to the original code, but:
> You only call __call_rcu twice, in call_rcu and call_rcu_bh.  Both
> times, you set head first, then wrap the call with local_irq_save.  How
> about moving both into __call_rcu, making call_rcu and call_rcu_bh
> one-liners?

I can't pass "rcu_data" to a function (or at least I don't know how to
do so, short of passing __per_cpu_rcu_data and doing the per-CPU stuff
by hand).  I could make __call_rcu() be a macro, but that seemed more
ugly than it seemed worthwhile.

Is there some other approach that would work?

> > --- /dev/null
> > +++ b/kernel/rcuclassic_trace.c
> 
> > +static struct mutex rcuclassic_trace_mutex;
> 
> static DEFINE_MUTEX(rcuclassic_trace_mutex);
> Then you don't need mutex_init later in your init function.

Good point.

> > +static char *rcuclassic_trace_buf;
> > +#define RCUPREEMPT_TRACE_BUF_SIZE 4096
> 
> Did you perhaps want PAGE_SIZE?

I really want some way of gracefully handling arbitrarily long output
to debugfs.  I am sure that some such exists, but haven't found it.
What I do instead is to arbitrarily truncate output to 4096 bytes,
which will be stunningly less than useful on a 4,096-CPU machine.  :-/

Suggestions welcome!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-22 17:22     ` Paul E. McKenney
  2008-08-22 18:16       ` Josh Triplett
@ 2008-08-23 16:07       ` Ingo Molnar
  2008-08-24  2:44         ` Paul E. McKenney
  1 sibling, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-08-23 16:07 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> Is this a sufficient improvement?

yeah - looks much better. This was the block that meets the eye for the 
first time in the patch so it stuck out.

just one more small pet peeve of mine: please use vertical alignment too 
to improve readability. Instead of:

> #define MAX_RCU_LEVELS 3
> #define RCU_FANOUT (CONFIG_RCU_FANOUT)
> #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
> #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)

this looks a bit more structured IMO:

> #define MAX_RCU_LEVELS	3
> #define RCU_FANOUT		(CONFIG_RCU_FANOUT)
> #define RCU_FANOUT_SQ		(RCU_FANOUT * RCU_FANOUT)
> #define RCU_FANOUT_CUBE	(RCU_FANOUT_SQ * RCU_FANOUT)

maybe even this:

> #if (NR_CPUS) <= RCU_FANOUT
> #  define NUM_RCU_LVLS	1
> #  define NUM_RCU_LVL_0	1
> #  define NUM_RCU_LVL_1	(NR_CPUS)
> #  define NUM_RCU_LVL_2	0
> #  define NUM_RCU_LVL_3	0
> #elif (NR_CPUS) <= RCU_FANOUT_SQ
> #  define NUM_RCU_LVLS	2
> #  define NUM_RCU_LVL_0	1
> #  define NUM_RCU_LVL_1	(((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> #  define NUM_RCU_LVL_2	(NR_CPUS)
> #  define NUM_RCU_LVL_3	0
> #elif (NR_CPUS) <= RCU_FANOUT_CUBE
> #  define NUM_RCU_LVLS	3
> #  define NUM_RCU_LVL_0	1
> #  define NUM_RCU_LVL_1	(((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> #  define NUM_RCU_LVL_2	(((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> #  define NUM_RCU_LVL_3	NR_CPUS
> #else
> # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> #endif /* #if (NR_CPUS) <= RCU_FANOUT */

but no strong feelings on that one. (maybe inserting a space at the 
right places helps too, no need for a full tab)

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-23 16:07       ` Ingo Molnar
@ 2008-08-24  2:44         ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-24  2:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Sat, Aug 23, 2008 at 06:07:35PM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > Is this a sufficient improvement?
> 
> yeah - looks much better. This was the block that meets the eye for the 
> first time in the patch so it stuck out.
> 
> just one more small pet peeve of mine: please use vertical alignment too 
> to improve readability. Instead of:
> 
> > #define MAX_RCU_LEVELS 3
> > #define RCU_FANOUT (CONFIG_RCU_FANOUT)
> > #define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT)
> > #define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT)
> 
> this looks a bit more structured IMO:
> 
> > #define MAX_RCU_LEVELS	3
> > #define RCU_FANOUT		(CONFIG_RCU_FANOUT)
> > #define RCU_FANOUT_SQ		(RCU_FANOUT * RCU_FANOUT)
> > #define RCU_FANOUT_CUBE	(RCU_FANOUT_SQ * RCU_FANOUT)

Good point, fixed.

> maybe even this:
> 
> > #if (NR_CPUS) <= RCU_FANOUT
> > #  define NUM_RCU_LVLS	1
> > #  define NUM_RCU_LVL_0	1
> > #  define NUM_RCU_LVL_1	(NR_CPUS)
> > #  define NUM_RCU_LVL_2	0
> > #  define NUM_RCU_LVL_3	0
> > #elif (NR_CPUS) <= RCU_FANOUT_SQ
> > #  define NUM_RCU_LVLS	2
> > #  define NUM_RCU_LVL_0	1
> > #  define NUM_RCU_LVL_1	(((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> > #  define NUM_RCU_LVL_2	(NR_CPUS)
> > #  define NUM_RCU_LVL_3	0
> > #elif (NR_CPUS) <= RCU_FANOUT_CUBE
> > #  define NUM_RCU_LVLS	3
> > #  define NUM_RCU_LVL_0	1
> > #  define NUM_RCU_LVL_1	(((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> > #  define NUM_RCU_LVL_2	(((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> > #  define NUM_RCU_LVL_3	NR_CPUS
> > #else
> > # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > #endif /* #if (NR_CPUS) <= RCU_FANOUT */
> 
> but no strong feelings on that one. (maybe inserting a space at the 
> right places helps too, no need for a full tab)

Yep, just like you, spaced it just enough to keep the longest one from
running over one line.  ;-)

I left the definitions for RCU_SUM and NUM_RCU_NODES compact, though:

#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)

The other alternative would be to stack RCU_SUM as follows:

#define RCU_SUM		      (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + \
			       NUM_RCU_LVL_2 + NUM_RCU_LVL_3)

which seemed to me to add more ugly than enlightenment.

Testing is going well.  Having to occasionally restrain myself to keep
from going full-bore for 4096 CPU optimality -- but have to keep it
simple until/unless someone with that large of a machine shows where
improvements are needed.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-21 23:43 [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation Paul E. McKenney
  2008-08-22  4:37 ` Ingo Molnar
  2008-08-22 23:29 ` Josh Triplett
@ 2008-08-24  8:08 ` Manfred Spraul
  2008-08-24 16:32   ` Paul E. McKenney
  2008-08-25  0:07 ` [PATCH, RFC, tip/core/rcu] v2 " Paul E. McKenney
  3 siblings, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-08-24  8:08 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

Paul E. McKenney wrote:
> +/*
> + * Definition for node within the RCU grace-period-detection hierarchy.
> + */
> +struct rcu_node {
> +	spinlock_t lock;
> +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
> +				/*  order for current grace period to proceed.*/
> +	unsigned long	qsmaskinit;
> +				/* Per-GP initialization for qsmask.	      */
>   
I'm not sure if a bitmap is the right storage. If I understand the code 
correctly, it contains two information:
1) If the bitmap is clear, then all cpus have completed whatever they 
need to do.
A counter is more efficient than a bitmap. Especially: It would allow to 
choose the optimal fan-out, independent from 32/64 bits.
2) The information if the current cpu must do something to complete the 
current period.non
This is a local information, usually (always?) only the current cpu 
needs to know if it must do something.
But this doesn't need to be stored in a shared structure, the 
information could be stored in a per-cpu structure.

> +	/*
> +	 * Extract the list of ready callbacks, disabling to prevent
> +	 * races with call_rcu() from interrupt handlers.
> +	 */
> +	local_irq_save(flags);
> +	list = rdp->nxtlist;
> +	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
> +	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
> +	tail = rdp->nxttail[RCU_DONE_TAIL];
> +	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
> +		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
> +			rdp->nxttail[count] = &rdp->nxtlist;
> +	local_irq_restore(flags);
>   
Do you have a description of the events between call_rcu() and the rcu 
callback?
Is the following description correct?

__call_rcu() queues in RCU_NEXT_TAIL.
In the middle of the current grace period: rcu_check_quiescent_state() 
calls rcu_next_callbacks_are_ready().
Entry now in RCU_NEXT_READY_TAIL
** 0.5 cycles: wait until all cpus have completed the current cycle.
rcu_process_gp_end() moves from NEXT_READY_TAIL to WAIT_TAIL

** full grace period
rcu_process_gp_end() moves from WAIT_TAIL to DONE_TAIL
rcu_do_batch() finds the entries in DONE_TAIL and calls the callback.


> +/*
> + * Do softirq processing for the current CPU.
> + */
>  static void rcu_process_callbacks(struct softirq_action *unused)
>  {
>  	/*
>  	 * Memory references from any prior RCU read-side critical sections
> -	 * executed by the interrupted code must be see before any RCU
> +	 * executed by the interrupted code must be seen before any RCU
>  	 * grace-period manupulations below.
>  	 */
>  
>  	smp_mb(); /* See above block comment. */
>   
Why this mb()? There was a grace period between the last read side 
critical section that might have accessed the pointer.
The rcu internal code already does spin_lock()+spin_unlock(). Isn't that 
sufficient?

>  
> -	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
> -	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
> +	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
> +	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
>   
Have you considered merging RCU_DONE_TAIL for rcu_data and rcu_bh_data?

> +
> +/**
> + * call_rcu - Queue an RCU callback for invocation after a grace period.
> + * @head: structure to be used for queueing the RCU updates.
> + * @func: actual update function to be invoked after the grace period
> + *
> + * The update function will be invoked some time after a full grace
> + * period elapses, in other words after all currently executing RCU
> + * read-side critical sections have completed.  RCU read-side critical
> + * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
> + * and may be nested.
> + */
>   
The docbook entry is duplicated: They are in include/linux/rcupdate.h 
and here.
What about removing one of them?
I would go one step further:
Even add call_rcu_sched() into rcupdate.h. Add a Kconfig bool 
"RCU_NEEDS_SCHED" and automatically define either the extern or the #define.

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-24  8:08 ` Manfred Spraul
@ 2008-08-24 16:32   ` Paul E. McKenney
  2008-08-24 18:25     ` Manfred Spraul
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-24 16:32 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Sun, Aug 24, 2008 at 10:08:44AM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:

Thank you for looking this over!

>> + * Definition for node within the RCU grace-period-detection hierarchy.
>> + */
>> +struct rcu_node {
>> +	spinlock_t lock;
>> +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
>> +				/*  order for current grace period to proceed.*/
>> +	unsigned long	qsmaskinit;
>> +				/* Per-GP initialization for qsmask.	      */
>>   
> I'm not sure if a bitmap is the right storage. If I understand the code 
> correctly, it contains two information:
> 1) If the bitmap is clear, then all cpus have completed whatever they need 
> to do.
> A counter is more efficient than a bitmap. Especially: It would allow to 
> choose the optimal fan-out, independent from 32/64 bits.
> 2) The information if the current cpu must do something to complete the 
> current period.non
> This is a local information, usually (always?) only the current cpu needs 
> to know if it must do something.
> But this doesn't need to be stored in a shared structure, the information 
> could be stored in a per-cpu structure.

I am using the bitmap in force_quiescent_state() to work out who to
check dynticks and who to send reschedule IPIs to.  I could scan all
of the per-CPU rcu_data structures, but am assuming that after a few
jiffies there would typically be relatively few CPUs still needing to do
a quiescent state.  Given this assumption, on systems with large numbers
of CPUs, scanning the bitmask greatly reduces the number of cache misses
compared to scanning the rcu_data structures.

>> +	/*
>> +	 * Extract the list of ready callbacks, disabling to prevent
>> +	 * races with call_rcu() from interrupt handlers.
>> +	 */
>> +	local_irq_save(flags);
>> +	list = rdp->nxtlist;
>> +	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
>> +	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
>> +	tail = rdp->nxttail[RCU_DONE_TAIL];
>> +	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
>> +		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
>> +			rdp->nxttail[count] = &rdp->nxtlist;
>> +	local_irq_restore(flags);
>>   
> Do you have a description of the events between call_rcu() and the rcu 
> callback?
> Is the following description correct?
>
> __call_rcu() queues in RCU_NEXT_TAIL.
> In the middle of the current grace period: rcu_check_quiescent_state() 
> calls rcu_next_callbacks_are_ready().
> Entry now in RCU_NEXT_READY_TAIL
> ** 0.5 cycles: wait until all cpus have completed the current cycle.
> rcu_process_gp_end() moves from NEXT_READY_TAIL to WAIT_TAIL
>
> ** full grace period
> rcu_process_gp_end() moves from WAIT_TAIL to DONE_TAIL
> rcu_do_batch() finds the entries in DONE_TAIL and calls the callback.

Yes, that is the sequence of events if grace periods are happening back to
back and if the current CPU has not yet passed through a quiescent state.
If RCU is idle, then there is no need to wait for a previous grace
period to complete.  If this CPU already passed through its quiescent
state for the first grace period, and is not the CPU that starts the next
grace period, then there will be an additional grace period to move from
RCU_NEXT_TAIL to RCU_NEXT_READY_TAIL.  If this CPU is the one starting
the next grace period, then all of its callbacks get advanced.

>> +/*
>> + * Do softirq processing for the current CPU.
>> + */
>>  static void rcu_process_callbacks(struct softirq_action *unused)
>>  {
>>  	/*
>>  	 * Memory references from any prior RCU read-side critical sections
>> -	 * executed by the interrupted code must be see before any RCU
>> +	 * executed by the interrupted code must be seen before any RCU
>>  	 * grace-period manupulations below.
>>  	 */
>>   	smp_mb(); /* See above block comment. */
>>   
> Why this mb()? There was a grace period between the last read side critical 
> section that might have accessed the pointer.
> The rcu internal code already does spin_lock()+spin_unlock(). Isn't that 
> sufficient?

The combination of spin_lock()+spin_unlock()+spin_lock()+spin_unlock()
would suffice.  But the pair of smp_mb()s suffice regardless of fast
paths through the rest of the mechanism.  Because rcu_process_callbacks()
is on the slow path, the trival proof of ordering is more important than
the slight reduction in overhead.

>>  -	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
>> -	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
>> +	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
>> +	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
>>   
> Have you considered merging RCU_DONE_TAIL for rcu_data and rcu_bh_data?

I have (and am) considering this.  It would require an additional per-CPU
data structure, would require an additional check in rcu_pending(), and
require a bit more manipulation when callbacks become "done".  However,
but would simplify the requeuing code in rcu_do_batch() a bit.

One interesting side effect would be that blimit would apply globally
to both rcu and rcu_bh, and that a burst of (say) rcu callbacks could
delay rcu_bh callbacks.  I am not yet sure whether this is good or bad.

>> +
>> +/**
>> + * call_rcu - Queue an RCU callback for invocation after a grace period.
>> + * @head: structure to be used for queueing the RCU updates.
>> + * @func: actual update function to be invoked after the grace period
>> + *
>> + * The update function will be invoked some time after a full grace
>> + * period elapses, in other words after all currently executing RCU
>> + * read-side critical sections have completed.  RCU read-side critical
>> + * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
>> + * and may be nested.
>> + */
>>   
> The docbook entry is duplicated: They are in include/linux/rcupdate.h and 
> here.
> What about removing one of them?

Good point, done!

> I would go one step further:
> Even add call_rcu_sched() into rcupdate.h. Add a Kconfig bool 
> "RCU_NEEDS_SCHED" and automatically define either the extern or the 
> #define.

Another approach would be to have an rcupdate.h definition that
was a wrapper around __call_rcu_sched(), and put the docbook stuff in
rcupdate.h.  It would appear that docbook was not created with the idea
of alternative implementations in mind.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-24 16:32   ` Paul E. McKenney
@ 2008-08-24 18:25     ` Manfred Spraul
  2008-08-24 21:19       ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-08-24 18:25 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

Paul E. McKenney wrote:
>>> + */
>>> +struct rcu_node {
>>> +	spinlock_t lock;
>>> +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
>>> +				/*  order for current grace period to proceed.*/
>>> +	unsigned long	qsmaskinit;
>>> +				/* Per-GP initialization for qsmask.	      */
>>>   
>>>       
>> I'm not sure if a bitmap is the right storage. If I understand the code 
>> correctly, it contains two information:
>> 1) If the bitmap is clear, then all cpus have completed whatever they need 
>> to do.
>> A counter is more efficient than a bitmap. Especially: It would allow to 
>> choose the optimal fan-out, independent from 32/64 bits.
>> 2) The information if the current cpu must do something to complete the 
>> current period.non
>> This is a local information, usually (always?) only the current cpu needs 
>> to know if it must do something.
>> But this doesn't need to be stored in a shared structure, the information 
>> could be stored in a per-cpu structure.
>>     
>
> I am using the bitmap in force_quiescent_state() to work out who to
> check dynticks and who to send reschedule IPIs to.  I could scan all
> of the per-CPU rcu_data structures, but am assuming that after a few
> jiffies there would typically be relatively few CPUs still needing to do
> a quiescent state.  Given this assumption, on systems with large numbers
> of CPUs, scanning the bitmask greatly reduces the number of cache misses
> compared to scanning the rcu_data structures.
>
>   
It's an optimization question: What is rarer? force_quiescent_state() or 
"normal" cpu_quiet calls.
You have optimized for force_quiescent_state(), I have optimized for 
"normal" cpu_quiet calls. [ok, I admit: force_quiescent_state() is still 
missing in my code].
Do you have any statistics?

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-24 18:25     ` Manfred Spraul
@ 2008-08-24 21:19       ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-24 21:19 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Sun, Aug 24, 2008 at 08:25:02PM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:
>>>> + */
>>>> +struct rcu_node {
>>>> +	spinlock_t lock;
>>>> +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
>>>> +				/*  order for current grace period to proceed.*/
>>>> +	unsigned long	qsmaskinit;
>>>> +				/* Per-GP initialization for qsmask.	      */
>>>>         
>>> I'm not sure if a bitmap is the right storage. If I understand the code 
>>> correctly, it contains two information:
>>> 1) If the bitmap is clear, then all cpus have completed whatever they 
>>> need to do.
>>> A counter is more efficient than a bitmap. Especially: It would allow to 
>>> choose the optimal fan-out, independent from 32/64 bits.
>>> 2) The information if the current cpu must do something to complete the 
>>> current period.non
>>> This is a local information, usually (always?) only the current cpu needs 
>>> to know if it must do something.
>>> But this doesn't need to be stored in a shared structure, the information 
>>> could be stored in a per-cpu structure.
>>
>> I am using the bitmap in force_quiescent_state() to work out who to
>> check dynticks and who to send reschedule IPIs to.  I could scan all
>> of the per-CPU rcu_data structures, but am assuming that after a few
>> jiffies there would typically be relatively few CPUs still needing to do
>> a quiescent state.  Given this assumption, on systems with large numbers
>> of CPUs, scanning the bitmask greatly reduces the number of cache misses
>> compared to scanning the rcu_data structures.
>>   
> It's an optimization question: What is rarer? force_quiescent_state() or 
> "normal" cpu_quiet calls.
> You have optimized for force_quiescent_state(), I have optimized for 
> "normal" cpu_quiet calls. [ok, I admit: force_quiescent_state() is still 
> missing in my code].

;-)

> Do you have any statistics?

If the system is completely busy, then I would expect normal cpu_quiet()
calls to be more common.  But if the system were sized for peak
workload, it would spend a fair amount of time with many of the CPUs
idle.  Power-conservation measures would hopefully push the idleness
into single cores/dies/whatever which could then be powered down.

A large fraction of the systems I see have utilizations well under 50%.
And latency concerns would also focus on force_quiescent state.

That said, I haven't had much to do with systems having more than 128
CPUs.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [PATCH, RFC, tip/core/rcu] v2 scalable classic RCU implementation
  2008-08-21 23:43 [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation Paul E. McKenney
                   ` (2 preceding siblings ...)
  2008-08-24  8:08 ` Manfred Spraul
@ 2008-08-25  0:07 ` Paul E. McKenney
  2008-08-30  0:49   ` [PATCH, RFC, tip/core/rcu] v3 " Paul E. McKenney
  3 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-25  0:07 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt

Hello!

Still experimental, not for inclusion.

Updates from earlier version:

o	Handles dyntick idle state, including interrupts and NMIs,
	but while allowing dyntick-idle CPUs to remain idle (in
	contrast to the previous version, which simply whacked all
	non-responding CPUs with a resched IPI).

o	Made force_quiescent_state() more intelligent, so that it
	no longer whacks all CPUs whether they need it or not.

o	Cleaned up cpp code that determines size and shape of the
	rcu_node hierarchy.

o	Added debugfs tracing capability (was in previous patch,
	but forgot to mention it).

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large machines.
This passes mild rcutorture testing on x86 and ppc64, but is most
definitely not ready for inclusion.  It is OK for experimental work
assuming sufficiently brave experimenters.  See also Manfred Spraul's
patch at http://lkml.org/lkml/2008/8/21/336 (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).  We will
converge onto a common patch in the fullness of time, but are currently
exploring different regions of the design space.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	Entering and leaving dynticks idle mode is a quiescent state,
	but the current patch doesn't take advantage of this (noted
	by Manfred).  If there was an arch-independent in_nmi() or
	some such, it would also be possible to take advantage of
	interrupts from dyntick-idle mode -- and simplify the dynticks
	code interfacing to RCU.

o	CPU onlining and offlining is probably broken.  Testing in
	progress.

o	The check-CPU-stalls code is busted.  Will be fixed.

o	There are probably hangs, rcutorture failures, &c.

o	There is not yet a human-readable design document.  Will be fixed.

o	The largest machine I can get my hands on at the moment only
	has 8 CPUs, which really doesn't stress this algorithm much.

If you want to use this against a Linus kernel, the following will work:

Start with 2.6.27-rc3.

Apply http://www.rdrop.com/users/paulmck/patches/paulmck-rcu.2008.08.20a.patch
which catches you up to a recent linux-2.6-tip tip/core/rcu commit.

Apply http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-hierRCU-9.patch
which gets you the current hierarchical RCU implementation.

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 include/linux/hardirq.h    |    4 
 include/linux/rcuclassic.h |  221 +++++--
 kernel/Kconfig.preempt     |   32 +
 kernel/Makefile            |    5 
 kernel/rcuclassic.c        | 1374 ++++++++++++++++++++++++++++++++-------------
 kernel/rcuclassic_trace.c  |  218 +++++++
 6 files changed, 1421 insertions(+), 433 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..a776bf0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,13 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+#endif /* #if defined(CONFIG_NO_HZ) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 1658995..b21300f 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -15,19 +15,16 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright IBM Corporation, 2001
+ * Copyright IBM Corporation, 2008
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
- *
+ * 	Documentation/RCU
  */
 
 #ifndef __LINUX_RCUCLASSIC_H
@@ -40,69 +37,159 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
 
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	long	cur;		/* Current batch number.                      */
-	long	completed;	/* Number of the last completed batch         */
-	long	pending;	/* Number of the last pending batch           */
-#ifdef CONFIG_DEBUG_RCU_STALL
-	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
-#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+#if (NR_CPUS) <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
 
-	int	signaled;
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-				 /* for current batch to proceed.        */
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
+				/*  order for current grace period to proceed.*/
+	unsigned long	qsmaskinit;
+				/* Per-GP initialization for qsmask.	      */
+	int	grplo;		/* lowest-numbered CPU or group here.	      */
+	int	grphi;		/* highest-numbered CPU or group here.	      */
+	u8	grpnum;		/* CPU/group number for next level up.	      */
+	u8	level;		/* root is at level 0.			      */
+	struct rcu_node *parent;
 } ____cacheline_internodealigned_in_smp;
 
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
-	return (a - b) < 0;
-}
+/* Values for signaled field in struc rcu_data. */
+#define RCU_SAVE_DYNTICK	0	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		1	/* Need to force quiescent state. */
+#define RCU_SIGNALED		2	/* We have done all that we can. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
 
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
-	return (a - b) > 0;
-}
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u8 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
 
-/* Per-CPU data for Read-Copy UPdate. */
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* sent GP-kick IPIs? */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
 struct rcu_data {
-	/* 1) quiescent state handling : */
-	long		quiescbatch;     /* Batch # for grace period */
-	int		passed_quiesc;	 /* User-mode/idle loop etc. */
-	int		qs_pending;	 /* core waits for quiesc state */
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 
 	/* 2) batch handling */
 	/*
-	 * if nxtlist is not NULL, then:
-	 * batch:
-	 *	The batch # for the last entry of nxtlist
-	 * [*nxttail[1], NULL = *nxttail[2]):
-	 *	Entries that batch # <= batch
-	 * [*nxttail[0], *nxttail[1]):
-	 *	Entries that batch # <= batch - 1
-	 * [nxtlist, *nxttail[0]):
-	 *	Entries that batch # <= batch - 2
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
 	 *	The grace period for these entries has completed, and
 	 *	the other grace-period-completed entries may be moved
 	 *	here temporarily in rcu_process_callbacks().
 	 */
-	long  	       	batch;
 	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail[3];
-	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *donelist;
-	struct rcu_head **donetail;
-	long		blimit;		 /* Upper limit on a processed batch */
-	int cpu;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+	/* 3) rcu-barrier functions */
 	struct rcu_head barrier;
+
+#ifdef CONFIG_NO_HZ
+	/* 4) dynticks interface (see http://lwn.net/Articles/279077/) */
+	int dynticks_nesting;		/* Track nesting level, sort of. */
+	int dynticks;			/* Even for dynticks-idle mode. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	int cpu;
 };
 
+extern struct rcu_state rcu_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
@@ -115,11 +202,13 @@ static inline void rcu_qsctr_inc(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
 }
 static inline void rcu_bh_qsctr_inc(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
 }
 
 extern int rcu_pending(int cpu);
@@ -172,7 +261,41 @@ extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
 
+#ifdef CONFIG_NO_HZ
+
+/*
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ *
+ * @@@ note quiescent state???
+ */
+static inline void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_data).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_data).dynticks & 0x1, &rs);
+}
+
+/*
+ * Exit nohz mode.
+ */
+static inline void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_data).dynticks++;
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_data).dynticks & 0x1),
+				&rs);
+}
+
+#else /* CONFIG_NO_HZ */
 #define rcu_enter_nohz()	do { } while (0)
 #define rcu_exit_nohz()		do { } while (0)
+#endif /* CONFIG_NO_HZ */
 
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03..38a64ae 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -68,7 +68,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
 	select DEBUG_FS
 	default y
 	help
@@ -77,3 +76,34 @@ config RCU_TRACE
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on CLASSIC_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable hierarchical RCU auto-balancing"
+	depends on CLASSIC_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df..b018f62 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -75,8 +75,9 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
+ifeq ($(CONFIG_RCU_TRACE),y)
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic_trace.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt_trace.o
 endif
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 01e761a..905018c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -15,20 +15,17 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright IBM Corporation, 2001
+ * Copyright IBM Corporation, 2008
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
- *
+ * 	Documentation/RCU
  */
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -56,164 +53,227 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+}
 
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+#ifdef CONFIG_NO_HZ
+
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
 {
-	int cpu;
-	cpumask_t cpumask;
-	unsigned long flags;
+	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
 
-	set_need_resched();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
+	if (rdp->dynticks_nesting)
+		rdp->dynticks_nesting++;
+
+	/*
+	 * Only update if we are coming from a stopped ticks mode
+	 * (rdp->dynticks is even).
+	 */
+	if (!in_interrupt() &&
+	    (rdp->dynticks & 0x1) == 0) {
 		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 *
-		 * cpu_online_map is updated by the _cpu_down()
-		 * using __stop_machine(). Since we're in irqs disabled
-		 * section, __stop_machine() is not exectuting, hence
-		 * the cpu_online_map is stable.
+		 * The following might seem like we could have a race
+		 * with NMI/SMIs. But this really isn't a problem.
+		 * Here we do a read/modify/write, and the race happens
+		 * when an NMI/SMI comes in after the read and before
+		 * the write. But NMI/SMIs will increment this counter
+		 * twice before returning, so the zero bit will not
+		 * be corrupted by the NMI/SMI which is the most important
+		 * part.
 		 *
-		 * However,  a cpu might have been offlined _just_ before
-		 * we disabled irqs while entering here.
-		 * And rcu subsystem might not yet have handled the CPU_DEAD
-		 * notification, leading to the offlined cpu's bit
-		 * being set in the rcp->cpumask.
+		 * The only thing is that we would bring back the counter
+		 * to a postion that it was in during the NMI/SMI.
+		 * But the zero bit would be set, so the rest of the
+		 * counter would again be ignored.
 		 *
-		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
-		 * sending smp_reschedule() to an offlined CPU.
+		 * On return from the IRQ, the counter may have the zero
+		 * bit be 0 and the counter the same as the return from
+		 * the NMI/SMI. If the state machine was so unlucky to
+		 * see that, it still doesn't matter, since all
+		 * RCU read-side critical sections on this CPU would
+		 * have already completed.
+		 */
+		rdp->dynticks++;
+		/*
+		 * The following memory barrier ensures that any RCU
+		 * read-side critical sections in the irq handler are
+		 * seen by other CPUs to follow the above increment to
+		 * rdp->dynticks. This is required in order for other CPUs
+		 * to correctly determine when it is safe to advance the
+		 * RCU grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		/*
+		 * Since we can't determine the dynamic tick mode from
+		 * the rdp->dynticks after this routine, we use a second
+		 * flag to acknowledge that we came from an idle state
+		 * with ticks stopped.
+		 */
+		rdp->dynticks_nesting++;
+		/*
+		 * If we take an NMI/SMI now, they will also increment
+		 * the dynticks_nesting counter, and will not update the
+		 * rdp->dynticks on exit. That is for this IRQ to do.
 		 */
-		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask_nr(cpu, cpumask)
-			smp_send_reschedule(cpu);
 	}
-	spin_unlock_irqrestore(&rcp->lock, flags);
 }
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	set_need_resched();
-}
-#endif
 
-static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
-		struct rcu_data *rdp)
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
 {
-	long batch;
-
-	head->next = NULL;
-	smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
+	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
 
 	/*
-	 * Determine the batch number of this callback.
-	 *
-	 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
-	 * local variable "batch" and emits codes like this:
-	 *	1) rdp->batch = rcp->cur + 1 # gets old value
-	 *	......
-	 *	2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
-	 * then [*nxttail[0], *nxttail[1]) may contain callbacks
-	 * that batch# = rdp->batch, see the comment of struct rcu_data.
+	 * rdp->dynticks_nesting is set if we interrupted the CPU
+	 * when it was idle with ticks stopped.
+	 * Once this occurs, we keep track of interrupt nesting
+	 * because a NMI/SMI could also come in, and we still
+	 * only want the IRQ that started the increment of the
+	 * rdp->dynticks to be the one that modifies it on exit.
 	 */
-	batch = ACCESS_ONCE(rcp->cur) + 1;
-
-	if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
-		/* process callbacks */
-		rdp->nxttail[0] = rdp->nxttail[1];
-		rdp->nxttail[1] = rdp->nxttail[2];
-		if (rcu_batch_after(batch - 1, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[2];
-	}
+	if (rdp->dynticks_nesting) {
+		if (--rdp->dynticks_nesting)
+			return;
 
-	rdp->batch = batch;
-	*rdp->nxttail[2] = head;
-	rdp->nxttail[2] = &head->next;
+		/* This must match the interrupt nesting */
+		WARN_ON(in_interrupt());
 
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
+		/*
+		 * If an NMI/SMI happens now we are still
+		 * protected by the rdp->dynticks being odd.
+		 */
+
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_unlock() primitives in the irq handler
+		 * are seen by other CPUs to preceed the following
+		 * increment to rdp->dynticks. This is required in
+		 * order for other CPUs to determine when it is safe
+		 * to advance the RCU grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		rdp->dynticks++;
+		WARN_ON(rdp->dynticks & 0x1);
 	}
 }
 
-/**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
  */
-void call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+static int dyntick_save_progress_counter(int cpu)
 {
-	unsigned long flags;
+	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
+	int snap;
 
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	local_irq_restore(flags);
+	snap = rdp->dynticks;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	return ((snap & 0x1) == 0);
 }
-EXPORT_SYMBOL_GPL(call_rcu);
 
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+/*
+ * Snapshot the global completed counter so that later on it will be
+ * possible to tell which grace period any detected dyntick-idle
+ * quiescent states belong to.  The caller must hold the root rcu_node
+ * lock.
+ */
+static void dyntick_save_completed(struct rcu_state *rsp, long completed)
 {
-	unsigned long flags;
+	rsp->dynticks_completed = completed;
+}
 
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-	local_irq_restore(flags);
+/*
+ * Get the value previously saved by dyntick_save_completed().
+ */
+static long dyntick_get_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
 }
-EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(int cpu)
+{
+	long curr;
+	long snap;
+	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
+
+	curr = rdp->dynticks;
+	snap = rdp->dynticks_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr - snap) >= 2 || (curr & 0x1) == 0)
+		return 1;
+
+	/*
+	 * We need this CPU to either enter dynticks idle mode or pass
+	 * through a quiescent state.  Send it a reschedule IPI.
+	 */
+
+	if (cpu != smp_processor_id())
+		smp_send_reschedule(cpu);
+	else
+		set_need_resched();
+	return 0;
+}
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static int dyntick_save_progress_counter(int cpu) { return 0; }
+static int rcu_implicit_dynticks_qs(int cpu) { return 0; }
+# define dyntick_save_completed(rsp, completed) do { } while (0)
+# define dyntick_get_completed(rsp)		((rsp)->completed - 1)
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
 
 /*
  * Return the number of RCU batches processed thus far.  Useful
@@ -221,7 +281,7 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
  */
 long rcu_batches_completed(void)
 {
-	return rcu_ctrlblk.completed;
+	return rcu_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
@@ -231,70 +291,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
  */
 long rcu_batches_completed_bh(void)
 {
-	return rcu_bh_ctrlblk.completed;
+	return rcu_bh_state.completed;
 }
 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
 /* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
+static void raise_rcu_softirq(void)
 {
 	raise_softirq(RCU_SOFTIRQ);
 }
 
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-	rdp->donelist = list;
-
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		raise_rcu_softirq();
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-
 #ifdef CONFIG_DEBUG_RCU_STALL
 
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+static void record_gp_check_time(struct rcu_ctrlblk *rcp)
 {
 	rcp->gp_check = get_seconds() + 3;
 }
@@ -359,78 +368,354 @@ static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 
 #else /* #ifdef CONFIG_DEBUG_RCU_STALL */
 
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+static void record_gp_check_time(struct rcu_state *rsp)
 {
 }
 
-static inline void
-check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+static void
+check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
 #endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
 
 /*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+void rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		levelspread[i] = CONFIG_RCU_FANOUT;
+	}
+	
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+void rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+	
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * When a given CPU first becomes aware of a grace period, it knows
+ * that all of its pre-existing callbacks will be covered by the next
+ * grace period.
+ *
+ * Similarly, if a given CPU has not yet let RCU know that it passed
+ * through a quiescent state for the current grace period, then that
+ * CPU knows that all of its callbacks may safely be invoked at the
+ * end of the next grace period.
+ */
+static void
+rcu_next_callbacks_are_ready(struct rcu_data *rdp)
+{
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+}
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
  */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+static void
+rcu_start_gp(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long iflg)
 {
-	if (rcp->cur != rcp->pending &&
-			rcp->completed == rcp->cur) {
-		rcp->cur++;
-		record_gp_check_time(rcp);
+	unsigned long flags = iflg;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
 
 		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
+		 * Either there is no need to detect any more grace periods
+		 * at the moment, or we are already in the process of
+		 * detecting one.  Either way, we should not start a new
+		 * RCU grace period, so drop the lock and return.
 		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
 
-		rcp->signaled = 0;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+
+	rsp->gpnum++;
+	rsp->signaled = RCU_SIGNAL_INIT;
+	dyntick_save_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+
+	rcu_next_callbacks_are_ready(rdp);
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
 	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.
  */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
 	}
+	local_irq_restore(flags);
 }
 
 /*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
  */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
+static void
+cpu_quiet_msk(unsigned long mask,
+	      struct rcu_state *rsp, struct rcu_data *rdp, struct rcu_node *rnp,
+	      long *lastcomp, unsigned long flags)
 {
-	unsigned long flags;
+	if (lastcomp != NULL &&
+	    *lastcomp != ACCESS_ONCE(rsp->completed)) {
 
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared below if this race occurred.
+		 */
+		spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = 1L << rnp->grpnum;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rdp);
+	rcu_start_gp(rsp, rdp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU.  Note that a CPU
+ * going offline counts as a quiescent state.  If invoking this on behalf
+ * of an online CPU (even if you are that CPU), lastcomp is used to make
+ * sure we are still in the grace period of interest.  We don't want to
+ * end the current grace period based on quiescent states detected in an
+ * earlier grace period!
+ *
+ * @@@@ make sure all callers are passing -their- rdp, not cpu's!!!
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
+{
+	unsigned long flags;
+	long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	mask = 1L << (cpu - rnp->grplo);
+	cpu_quiet_msk(mask, rsp, rdp, rnp, lastcomp, flags);
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
 
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
 	 */
 	if (!rdp->qs_pending)
 		return;
@@ -441,195 +726,253 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 	 */
 	if (!rdp->passed_quiesc)
 		return;
-	rdp->qs_pending = 0;
 
-	spin_lock_irqsave(&rcp->lock, flags);
 	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
+	 * Set up to process all currently pending callbacks at the end
+	 * of the next grace period, as these pending callbacks are
+	 * guaranteed to have been registered before the beginning of
+	 * the next grace period.  Then record the fact that this CPU
+	 * has done its part for the current grace period.
 	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	rcu_next_callbacks_are_ready(rdp);
+	rdp->qs_pending = 0;
+	cpu_quiet(rdp->cpu, rsp, rdp, &rdp->passed_quiesc_completed);
 }
 
-
 #ifdef CONFIG_HOTPLUG_CPU
 
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
  */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail, long batch)
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp,
+			      struct rcu_data *rdp, struct rcu_data *rdp_me)
 {
-	if (list) {
-		local_irq_disable();
-		this_rdp->batch = batch;
-		*this_rdp->nxttail[2] = list;
-		this_rdp->nxttail[2] = tail;
-		local_irq_enable();
+	int i;
+	unsigned long flags;
+	long mask;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	spin_lock(&rnp->lock);			/* irqs already disabled. */
+	mask = 1L << (cpu - rnp->grplo);
+	for (;;) {
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = 1L << rnp->grpnum;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		rnp = rnp->parent;
+		if (rnp == NULL)
+			break;
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
 	}
-}
 
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	unsigned long flags;
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
 
 	/*
-	 * if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
 	 */
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-	spin_unlock(&rcp->lock);
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+	}
 
-	this_rdp->qlen += rdp->qlen;
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
 static void rcu_offline_cpu(int cpu)
 {
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_me = &__get_cpu_var(rcu_data);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+	struct rcu_data *bh_rdp_me = &__get_cpu_var(rcu_bh_data);
 
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
+	__rcu_offline_cpu(cpu, &rcu_state, rdp, rdp_me);
+	__rcu_offline_cpu(cpu, &rcu_bh_state, bh_rdp, bh_rdp_me);
 }
 
-#else
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_offline_cpu(int cpu)
+static void
+rcu_offline_cpu(int cpu)
 {
 }
 
-#endif
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
- * This does the RCU processing work from softirq context.
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_data *rdp)
 {
-	long completed_snap;
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
 
-	if (rdp->nxtlist) {
-		local_irq_disable();
-		completed_snap = ACCESS_ONCE(rcp->completed);
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
 
-		/*
-		 * move the other grace-period-completed entries to
-		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
-		else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
-			rdp->nxttail[0] = rdp->nxttail[1];
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
 
-		/*
-		 * the grace period for entries in
-		 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
-		 * move these entries to donelist
-		 */
-		if (rdp->nxttail[0] != &rdp->nxtlist) {
-			*rdp->donetail = rdp->nxtlist;
-			rdp->donetail = rdp->nxttail[0];
-			rdp->nxtlist = *rdp->nxttail[0];
-			*rdp->donetail = NULL;
-
-			if (rdp->nxttail[1] == rdp->nxttail[0])
-				rdp->nxttail[1] = &rdp->nxtlist;
-			if (rdp->nxttail[2] == rdp->nxttail[0])
-				rdp->nxttail[2] = &rdp->nxtlist;
-			rdp->nxttail[0] = &rdp->nxtlist;
-		}
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	/* Update count, and requeue any remaining callbacks. */
+	local_irq_save(flags);
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+	local_irq_restore(flags);
 
-		local_irq_enable();
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
 
-		if (rcu_batch_after(rdp->batch, rcp->pending)) {
-			unsigned long flags;
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_rcu_softirq();
+}
 
-			/* and start it/schedule start if it's a new batch */
-			spin_lock_irqsave(&rcp->lock, flags);
-			if (rcu_batch_after(rdp->batch, rcp->pending)) {
-				rcp->pending = rdp->batch;
-				rcu_start_batch(rcp);
-			}
-			spin_unlock_irqrestore(&rcp->lock, flags);
-		}
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, rdp, flags);  /* releases rsp->lock */
 	}
 
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
 }
 
+/*
+ * Do softirq processing for the current CPU.
+ */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	/*
 	 * Memory references from any prior RCU read-side critical sections
-	 * executed by the interrupted code must be see before any RCU
+	 * executed by the interrupted code must be seen before any RCU
 	 * grace-period manupulations below.
 	 */
 
 	smp_mb(); /* See above block comment. */
 
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 
 	/*
 	 * Memory references from any later RCU read-side critical sections
-	 * executed by the interrupted code must be see after any RCU
+	 * executed by the interrupted code must be seen after any RCU
 	 * grace-period manupulations above.
 	 */
 
 	smp_mb(); /* See above block comment. */
 }
 
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	/* Check for CPU stalls, if enabled. */
-	check_cpu_stall(rcp, rdp);
+	check_cpu_stall(rsp, rdp);
 
-	if (rdp->nxtlist) {
-		long completed_snap = ACCESS_ONCE(rcp->completed);
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
 
-		/*
-		 * This cpu has pending rcu entries and the grace period
-		 * for them has completed.
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			return 1;
-		if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
-				rdp->nxttail[0] != rdp->nxttail[1])
-			return 1;
-		if (rdp->nxttail[0] != &rdp->nxtlist)
-			return 1;
+	/* Does this CPU have finished callbacks to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
 
-		/*
-		 * This cpu has pending rcu entries and the new batch
-		 * for then hasn't been started nor scheduled start
-		 */
-		if (rcu_batch_after(rdp->batch, rcp->pending))
-			return 1;
-	}
+	/* Are there callbacks waiting for a GP that needs to be started? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
 
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
 		return 1;
 
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
 		return 1;
 
 	/* nothing to do */
@@ -643,8 +986,8 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
  */
 int rcu_pending(int cpu)
 {
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
 }
 
 /*
@@ -658,14 +1001,19 @@ int rcu_needs_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
 
-	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
+	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
+	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
+	       rcu_pending(cpu);
 }
 
 /*
- * Top-level function driving RCU grace-period detection, normally
- * invoked from the scheduler-clock interrupt.  This function simply
- * increments counters that are read only from softirq by this same
- * CPU, so there are no memory barriers required.
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
  */
 void rcu_check_callbacks(int cpu, int user)
 {
@@ -707,20 +1055,224 @@ void rcu_check_callbacks(int cpu, int user)
 	raise_rcu_softirq();
 }
 
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int
+rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int (*f)(int))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = &__get_cpu_var(rcu_data);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		mask = 0;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0L && f(cpu))
+				mask |= bit;
+		}
+		if (mask != 0) {
+			cpu_quiet_msk(mask, rsp, rdp, rnp_cur,
+				      &lastcomp, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp)
 {
-	long flags;
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
 
-	spin_lock_irqsave(&rcp->lock, flags);
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
+	if (!spin_trylock_irqsave(&rsp->onofflock, flags))
+		return;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_SAVE_DYNTICK:
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_save_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_get_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Update state. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed)
+			rsp->signaled = RCU_SIGNALED;
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_SIGNALED:
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+static void
+__call_rcu(struct rcu_head *head, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rsp);
+	}
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	__call_rcu(head, &rcu_state, &__get_cpu_var(rcu_data));
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+	int i;
+	long mask;
+	struct rcu_node *rnp = rdp->mynode;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	spin_lock(&rnp->lock);		/* irqs already disabled. */
+	completed_snap = ACCESS_ONCE(rsp->completed); /* outside of lock */
+	rdp->completed = completed_snap;
+	rdp->gpnum = completed_snap;
+	rdp->passed_quiesc = 1;
 	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
 	rdp->blimit = blimit;
-	spin_unlock_irqrestore(&rcp->lock, flags);
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = 1;
+	rdp->dynticks_nesting = 0;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+
+	/* Add CPU to rcu_node bitmasks. */
+
+	mask = 1L << (cpu - rnp->grplo);
+	for (;;) {
+		rnp->qsmaskinit |= mask;
+		mask = 1L << rnp->grpnum;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+		if ((rnp == NULL) || !!(rnp->qsmaskinit & mask))
+			break;
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
@@ -728,11 +1280,14 @@ static void __cpuinit rcu_online_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
 
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	rcu_init_percpu_data(cpu, &rcu_state, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_state, bh_rdp);
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
+/*
+ * Handle CPU online/offline notifcation events.
+ */
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
@@ -753,20 +1308,81 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++) {
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	}
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->grplo = j * rsp->levelspread[i];
+			rnp->grphi = (j + 1) * rsp->levelspread[i] - 1;
+			if (rnp->grphi >= rsp->levelcnt[i + 1])
+				rnp->grphi = rsp->levelcnt[i + 1] - 1;
+			rnp->qsmaskinit = 0;
+			if (i != NUM_RCU_LVLS - 1)
+				rnp->grplo = rnp->grphi = 0;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->parent = rsp->level[i - 1] + 
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+	} \
+} while (0)
+
 static struct notifier_block __cpuinitdata rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init __rcu_init(void)
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
 	/* Register notifier for non-boot CPUs */
 	register_cpu_notifier(&rcu_nb);
 }
diff --git a/kernel/rcuclassic_trace.c b/kernel/rcuclassic_trace.c
new file mode 100644
index 0000000..100d757
--- /dev/null
+++ b/kernel/rcuclassic_trace.c
@@ -0,0 +1,218 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+
+static DEFINE_MUTEX(rcuclassic_trace_mutex);
+static char *rcuclassic_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE 4096
+
+static int print_one_rcu_data(struct rcu_data *rdp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		"%3d completed=%ld gpnum=%ld passed_q: %d qs_pending: %d",
+		rdp->cpu,
+		rdp->completed, rdp->gpnum,
+		rdp->passed_quiesc, rdp->qs_pending);
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" qlen: %ld blimit: %ld\n", rdp->qlen, rdp->blimit);
+	return cnt;
+}
+
+#define PRINT_RCU_DATA(name, buf, ebuf) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_online_cpu(_p_r_d_i) \
+			(buf) += print_one_rcu_data(&per_cpu(name, _p_r_d_i), \
+						    buf, ebuf); \
+	} while (0)
+
+static ssize_t rcudata_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static int print_one_rcu_state(struct rcu_state *rsp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+	int level = 0;
+	struct rcu_node *rnp;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+			"completed: %ld gpnum: %ld signaled: %d\n",
+			rsp->completed, rsp->gpnum, rsp->signaled);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+			level = rnp->level;
+		}
+		cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+				"%lx/%lx %d:%d ^%d    ",
+				rnp->qsmask, rnp->qsmaskinit,
+				rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+	return cnt;
+}
+
+static ssize_t rcuhier_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	buf += print_one_rcu_state(&rcu_state, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	buf += print_one_rcu_state(&rcu_bh_state, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: completed=%ld  gpnum=%ld\n",
+			rcu_state.completed, rcu_state.gpnum);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh: completed=%ld  gpnum=%ld\n",
+			rcu_bh_state.completed, rcu_bh_state.gpnum);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.read = rcudata_read,
+};
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuhier_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct dentry *rcudir, *datadir, *hierdir, *gpdir;
+static int rcuclassic_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcuclassic_trace_init(void)
+{
+	int ret;
+
+	rcuclassic_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcuclassic_trace_buf)
+		return 1;
+	ret = rcuclassic_debugfs_init();
+	if (ret)
+		kfree(rcuclassic_trace_buf);
+	return ret;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+	kfree(rcuclassic_trace_buf);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-22 23:29 ` Josh Triplett
  2008-08-23  1:53   ` Paul E. McKenney
@ 2008-08-25 10:34   ` Peter Zijlstra
  2008-08-25 15:16     ` Paul E. McKenney
  1 sibling, 1 reply; 94+ messages in thread
From: Peter Zijlstra @ 2008-08-25 10:34 UTC (permalink / raw)
  To: Josh Triplett
  Cc: paulmck, linux-kernel, cl, mingo, akpm, manfred, dipankar,
	schamp, niv, dvhltc, ego, laijs, rostedt

On Fri, 2008-08-22 at 16:29 -0700, Josh Triplett wrote:

> > @@ -26,8 +27,10 @@
> >   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> >   *
> >   * For detailed explanation of Read-Copy Update mechanism see -
> > - * 		Documentation/RCU
> > - *
> > + * 	Documentation/RCU
> > + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> > + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> > + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> >   */
> 
> Why put these references here rather than in Documentation/RCU?  It
> seems easier to keep documentation up to date in one place.  If you
> think these represent a good "getting started" set of documents, how
> about a Documentation/RCU/ReadTheseFirst with links to them, or how
> about linking to them from whatisRCU.txt?

I actually like in code comments and 'documentation' more than
Documentation/ stuff. Mostly because Documentation/ is:
 - far away from the code
 - therefore, more easily bitrotted
 - and easily forgotten




^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-25 10:34   ` Peter Zijlstra
@ 2008-08-25 15:16     ` Paul E. McKenney
  2008-08-25 15:26       ` Peter Zijlstra
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-25 15:16 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Triplett, linux-kernel, cl, mingo, akpm, manfred, dipankar,
	schamp, niv, dvhltc, ego, laijs, rostedt

On Mon, Aug 25, 2008 at 12:34:56PM +0200, Peter Zijlstra wrote:
> On Fri, 2008-08-22 at 16:29 -0700, Josh Triplett wrote:
> 
> > > @@ -26,8 +27,10 @@
> > >   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> > >   *
> > >   * For detailed explanation of Read-Copy Update mechanism see -
> > > - * 		Documentation/RCU
> > > - *
> > > + * 	Documentation/RCU
> > > + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> > > + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> > > + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> > >   */
> > 
> > Why put these references here rather than in Documentation/RCU?  It
> > seems easier to keep documentation up to date in one place.  If you
> > think these represent a good "getting started" set of documents, how
> > about a Documentation/RCU/ReadTheseFirst with links to them, or how
> > about linking to them from whatisRCU.txt?
> 
> I actually like in code comments and 'documentation' more than
> Documentation/ stuff. Mostly because Documentation/ is:
>  - far away from the code
>  - therefore, more easily bitrotted
>  - and easily forgotten

I know!!!

#ifdef JOSH_TRIPLETT
 * 	Documentation/RCU
 * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
 * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
 * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
#elif PETER_ZIJLSTRA
 * 	Documentation/RCU
#endif

(Sorry, couldn't resist!!!)

Seriously, I know where all the documentation is, as I wrote most of it.
These comments are for you guys.  So, any thoughts on how I should
resolve this?  My default is, as always, a coin flip.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-25 15:16     ` Paul E. McKenney
@ 2008-08-25 15:26       ` Peter Zijlstra
  2008-08-27 18:28         ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Peter Zijlstra @ 2008-08-25 15:26 UTC (permalink / raw)
  To: paulmck
  Cc: Josh Triplett, linux-kernel, cl, mingo, akpm, manfred, dipankar,
	schamp, niv, dvhltc, ego, laijs, rostedt

On Mon, 2008-08-25 at 08:16 -0700, Paul E. McKenney wrote:
> On Mon, Aug 25, 2008 at 12:34:56PM +0200, Peter Zijlstra wrote:
> > On Fri, 2008-08-22 at 16:29 -0700, Josh Triplett wrote:
> > 
> > > > @@ -26,8 +27,10 @@
> > > >   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> > > >   *
> > > >   * For detailed explanation of Read-Copy Update mechanism see -
> > > > - * 		Documentation/RCU
> > > > - *
> > > > + * 	Documentation/RCU
> > > > + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> > > > + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> > > > + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> > > >   */
> > > 
> > > Why put these references here rather than in Documentation/RCU?  It
> > > seems easier to keep documentation up to date in one place.  If you
> > > think these represent a good "getting started" set of documents, how
> > > about a Documentation/RCU/ReadTheseFirst with links to them, or how
> > > about linking to them from whatisRCU.txt?
> > 
> > I actually like in code comments and 'documentation' more than
> > Documentation/ stuff. Mostly because Documentation/ is:
> >  - far away from the code
> >  - therefore, more easily bitrotted
> >  - and easily forgotten
> 
> I know!!!
> 
> #ifdef JOSH_TRIPLETT
>  * 	Documentation/RCU
>  * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
>  * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
>  * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> #elif PETER_ZIJLSTRA
>  * 	Documentation/RCU
> #endif
> 
> (Sorry, couldn't resist!!!)

But but but, you got the cases the wrong way around.. ;-)

> Seriously, I know where all the documentation is, as I wrote most of it.
> These comments are for you guys.  So, any thoughts on how I should
> resolve this?  My default is, as always, a coin flip.  ;-)

I guess we could do the 'this is how the concept works and can be used
like so and so' documentation in Documentation/

And the stuff that says 'this code does like so and so, because blah'
should stay near the code.

And in any case of doubt - stay near the code :-)

I always view Documentation/ as end user stuff (be that a kernel
programmer that needs to learn a new API, or userland folks or people
wanting to know what a certain feature is about).


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-23  1:53   ` Paul E. McKenney
@ 2008-08-25 22:02     ` Josh Triplett
  2008-08-26 16:05       ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Josh Triplett @ 2008-08-25 22:02 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Fri, 2008-08-22 at 18:53 -0700, Paul E. McKenney wrote:
> On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> > On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > > -	spinlock_t	lock	____cacheline_internodealigned_in_smp;
> > > -	cpumask_t	cpumask; /* CPUs that need to switch in order    */
> > > -				 /* for current batch to proceed.        */
> > > +/*
> > > + * Definition for node within the RCU grace-period-detection hierarchy.
> > > + */
> > > +struct rcu_node {
> > > +	spinlock_t lock;
> > > +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
> > > +				/*  order for current grace period to proceed.*/
> > > +	unsigned long	qsmaskinit;
> > > +				/* Per-GP initialization for qsmask.	      */
> > > +	int	grplo;		/* lowest-numbered CPU or group here.	      */
> > > +	int	grphi;		/* highest-numbered CPU or group here.	      */
> > > +	char	grpnum;		/* CPU/group number for next level up.	      */
> > > +	char	level;		/* root is at level 0.			      */
> > 
> > These four fields should use sized types, and preferably unsigned types.
> 
> OK for grpnum and level, but grphi and grplo need to be "int" to
> match the various CPU-manipulation primitives.

Fair enough; the CPU-manipulation primitives do indeed use "int".  Odd
that they use a signed type.

> > > +	struct rcu_node *parent;
> > >  } ____cacheline_internodealigned_in_smp;
> > > 
> > > -/* Is batch a before batch b ? */
> > > -static inline int rcu_batch_before(long a, long b)
> > > -{
> > > -	return (a - b) < 0;
> > > -}
> > > +/*
> > > + * RCU global state, including node hierarchy.  This hierarchy is
> > > + * represented in "heap" form in a dense array.  The root (first level)
> > > + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
> > > + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
> > > + * and the third level in ->node[m+1] and following (->node[m+1] referenced
> > > + * by ->level[2]).  The number of levels is determined by the number of
> > > + * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
> > > + * consisting of a single rcu_node.
> > > + */
> > > +struct rcu_state {
> > > +	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
> > > +	struct rcu_node *level[NUM_RCU_LEVELS];	/* Hierarchy levels. */
> > > +	int levelcnt[MAX_RCU_LEVELS + 1];	/* # nodes in each level. */
> > > +	int levelspread[NUM_RCU_LEVELS];	/* kids/node in each level. */
> > 
> > These two should use sized types.
> 
> Fair enough.  And can be 8 bits, for that matter.

levelspread can, since it will never exceed 64, but levelcnt cannot.
That would lead to a bug on systems with more than 256 CPUs.

> > > +
> > > +	/* The following fields are guarded by the root rcu_node's lock. */
> > > +
> > > +	char	signaled ____cacheline_internodealigned_in_smp;
> > > +						/* sent GP-kick IPIs? */
> > 
> > u8 or bool, depending on semantics.  If just a simple flag, how about
> > bool?
> 
> This will need to be a non-bool shortly.

OK.

> OK, so what the heck -are- the official type names???  u8 seems
> to be defined in a powerpc-specific file.  OK, it also appears in
> include/asm-generic/int-l64.h.  s8, u8, s16, u16, s32, u32, s64, and
> u64, then?

Yes. {s,u}{8,16,32,64}, defined in include/asm-generic/int-{l,ll}64.h,
depending on architecture.

> > >  	int cpu;
> > >  	struct rcu_head barrier;
> > >  };
> > > 
> > > +extern struct rcu_state rcu_state;
> > >  DECLARE_PER_CPU(struct rcu_data, rcu_data);
> > > +
> > > +extern struct rcu_state rcu_bh_state;
> > >  DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
> > 
> > Why extern and in the header?  I don't see anything else using them.
> 
> kernel/rcuclassic_trace.c, right?

Hmmm, true.  Unfortunate, particularly if only for the benefit of
tracing code which doesn't even get compiled under normal circumstances.

> > >  	select DEBUG_FS
> > >  	default y
> > >  	help
> > > @@ -77,3 +76,33 @@ config RCU_TRACE
> > > 
> > >  	  Say Y here if you want to enable RCU tracing
> > >  	  Say N if you are unsure.
> > > +
> > > +config RCU_FANOUT
> > > +	int "Hierarchical RCU fanout value"
> > > +	range 2 64 if 64BIT
> > > +	range 2 32 if !64BIT
> > > +	depends on CLASSIC_RCU
> > > +	default 64 if 64BIT
> > > +	default 32 if !64BIT
> > > +	help
> > > +	  This option controls the fanout of hierarchical implementations
> > > +	  of RCU, allowing RCU to work efficiently on machines with
> > > +	  large numbers of CPUs.  This value must be at least the cube
> > > +	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
> > > +	  systems and up to 262,144 for 64-bit systems.
> > > +
> > > +	  Select a specific number if testing RCU itself.
> > 
> > ...or if attempting to tune for a specific NUMA system.
> 
> Indeed.  But I need to see an actual example before I document it.
> It would be easy to make things slower by following the NUMA hardware
> layout.

Fair enough.

> > > +	  Take the default if unsure.
> > > +
> > > +config RCU_FANOUT_EXACT
> > > +	bool "Disable hierarchical RCU auto-balancing"
> > > +	depends on CLASSIC_RCU
> > > +	default n
> > > +	help
> > > +	  This option forces use of the exact RCU_FANOUT value specified,
> > > +	  regardless of imbalances in the hierarchy.  This can be useful
> > > +	  on systems with strong NUMA behavior.
> > > +
> > > +	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
> > 
> > You might want to give a specific example of a NUMA machine, the
> > appropriate value to use on that machine, and the result with and
> > without RCU_FANOUT_EXACT.
> 
> Or change "can" to "might".  ;-)

:)

Right, my comment only applies if such an example actually exists. :)

> > > -static int blimit = 10;
> > > -static int qhimark = 10000;
> > > -static int qlowmark = 100;
> > > +static int blimit = 10;		/* Maximum callbacks per softirq. */
> > > +static int qhimark = 10000;	/* If this many pending, ignore blimit. */
> > > +static int qlowmark = 100;	/* Once only this many pending, use blimit. */
> > 
> > Indentation mismatch on the comments?
> 
> Looks fine in the source -- context diff-ism.

Sigh.  Yay for tabs.

> > >  #ifdef CONFIG_SMP
> > > -static void force_quiescent_state(struct rcu_data *rdp,
> > > -			struct rcu_ctrlblk *rcp)
> > > +static void force_quiescent_state(struct rcu_state *rsp)
> > >  {
> > >  	int cpu;
> > > -	cpumask_t cpumask;
> > >  	unsigned long flags;
> > > 
> > >  	set_need_resched();
> > > -	spin_lock_irqsave(&rcp->lock, flags);
> > > -	if (unlikely(!rcp->signaled)) {
> > > -		rcp->signaled = 1;
> > > +	if (!spin_trylock_irqsave(&rsp->onofflock, flags))
> > > +		return;
> > 
> > This seems to make force_quiescent_state rather less forceful.
> 
> It will try again on the next scheduling-clock interrupt.  The reason
> I did this is because ->onofflock is a global lock acquired when
> beginning a quiescent state or when onlining/offlining.  Can't let
> force_quiescent_state() monopolize things, and would like to exclude
> online/offline while doing force_quiescent_state().  Hence make
> force_quiescent_state() back off if the lock is held.
> 
> There is probably a better way to do this...

Primarily concerned about the possibility of perpetual failure.  Then
again, eventually a grace period will occur "naturally".  Just wondering
whether the inability to force might cause a problem.

> > > -#else
> > > +#else /* #ifdef CONFIG_HOTPLUG_CPU */
> > > 
> > > -static void rcu_offline_cpu(int cpu)
> > > +static inline void
> > > +rcu_offline_cpu(int cpu)
> > >  {
> > >  }
> > 
> > No need to explicitly say "inline"; GCC should do the right thing here.
> > Same comment applies a couple of other places in your patch.
> 
> OK, I will get rid of these.  You can do the other 26,000 of them.  ;-)

:)

> > > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> > >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> > >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > > 
> > > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > > +	       rcu_pending(cpu);
> > 
> > !! seems unnecessary here.
> 
> Someone once told me why this was necessary, but I forget.  It was in the
> original, and I didn't put it there.  Some weirdness about conversion
> to 32-bit integer when the lower 32 bits of the pointer was zero or
> some such.  So if your pointer value was 0x100000000, for example,
> so that conversion to int gives zero.

Good point!  That doesn't apply if you use ||, though.  If you just did
"return somepointer" that could potentially cause the problem you
describe.  In any case, it can't *hurt* to have it; GCC should do the
sane thing.

> > > +void call_rcu_bh(struct rcu_head *head,
> > > +				void (*func)(struct rcu_head *rcu))
> > > +{
> > > +	unsigned long flags;
> > > +
> > > +	head->func = func;
> > > +	head->next = NULL;
> > > +	local_irq_save(flags);
> > > +	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
> > > +	local_irq_restore(flags);
> > > +}
> > > +EXPORT_SYMBOL_GPL(call_rcu_bh);
> > 
> > This comment applies to the original code, but:
> > You only call __call_rcu twice, in call_rcu and call_rcu_bh.  Both
> > times, you set head first, then wrap the call with local_irq_save.  How
> > about moving both into __call_rcu, making call_rcu and call_rcu_bh
> > one-liners?
> 
> I can't pass "rcu_data" to a function (or at least I don't know how to
> do so, short of passing __per_cpu_rcu_data and doing the per-CPU stuff
> by hand).  I could make __call_rcu() be a macro, but that seemed more
> ugly than it seemed worthwhile.
> 
> Is there some other approach that would work?

Hmmm.  No, not that I know of.  Sigh.

> > > +static char *rcuclassic_trace_buf;
> > > +#define RCUPREEMPT_TRACE_BUF_SIZE 4096
> > 
> > Did you perhaps want PAGE_SIZE?
> 
> I really want some way of gracefully handling arbitrarily long output
> to debugfs.  I am sure that some such exists, but haven't found it.
> What I do instead is to arbitrarily truncate output to 4096 bytes,
> which will be stunningly less than useful on a 4,096-CPU machine.  :-/
> 
> Suggestions welcome!

I can see two possibilities, depending on how much complexity you want.

The complicated way: do one pass calling snprintf everywhere and adding
up the total length used, and if you run out of memory during that pass,
reallocate the buffer to at least the total length you accumulated.  Or
something like that.

The simple hack:
#define RCUPREEMPT_TRACE_BUF_SIZE (NR_CPUS * something)

:)

- Josh Triplett



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-25 22:02     ` Josh Triplett
@ 2008-08-26 16:05       ` Paul E. McKenney
  2008-08-27  0:38         ` Josh Triplett
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-26 16:05 UTC (permalink / raw)
  To: Josh Triplett
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Mon, Aug 25, 2008 at 03:02:30PM -0700, Josh Triplett wrote:
> On Fri, 2008-08-22 at 18:53 -0700, Paul E. McKenney wrote:
> > On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> > > On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > > > -	spinlock_t	lock	____cacheline_internodealigned_in_smp;
> > > > -	cpumask_t	cpumask; /* CPUs that need to switch in order    */
> > > > -				 /* for current batch to proceed.        */
> > > > +/*
> > > > + * Definition for node within the RCU grace-period-detection hierarchy.
> > > > + */
> > > > +struct rcu_node {
> > > > +	spinlock_t lock;
> > > > +	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
> > > > +				/*  order for current grace period to proceed.*/
> > > > +	unsigned long	qsmaskinit;
> > > > +				/* Per-GP initialization for qsmask.	      */
> > > > +	int	grplo;		/* lowest-numbered CPU or group here.	      */
> > > > +	int	grphi;		/* highest-numbered CPU or group here.	      */
> > > > +	char	grpnum;		/* CPU/group number for next level up.	      */
> > > > +	char	level;		/* root is at level 0.			      */
> > > 
> > > These four fields should use sized types, and preferably unsigned types.
> > 
> > OK for grpnum and level, but grphi and grplo need to be "int" to
> > match the various CPU-manipulation primitives.
> 
> Fair enough; the CPU-manipulation primitives do indeed use "int".  Odd
> that they use a signed type.

It does allow use of -1 for "no particular CPU" or for error checking,
which can sometimes be useful.

> > > > +	struct rcu_node *parent;
> > > >  } ____cacheline_internodealigned_in_smp;
> > > > 
> > > > -/* Is batch a before batch b ? */
> > > > -static inline int rcu_batch_before(long a, long b)
> > > > -{
> > > > -	return (a - b) < 0;
> > > > -}
> > > > +/*
> > > > + * RCU global state, including node hierarchy.  This hierarchy is
> > > > + * represented in "heap" form in a dense array.  The root (first level)
> > > > + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
> > > > + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
> > > > + * and the third level in ->node[m+1] and following (->node[m+1] referenced
> > > > + * by ->level[2]).  The number of levels is determined by the number of
> > > > + * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
> > > > + * consisting of a single rcu_node.
> > > > + */
> > > > +struct rcu_state {
> > > > +	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
> > > > +	struct rcu_node *level[NUM_RCU_LEVELS];	/* Hierarchy levels. */
> > > > +	int levelcnt[MAX_RCU_LEVELS + 1];	/* # nodes in each level. */
> > > > +	int levelspread[NUM_RCU_LEVELS];	/* kids/node in each level. */
> > > 
> > > These two should use sized types.
> > 
> > Fair enough.  And can be 8 bits, for that matter.
> 
> levelspread can, since it will never exceed 64, but levelcnt cannot.
> That would lead to a bug on systems with more than 256 CPUs.

Good catch!!! Fixed.

> > > > +
> > > > +	/* The following fields are guarded by the root rcu_node's lock. */
> > > > +
> > > > +	char	signaled ____cacheline_internodealigned_in_smp;
> > > > +						/* sent GP-kick IPIs? */
> > > 
> > > u8 or bool, depending on semantics.  If just a simple flag, how about
> > > bool?
> > 
> > This will need to be a non-bool shortly.
> 
> OK.
> 
> > OK, so what the heck -are- the official type names???  u8 seems
> > to be defined in a powerpc-specific file.  OK, it also appears in
> > include/asm-generic/int-l64.h.  s8, u8, s16, u16, s32, u32, s64, and
> > u64, then?
> 
> Yes. {s,u}{8,16,32,64}, defined in include/asm-generic/int-{l,ll}64.h,
> depending on architecture.

Got it!

> > > >  	int cpu;
> > > >  	struct rcu_head barrier;
> > > >  };
> > > > 
> > > > +extern struct rcu_state rcu_state;
> > > >  DECLARE_PER_CPU(struct rcu_data, rcu_data);
> > > > +
> > > > +extern struct rcu_state rcu_bh_state;
> > > >  DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
> > > 
> > > Why extern and in the header?  I don't see anything else using them.
> > 
> > kernel/rcuclassic_trace.c, right?
> 
> Hmmm, true.  Unfortunate, particularly if only for the benefit of
> tracing code which doesn't even get compiled under normal circumstances.

Indeed.  Putting rcuclassic_trace.c into rcuclassic.c gets pretty ugly.
I suppose that another possibility would be to #include rcuclassic_trace.c
into rcuclassic.c, which might actually be the best approach.

> > > >  	select DEBUG_FS
> > > >  	default y
> > > >  	help
> > > > @@ -77,3 +76,33 @@ config RCU_TRACE
> > > > 
> > > >  	  Say Y here if you want to enable RCU tracing
> > > >  	  Say N if you are unsure.
> > > > +
> > > > +config RCU_FANOUT
> > > > +	int "Hierarchical RCU fanout value"
> > > > +	range 2 64 if 64BIT
> > > > +	range 2 32 if !64BIT
> > > > +	depends on CLASSIC_RCU
> > > > +	default 64 if 64BIT
> > > > +	default 32 if !64BIT
> > > > +	help
> > > > +	  This option controls the fanout of hierarchical implementations
> > > > +	  of RCU, allowing RCU to work efficiently on machines with
> > > > +	  large numbers of CPUs.  This value must be at least the cube
> > > > +	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
> > > > +	  systems and up to 262,144 for 64-bit systems.
> > > > +
> > > > +	  Select a specific number if testing RCU itself.
> > > 
> > > ...or if attempting to tune for a specific NUMA system.
> > 
> > Indeed.  But I need to see an actual example before I document it.
> > It would be easy to make things slower by following the NUMA hardware
> > layout.
> 
> Fair enough.
> 
> > > > +	  Take the default if unsure.
> > > > +
> > > > +config RCU_FANOUT_EXACT
> > > > +	bool "Disable hierarchical RCU auto-balancing"
> > > > +	depends on CLASSIC_RCU
> > > > +	default n
> > > > +	help
> > > > +	  This option forces use of the exact RCU_FANOUT value specified,
> > > > +	  regardless of imbalances in the hierarchy.  This can be useful
> > > > +	  on systems with strong NUMA behavior.
> > > > +
> > > > +	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
> > > 
> > > You might want to give a specific example of a NUMA machine, the
> > > appropriate value to use on that machine, and the result with and
> > > without RCU_FANOUT_EXACT.
> > 
> > Or change "can" to "might".  ;-)
> 
> :)
> 
> Right, my comment only applies if such an example actually exists. :)

Hopefully we have correctly tuned the uncertainty.

> > > > -static int blimit = 10;
> > > > -static int qhimark = 10000;
> > > > -static int qlowmark = 100;
> > > > +static int blimit = 10;		/* Maximum callbacks per softirq. */
> > > > +static int qhimark = 10000;	/* If this many pending, ignore blimit. */
> > > > +static int qlowmark = 100;	/* Once only this many pending, use blimit. */
> > > 
> > > Indentation mismatch on the comments?
> > 
> > Looks fine in the source -- context diff-ism.
> 
> Sigh.  Yay for tabs.
> 
> > > >  #ifdef CONFIG_SMP
> > > > -static void force_quiescent_state(struct rcu_data *rdp,
> > > > -			struct rcu_ctrlblk *rcp)
> > > > +static void force_quiescent_state(struct rcu_state *rsp)
> > > >  {
> > > >  	int cpu;
> > > > -	cpumask_t cpumask;
> > > >  	unsigned long flags;
> > > > 
> > > >  	set_need_resched();
> > > > -	spin_lock_irqsave(&rcp->lock, flags);
> > > > -	if (unlikely(!rcp->signaled)) {
> > > > -		rcp->signaled = 1;
> > > > +	if (!spin_trylock_irqsave(&rsp->onofflock, flags))
> > > > +		return;
> > > 
> > > This seems to make force_quiescent_state rather less forceful.
> > 
> > It will try again on the next scheduling-clock interrupt.  The reason
> > I did this is because ->onofflock is a global lock acquired when
> > beginning a quiescent state or when onlining/offlining.  Can't let
> > force_quiescent_state() monopolize things, and would like to exclude
> > online/offline while doing force_quiescent_state().  Hence make
> > force_quiescent_state() back off if the lock is held.
> > 
> > There is probably a better way to do this...
> 
> Primarily concerned about the possibility of perpetual failure.  Then
> again, eventually a grace period will occur "naturally".  Just wondering
> whether the inability to force might cause a problem.

Ah!  So the lock can fail for the following reasons:

1.	Some other CPU is in force_quiescent_state().  Here there is
	clearly no problem.

2.	Some other CPU is initializing the rcu_node hierarchy to set
	up a new quiescent state.  Here, we shouldn't have been
	executing force_quiescent_state() in the first place, so
	again no problem.

3.	Some other CPU is adjusting the rcu_node hierarchy to account
	for a CPU online or offline operation.  There is enough overhead
	in onlining and offlining CPUs that it seems unlikely that this
	could result in a denial of service.  However, if someone can
	make this happen, I will make the online/offline operation check
	to see if it should do a force_quiescent_state() -- which will
	require an __force_quiescent_state() where the onofflock is
	acquired by the caller.

So we are covered on #1 and #2, and very likely covered on #3, with an
easy fix if I am wrong.

> > > > -#else
> > > > +#else /* #ifdef CONFIG_HOTPLUG_CPU */
> > > > 
> > > > -static void rcu_offline_cpu(int cpu)
> > > > +static inline void
> > > > +rcu_offline_cpu(int cpu)
> > > >  {
> > > >  }
> > > 
> > > No need to explicitly say "inline"; GCC should do the right thing here.
> > > Same comment applies a couple of other places in your patch.
> > 
> > OK, I will get rid of these.  You can do the other 26,000 of them.  ;-)
> 
> :)
> 
> > > > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> > > >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> > > >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > > > 
> > > > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > > > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > > > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > > > +	       rcu_pending(cpu);
> > > 
> > > !! seems unnecessary here.
> > 
> > Someone once told me why this was necessary, but I forget.  It was in the
> > original, and I didn't put it there.  Some weirdness about conversion
> > to 32-bit integer when the lower 32 bits of the pointer was zero or
> > some such.  So if your pointer value was 0x100000000, for example,
> > so that conversion to int gives zero.
> 
> Good point!  That doesn't apply if you use ||, though.  If you just did
> "return somepointer" that could potentially cause the problem you
> describe.  In any case, it can't *hurt* to have it; GCC should do the
> sane thing.

OK.  I will review this towards the end, leaving it there to remind me
in the meantime.

So, would I need the !! on the left-hand operand of the first || due
to short-circuiting?

> > > > +void call_rcu_bh(struct rcu_head *head,
> > > > +				void (*func)(struct rcu_head *rcu))
> > > > +{
> > > > +	unsigned long flags;
> > > > +
> > > > +	head->func = func;
> > > > +	head->next = NULL;
> > > > +	local_irq_save(flags);
> > > > +	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
> > > > +	local_irq_restore(flags);
> > > > +}
> > > > +EXPORT_SYMBOL_GPL(call_rcu_bh);
> > > 
> > > This comment applies to the original code, but:
> > > You only call __call_rcu twice, in call_rcu and call_rcu_bh.  Both
> > > times, you set head first, then wrap the call with local_irq_save.  How
> > > about moving both into __call_rcu, making call_rcu and call_rcu_bh
> > > one-liners?
> > 
> > I can't pass "rcu_data" to a function (or at least I don't know how to
> > do so, short of passing __per_cpu_rcu_data and doing the per-CPU stuff
> > by hand).  I could make __call_rcu() be a macro, but that seemed more
> > ugly than it seemed worthwhile.
> > 
> > Is there some other approach that would work?
> 
> Hmmm.  No, not that I know of.  Sigh.

The only other thing I can think of is dynamically allocated per-CPU
variables, which seemed more ugly than helpful in this case.

> > > > +static char *rcuclassic_trace_buf;
> > > > +#define RCUPREEMPT_TRACE_BUF_SIZE 4096
> > > 
> > > Did you perhaps want PAGE_SIZE?
> > 
> > I really want some way of gracefully handling arbitrarily long output
> > to debugfs.  I am sure that some such exists, but haven't found it.
> > What I do instead is to arbitrarily truncate output to 4096 bytes,
> > which will be stunningly less than useful on a 4,096-CPU machine.  :-/
> > 
> > Suggestions welcome!
> 
> I can see two possibilities, depending on how much complexity you want.
> 
> The complicated way: do one pass calling snprintf everywhere and adding
> up the total length used, and if you run out of memory during that pass,
> reallocate the buffer to at least the total length you accumulated.  Or
> something like that.
> 
> The simple hack:
> #define RCUPREEMPT_TRACE_BUF_SIZE (NR_CPUS * something)
> 
> :)

Given that this doesn't show up in production kernels, I will take
door #2.  Though I was hoping for some sort of interface that "just
made it work" regardless of the size of user reads and the length
and pattern of in-kernel prints, but that might be a bit much...

						Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-26 16:05       ` Paul E. McKenney
@ 2008-08-27  0:38         ` Josh Triplett
  2008-08-27 18:34           ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Josh Triplett @ 2008-08-27  0:38 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Tue, 2008-08-26 at 09:05 -0700, Paul E. McKenney wrote:
> On Mon, Aug 25, 2008 at 03:02:30PM -0700, Josh Triplett wrote:
> > On Fri, 2008-08-22 at 18:53 -0700, Paul E. McKenney wrote:
> > > On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> > > > On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > > > > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> > > > >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> > > > >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > > > > 
> > > > > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > > > > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > > > > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > > > > +	       rcu_pending(cpu);
> > > > 
> > > > !! seems unnecessary here.
> > > 
> > > Someone once told me why this was necessary, but I forget.  It was in the
> > > original, and I didn't put it there.  Some weirdness about conversion
> > > to 32-bit integer when the lower 32 bits of the pointer was zero or
> > > some such.  So if your pointer value was 0x100000000, for example,
> > > so that conversion to int gives zero.
> > 
> > Good point!  That doesn't apply if you use ||, though.  If you just did
> > "return somepointer" that could potentially cause the problem you
> > describe.  In any case, it can't *hurt* to have it; GCC should do the
> > sane thing.
> 
> OK.  I will review this towards the end, leaving it there to remind me
> in the meantime.
> 
> So, would I need the !! on the left-hand operand of the first || due
> to short-circuiting?

No.  || will always return 1 or 0.  You only need the !! if you want to
directly return the boolean value of a potentially 64-bit pointer.

- Josh Triplett



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-25 15:26       ` Peter Zijlstra
@ 2008-08-27 18:28         ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-27 18:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Josh Triplett, linux-kernel, cl, mingo, akpm, manfred, dipankar,
	schamp, niv, dvhltc, ego, laijs, rostedt

On Mon, Aug 25, 2008 at 05:26:43PM +0200, Peter Zijlstra wrote:
> On Mon, 2008-08-25 at 08:16 -0700, Paul E. McKenney wrote:
> > On Mon, Aug 25, 2008 at 12:34:56PM +0200, Peter Zijlstra wrote:
> > > On Fri, 2008-08-22 at 16:29 -0700, Josh Triplett wrote:
> > > 
> > > > > @@ -26,8 +27,10 @@
> > > > >   * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
> > > > >   *
> > > > >   * For detailed explanation of Read-Copy Update mechanism see -
> > > > > - * 		Documentation/RCU
> > > > > - *
> > > > > + * 	Documentation/RCU
> > > > > + * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> > > > > + * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> > > > > + * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> > > > >   */
> > > > 
> > > > Why put these references here rather than in Documentation/RCU?  It
> > > > seems easier to keep documentation up to date in one place.  If you
> > > > think these represent a good "getting started" set of documents, how
> > > > about a Documentation/RCU/ReadTheseFirst with links to them, or how
> > > > about linking to them from whatisRCU.txt?
> > > 
> > > I actually like in code comments and 'documentation' more than
> > > Documentation/ stuff. Mostly because Documentation/ is:
> > >  - far away from the code
> > >  - therefore, more easily bitrotted
> > >  - and easily forgotten
> > 
> > I know!!!
> > 
> > #ifdef JOSH_TRIPLETT
> >  * 	Documentation/RCU
> >  * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
> >  * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
> >  * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
> > #elif PETER_ZIJLSTRA
> >  * 	Documentation/RCU
> > #endif
> > 
> > (Sorry, couldn't resist!!!)
> 
> But but but, you got the cases the wrong way around.. ;-)

Good point...

#ifdef READER_LIKES_DOCUMENTATION_URLS_IN_COMMENTS
 * 	Documentation/RCU
 * 	http://lwn.net/Articles/262464/ (What is RCU, Fundamentally?)
 * 	http://lwn.net/Articles/263130/ (What is RCU's Usage?)
 * 	http://lwn.net/Articles/264090/ (What is RCU's API? + references)
#else
 * 	Documentation/RCU
#endif

Of course, the C preprocessor would just remove the whole comment
anyway, but hopefully it is the thought that counts.  ;-)

> > Seriously, I know where all the documentation is, as I wrote most of it.
> > These comments are for you guys.  So, any thoughts on how I should
> > resolve this?  My default is, as always, a coin flip.  ;-)
> 
> I guess we could do the 'this is how the concept works and can be used
> like so and so' documentation in Documentation/

Documentation/RCU/whatisRCU.txt does in fact contain the three URLs
listed above.  And there is always Documentation/RCU/RTFP.txt for
people wanting the full effect.

> And the stuff that says 'this code does like so and so, because blah'
> should stay near the code.
> 
> And in any case of doubt - stay near the code :-)
> 
> I always view Documentation/ as end user stuff (be that a kernel
> programmer that needs to learn a new API, or userland folks or people
> wanting to know what a certain feature is about).

I confess to erring on the side of spamming all channels.  Then again,
I am a serial junk-mailer, so perhaps this is just me.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-27  0:38         ` Josh Triplett
@ 2008-08-27 18:34           ` Paul E. McKenney
  2008-08-27 20:23             ` Josh Triplett
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-27 18:34 UTC (permalink / raw)
  To: Josh Triplett
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Tue, Aug 26, 2008 at 05:38:36PM -0700, Josh Triplett wrote:
> On Tue, 2008-08-26 at 09:05 -0700, Paul E. McKenney wrote:
> > On Mon, Aug 25, 2008 at 03:02:30PM -0700, Josh Triplett wrote:
> > > On Fri, 2008-08-22 at 18:53 -0700, Paul E. McKenney wrote:
> > > > On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> > > > > On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > > > > > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> > > > > >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> > > > > >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > > > > > 
> > > > > > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > > > > > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > > > > > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > > > > > +	       rcu_pending(cpu);
> > > > > 
> > > > > !! seems unnecessary here.
> > > > 
> > > > Someone once told me why this was necessary, but I forget.  It was in the
> > > > original, and I didn't put it there.  Some weirdness about conversion
> > > > to 32-bit integer when the lower 32 bits of the pointer was zero or
> > > > some such.  So if your pointer value was 0x100000000, for example,
> > > > so that conversion to int gives zero.
> > > 
> > > Good point!  That doesn't apply if you use ||, though.  If you just did
> > > "return somepointer" that could potentially cause the problem you
> > > describe.  In any case, it can't *hurt* to have it; GCC should do the
> > > sane thing.
> > 
> > OK.  I will review this towards the end, leaving it there to remind me
> > in the meantime.
> > 
> > So, would I need the !! on the left-hand operand of the first || due
> > to short-circuiting?
> 
> No.  || will always return 1 or 0.  You only need the !! if you want to
> directly return the boolean value of a potentially 64-bit pointer.

Even if one argument of || is long and the other int or some fool thing
like that?  (What, me paranoid???)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-27 18:34           ` Paul E. McKenney
@ 2008-08-27 20:23             ` Josh Triplett
  2008-08-27 20:41               ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Josh Triplett @ 2008-08-27 20:23 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Wed, 2008-08-27 at 11:34 -0700, Paul E. McKenney wrote:
> On Tue, Aug 26, 2008 at 05:38:36PM -0700, Josh Triplett wrote:
> > On Tue, 2008-08-26 at 09:05 -0700, Paul E. McKenney wrote:
> > > On Mon, Aug 25, 2008 at 03:02:30PM -0700, Josh Triplett wrote:
> > > > On Fri, 2008-08-22 at 18:53 -0700, Paul E. McKenney wrote:
> > > > > On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> > > > > > On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > > > > > > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> > > > > > >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> > > > > > >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > > > > > > 
> > > > > > > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > > > > > > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > > > > > > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > > > > > > +	       rcu_pending(cpu);
> > > > > > 
> > > > > > !! seems unnecessary here.
> > > > > 
> > > > > Someone once told me why this was necessary, but I forget.  It was in the
> > > > > original, and I didn't put it there.  Some weirdness about conversion
> > > > > to 32-bit integer when the lower 32 bits of the pointer was zero or
> > > > > some such.  So if your pointer value was 0x100000000, for example,
> > > > > so that conversion to int gives zero.
> > > > 
> > > > Good point!  That doesn't apply if you use ||, though.  If you just did
> > > > "return somepointer" that could potentially cause the problem you
> > > > describe.  In any case, it can't *hurt* to have it; GCC should do the
> > > > sane thing.
> > > 
> > > OK.  I will review this towards the end, leaving it there to remind me
> > > in the meantime.
> > > 
> > > So, would I need the !! on the left-hand operand of the first || due
> > > to short-circuiting?
> > 
> > No.  || will always return 1 or 0.  You only need the !! if you want to
> > directly return the boolean value of a potentially 64-bit pointer.
> 
> Even if one argument of || is long and the other int or some fool thing
> like that?  (What, me paranoid???)

What, you don't know exactly how C behaves in every strange corner
case? ;)

|| always produces a result of type int, and it compares each of its two
arguments to 0 independently; to the best of my knowledge the size of
those arguments never matters.

- Josh Triplett



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] scalable classic RCU implementation
  2008-08-27 20:23             ` Josh Triplett
@ 2008-08-27 20:41               ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-27 20:41 UTC (permalink / raw)
  To: Josh Triplett
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, schamp, niv,
	dvhltc, ego, laijs, rostedt

On Wed, Aug 27, 2008 at 01:23:28PM -0700, Josh Triplett wrote:
> On Wed, 2008-08-27 at 11:34 -0700, Paul E. McKenney wrote:
> > On Tue, Aug 26, 2008 at 05:38:36PM -0700, Josh Triplett wrote:
> > > On Tue, 2008-08-26 at 09:05 -0700, Paul E. McKenney wrote:
> > > > On Mon, Aug 25, 2008 at 03:02:30PM -0700, Josh Triplett wrote:
> > > > > On Fri, 2008-08-22 at 18:53 -0700, Paul E. McKenney wrote:
> > > > > > On Fri, Aug 22, 2008 at 04:29:32PM -0700, Josh Triplett wrote:
> > > > > > > On Thu, 2008-08-21 at 16:43 -0700, Paul E. McKenney wrote:
> > > > > > > > @@ -658,14 +806,19 @@ int rcu_needs_cpu(int cpu)
> > > > > > > >  	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
> > > > > > > >  	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
> > > > > > > > 
> > > > > > > > -	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
> > > > > > > > +	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
> > > > > > > > +	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
> > > > > > > > +	       rcu_pending(cpu);
> > > > > > > 
> > > > > > > !! seems unnecessary here.
> > > > > > 
> > > > > > Someone once told me why this was necessary, but I forget.  It was in the
> > > > > > original, and I didn't put it there.  Some weirdness about conversion
> > > > > > to 32-bit integer when the lower 32 bits of the pointer was zero or
> > > > > > some such.  So if your pointer value was 0x100000000, for example,
> > > > > > so that conversion to int gives zero.
> > > > > 
> > > > > Good point!  That doesn't apply if you use ||, though.  If you just did
> > > > > "return somepointer" that could potentially cause the problem you
> > > > > describe.  In any case, it can't *hurt* to have it; GCC should do the
> > > > > sane thing.
> > > > 
> > > > OK.  I will review this towards the end, leaving it there to remind me
> > > > in the meantime.
> > > > 
> > > > So, would I need the !! on the left-hand operand of the first || due
> > > > to short-circuiting?
> > > 
> > > No.  || will always return 1 or 0.  You only need the !! if you want to
> > > directly return the boolean value of a potentially 64-bit pointer.
> > 
> > Even if one argument of || is long and the other int or some fool thing
> > like that?  (What, me paranoid???)
> 
> What, you don't know exactly how C behaves in every strange corner
> case? ;)

I used to, back when identifiers were only guaranteed to be
differentiated by their first 8 characters (6 or 7 if extern).  ;-)

> || always produces a result of type int, and it compares each of its two
> arguments to 0 independently; to the best of my knowledge the size of
> those arguments never matters.

I suppose I should read the spec.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-25  0:07 ` [PATCH, RFC, tip/core/rcu] v2 " Paul E. McKenney
@ 2008-08-30  0:49   ` Paul E. McKenney
  2008-08-30  9:33     ` Peter Zijlstra
                       ` (3 more replies)
  0 siblings, 4 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-30  0:49 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt, peterz

Hello!

Still experimental, not for inclusion.  But getting better!

Updates from v2:

o	Fixed a number of bugs uncovered by running rcutorture in
	parallel with onlining and offlining CPUs.  Many of these
	were due to the fact that there can be a multiple-grace-period
	window during which RCU and the process scheduler disagree
	about whether a given CPU is offline.  The solution was
	to make force_quiescent_state() check for RCU waiting on
	offlined CPUs, and then cleaning up all the locking gotchas
	that resulted from that change.

o	Upgraded tracing capability with additional statistics, for
	example, per-CPU counts of how often force_quiescent_state()
	responded on their behalf (because they were offline, in
	dyntick-idle state, or needed a resched IPI).  Also abbreviated
	more severely to allow the system to run longer within the
	confines of an 80-character xterm.

o	Added sparse annotations so that it sparses cleanly.

o	Added an argument to force_quiescent_state() so that for normal
	callers, it checks for enough time having passed since the
	last try.  Emergency callers (__call_rcu() with more than
	10,000 RCU callbacks piled up on the local CPU) get their
	quiescent state forced unconditionally.

o	Added mapping from CPU to rcu_data structure to allow RCU to
	easily switch its attention from (say) the CPU being offlined
	to the currently running CPU, should the offlining kick off
	a new RCU grace period.

o	Made the trace buffer's size a function of the number of
	CPUs so that the rcudata debugfs file works correctly on
	128-CPU machines.

Attached is an updated patch to Classic RCU that applies a hierarchy,
greatly reducing the contention on the top-level lock for large
machines.  This passes mild rcutorture testing on x86 and ppc64,
including some 12-hour runs on 8-CPU machines and an hour thus far
on a 128-CPU machine, but is most definitely not ready for inclusion.
It is OK for experimental work assuming sufficiently brave experimenters.
See also Manfred Spraul's recent patches (or his earlier work from 2004
at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).  We will
converge onto a common patch in the fullness of time, but are currently
exploring different regions of the design space.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	Entering and leaving dynticks idle mode is a quiescent state,
	but the current patch doesn't take advantage of this (noted
	by Manfred).  It appears that it should be possible to make
	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
	out whether it is safe to tell RCU about the quiescent state --
	and also greatly simplify the code.

o	Both rcu_pending() and rcu_needs_cpu() need to be a bit
	smarter.

o	The cpu_quiet() and cpu_quiet_msk() functions should use
	pre-fab masks rather than doing shifting each time.

o	The check-CPU-stalls code is busted.  Will be fixed.

o	There are a few places where grace periods are unnecessarily
	delayed.

o	There are probably hangs, rcutorture failures, &c.

o	There is not yet a human-readable design document.  Will be fixed.

If you want to use this against a Linus kernel, do the following

Start with 2.6.27-rc3.

Apply http://www.rdrop.com/users/paulmck/patches/paulmck-rcu.2008.08.20a.patch
which catches you up to a recent linux-2.6-tip tip/core/rcu commit.

Apply http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-hierRCU-30.patch
which gets you the current hierarchical RCU implementation.

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 
 include/linux/hardirq.h    |    4 
 include/linux/rcuclassic.h |  251 +++++--
 kernel/Kconfig.preempt     |   32 
 kernel/Makefile            |    5 
 kernel/rcuclassic.c        | 1586 ++++++++++++++++++++++++++++++++-------------
 kernel/rcuclassic_trace.c  |  227 ++++++
 6 files changed, 1617 insertions(+), 488 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..a776bf0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,13 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+#endif /* #if defined(CONFIG_NO_HZ) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 1658995..f242605 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -15,19 +15,16 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright IBM Corporation, 2001
+ * Copyright IBM Corporation, 2008
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
- *
+ * 	Documentation/RCU
  */
 
 #ifndef __LINUX_RCUCLASSIC_H
@@ -40,69 +37,184 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
 
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	long	cur;		/* Current batch number.                      */
-	long	completed;	/* Number of the last completed batch         */
-	long	pending;	/* Number of the last pending batch           */
-#ifdef CONFIG_DEBUG_RCU_STALL
-	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
-#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+#if (NR_CPUS) <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
 
-	int	signaled;
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
 
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-				 /* for current batch to proceed.        */
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long	qsmask;	/* CPUs or groups that need to switch in      */
+				/*  order for current grace period to proceed.*/
+	unsigned long	qsmaskinit;
+				/* Per-GP initialization for qsmask.	      */
+	int	grplo;		/* lowest-numbered CPU or group here.	      */
+	int	grphi;		/* highest-numbered CPU or group here.	      */
+	u8	grpnum;		/* CPU/group number for next level up.	      */
+	u8	level;		/* root is at level 0.			      */
+	struct rcu_node *parent;
 } ____cacheline_internodealigned_in_smp;
 
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
-	return (a - b) < 0;
-}
-
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
-	return (a - b) > 0;
-}
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
 
-/* Per-CPU data for Read-Copy UPdate. */
+/* Per-CPU data for read-copy update. */
 struct rcu_data {
-	/* 1) quiescent state handling : */
-	long		quiescbatch;     /* Batch # for grace period */
-	int		passed_quiesc;	 /* User-mode/idle loop etc. */
-	int		qs_pending;	 /* core waits for quiesc state */
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 
 	/* 2) batch handling */
 	/*
-	 * if nxtlist is not NULL, then:
-	 * batch:
-	 *	The batch # for the last entry of nxtlist
-	 * [*nxttail[1], NULL = *nxttail[2]):
-	 *	Entries that batch # <= batch
-	 * [*nxttail[0], *nxttail[1]):
-	 *	Entries that batch # <= batch - 1
-	 * [nxtlist, *nxttail[0]):
-	 *	Entries that batch # <= batch - 2
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
 	 *	The grace period for these entries has completed, and
 	 *	the other grace-period-completed entries may be moved
 	 *	here temporarily in rcu_process_callbacks().
 	 */
-	long  	       	batch;
 	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail[3];
-	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *donelist;
-	struct rcu_head **donetail;
-	long		blimit;		 /* Upper limit on a processed batch */
-	int cpu;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+	/* 3) rcu-barrier functions */
 	struct rcu_head barrier;
+
+#ifdef CONFIG_NO_HZ
+	/* 4) dynticks interface (see http://lwn.net/Articles/279077/) */
+	int dynticks_nesting;		/* Track nesting level, sort of. */
+	int dynticks;			/* Even for dynticks-idle mode. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 5) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struc rcu_data. */
+#define RCU_SAVE_DYNTICK	0	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		1	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#define RCU_SECONDS_TILL_STALL_CHECK	 3	/* for rsp->seconds_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK	30	/* for rsp->seconds_stall */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* sent GP-kick IPIs? */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_DEBUG_RCU_STALL
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long seconds_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
 };
 
+extern struct rcu_state rcu_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
@@ -115,11 +227,13 @@ static inline void rcu_qsctr_inc(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
 }
 static inline void rcu_bh_qsctr_inc(int cpu)
 {
 	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
 	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
 }
 
 extern int rcu_pending(int cpu);
@@ -172,7 +286,44 @@ extern void rcu_restart_cpu(int cpu);
 extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
 
+#ifdef CONFIG_NO_HZ
+
+/*
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ *
+ * @@@ note quiescent state???
+ */
+static inline void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_data).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_data).dynticks & 0x1, &rs);
+	__get_cpu_var(rcu_bh_data).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_bh_data).dynticks & 0x1, &rs);
+}
+
+/*
+ * Exit nohz mode.
+ */
+static inline void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_data).dynticks++;
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_data).dynticks & 0x1), &rs);
+	__get_cpu_var(rcu_bh_data).dynticks++;
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_bh_data).dynticks & 0x1), &rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+#else /* CONFIG_NO_HZ */
 #define rcu_enter_nohz()	do { } while (0)
 #define rcu_exit_nohz()		do { } while (0)
+#endif /* CONFIG_NO_HZ */
 
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03..38a64ae 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -68,7 +68,6 @@ config PREEMPT_RCU
 
 config RCU_TRACE
 	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
 	select DEBUG_FS
 	default y
 	help
@@ -77,3 +76,34 @@ config RCU_TRACE
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on CLASSIC_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable hierarchical RCU auto-balancing"
+	depends on CLASSIC_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df..b018f62 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -75,8 +75,9 @@ obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
+ifeq ($(CONFIG_RCU_TRACE),y)
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic_trace.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt_trace.o
 endif
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 01e761a..e0a865d 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -15,20 +15,17 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
- * Copyright IBM Corporation, 2001
+ * Copyright IBM Corporation, 2008
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
  *
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		Documentation/RCU
- *
+ * 	Documentation/RCU
  */
 #include <linux/types.h>
 #include <linux/kernel.h>
@@ -56,273 +53,393 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
 
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.pending = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
 {
-	int cpu;
-	cpumask_t cpumask;
-	unsigned long flags;
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
-	set_need_resched();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
-		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 *
-		 * cpu_online_map is updated by the _cpu_down()
-		 * using __stop_machine(). Since we're in irqs disabled
-		 * section, __stop_machine() is not exectuting, hence
-		 * the cpu_online_map is stable.
-		 *
-		 * However,  a cpu might have been offlined _just_ before
-		 * we disabled irqs while entering here.
-		 * And rcu subsystem might not yet have handled the CPU_DEAD
-		 * notification, leading to the offlined cpu's bit
-		 * being set in the rcp->cpumask.
-		 *
-		 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
-		 * sending smp_reschedule() to an offlined CPU.
-		 */
-		cpus_and(cpumask, rcp->cpumask, cpu_online_map);
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask_nr(cpu, cpumask)
-			smp_send_reschedule(cpu);
-	}
-	spin_unlock_irqrestore(&rcp->lock, flags);
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/* Raises the softirq for processing rcu_callbacks. */
+static void raise_rcu_softirq(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
 }
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+
+/*
+ * Does the CPU have any callbacks in any state?
+ */
+static int
+cpu_has_callbacks(struct rcu_data *rdp)
 {
-	set_need_resched();
+	return rdp->nxtlist != NULL;
 }
-#endif
 
-static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
-		struct rcu_data *rdp)
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 {
-	long batch;
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
 
-	head->next = NULL;
-	smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+/*
+ * When a given CPU first becomes aware of a grace period, it knows
+ * that all of its pre-existing callbacks will be covered by the next
+ * grace period.  Therefore, this function may be called only on
+ * behalf of the calling CPU or on behalf of an offline CPU.
+ *
+ * Similarly, if a given CPU has not yet let RCU know that it passed
+ * through a quiescent state for the current grace period, then that
+ * CPU knows that all of its callbacks may safely be invoked at the
+ * end of the next grace period.
+ */
+static void
+rcu_next_callbacks_are_ready(struct rcu_data *rdp)
+{
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+}
 
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
 	/*
-	 * Determine the batch number of this callback.
-	 *
-	 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
-	 * local variable "batch" and emits codes like this:
-	 *	1) rdp->batch = rcp->cur + 1 # gets old value
-	 *	......
-	 *	2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
-	 * then [*nxttail[0], *nxttail[1]) may contain callbacks
-	 * that batch# = rdp->batch, see the comment of struct rcu_data.
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
 	 */
-	batch = ACCESS_ONCE(rcp->cur) + 1;
-
-	if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
-		/* process callbacks */
-		rdp->nxttail[0] = rdp->nxttail[1];
-		rdp->nxttail[1] = rdp->nxttail[2];
-		if (rcu_batch_after(batch - 1, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[2];
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
 	}
 
-	rdp->batch = batch;
-	*rdp->nxttail[2] = head;
-	rdp->nxttail[2] = &head->next;
+	/*
+	 * We need this CPU to either enter dynticks idle mode or pass
+	 * through a quiescent state.  Send it a reschedule IPI.
+	 */
 
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#ifdef CONFIG_NO_HZ
+
+/*
+ * Helper function for rcu_irq_enter().
+ */
+void __rcu_irq_enter(struct rcu_data *rdp)
+{
+	if (rdp->dynticks_nesting)
+		rdp->dynticks_nesting++;
+
+	/*
+	 * Only update if we are coming from a stopped ticks mode
+	 * (rdp->dynticks is even).
+	 */
+	if (!in_interrupt() &&
+	    (rdp->dynticks & 0x1) == 0) {
+		/*
+		 * The following might seem like we could have a race
+		 * with NMI/SMIs. But this really isn't a problem.
+		 * Here we do a read/modify/write, and the race happens
+		 * when an NMI/SMI comes in after the read and before
+		 * the write. But NMI/SMIs will increment this counter
+		 * twice before returning, so the zero bit will not
+		 * be corrupted by the NMI/SMI which is the most important
+		 * part.
+		 *
+		 * The only thing is that we would bring back the counter
+		 * to a postion that it was in during the NMI/SMI.
+		 * But the zero bit would be set, so the rest of the
+		 * counter would again be ignored.
+		 *
+		 * On return from the IRQ, the counter may have the zero
+		 * bit be 0 and the counter the same as the return from
+		 * the NMI/SMI. If the state machine was so unlucky to
+		 * see that, it still doesn't matter, since all
+		 * RCU read-side critical sections on this CPU would
+		 * have already completed.
+		 */
+		rdp->dynticks++;
+		/*
+		 * The following memory barrier ensures that any RCU
+		 * read-side critical sections in the irq handler are
+		 * seen by other CPUs to follow the above increment to
+		 * rdp->dynticks. This is required in order for other CPUs
+		 * to correctly determine when it is safe to advance the
+		 * RCU grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		/*
+		 * Since we can't determine the dynamic tick mode from
+		 * the rdp->dynticks after this routine, we use a second
+		 * flag to acknowledge that we came from an idle state
+		 * with ticks stopped.
+		 */
+		rdp->dynticks_nesting++;
+		/*
+		 * If we take an NMI/SMI now, they will also increment
+		 * the dynticks_nesting counter, and will not update the
+		 * rdp->dynticks on exit. That is for this IRQ to do.
+		 */
 	}
 }
 
 /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdp->dynticks to let the RCU handling know that the CPU is active.
  */
-void call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+void rcu_irq_enter(void)
 {
-	unsigned long flags;
+	__rcu_irq_enter(&__get_cpu_var(rcu_data));
+	__rcu_irq_enter(&__get_cpu_var(rcu_bh_data));
+}
 
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	local_irq_restore(flags);
+/*
+ * Helper function for rcu_irq_exit().
+ */
+static void __rcu_irq_exit(struct rcu_data *rdp)
+{
+	/*
+	 * rdp->dynticks_nesting is set if we interrupted the CPU
+	 * when it was idle with ticks stopped.
+	 * Once this occurs, we keep track of interrupt nesting
+	 * because a NMI/SMI could also come in, and we still
+	 * only want the IRQ that started the increment of the
+	 * rdp->dynticks to be the one that modifies it on exit.
+	 */
+	if (rdp->dynticks_nesting) {
+		if (--rdp->dynticks_nesting)
+			return;
+
+		/* This must match the interrupt nesting */
+		WARN_ON(in_interrupt());
+
+		/*
+		 * If an NMI/SMI happens now we are still
+		 * protected by the rdp->dynticks being odd.
+		 */
+
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_unlock() primitives in the irq handler
+		 * are seen by other CPUs to preceed the following
+		 * increment to rdp->dynticks. This is required in
+		 * order for other CPUs to determine when it is safe
+		 * to advance the RCU grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		rdp->dynticks++;
+		WARN_ON(rdp->dynticks & 0x1);
+	}
 }
-EXPORT_SYMBOL_GPL(call_rcu);
 
 /**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * rcu_irq_exit - Called from exiting Hard irq context.
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
 {
-	unsigned long flags;
-
-	head->func = func;
-	local_irq_save(flags);
-	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-	local_irq_restore(flags);
+	__rcu_irq_exit(&__get_cpu_var(rcu_data));
+	__rcu_irq_exit(&__get_cpu_var(rcu_bh_data));
 }
-EXPORT_SYMBOL_GPL(call_rcu_bh);
 
 /*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
  */
-long rcu_batches_completed(void)
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
-	return rcu_ctrlblk.completed;
+	int ret;
+	int snap;
+
+	snap = rdp->dynticks;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	ret = (snap & 0x1) == 0;
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
 
 /*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
+ * Snapshot the global completed counter so that later on it will be
+ * possible to tell which grace period any detected dyntick-idle
+ * quiescent states belong to.  The caller must hold the root rcu_node
+ * lock.
  */
-long rcu_batches_completed_bh(void)
+static void dyntick_save_completed(struct rcu_state *rsp, long completed)
 {
-	return rcu_bh_ctrlblk.completed;
+	rsp->dynticks_completed = completed;
 }
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 
-/* Raises the softirq for processing rcu_callbacks. */
-static inline void raise_rcu_softirq(void)
+/*
+ * Get the value previously saved by dyntick_save_completed().
+ */
+static long dyntick_get_completed(struct rcu_state *rsp)
 {
-	raise_softirq(RCU_SOFTIRQ);
+	return rsp->dynticks_completed;
 }
 
 /*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
  */
-static void rcu_do_batch(struct rcu_data *rdp)
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-	struct rcu_head *next, *list;
-	int count = 0;
+	long curr;
+	long snap;
 
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
+	curr = rdp->dynticks;
+	snap = rdp->dynticks_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr - snap) >= 2 || (curr & 0x1) == 0) {
+		rdp->dynticks_fqs++;
+		return 1;
 	}
-	rdp->donelist = list;
 
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
 
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		raise_rcu_softirq();
+#else /* #ifdef CONFIG_NO_HZ */
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; }
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
 }
 
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
+# define dyntick_save_completed(rsp, completed) do { } while (0)
+# define dyntick_get_completed(rsp)		((rsp)->completed)
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
 
 #ifdef CONFIG_DEBUG_RCU_STALL
 
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+static void record_gp_stall_check_time(void)
 {
-	rcp->gp_check = get_seconds() + 3;
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_CHECK;
 }
 
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+static void print_other_cpu_stall(struct rcu_ctrlblk *rsp)
 {
 	int cpu;
 	long delta;
 	unsigned long flags;
+	struct rcu_node *rnp;
 
 	/* Only let one CPU complain about others per time interval. */
 
-	spin_lock_irqsave(&rcp->lock, flags);
-	delta = get_seconds() - rcp->gp_check;
-	if (delta < 2L || cpus_empty(rcp->cpumask)) {
-		spin_unlock(&rcp->lock);
+	rnp = rcu_get_root(rsp);
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = get_seconds() - rsp->seconds_stall;
+	if (delta < 2L || rsp->gpnum != rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
-	rcp->gp_check = get_seconds() + 30;
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/* OK, time to rat on our buddy... */
 
 	printk(KERN_ERR "RCU detected CPU stalls:");
-	for_each_cpu_mask(cpu, rcp->cpumask)
+	for_each_cpu_mask(cpu, rcp->cpumask) @@@ use process func...
 		printk(" %d", cpu);
-	printk(" (detected by %d, t=%lu/%lu)\n",
-	       smp_processor_id(), get_seconds(), rcp->gp_check);
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
 }
 
 static void print_cpu_stall(struct rcu_ctrlblk *rcp)
@@ -338,99 +455,316 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
-static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+static void check_cpu_stall(struct rcu_ctrlblk *rsp, struct rcu_data *rdp)
 {
 	long delta;
+	struct rcu_node *rnp;
 
-	delta = get_seconds() - rcp->gp_check;
-	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
+	delta = get_seconds() - rsp->seconds_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & (1L << (smp_processor_id() - rnp->grplo))) &&
+	    delta >= 0L) {
 
 		/* We haven't checked in, so go dump stack. */
-
 		print_cpu_stall(rcp);
 
-	} else {
-		if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
-			/* They had two seconds to dump stack, so complain. */
-			print_other_cpu_stall(rcp);
-		}
+	} else if (rsp->gpnum != rsp->completed && delta >= 2L) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rcp);
 	}
 }
 
 #else /* #ifdef CONFIG_DEBUG_RCU_STALL */
 
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+static void record_gp_stall_check_time(void)
 {
 }
 
-static inline void
-check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+static void
+check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 }
 
 #endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
 
 /*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
  */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	if (rcp->cur != rcp->pending &&
-			rcp->completed == rcp->cur) {
-		rcp->cur++;
-		record_gp_check_time(rcp);
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
+	__releases(rsp->rda[smp_processor_id()]->lock)
+{
+	unsigned long flags = iflg;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
 
 		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
+		 * Either there is no need to detect any more grace periods
+		 * at the moment, or we are already in the process of
+		 * detecting one.  Either way, we should not start a new
+		 * RCU grace period, so drop the lock and return.
 		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+
+	rsp->gpnum++;
+	rsp->signaled = RCU_SIGNAL_INIT;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time();
+	dyntick_save_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+
+	rcu_next_callbacks_are_ready(rdp);
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 
-		rcp->signaled = 0;
+	/* Special-case the common single-level case. */
+
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
 	}
+
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
 /*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
  */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
 	}
+	local_irq_restore(flags);
 }
 
 /*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
  */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = 1L << rnp->grpnum;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  When invoking this on one's own
+ * behalf, lastcomp is used to make sure we are still in the grace period
+ * of interest.  We don't want to end the current grace period based on
+ * quiescent states detected in an earlier grace period!  On the other hand,
+ * it the CPU being quieted is offline, we can safely pass in lastcomp==NULL,
+ * since an offline CPU is in a quiescent state with respect to any grace
+ * period, unlike pesky online CPUs, which can go non-quiescent with
+ * absolutely no warning.
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
 {
 	unsigned long flags;
+	long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != NULL &&
+	    *lastcomp != ACCESS_ONCE(rsp->completed)) {
 
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
+	mask = 1L << (cpu - rnp->grplo);
+	if ((rnp->qsmask & mask) == 0L) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+
+		if (cpu == rdp->cpu)
+			rcu_next_callbacks_are_ready(rdp);
+		rdp->qs_pending = 0;
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
 
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
 	 */
 	if (!rdp->qs_pending)
 		return;
@@ -441,195 +775,470 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 	 */
 	if (!rdp->passed_quiesc)
 		return;
-	rdp->qs_pending = 0;
 
-	spin_lock_irqsave(&rcp->lock, flags);
-	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
-	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, &rdp->passed_quiesc_completed);
 }
 
-
 #ifdef CONFIG_HOTPLUG_CPU
 
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
  */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail, long batch)
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
-	if (list) {
-		local_irq_disable();
-		this_rdp->batch = batch;
-		*this_rdp->nxttail[2] = list;
-		this_rdp->nxttail[2] = tail;
-		local_irq_enable();
+	int i;
+	unsigned long flags;
+	long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = 1L << (cpu - rnp->grplo);	/* rnp->grplo is constant. */
+	while (rnp != NULL) {  /* @@@ do-while */
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = 1L << rnp->grpnum;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+						/* @@@ move up to simplify. */
+		rnp = rnp->parent;
 	}
-}
 
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	unsigned long flags;
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
 
 	/*
-	 * if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
 	 */
-	spin_lock_irqsave(&rcp->lock, flags);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-	spin_unlock(&rcp->lock);
-
-	this_rdp->qlen += rdp->qlen;
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
 	local_irq_restore(flags);
 }
 
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
 static void rcu_offline_cpu(int cpu)
 {
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
 }
 
-#else
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
 
-static void rcu_offline_cpu(int cpu)
+static void
+rcu_offline_cpu(int cpu)
 {
 }
 
-#endif
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 
 /*
- * This does the RCU processing work from softirq context.
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.
  */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_data *rdp)
 {
-	long completed_snap;
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	/* Update count, and requeue any remaining callbacks. */
+	local_irq_save(flags);
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+	local_irq_restore(flags);
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_rcu_softirq();
+}
 
-	if (rdp->nxtlist) {
-		local_irq_disable();
-		completed_snap = ACCESS_ONCE(rcp->completed);
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
-		 * move the other grace-period-completed entries to
-		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * Also do a memory barrier.  This is needed to handle
+		 * the case where writes from a preempt-disable section
+		 * of code get reordered into schedule() by this CPU's
+		 * write buffer.  The memory barrier makes sure that
+		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+		 * by other CPUs to happen after any such write.
 		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
-		else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
-			rdp->nxttail[0] = rdp->nxttail[1];
+
+		smp_mb();  /* See above block comment. */
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
 
 		/*
-		 * the grace period for entries in
-		 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
-		 * move these entries to donelist
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.  The memory barrier
+		 * is needed for the same reason as is the above one.
 		 */
-		if (rdp->nxttail[0] != &rdp->nxtlist) {
-			*rdp->donetail = rdp->nxtlist;
-			rdp->donetail = rdp->nxttail[0];
-			rdp->nxtlist = *rdp->nxttail[0];
-			*rdp->donetail = NULL;
-
-			if (rdp->nxttail[1] == rdp->nxttail[0])
-				rdp->nxttail[1] = &rdp->nxtlist;
-			if (rdp->nxttail[2] == rdp->nxttail[0])
-				rdp->nxttail[2] = &rdp->nxtlist;
-			rdp->nxttail[0] = &rdp->nxtlist;
+
+		smp_mb();  /* See above block comment. */
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_rcu_softirq();
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		mask = 0;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0L && f(rsp->rda[cpu]))
+				mask |= bit;
 		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
 
-		local_irq_enable();
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags))
+		return;	/* Someone else is already on the job. */
+	if (relaxed && (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	if (rsp->completed == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_SAVE_DYNTICK:
 
-		if (rcu_batch_after(rdp->batch, rcp->pending)) {
-			unsigned long flags;
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break;
 
-			/* and start it/schedule start if it's a new batch */
-			spin_lock_irqsave(&rcp->lock, flags);
-			if (rcu_batch_after(rdp->batch, rcp->pending)) {
-				rcp->pending = rdp->batch;
-				rcu_start_batch(rcp);
-			}
-			spin_unlock_irqrestore(&rcp->lock, flags);
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_save_completed(rsp, lastcomp);
 		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_get_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
 	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
 
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
 }
 
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+	    	force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
 	/*
 	 * Memory references from any prior RCU read-side critical sections
-	 * executed by the interrupted code must be see before any RCU
+	 * executed by the interrupted code must be seen before any RCU
 	 * grace-period manupulations below.
 	 */
 
 	smp_mb(); /* See above block comment. */
 
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
 
 	/*
 	 * Memory references from any later RCU read-side critical sections
-	 * executed by the interrupted code must be see after any RCU
+	 * executed by the interrupted code must be seen after any RCU
 	 * grace-period manupulations above.
 	 */
 
 	smp_mb(); /* See above block comment. */
 }
 
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+static void
+__call_rcu(struct rcu_head *head, struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		force_quiescent_state(rsp, 1);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	__call_rcu(head, &rcu_state, &__get_cpu_var(rcu_data));
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	__call_rcu(head, &rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	/* Check for CPU stalls, if enabled. */
-	check_cpu_stall(rcp, rdp);
+	check_cpu_stall(rsp, rdp);
 
-	if (rdp->nxtlist) {
-		long completed_snap = ACCESS_ONCE(rcp->completed);
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
 
-		/*
-		 * This cpu has pending rcu entries and the grace period
-		 * for them has completed.
-		 */
-		if (!rcu_batch_before(completed_snap, rdp->batch))
-			return 1;
-		if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
-				rdp->nxttail[0] != rdp->nxttail[1])
-			return 1;
-		if (rdp->nxttail[0] != &rdp->nxtlist)
-			return 1;
+	/* Does this CPU have callbacks? */
+	if (cpu_has_callbacks(rdp)) /* @@@ need to be more selective. */
+		return 1;
 
-		/*
-		 * This cpu has pending rcu entries and the new batch
-		 * for then hasn't been started nor scheduled start
-		 */
-		if (rcu_batch_after(rdp->batch, rcp->pending))
-			return 1;
-	}
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
 
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
 		return 1;
 
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
 		return 1;
 
 	/* nothing to do */
@@ -643,8 +1252,8 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
  */
 int rcu_pending(int cpu)
 {
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
 }
 
 /*
@@ -658,81 +1267,97 @@ int rcu_needs_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
 
-	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
+	return !!*rdp->nxttail[RCU_DONE_TAIL] ||
+	       !!*rdp_bh->nxttail[RCU_DONE_TAIL] ||
+	       rcu_pending(cpu);
 }
 
 /*
- * Top-level function driving RCU grace-period detection, normally
- * invoked from the scheduler-clock interrupt.  This function simply
- * increments counters that are read only from softirq by this same
- * CPU, so there are no memory barriers required.
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
  */
-void rcu_check_callbacks(int cpu, int user)
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 {
-	if (user ||
-	    (idle_cpu(cpu) && !in_softirq() &&
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-
-		/*
-		 * Get here if this CPU took its interrupt from user
-		 * mode or from the idle loop, and if this is not a
-		 * nested interrupt.  In this case, the CPU is in
-		 * a quiescent state, so count it.
-		 *
-		 * Also do a memory barrier.  This is needed to handle
-		 * the case where writes from a preempt-disable section
-		 * of code get reordered into schedule() by this CPU's
-		 * write buffer.  The memory barrier makes sure that
-		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
-		 * by other CPUs to happen after any such write.
-		 */
+	unsigned long flags;
+	int i;
+	long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	rdp->completed = rsp->completed;
+	rdp->gpnum = rsp->completed;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->passed_quiesc_completed = rsp->completed - 1;
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks |= 1; /* want consecutive numbers even for hotplug. */
+	rdp->dynticks_nesting = 0;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
-		smp_mb();  /* See above block comment. */
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
 
-	} else if (!in_softirq()) {
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
 
-		/*
-		 * Get here if this CPU did not take its interrupt from
-		 * softirq, in other words, if it is not interrupting
-		 * a rcu_bh read-side critical section.  This is an _bh
-		 * critical section, so count it.  The memory barrier
-		 * is needed for the same reason as is the above one.
-		 */
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = 1L << (cpu - rnp->grplo); /* rnp->grplo is constant. */
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = 1L << rnp->grpnum;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
 
-		smp_mb();  /* See above block comment. */
-		rcu_bh_qsctr_inc(cpu);
-	}
-	raise_rcu_softirq();
-}
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
 
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	long flags;
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
 
-	spin_lock_irqsave(&rcp->lock, flags);
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
-	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
-	rdp->blimit = blimit;
-	spin_unlock_irqrestore(&rcp->lock, flags);
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+	local_irq_restore(flags);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
 {
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
+/*
+ * Handle CPU online/offline notifcation events.
+ */
 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 				unsigned long action, void *hcpu)
 {
@@ -753,22 +1378,117 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		levelspread[i] = CONFIG_RCU_FANOUT;
+	}
+	
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+	
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++) {
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	}
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->grplo = j * rsp->levelspread[i];
+			rnp->grphi = (j + 1) * rsp->levelspread[i] - 1;
+			if (rnp->grphi >= rsp->levelcnt[i + 1])
+				rnp->grphi = rsp->levelcnt[i + 1] - 1;
+			rnp->qsmaskinit = 0;
+			if (i != NUM_RCU_LVLS - 1)
+				rnp->grplo = rnp->grphi = 0;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->parent = rsp->level[i - 1] + 
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
 static struct notifier_block __cpuinitdata rcu_nb = {
 	.notifier_call	= rcu_cpu_notify,
 };
 
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init __rcu_init(void)
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
 	/* Register notifier for non-boot CPUs */
 	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
 }
 
 module_param(blimit, int, 0);
diff --git a/kernel/rcuclassic_trace.c b/kernel/rcuclassic_trace.c
new file mode 100644
index 0000000..d516049
--- /dev/null
+++ b/kernel/rcuclassic_trace.c
@@ -0,0 +1,227 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+
+static DEFINE_MUTEX(rcuclassic_trace_mutex);
+static char *rcuclassic_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE (512*NR_CPUS)
+
+static int print_one_rcu_data(struct rcu_data *rdp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		"%3d c=%ld g=%ld pq=%d pqc=%ld qp=%d",
+		rdp->cpu,
+		rdp->completed, rdp->gpnum,
+		rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		rdp->qs_pending);
+#ifdef CONFIG_NO_HZ
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" dt=%d df=%lu", rdp->dynticks, rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" of=%lu ri=%lu", rdp->offline_fqs, rdp->offline_fqs);
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+	return cnt;
+}
+
+#define PRINT_RCU_DATA(name, buf, ebuf) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_online_cpu(_p_r_d_i) \
+			(buf) += print_one_rcu_data(&per_cpu(name, _p_r_d_i), \
+						    buf, ebuf); \
+	} while (0)
+
+static ssize_t rcudata_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static int print_one_rcu_state(struct rcu_state *rsp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+	int level = 0;
+	struct rcu_node *rnp;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+			"c=%ld g=%ld s=%d jfq=%ld nfqs=%lu nfqsng=%lu\n",
+			rsp->completed, rsp->gpnum, rsp->signaled,
+			(long)(rsp->jiffies_force_qs - jiffies),
+			rsp->n_force_qs, rsp->n_force_qs_ngp);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+			level = rnp->level;
+		}
+		cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+				"%lx/%lx %d:%d ^%d    ",
+				rnp->qsmask, rnp->qsmaskinit,
+				rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+	return cnt;
+}
+
+static ssize_t rcuhier_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	buf += print_one_rcu_state(&rcu_state, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	buf += print_one_rcu_state(&rcu_bh_state, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: completed=%ld  gpnum=%ld\n",
+			rcu_state.completed, rcu_state.gpnum);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh: completed=%ld  gpnum=%ld\n",
+			rcu_bh_state.completed, rcu_bh_state.gpnum);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.read = rcudata_read,
+};
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuhier_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct dentry *rcudir, *datadir, *hierdir, *gpdir;
+static int rcuclassic_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcuclassic_trace_init(void)
+{
+	int ret;
+
+	rcuclassic_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcuclassic_trace_buf)
+		return 1;
+	ret = rcuclassic_debugfs_init();
+	if (ret)
+		kfree(rcuclassic_trace_buf);
+	return ret;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+	kfree(rcuclassic_trace_buf);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30  0:49   ` [PATCH, RFC, tip/core/rcu] v3 " Paul E. McKenney
@ 2008-08-30  9:33     ` Peter Zijlstra
  2008-08-30 14:10       ` Paul E. McKenney
  2008-08-30  9:58     ` Lai Jiangshan
                       ` (2 subsequent siblings)
  3 siblings, 1 reply; 94+ messages in thread
From: Peter Zijlstra @ 2008-08-30  9:33 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, Mathieu Desnoyers

On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:

> Some shortcomings:
> 
> o	Entering and leaving dynticks idle mode is a quiescent state,
> 	but the current patch doesn't take advantage of this (noted
> 	by Manfred).  It appears that it should be possible to make
> 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> 	out whether it is safe to tell RCU about the quiescent state --
> 	and also greatly simplify the code.

Already done and available in the -tip tree, curtesy of Mathieu.



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30  0:49   ` [PATCH, RFC, tip/core/rcu] v3 " Paul E. McKenney
  2008-08-30  9:33     ` Peter Zijlstra
@ 2008-08-30  9:58     ` Lai Jiangshan
  2008-08-30 13:32       ` Manfred Spraul
  2008-08-30 14:29       ` Paul E. McKenney
  2008-09-01  9:38     ` Andi Kleen
  2008-09-05 15:29     ` [PATCH, RFC] v4 " Paul E. McKenney
  3 siblings, 2 replies; 94+ messages in thread
From: Lai Jiangshan @ 2008-08-30  9:58 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, rostedt, peterz

I just had a fast review. so my comments is nothing but cleanup.

          Thanks, Lai.

Paul E. McKenney wrote:
> Hello!

> +rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
> +	__releases(rsp->rda[smp_processor_id()]->lock)
> +{
> +	unsigned long flags = iflg;
> +	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
> +	struct rcu_node *rnp = rcu_get_root(rsp);
> +	struct rcu_node *rnp_cur;
> +	struct rcu_node *rnp_end;
> +
> +	if (!cpu_needs_another_gp(rsp, rdp)) {
>  
>  		/*
> -		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
> -		 * Barrier  Otherwise it can cause tickless idle CPUs to be
> -		 * included in rcp->cpumask, which will extend graceperiods
> -		 * unnecessarily.
> +		 * Either there is no need to detect any more grace periods
> +		 * at the moment, or we are already in the process of
> +		 * detecting one.  Either way, we should not start a new
> +		 * RCU grace period, so drop the lock and return.
>  		 */
> -		smp_mb();
> -		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
> +		spin_unlock_irqrestore(&rnp->lock, flags);
> +		return;
> +	}
> +
> +	/* Advance to a new grace period and initialize state. */
> +
> +	rsp->gpnum++;
> +	rsp->signaled = RCU_SIGNAL_INIT;
> +	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
> +	record_gp_stall_check_time();
> +	dyntick_save_completed(rsp, rsp->completed - 1);
> +	note_new_gpnum(rsp, rdp);
> +
> +	/*
> +	 * Because we are first, we know that all our callbacks will
> +	 * be covered by this upcoming grace period, even the ones
> +	 * that were registered arbitrarily recently.
> +	 */
> +
> +	rcu_next_callbacks_are_ready(rdp);
> +	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
>  
> -		rcp->signaled = 0;
> +	/* Special-case the common single-level case. */
> +
> +	if (NUM_RCU_NODES == 1) {
> +		rnp->qsmask = rnp->qsmaskinit;

I tried a mask like qsmaskinit before. The system came to deadlock
when I did on/offline cpus.
I didn't find out the whys for I bethought of these two problem:

problem 1:
----race condition 1:
<cpu_down>
synchronize_rcu <called from offline handler in other subsystem>
rcu_offline_cpu


-----race condition 2:
rcu_online_cpu
synchronize_rcu <called from online handler in other subsystem>
<cpu_up>

in these two condition, synchronize_rcu isblocked for ever for
synchronize_rcu have to wait a cpu in rnp->qsmask, but this
cpu don't run.



problem 2:
we need call rcu_offline_cpu() in these two cases in rcu_cpu_notify()
since qsmaskinit had changed by rcu_online_cpu()

	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:


> +static void
> +cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
>  {
>  	unsigned long flags;
> +	long mask;

long mask -> unsigned long mask


> +static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
>  {
> -	if (list) {
> -		local_irq_disable();
> -		this_rdp->batch = batch;
> -		*this_rdp->nxttail[2] = list;
> -		this_rdp->nxttail[2] = tail;
> -		local_irq_enable();
> +	int i;
> +	unsigned long flags;
> +	long mask;

long mask -> unsigned long mask


> + * Queue an RCU callback for invocation after a grace period.
> + */
> +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
> +{
> +	unsigned long flags;
> +
> +	head->func = func;
> +	head->next = NULL;
> +	local_irq_save(flags);
> +	__call_rcu(head, &rcu_state, &__get_cpu_var(rcu_data));
> +	local_irq_restore(flags);
> +}

struct rcu_state has a field: struct rcu_data *rda[NR_CPUS]
so we can move these lines around __call_rcu into __call_rcu.

__call_rcu(struct rcu_head *head, struct rcu_state *rsp)
{
	local_irq_save(flags);
	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
.....
	local_irq_save(flags);
}


> +static void
> +rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
>  {
> -	if (user ||
> -	    (idle_cpu(cpu) && !in_softirq() &&
> -				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
> -
> -		/*
> -		 * Get here if this CPU took its interrupt from user
> -		 * mode or from the idle loop, and if this is not a
> -		 * nested interrupt.  In this case, the CPU is in
> -		 * a quiescent state, so count it.
> -		 *
> -		 * Also do a memory barrier.  This is needed to handle
> -		 * the case where writes from a preempt-disable section
> -		 * of code get reordered into schedule() by this CPU's
> -		 * write buffer.  The memory barrier makes sure that
> -		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
> -		 * by other CPUs to happen after any such write.
> -		 */
> +	unsigned long flags;
> +	int i;
> +	long mask;

long mask -> unsigned long mask


> +
> +/*
> + * Helper function for rcu_init() that initializes one rcu_state structure.
> + */
> +static void __init rcu_init_one(struct rcu_state *rsp)
> +{
> +	int i;
> +	int j;
> +	struct rcu_node *rnp;
> +
> +	/* Initialize the level-tracking arrays. */
> +
> +	for (i = 1; i < NUM_RCU_LVLS; i++) {
> +		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
> +	}
> +	rcu_init_levelspread(rsp);
> +
> +	/* Initialize the elements themselves, starting from the leaves. */
> +
> +	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
> +		rnp = rsp->level[i];
> +		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
> +			spin_lock_init(&rnp->lock);
> +			rnp->qsmask = 0;
> +			rnp->grplo = j * rsp->levelspread[i];
> +			rnp->grphi = (j + 1) * rsp->levelspread[i] - 1;
> +			if (rnp->grphi >= rsp->levelcnt[i + 1])
> +				rnp->grphi = rsp->levelcnt[i + 1] - 1;
> +			rnp->qsmaskinit = 0;

if no other reason, I will init fields with the order as they are declared.

> +			if (i != NUM_RCU_LVLS - 1)
> +				rnp->grplo = rnp->grphi = 0;
> +			if (i == 0) {
> +				rnp->grpnum = 0;
> +				rnp->parent = NULL;
> +			} else {
> +				rnp->grpnum = j % rsp->levelspread[i - 1];
> +				rnp->parent = rsp->level[i - 1] + 
> +					      j / rsp->levelspread[i - 1];
> +			}
> +			rnp->level = i;
> +		}
> +	}
> +}
> +
> +/*
> + * Helper macro for rcu_init().  To be used nowhere else!

rcu_init -> __rcu_init

> + * Assigns leaf node pointers into each CPU's rcu_data structure.
> + */
> +#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
> +do { \
> +	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
> +	j = 0; \
> +	for_each_possible_cpu(i) { \
> +		if (i > rnp[j].grphi) \
> +			j++; \
> +		per_cpu(rcu_data, i).mynode = &rnp[j]; \
> +		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
> +	} \
> +} while (0)
> +
>  static struct notifier_block __cpuinitdata rcu_nb = {
>  	.notifier_call	= rcu_cpu_notify,
>  };
>  


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30  9:58     ` Lai Jiangshan
@ 2008-08-30 13:32       ` Manfred Spraul
  2008-08-30 14:34         ` Paul E. McKenney
  2008-08-30 14:29       ` Paul E. McKenney
  1 sibling, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-08-30 13:32 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: paulmck, linux-kernel, cl, mingo, akpm, dipankar, josht, schamp,
	niv, dvhltc, ego, rostedt, peterz

Lai Jiangshan wrote:
> I just had a fast review. so my comments is nothing but cleanup.
>
>           Thanks, Lai.
>
> Paul E. McKenney wrote:
>   
>> Hello!
>>     
>
>   
>> +rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
>> +	__releases(rsp->rda[smp_processor_id()]->lock)
>> +{
>> +	unsigned long flags = iflg;
>> +	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
>> +	struct rcu_node *rnp = rcu_get_root(rsp);
>> +	struct rcu_node *rnp_cur;
>> +	struct rcu_node *rnp_end;
>> +
>> +	if (!cpu_needs_another_gp(rsp, rdp)) {
>>  
>>  		/*
>> -		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
>> -		 * Barrier  Otherwise it can cause tickless idle CPUs to be
>> -		 * included in rcp->cpumask, which will extend graceperiods
>> -		 * unnecessarily.
>> +		 * Either there is no need to detect any more grace periods
>> +		 * at the moment, or we are already in the process of
>> +		 * detecting one.  Either way, we should not start a new
>> +		 * RCU grace period, so drop the lock and return.
>>  		 */
>> -		smp_mb();
>> -		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
>> +		spin_unlock_irqrestore(&rnp->lock, flags);
>> +		return;
>> +	}
>> +
>> +	/* Advance to a new grace period and initialize state. */
>> +
>> +	rsp->gpnum++;
>> +	rsp->signaled = RCU_SIGNAL_INIT;
>> +	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
>> +	record_gp_stall_check_time();
>> +	dyntick_save_completed(rsp, rsp->completed - 1);
>> +	note_new_gpnum(rsp, rdp);
>> +
>> +	/*
>> +	 * Because we are first, we know that all our callbacks will
>> +	 * be covered by this upcoming grace period, even the ones
>> +	 * that were registered arbitrarily recently.
>> +	 */
>> +
>> +	rcu_next_callbacks_are_ready(rdp);
>> +	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
>>  
>> -		rcp->signaled = 0;
>> +	/* Special-case the common single-level case. */
>> +
>> +	if (NUM_RCU_NODES == 1) {
>> +		rnp->qsmask = rnp->qsmaskinit;
>>     
>
> I tried a mask like qsmaskinit before. The system came to deadlock
> when I did on/offline cpus.
> I didn't find out the whys for I bethought of these two problem:
>
> problem 1:
> ----race condition 1:
> <cpu_down>
> synchronize_rcu <called from offline handler in other subsystem>
> rcu_offline_cpu
>
>
> -----race condition 2:
> rcu_online_cpu
> synchronize_rcu <called from online handler in other subsystem>
> <cpu_up>
>
> in these two condition, synchronize_rcu isblocked for ever for
> synchronize_rcu have to wait a cpu in rnp->qsmask, but this
> cpu don't run.
>
>   
Can we disallow synchronize_rcu() from the cpu notifiers? Are there any 
users that do a synchronize_rcu() from within the notifiers?
I don't see any other solution.
Something like qsmaskinit is needed - always enumerating all cpus just 
doesn't scale.

Perhaps it's possible to rely on CPU_DYING, but I haven't figured out 
yet how to handle read-side critical sections in CPU_DYING handlers.
Interrupts after CPU_DYING could be handled by rcu_irq_enter(), 
rcu_irq_exit() [yes, they exist on x86: the arch code enables the local 
interrupts in order to process the currently queued interrupts]

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30  9:33     ` Peter Zijlstra
@ 2008-08-30 14:10       ` Paul E. McKenney
  2008-08-30 15:40         ` Peter Zijlstra
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-30 14:10 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, Mathieu Desnoyers

On Sat, Aug 30, 2008 at 11:33:00AM +0200, Peter Zijlstra wrote:
> On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:
> 
> > Some shortcomings:
> > 
> > o	Entering and leaving dynticks idle mode is a quiescent state,
> > 	but the current patch doesn't take advantage of this (noted
> > 	by Manfred).  It appears that it should be possible to make
> > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > 	out whether it is safe to tell RCU about the quiescent state --
> > 	and also greatly simplify the code.
> 
> Already done and available in the -tip tree, curtesy of Mathieu.

Very cool!!!  I see one of his patches at http://lkml.org/lkml/2008/4/17/342,
but how do I find out which branch of -tip this is on?  (I am learning
git, but it is a slow process...)

This would also simplify preemptable RCU's dyntick interface, removing
the need for proofs.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30  9:58     ` Lai Jiangshan
  2008-08-30 13:32       ` Manfred Spraul
@ 2008-08-30 14:29       ` Paul E. McKenney
  1 sibling, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-30 14:29 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, rostedt, peterz

On Sat, Aug 30, 2008 at 05:58:26PM +0800, Lai Jiangshan wrote:
> I just had a fast review. so my comments is nothing but cleanup.

Thank you for looking it over!!!

>           Thanks, Lai.
> 
> Paul E. McKenney wrote:
> > Hello!
> 
> > +rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
> > +	__releases(rsp->rda[smp_processor_id()]->lock)
> > +{
> > +	unsigned long flags = iflg;
> > +	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
> > +	struct rcu_node *rnp = rcu_get_root(rsp);
> > +	struct rcu_node *rnp_cur;
> > +	struct rcu_node *rnp_end;
> > +
> > +	if (!cpu_needs_another_gp(rsp, rdp)) {
> >  
> >  		/*
> > -		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
> > -		 * Barrier  Otherwise it can cause tickless idle CPUs to be
> > -		 * included in rcp->cpumask, which will extend graceperiods
> > -		 * unnecessarily.
> > +		 * Either there is no need to detect any more grace periods
> > +		 * at the moment, or we are already in the process of
> > +		 * detecting one.  Either way, we should not start a new
> > +		 * RCU grace period, so drop the lock and return.
> >  		 */
> > -		smp_mb();
> > -		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
> > +		spin_unlock_irqrestore(&rnp->lock, flags);
> > +		return;
> > +	}
> > +
> > +	/* Advance to a new grace period and initialize state. */
> > +
> > +	rsp->gpnum++;
> > +	rsp->signaled = RCU_SIGNAL_INIT;
> > +	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
> > +	record_gp_stall_check_time();
> > +	dyntick_save_completed(rsp, rsp->completed - 1);
> > +	note_new_gpnum(rsp, rdp);
> > +
> > +	/*
> > +	 * Because we are first, we know that all our callbacks will
> > +	 * be covered by this upcoming grace period, even the ones
> > +	 * that were registered arbitrarily recently.
> > +	 */
> > +
> > +	rcu_next_callbacks_are_ready(rdp);
> > +	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
> >  
> > -		rcp->signaled = 0;
> > +	/* Special-case the common single-level case. */
> > +
> > +	if (NUM_RCU_NODES == 1) {
> > +		rnp->qsmask = rnp->qsmaskinit;
> 
> I tried a mask like qsmaskinit before. The system came to deadlock
> when I did on/offline cpus.

And I did need to address this.

> I didn't find out the whys for I bethought of these two problem:
> 
> problem 1:
> ----race condition 1:
> <cpu_down>
> synchronize_rcu <called from offline handler in other subsystem>
> rcu_offline_cpu
> 
> 
> -----race condition 2:
> rcu_online_cpu
> synchronize_rcu <called from online handler in other subsystem>
> <cpu_up>
> 
> in these two condition, synchronize_rcu isblocked for ever for
> synchronize_rcu have to wait a cpu in rnp->qsmask, but this
> cpu don't run.

First, only one of these race conditions can happen at a time, since
only one online/offline action can be happening at a time.

What I did to solve it was to make force_quiescent_state() check
to see if the CPU currently blocking the grace period is offline.
(The actual checking for offline is in rcu_implicit_offline_qs(),
which is called indirectly from force_quiescent_state().)

So when this race occurs, it is taken care of within three jiffies.
This happened -many- times during my most recent test ("of=" in the
rcudata trace).

> problem 2:
> we need call rcu_offline_cpu() in these two cases in rcu_cpu_notify()
> since qsmaskinit had changed by rcu_online_cpu()
> 
> 	case CPU_UP_CANCELED:
> 	case CPU_UP_CANCELED_FROZEN:

Good catch!!!  Fixed.  The current code would work in this case, but grace
periods would be unnecessarily extended until force_quiescent_state()
got a chance to clean things up.  So very good to fix this one.

> > +static void
> > +cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
> >  {
> >  	unsigned long flags;
> > +	long mask;
> 
> long mask -> unsigned long mask

Good eyes!  Fixed.

> > +static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
> >  {
> > -	if (list) {
> > -		local_irq_disable();
> > -		this_rdp->batch = batch;
> > -		*this_rdp->nxttail[2] = list;
> > -		this_rdp->nxttail[2] = tail;
> > -		local_irq_enable();
> > +	int i;
> > +	unsigned long flags;
> > +	long mask;
> 
> long mask -> unsigned long mask

Here too!

> > + * Queue an RCU callback for invocation after a grace period.
> > + */
> > +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
> > +{
> > +	unsigned long flags;
> > +
> > +	head->func = func;
> > +	head->next = NULL;
> > +	local_irq_save(flags);
> > +	__call_rcu(head, &rcu_state, &__get_cpu_var(rcu_data));
> > +	local_irq_restore(flags);
> > +}
> 
> struct rcu_state has a field: struct rcu_data *rda[NR_CPUS]
> so we can move these lines around __call_rcu into __call_rcu.
> 
> __call_rcu(struct rcu_head *head, struct rcu_state *rsp)
> {
> 	local_irq_save(flags);
> 	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
> .....
> 	local_irq_save(flags);
> }

Very good point!!!  And then call_rcu() and call_rcu_bh() become
one-liners.  ;-)

> > +static void
> > +rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
> >  {
> > -	if (user ||
> > -	    (idle_cpu(cpu) && !in_softirq() &&
> > -				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
> > -
> > -		/*
> > -		 * Get here if this CPU took its interrupt from user
> > -		 * mode or from the idle loop, and if this is not a
> > -		 * nested interrupt.  In this case, the CPU is in
> > -		 * a quiescent state, so count it.
> > -		 *
> > -		 * Also do a memory barrier.  This is needed to handle
> > -		 * the case where writes from a preempt-disable section
> > -		 * of code get reordered into schedule() by this CPU's
> > -		 * write buffer.  The memory barrier makes sure that
> > -		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
> > -		 * by other CPUs to happen after any such write.
> > -		 */
> > +	unsigned long flags;
> > +	int i;
> > +	long mask;
> 
> long mask -> unsigned long mask

And again!  ;-)  Very good eyes!!!

> > +
> > +/*
> > + * Helper function for rcu_init() that initializes one rcu_state structure.
> > + */
> > +static void __init rcu_init_one(struct rcu_state *rsp)
> > +{
> > +	int i;
> > +	int j;
> > +	struct rcu_node *rnp;
> > +
> > +	/* Initialize the level-tracking arrays. */
> > +
> > +	for (i = 1; i < NUM_RCU_LVLS; i++) {
> > +		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
> > +	}
> > +	rcu_init_levelspread(rsp);
> > +
> > +	/* Initialize the elements themselves, starting from the leaves. */
> > +
> > +	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
> > +		rnp = rsp->level[i];
> > +		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
> > +			spin_lock_init(&rnp->lock);
> > +			rnp->qsmask = 0;
> > +			rnp->grplo = j * rsp->levelspread[i];
> > +			rnp->grphi = (j + 1) * rsp->levelspread[i] - 1;
> > +			if (rnp->grphi >= rsp->levelcnt[i + 1])
> > +				rnp->grphi = rsp->levelcnt[i + 1] - 1;
> > +			rnp->qsmaskinit = 0;
> 
> if no other reason, I will init fields with the order as they are declared.

Good point, moved.

> > +			if (i != NUM_RCU_LVLS - 1)
> > +				rnp->grplo = rnp->grphi = 0;
> > +			if (i == 0) {
> > +				rnp->grpnum = 0;
> > +				rnp->parent = NULL;
> > +			} else {
> > +				rnp->grpnum = j % rsp->levelspread[i - 1];
> > +				rnp->parent = rsp->level[i - 1] + 
> > +					      j / rsp->levelspread[i - 1];
> > +			}
> > +			rnp->level = i;
> > +		}
> > +	}
> > +}
> > +
> > +/*
> > + * Helper macro for rcu_init().  To be used nowhere else!
> 
> rcu_init -> __rcu_init

Good catch, fixed.

> > + * Assigns leaf node pointers into each CPU's rcu_data structure.
> > + */
> > +#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
> > +do { \
> > +	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
> > +	j = 0; \
> > +	for_each_possible_cpu(i) { \
> > +		if (i > rnp[j].grphi) \
> > +			j++; \
> > +		per_cpu(rcu_data, i).mynode = &rnp[j]; \
> > +		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
> > +	} \
> > +} while (0)
> > +
> >  static struct notifier_block __cpuinitdata rcu_nb = {
> >  	.notifier_call	= rcu_cpu_notify,
> >  };
> >  
> 

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30 13:32       ` Manfred Spraul
@ 2008-08-30 14:34         ` Paul E. McKenney
  2008-08-31 10:58           ` Manfred Spraul
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-30 14:34 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz

On Sat, Aug 30, 2008 at 03:32:36PM +0200, Manfred Spraul wrote:
> Lai Jiangshan wrote:
>> I just had a fast review. so my comments is nothing but cleanup.
>>
>>           Thanks, Lai.
>>
>> Paul E. McKenney wrote:
>>   
>>> Hello!
>>>     
>>
>>   
>>> +rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
>>> +	__releases(rsp->rda[smp_processor_id()]->lock)
>>> +{
>>> +	unsigned long flags = iflg;
>>> +	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
>>> +	struct rcu_node *rnp = rcu_get_root(rsp);
>>> +	struct rcu_node *rnp_cur;
>>> +	struct rcu_node *rnp_end;
>>> +
>>> +	if (!cpu_needs_another_gp(rsp, rdp)) {
>>>   		/*
>>> -		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
>>> -		 * Barrier  Otherwise it can cause tickless idle CPUs to be
>>> -		 * included in rcp->cpumask, which will extend graceperiods
>>> -		 * unnecessarily.
>>> +		 * Either there is no need to detect any more grace periods
>>> +		 * at the moment, or we are already in the process of
>>> +		 * detecting one.  Either way, we should not start a new
>>> +		 * RCU grace period, so drop the lock and return.
>>>  		 */
>>> -		smp_mb();
>>> -		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
>>> +		spin_unlock_irqrestore(&rnp->lock, flags);
>>> +		return;
>>> +	}
>>> +
>>> +	/* Advance to a new grace period and initialize state. */
>>> +
>>> +	rsp->gpnum++;
>>> +	rsp->signaled = RCU_SIGNAL_INIT;
>>> +	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
>>> +	record_gp_stall_check_time();
>>> +	dyntick_save_completed(rsp, rsp->completed - 1);
>>> +	note_new_gpnum(rsp, rdp);
>>> +
>>> +	/*
>>> +	 * Because we are first, we know that all our callbacks will
>>> +	 * be covered by this upcoming grace period, even the ones
>>> +	 * that were registered arbitrarily recently.
>>> +	 */
>>> +
>>> +	rcu_next_callbacks_are_ready(rdp);
>>> +	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
>>>  -		rcp->signaled = 0;
>>> +	/* Special-case the common single-level case. */
>>> +
>>> +	if (NUM_RCU_NODES == 1) {
>>> +		rnp->qsmask = rnp->qsmaskinit;
>>>     
>>
>> I tried a mask like qsmaskinit before. The system came to deadlock
>> when I did on/offline cpus.
>> I didn't find out the whys for I bethought of these two problem:
>>
>> problem 1:
>> ----race condition 1:
>> <cpu_down>
>> synchronize_rcu <called from offline handler in other subsystem>
>> rcu_offline_cpu
>>
>>
>> -----race condition 2:
>> rcu_online_cpu
>> synchronize_rcu <called from online handler in other subsystem>
>> <cpu_up>
>>
>> in these two condition, synchronize_rcu isblocked for ever for
>> synchronize_rcu have to wait a cpu in rnp->qsmask, but this
>> cpu don't run.
>>
>>   
> Can we disallow synchronize_rcu() from the cpu notifiers? Are there any 
> users that do a synchronize_rcu() from within the notifiers?
> I don't see any other solution.

I made force_quiescent_state() check for offline CPUs.  (Well, actually
it is rcu_implicit_offline_qs(), which is indirectly called from
force_quiescent_state().

> Something like qsmaskinit is needed - always enumerating all cpus just 
> doesn't scale.

Agreed!!!

> Perhaps it's possible to rely on CPU_DYING, but I haven't figured out yet 
> how to handle read-side critical sections in CPU_DYING handlers.
> Interrupts after CPU_DYING could be handled by rcu_irq_enter(), 
> rcu_irq_exit() [yes, they exist on x86: the arch code enables the local 
> interrupts in order to process the currently queued interrupts]

My feeling is that CPU online/offline will be quite rare, so it should
be OK to clean up after the races in force_quiescent_state(), which in
this version is called every three ticks in a given grace period.

Yes, I did worry about the possibility of all CPUs being in dyntick-idle
mode, and the solution for that is (1) don't let a CPU that has RCU
callbacks pending go into dyntick-idle mode via rcu_needs_cpu() and
(2) don't let a grace period start unless there is at least one callback
that is not yet in the done state.  But no, I am not certain that I have
gotten this completely correct yet.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30 14:10       ` Paul E. McKenney
@ 2008-08-30 15:40         ` Peter Zijlstra
  2008-08-30 19:38           ` Paul E. McKenney
  2008-09-02 13:26           ` Mathieu Desnoyers
  0 siblings, 2 replies; 94+ messages in thread
From: Peter Zijlstra @ 2008-08-30 15:40 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, Mathieu Desnoyers

On Sat, 2008-08-30 at 07:10 -0700, Paul E. McKenney wrote:
> On Sat, Aug 30, 2008 at 11:33:00AM +0200, Peter Zijlstra wrote:
> > On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:
> > 
> > > Some shortcomings:
> > > 
> > > o	Entering and leaving dynticks idle mode is a quiescent state,
> > > 	but the current patch doesn't take advantage of this (noted
> > > 	by Manfred).  It appears that it should be possible to make
> > > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > > 	out whether it is safe to tell RCU about the quiescent state --
> > > 	and also greatly simplify the code.
> > 
> > Already done and available in the -tip tree, curtesy of Mathieu.
> 
> Very cool!!!  I see one of his patches at http://lkml.org/lkml/2008/4/17/342,
> but how do I find out which branch of -tip this is on?  (I am learning
> git, but it is a slow process...)
> 
> This would also simplify preemptable RCU's dyntick interface, removing
> the need for proofs.

Not sure - my git-foo isn't good enough either :-(

All I can offer is that its available in tip/master (the collective
merge of all of tip's branches) as commit:
0d84b78a606f1562532cd576ee8733caf5a4aed3, which I found using
git-annotate include/linux/hardirq.h

How to find from which particular topic branch it came from, I too am
clueless.

---
commit 0d84b78a606f1562532cd576ee8733caf5a4aed3
Author: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date:   Mon May 12 21:21:07 2008 +0200

    x86 NMI-safe INT3 and Page Fault
    
    Implements an alternative iret with popf and return so trap and exception
    handlers can return to the NMI handler without issuing iret. iret would cause
    NMIs to be reenabled prematurely. x86_32 uses popf and far return. x86_64 has to
    copy the return instruction pointer to the top of the previous stack, issue a
    popf, loads the previous esp and issue a near return (ret).
    
    It allows placing immediate values (and therefore optimized trace_marks) in NMI
    code since returning from a breakpoint would be valid. Accessing vmalloc'd
    memory, which allows executing module code or accessing vmapped or vmalloc'd
    areas from NMI context, would also be valid. This is very useful to tracers like
    LTTng.
    
    This patch makes all faults, traps and exception safe to be called from NMI
    context *except* single-stepping, which requires iret to restore the TF (trap
    flag) and jump to the return address in a single instruction. Sorry, no kprobes
    support in NMI handlers because of this limitation.  We cannot single-step an
    NMI handler, because iret must set the TF flag and return back to the
    instruction to single-step in a single instruction. This cannot be emulated with
    popf/lret, because lret would be single-stepped. It does not apply to immediate
    values because they do not use single-stepping. This code detects if the TF
    flag is set and uses the iret path for single-stepping, even if it reactivates
    NMIs prematurely.
    
    Test to detect if nested under a NMI handler is only done upon the return from
    trap/exception to kernel, which is not frequent. Other return paths (return from
    trap/exception to userspace, return from interrupt) keep the exact same behavior
    (no slowdown).
    
    Depends on :
    change-alpha-active-count-bit.patch
    change-avr32-active-count-bit.patch
    
    TODO : test with lguest, xen, kvm.
    
    ** This patch depends on the "Stringify support commas" patchset **
    ** Also depends on fix-x86_64-page-fault-scheduler-race patch **
    
    tested on x86_32 (tests implemented in a separate patch) :
    - instrumented the return path to export the EIP, CS and EFLAGS values when
      taken so we know the return path code has been executed.
    - trace_mark, using immediate values, with 10ms delay with the breakpoint
      activated. Runs well through the return path.
    - tested vmalloc faults in NMI handler by placing a non-optimized marker in the
      NMI handler (so no breakpoint is executed) and connecting a probe which
      touches every pages of a 20MB vmalloc'd buffer. It executes trough the return
      path without problem.
    - Tested with and without preemption
    
    tested on x86_64
    - instrumented the return path to export the EIP, CS and EFLAGS values when
      taken so we know the return path code has been executed.
    - trace_mark, using immediate values, with 10ms delay with the breakpoint
      activated. Runs well through the return path.
    
    To test on x86_64 :
    - Test without preemption
    - Test vmalloc faults
    - Test on Intel 64 bits CPUs. (AMD64 was fine)
    
    Changelog since v1 :
    - x86_64 fixes.
    Changelog since v2 :
    - fix paravirt build
    Changelog since v3 :
    - Include modifications suggested by Jeremy
    Changelog since v4 :
    - including hardirq.h in entry_32/64.S is a bad idea (non ifndef'd C code),
      define HARDNMI_MASK in the .S files directly.
    Changelog since v5 :
    - Add HARDNMI_MASK to irq_count() and make die() more verbose for NMIs.
    Changelog since v7 :
    - Implement paravirtualized nmi_return.
    Changelog since v8 :
    - refreshed the patch for asm-offsets. Those were left out of v8.
    - now depends on "Stringify support commas" patch.
    Changelog since v9 :
    - Only test the nmi nested preempt count flag upon return from exceptions, not
      on return from interrupts. Only the kernel return path has this test.
    - Add Xen, VMI, lguest support. Use their iret pavavirt ops in lieu of
      nmi_return.
    
    -- Ported to sched-devel.git
    
    Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
    CC: akpm@osdl.org
    CC: mingo@elte.hu
    CC: "H. Peter Anvin" <hpa@zytor.com>
    CC: Jeremy Fitzhardinge <jeremy@goop.org>
    CC: Steven Rostedt <rostedt@goodmis.org>
    CC: "Frank Ch. Eigler" <fche@redhat.com>
    Signed-off-by: Ingo Molnar <mingo@elte.hu>
    Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 9258808..73474e0 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -111,6 +111,7 @@ void foo(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
 	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
 	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
 #endif
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index f126c05..a5bbec3 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -62,6 +62,7 @@ int main(void)
 	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
 	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
 	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
+	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
 	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
 	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
 	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index e6517ce..2d88211 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -68,6 +68,8 @@
 
 #define nr_syscalls ((syscall_table_size)/4)
 
+#define HARDNMI_MASK 0x40000000
+
 #ifdef CONFIG_PREEMPT
 #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
 #else
@@ -232,8 +234,32 @@ END(ret_from_fork)
 	# userspace resumption stub bypassing syscall exit tracing
 	ALIGN
 	RING0_PTREGS_FRAME
+
 ret_from_exception:
 	preempt_stop(CLBR_ANY)
+	GET_THREAD_INFO(%ebp)
+	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
+	movb PT_CS(%esp), %al
+	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+	cmpl $USER_RPL, %eax
+	jae resume_userspace	# returning to v8086 or userspace
+	testl $HARDNMI_MASK,TI_preempt_count(%ebp)
+	jz resume_kernel		/* Not nested over NMI ? */
+	testw $X86_EFLAGS_TF, PT_EFLAGS(%esp)
+	jnz resume_kernel		/*
+					 * If single-stepping an NMI handler,
+					 * use the normal iret path instead of
+					 * the popf/lret because lret would be
+					 * single-stepped. It should not
+					 * happen : it will reactivate NMIs
+					 * prematurely.
+					 */
+	TRACE_IRQS_IRET
+	RESTORE_REGS
+	addl $4, %esp			# skip orig_eax/error_code
+	CFI_ADJUST_CFA_OFFSET -4
+	INTERRUPT_RETURN_NMI_SAFE
+
 ret_from_intr:
 	GET_THREAD_INFO(%ebp)
 check_userspace:
@@ -873,6 +899,10 @@ ENTRY(native_iret)
 .previous
 END(native_iret)
 
+ENTRY(native_nmi_return)
+	NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ?
+END(native_nmi_return)
+
 ENTRY(native_irq_enable_syscall_ret)
 	sti
 	sysexit
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe25e5f..5f8edc7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -156,6 +156,8 @@ END(mcount)
 #endif /* CONFIG_DYNAMIC_FTRACE */
 #endif /* CONFIG_FTRACE */
 
+#define HARDNMI_MASK 0x40000000
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
@@ -698,6 +700,9 @@ ENTRY(native_iret)
 	.section __ex_table,"a"
 	.quad native_iret, bad_iret
 	.previous
+
+ENTRY(native_nmi_return)
+	NATIVE_INTERRUPT_RETURN_NMI_SAFE
 #endif
 
 	.section .fixup,"ax"
@@ -753,6 +758,23 @@ retint_signal:
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
 
+	/* Returning to kernel space from exception. */
+	/* rcx:	 threadinfo. interrupts off. */
+ENTRY(retexc_kernel)
+	testl $HARDNMI_MASK,threadinfo_preempt_count(%rcx)
+	jz retint_kernel		/* Not nested over NMI ? */
+	testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp)	/* trap flag? */
+	jnz retint_kernel		/*
+					 * If single-stepping an NMI handler,
+					 * use the normal iret path instead of
+					 * the popf/lret because lret would be
+					 * single-stepped. It should not
+					 * happen : it will reactivate NMIs
+					 * prematurely.
+					 */
+	RESTORE_ARGS 0,8,0
+	INTERRUPT_RETURN_NMI_SAFE
+
 #ifdef CONFIG_PREEMPT
 	/* Returning to kernel space. Check if we need preemption */
 	/* rcx:	 threadinfo. interrupts off. */
@@ -911,9 +933,17 @@ paranoid_swapgs\trace:
 	TRACE_IRQS_IRETQ 0
 	.endif
 	SWAPGS_UNSAFE_STACK
-paranoid_restore\trace:
+paranoid_restore_no_nmi\trace:
 	RESTORE_ALL 8
 	jmp irq_return
+paranoid_restore\trace:
+	GET_THREAD_INFO(%rcx)
+	testl $HARDNMI_MASK,threadinfo_preempt_count(%rcx)
+	jz paranoid_restore_no_nmi\trace	/* Nested over NMI ? */
+	testw $X86_EFLAGS_TF,EFLAGS-0(%rsp)	/* trap flag? */
+	jnz paranoid_restore_no_nmi\trace
+	RESTORE_ALL 8
+	INTERRUPT_RETURN_NMI_SAFE
 paranoid_userspace\trace:
 	GET_THREAD_INFO(%rcx)
 	movl threadinfo_flags(%rcx),%ebx
@@ -1012,7 +1042,7 @@ error_exit:
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)	
 	testl %eax,%eax
-	jne  retint_kernel
+	jne  retexc_kernel
 	LOCKDEP_SYS_EXIT_IRQ
 	movl  threadinfo_flags(%rcx),%edx
 	movl  $_TIF_WORK_MASK,%edi
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 74f0c5e..bb174a8 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -139,6 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
 		/* If the operation is a nop, then nop the callsite */
 		ret = paravirt_patch_nop();
 	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
+		 type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) ||
 		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
 		/* If operation requires a jmp, then jmp */
 		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
@@ -190,6 +191,7 @@ static void native_flush_tlb_single(unsigned long addr)
 
 /* These are in entry.S */
 extern void native_iret(void);
+extern void native_nmi_return(void);
 extern void native_irq_enable_syscall_ret(void);
 
 static int __init print_banner(void)
@@ -328,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
 
 	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
 	.iret = native_iret,
+	.nmi_return = native_nmi_return,
 	.swapgs = native_swapgs,
 
 	.set_iopl_mask = native_set_iopl_mask,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 82fc5fc..8ed31c7 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -1,10 +1,13 @@
-#include <asm/paravirt.h>
+#include <linux/stringify.h>
+#include <linux/irqflags.h>
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
 DEF_NATIVE(pv_cpu_ops, iret, "iret");
+DEF_NATIVE(pv_cpu_ops, nmi_return,
+	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
 DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
@@ -29,6 +32,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, restore_fl);
 		PATCH_SITE(pv_irq_ops, save_fl);
 		PATCH_SITE(pv_cpu_ops, iret);
+		PATCH_SITE(pv_cpu_ops, nmi_return);
 		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
 		PATCH_SITE(pv_mmu_ops, read_cr3);
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 7d904e1..56eccea 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -1,12 +1,15 @@
+#include <linux/irqflags.h>
+#include <linux/stringify.h>
 #include <asm/paravirt.h>
 #include <asm/asm-offsets.h>
-#include <linux/stringify.h>
 
 DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
 DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
 DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
 DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_cpu_ops, iret, "iretq");
+DEF_NATIVE(pv_cpu_ops, nmi_return,
+	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
@@ -35,6 +38,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_irq_ops, irq_enable);
 		PATCH_SITE(pv_irq_ops, irq_disable);
 		PATCH_SITE(pv_cpu_ops, iret);
+		PATCH_SITE(pv_cpu_ops, nmi_return);
 		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
 		PATCH_SITE(pv_cpu_ops, swapgs);
 		PATCH_SITE(pv_mmu_ops, read_cr2);
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
index bde6f63..f3a59cd 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -475,6 +475,9 @@ void die(const char *str, struct pt_regs *regs, long err)
 	if (kexec_should_crash(current))
 		crash_kexec(regs);
 
+	if (in_nmi())
+		panic("Fatal exception in non-maskable interrupt");
+
 	if (in_interrupt())
 		panic("Fatal exception in interrupt");
 
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76e..3dacb75 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -555,6 +555,10 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
 		oops_exit();
 		return;
 	}
+	if (in_nmi())
+		panic("Fatal exception in non-maskable interrupt");
+	if (in_interrupt())
+		panic("Fatal exception in interrupt");
 	if (panic_on_oops)
 		panic("Fatal exception");
 	oops_exit();
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 956f389..01d687d 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -151,6 +151,8 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
 					      insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.iret):
 			return patch_internal(VMI_CALL_IRET, len, insns, ip);
+		case PARAVIRT_PATCH(pv_cpu_ops.nmi_return):
+			return patch_internal(VMI_CALL_IRET, len, insns, ip);
 		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
 			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
 		default:
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index af65b2d..f5cbb74 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -958,6 +958,7 @@ __init void lguest_init(void)
 	pv_cpu_ops.cpuid = lguest_cpuid;
 	pv_cpu_ops.load_idt = lguest_load_idt;
 	pv_cpu_ops.iret = lguest_iret;
+	pv_cpu_ops.nmi_return = lguest_iret;
 	pv_cpu_ops.load_sp0 = lguest_load_sp0;
 	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
 	pv_cpu_ops.set_ldt = lguest_set_ldt;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c8a56e4..33272ce 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1008,6 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	.read_pmc = native_read_pmc,
 
 	.iret = xen_iret,
+	.nmi_return = xen_iret,
 	.irq_enable_syscall_ret = xen_sysexit,
 
 	.load_tr_desc = paravirt_nop,
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index 24d71b1..c3009fd 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -51,6 +51,61 @@ static inline void native_halt(void)
 
 #endif
 
+#ifdef CONFIG_X86_64
+/*
+ * Only returns from a trap or exception to a NMI context (intra-privilege
+ * level near return) to the same SS and CS segments. Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued. It takes care of modifying the eflags, rsp and returning to the
+ * previous function.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(rsp)  RIP
+ * 8(rsp)  CS
+ * 16(rsp) EFLAGS
+ * 24(rsp) RSP
+ * 32(rsp) SS
+ *
+ * Upon execution :
+ * Copy EIP to the top of the return stack
+ * Update top of return stack address
+ * Pop eflags into the eflags register
+ * Make the return stack current
+ * Near return (popping the return address from the return stack)
+ */
+#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushq %rax;		\
+						movq %rsp, %rax;	\
+						movq 24+8(%rax), %rsp;	\
+						pushq 0+8(%rax);	\
+						pushq 16+8(%rax);	\
+						movq (%rax), %rax;	\
+						popfq;			\
+						ret
+#else
+/*
+ * Protected mode only, no V8086. Implies that protected mode must
+ * be entered before NMIs or MCEs are enabled. Only returns from a trap or
+ * exception to a NMI context (intra-privilege level far return). Should be used
+ * upon trap or exception return when nested over a NMI context so no iret is
+ * issued.
+ *
+ * The stack, at that point, looks like :
+ *
+ * 0(esp) EIP
+ * 4(esp) CS
+ * 8(esp) EFLAGS
+ *
+ * Upon execution :
+ * Copy the stack eflags to top of stack
+ * Pop eflags into the eflags register
+ * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
+ */
+#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushl 8(%esp);	\
+						popfl;		\
+						lret $4
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -109,6 +164,7 @@ static inline unsigned long __raw_local_irq_save(void)
 
 #define ENABLE_INTERRUPTS(x)	sti
 #define DISABLE_INTERRUPTS(x)	cli
+#define INTERRUPT_RETURN_NMI_SAFE	NATIVE_INTERRUPT_RETURN_NMI_SAFE
 
 #ifdef CONFIG_X86_64
 #define INTERRUPT_RETURN	iretq
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
index 0f13b94..d5087e0 100644
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -141,9 +141,10 @@ struct pv_cpu_ops {
 	u64 (*read_pmc)(int counter);
 	unsigned long long (*read_tscp)(unsigned int *aux);
 
-	/* These two are jmp to, not actually called. */
+	/* These three are jmp to, not actually called. */
 	void (*irq_enable_syscall_ret)(void);
 	void (*iret)(void);
+	void (*nmi_return)(void);
 
 	void (*swapgs)(void);
 
@@ -1385,6 +1386,10 @@ static inline unsigned long __raw_local_irq_save(void)
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
 		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
 
+#define INTERRUPT_RETURN_NMI_SAFE					\
+	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE,	\
+		  jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return)
+
 #define DISABLE_INTERRUPTS(clobbers)					\
 	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
 		  PV_SAVE_REGS;			\
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..b39f49d 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -22,10 +22,13 @@
  * PREEMPT_MASK: 0x000000ff
  * SOFTIRQ_MASK: 0x0000ff00
  * HARDIRQ_MASK: 0x0fff0000
+ * HARDNMI_MASK: 0x40000000
  */
 #define PREEMPT_BITS	8
 #define SOFTIRQ_BITS	8
 
+#define HARDNMI_BITS	1
+
 #ifndef HARDIRQ_BITS
 #define HARDIRQ_BITS	12
 
@@ -45,16 +48,19 @@
 #define PREEMPT_SHIFT	0
 #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
 #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define HARDNMI_SHIFT	(30)
 
 #define __IRQ_MASK(x)	((1UL << (x))-1)
 
 #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
 #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
 #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+#define HARDNMI_MASK	(__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
 
 #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
 #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
 #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define HARDNMI_OFFSET	(1UL << HARDNMI_SHIFT)
 
 #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
 #error PREEMPT_ACTIVE is too low!
@@ -62,7 +68,9 @@
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
-#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irq_count() \
+	(preempt_count() & (HARDNMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
+#define hardnmi_count()	(preempt_count() & HARDNMI_MASK)
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
@@ -71,6 +79,7 @@
 #define in_irq()		(hardirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
+#define in_nmi()		(hardnmi_count())
 
 #if defined(CONFIG_PREEMPT)
 # define PREEMPT_INATOMIC_BASE kernel_locked()
@@ -161,7 +170,19 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()					\
+	do {						\
+		lockdep_off();				\
+		BUG_ON(hardnmi_count());		\
+		add_preempt_count(HARDNMI_OFFSET);	\
+		__irq_enter();				\
+	} while (0)
+
+#define nmi_exit()					\
+	do {						\
+		__irq_exit();				\
+		sub_preempt_count(HARDNMI_OFFSET);	\
+		lockdep_on();				\
+	} while (0)
 
 #endif /* LINUX_HARDIRQ_H */



^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30 15:40         ` Peter Zijlstra
@ 2008-08-30 19:38           ` Paul E. McKenney
  2008-09-02 13:26           ` Mathieu Desnoyers
  1 sibling, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-30 19:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, Mathieu Desnoyers

On Sat, Aug 30, 2008 at 05:40:58PM +0200, Peter Zijlstra wrote:
> On Sat, 2008-08-30 at 07:10 -0700, Paul E. McKenney wrote:
> > On Sat, Aug 30, 2008 at 11:33:00AM +0200, Peter Zijlstra wrote:
> > > On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:
> > > 
> > > > Some shortcomings:
> > > > 
> > > > o	Entering and leaving dynticks idle mode is a quiescent state,
> > > > 	but the current patch doesn't take advantage of this (noted
> > > > 	by Manfred).  It appears that it should be possible to make
> > > > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > > > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > > > 	out whether it is safe to tell RCU about the quiescent state --
> > > > 	and also greatly simplify the code.
> > > 
> > > Already done and available in the -tip tree, curtesy of Mathieu.
> > 
> > Very cool!!!  I see one of his patches at http://lkml.org/lkml/2008/4/17/342,
> > but how do I find out which branch of -tip this is on?  (I am learning
> > git, but it is a slow process...)
> > 
> > This would also simplify preemptable RCU's dyntick interface, removing
> > the need for proofs.
> 
> Not sure - my git-foo isn't good enough either :-(
> 
> All I can offer is that its available in tip/master (the collective
> merge of all of tip's branches) as commit:
> 0d84b78a606f1562532cd576ee8733caf5a4aed3, which I found using
> git-annotate include/linux/hardirq.h

That works -- thank you!!!

						Thanx, Paul

> How to find from which particular topic branch it came from, I too am
> clueless.
> 
> ---
> commit 0d84b78a606f1562532cd576ee8733caf5a4aed3
> Author: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Date:   Mon May 12 21:21:07 2008 +0200
> 
>     x86 NMI-safe INT3 and Page Fault
>     
>     Implements an alternative iret with popf and return so trap and exception
>     handlers can return to the NMI handler without issuing iret. iret would cause
>     NMIs to be reenabled prematurely. x86_32 uses popf and far return. x86_64 has to
>     copy the return instruction pointer to the top of the previous stack, issue a
>     popf, loads the previous esp and issue a near return (ret).
>     
>     It allows placing immediate values (and therefore optimized trace_marks) in NMI
>     code since returning from a breakpoint would be valid. Accessing vmalloc'd
>     memory, which allows executing module code or accessing vmapped or vmalloc'd
>     areas from NMI context, would also be valid. This is very useful to tracers like
>     LTTng.
>     
>     This patch makes all faults, traps and exception safe to be called from NMI
>     context *except* single-stepping, which requires iret to restore the TF (trap
>     flag) and jump to the return address in a single instruction. Sorry, no kprobes
>     support in NMI handlers because of this limitation.  We cannot single-step an
>     NMI handler, because iret must set the TF flag and return back to the
>     instruction to single-step in a single instruction. This cannot be emulated with
>     popf/lret, because lret would be single-stepped. It does not apply to immediate
>     values because they do not use single-stepping. This code detects if the TF
>     flag is set and uses the iret path for single-stepping, even if it reactivates
>     NMIs prematurely.
>     
>     Test to detect if nested under a NMI handler is only done upon the return from
>     trap/exception to kernel, which is not frequent. Other return paths (return from
>     trap/exception to userspace, return from interrupt) keep the exact same behavior
>     (no slowdown).
>     
>     Depends on :
>     change-alpha-active-count-bit.patch
>     change-avr32-active-count-bit.patch
>     
>     TODO : test with lguest, xen, kvm.
>     
>     ** This patch depends on the "Stringify support commas" patchset **
>     ** Also depends on fix-x86_64-page-fault-scheduler-race patch **
>     
>     tested on x86_32 (tests implemented in a separate patch) :
>     - instrumented the return path to export the EIP, CS and EFLAGS values when
>       taken so we know the return path code has been executed.
>     - trace_mark, using immediate values, with 10ms delay with the breakpoint
>       activated. Runs well through the return path.
>     - tested vmalloc faults in NMI handler by placing a non-optimized marker in the
>       NMI handler (so no breakpoint is executed) and connecting a probe which
>       touches every pages of a 20MB vmalloc'd buffer. It executes trough the return
>       path without problem.
>     - Tested with and without preemption
>     
>     tested on x86_64
>     - instrumented the return path to export the EIP, CS and EFLAGS values when
>       taken so we know the return path code has been executed.
>     - trace_mark, using immediate values, with 10ms delay with the breakpoint
>       activated. Runs well through the return path.
>     
>     To test on x86_64 :
>     - Test without preemption
>     - Test vmalloc faults
>     - Test on Intel 64 bits CPUs. (AMD64 was fine)
>     
>     Changelog since v1 :
>     - x86_64 fixes.
>     Changelog since v2 :
>     - fix paravirt build
>     Changelog since v3 :
>     - Include modifications suggested by Jeremy
>     Changelog since v4 :
>     - including hardirq.h in entry_32/64.S is a bad idea (non ifndef'd C code),
>       define HARDNMI_MASK in the .S files directly.
>     Changelog since v5 :
>     - Add HARDNMI_MASK to irq_count() and make die() more verbose for NMIs.
>     Changelog since v7 :
>     - Implement paravirtualized nmi_return.
>     Changelog since v8 :
>     - refreshed the patch for asm-offsets. Those were left out of v8.
>     - now depends on "Stringify support commas" patch.
>     Changelog since v9 :
>     - Only test the nmi nested preempt count flag upon return from exceptions, not
>       on return from interrupts. Only the kernel return path has this test.
>     - Add Xen, VMI, lguest support. Use their iret pavavirt ops in lieu of
>       nmi_return.
>     
>     -- Ported to sched-devel.git
>     
>     Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
>     CC: akpm@osdl.org
>     CC: mingo@elte.hu
>     CC: "H. Peter Anvin" <hpa@zytor.com>
>     CC: Jeremy Fitzhardinge <jeremy@goop.org>
>     CC: Steven Rostedt <rostedt@goodmis.org>
>     CC: "Frank Ch. Eigler" <fche@redhat.com>
>     Signed-off-by: Ingo Molnar <mingo@elte.hu>
>     Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
> index 9258808..73474e0 100644
> --- a/arch/x86/kernel/asm-offsets_32.c
> +++ b/arch/x86/kernel/asm-offsets_32.c
> @@ -111,6 +111,7 @@ void foo(void)
>  	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
>  	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
>  	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
> +	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
>  	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
>  	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
>  #endif
> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
> index f126c05..a5bbec3 100644
> --- a/arch/x86/kernel/asm-offsets_64.c
> +++ b/arch/x86/kernel/asm-offsets_64.c
> @@ -62,6 +62,7 @@ int main(void)
>  	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
>  	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
>  	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
> +	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
>  	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
>  	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
>  	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index e6517ce..2d88211 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -68,6 +68,8 @@
> 
>  #define nr_syscalls ((syscall_table_size)/4)
> 
> +#define HARDNMI_MASK 0x40000000
> +
>  #ifdef CONFIG_PREEMPT
>  #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
>  #else
> @@ -232,8 +234,32 @@ END(ret_from_fork)
>  	# userspace resumption stub bypassing syscall exit tracing
>  	ALIGN
>  	RING0_PTREGS_FRAME
> +
>  ret_from_exception:
>  	preempt_stop(CLBR_ANY)
> +	GET_THREAD_INFO(%ebp)
> +	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
> +	movb PT_CS(%esp), %al
> +	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
> +	cmpl $USER_RPL, %eax
> +	jae resume_userspace	# returning to v8086 or userspace
> +	testl $HARDNMI_MASK,TI_preempt_count(%ebp)
> +	jz resume_kernel		/* Not nested over NMI ? */
> +	testw $X86_EFLAGS_TF, PT_EFLAGS(%esp)
> +	jnz resume_kernel		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	TRACE_IRQS_IRET
> +	RESTORE_REGS
> +	addl $4, %esp			# skip orig_eax/error_code
> +	CFI_ADJUST_CFA_OFFSET -4
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  ret_from_intr:
>  	GET_THREAD_INFO(%ebp)
>  check_userspace:
> @@ -873,6 +899,10 @@ ENTRY(native_iret)
>  .previous
>  END(native_iret)
> 
> +ENTRY(native_nmi_return)
> +	NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ?
> +END(native_nmi_return)
> +
>  ENTRY(native_irq_enable_syscall_ret)
>  	sti
>  	sysexit
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index fe25e5f..5f8edc7 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -156,6 +156,8 @@ END(mcount)
>  #endif /* CONFIG_DYNAMIC_FTRACE */
>  #endif /* CONFIG_FTRACE */
> 
> +#define HARDNMI_MASK 0x40000000
> +
>  #ifndef CONFIG_PREEMPT
>  #define retint_kernel retint_restore_args
>  #endif	
> @@ -698,6 +700,9 @@ ENTRY(native_iret)
>  	.section __ex_table,"a"
>  	.quad native_iret, bad_iret
>  	.previous
> +
> +ENTRY(native_nmi_return)
> +	NATIVE_INTERRUPT_RETURN_NMI_SAFE
>  #endif
> 
>  	.section .fixup,"ax"
> @@ -753,6 +758,23 @@ retint_signal:
>  	GET_THREAD_INFO(%rcx)
>  	jmp retint_check
> 
> +	/* Returning to kernel space from exception. */
> +	/* rcx:	 threadinfo. interrupts off. */
> +ENTRY(retexc_kernel)
> +	testl $HARDNMI_MASK,threadinfo_preempt_count(%rcx)
> +	jz retint_kernel		/* Not nested over NMI ? */
> +	testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp)	/* trap flag? */
> +	jnz retint_kernel		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	RESTORE_ARGS 0,8,0
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  #ifdef CONFIG_PREEMPT
>  	/* Returning to kernel space. Check if we need preemption */
>  	/* rcx:	 threadinfo. interrupts off. */
> @@ -911,9 +933,17 @@ paranoid_swapgs\trace:
>  	TRACE_IRQS_IRETQ 0
>  	.endif
>  	SWAPGS_UNSAFE_STACK
> -paranoid_restore\trace:
> +paranoid_restore_no_nmi\trace:
>  	RESTORE_ALL 8
>  	jmp irq_return
> +paranoid_restore\trace:
> +	GET_THREAD_INFO(%rcx)
> +	testl $HARDNMI_MASK,threadinfo_preempt_count(%rcx)
> +	jz paranoid_restore_no_nmi\trace	/* Nested over NMI ? */
> +	testw $X86_EFLAGS_TF,EFLAGS-0(%rsp)	/* trap flag? */
> +	jnz paranoid_restore_no_nmi\trace
> +	RESTORE_ALL 8
> +	INTERRUPT_RETURN_NMI_SAFE
>  paranoid_userspace\trace:
>  	GET_THREAD_INFO(%rcx)
>  	movl threadinfo_flags(%rcx),%ebx
> @@ -1012,7 +1042,7 @@ error_exit:
>  	TRACE_IRQS_OFF
>  	GET_THREAD_INFO(%rcx)	
>  	testl %eax,%eax
> -	jne  retint_kernel
> +	jne  retexc_kernel
>  	LOCKDEP_SYS_EXIT_IRQ
>  	movl  threadinfo_flags(%rcx),%edx
>  	movl  $_TIF_WORK_MASK,%edi
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 74f0c5e..bb174a8 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -139,6 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
>  		/* If the operation is a nop, then nop the callsite */
>  		ret = paravirt_patch_nop();
>  	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
> +		 type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) ||
>  		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
>  		/* If operation requires a jmp, then jmp */
>  		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
> @@ -190,6 +191,7 @@ static void native_flush_tlb_single(unsigned long addr)
> 
>  /* These are in entry.S */
>  extern void native_iret(void);
> +extern void native_nmi_return(void);
>  extern void native_irq_enable_syscall_ret(void);
> 
>  static int __init print_banner(void)
> @@ -328,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
> 
>  	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
>  	.iret = native_iret,
> +	.nmi_return = native_nmi_return,
>  	.swapgs = native_swapgs,
> 
>  	.set_iopl_mask = native_set_iopl_mask,
> diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
> index 82fc5fc..8ed31c7 100644
> --- a/arch/x86/kernel/paravirt_patch_32.c
> +++ b/arch/x86/kernel/paravirt_patch_32.c
> @@ -1,10 +1,13 @@
> -#include <asm/paravirt.h>
> +#include <linux/stringify.h>
> +#include <linux/irqflags.h>
> 
>  DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
>  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
>  DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
>  DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
>  DEF_NATIVE(pv_cpu_ops, iret, "iret");
> +DEF_NATIVE(pv_cpu_ops, nmi_return,
> +	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
>  DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
>  DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
>  DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
> @@ -29,6 +32,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
>  		PATCH_SITE(pv_irq_ops, restore_fl);
>  		PATCH_SITE(pv_irq_ops, save_fl);
>  		PATCH_SITE(pv_cpu_ops, iret);
> +		PATCH_SITE(pv_cpu_ops, nmi_return);
>  		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
>  		PATCH_SITE(pv_mmu_ops, read_cr2);
>  		PATCH_SITE(pv_mmu_ops, read_cr3);
> diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
> index 7d904e1..56eccea 100644
> --- a/arch/x86/kernel/paravirt_patch_64.c
> +++ b/arch/x86/kernel/paravirt_patch_64.c
> @@ -1,12 +1,15 @@
> +#include <linux/irqflags.h>
> +#include <linux/stringify.h>
>  #include <asm/paravirt.h>
>  #include <asm/asm-offsets.h>
> -#include <linux/stringify.h>
> 
>  DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
>  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
>  DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
>  DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
>  DEF_NATIVE(pv_cpu_ops, iret, "iretq");
> +DEF_NATIVE(pv_cpu_ops, nmi_return,
> +	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
>  DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
>  DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
>  DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
> @@ -35,6 +38,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
>  		PATCH_SITE(pv_irq_ops, irq_enable);
>  		PATCH_SITE(pv_irq_ops, irq_disable);
>  		PATCH_SITE(pv_cpu_ops, iret);
> +		PATCH_SITE(pv_cpu_ops, nmi_return);
>  		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
>  		PATCH_SITE(pv_cpu_ops, swapgs);
>  		PATCH_SITE(pv_mmu_ops, read_cr2);
> diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
> index bde6f63..f3a59cd 100644
> --- a/arch/x86/kernel/traps_32.c
> +++ b/arch/x86/kernel/traps_32.c
> @@ -475,6 +475,9 @@ void die(const char *str, struct pt_regs *regs, long err)
>  	if (kexec_should_crash(current))
>  		crash_kexec(regs);
> 
> +	if (in_nmi())
> +		panic("Fatal exception in non-maskable interrupt");
> +
>  	if (in_interrupt())
>  		panic("Fatal exception in interrupt");
> 
> diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
> index adff76e..3dacb75 100644
> --- a/arch/x86/kernel/traps_64.c
> +++ b/arch/x86/kernel/traps_64.c
> @@ -555,6 +555,10 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
>  		oops_exit();
>  		return;
>  	}
> +	if (in_nmi())
> +		panic("Fatal exception in non-maskable interrupt");
> +	if (in_interrupt())
> +		panic("Fatal exception in interrupt");
>  	if (panic_on_oops)
>  		panic("Fatal exception");
>  	oops_exit();
> diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
> index 956f389..01d687d 100644
> --- a/arch/x86/kernel/vmi_32.c
> +++ b/arch/x86/kernel/vmi_32.c
> @@ -151,6 +151,8 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
>  					      insns, ip);
>  		case PARAVIRT_PATCH(pv_cpu_ops.iret):
>  			return patch_internal(VMI_CALL_IRET, len, insns, ip);
> +		case PARAVIRT_PATCH(pv_cpu_ops.nmi_return):
> +			return patch_internal(VMI_CALL_IRET, len, insns, ip);
>  		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
>  			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
>  		default:
> diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
> index af65b2d..f5cbb74 100644
> --- a/arch/x86/lguest/boot.c
> +++ b/arch/x86/lguest/boot.c
> @@ -958,6 +958,7 @@ __init void lguest_init(void)
>  	pv_cpu_ops.cpuid = lguest_cpuid;
>  	pv_cpu_ops.load_idt = lguest_load_idt;
>  	pv_cpu_ops.iret = lguest_iret;
> +	pv_cpu_ops.nmi_return = lguest_iret;
>  	pv_cpu_ops.load_sp0 = lguest_load_sp0;
>  	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
>  	pv_cpu_ops.set_ldt = lguest_set_ldt;
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index c8a56e4..33272ce 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -1008,6 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
>  	.read_pmc = native_read_pmc,
> 
>  	.iret = xen_iret,
> +	.nmi_return = xen_iret,
>  	.irq_enable_syscall_ret = xen_sysexit,
> 
>  	.load_tr_desc = paravirt_nop,
> diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
> index 24d71b1..c3009fd 100644
> --- a/include/asm-x86/irqflags.h
> +++ b/include/asm-x86/irqflags.h
> @@ -51,6 +51,61 @@ static inline void native_halt(void)
> 
>  #endif
> 
> +#ifdef CONFIG_X86_64
> +/*
> + * Only returns from a trap or exception to a NMI context (intra-privilege
> + * level near return) to the same SS and CS segments. Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued. It takes care of modifying the eflags, rsp and returning to the
> + * previous function.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(rsp)  RIP
> + * 8(rsp)  CS
> + * 16(rsp) EFLAGS
> + * 24(rsp) RSP
> + * 32(rsp) SS
> + *
> + * Upon execution :
> + * Copy EIP to the top of the return stack
> + * Update top of return stack address
> + * Pop eflags into the eflags register
> + * Make the return stack current
> + * Near return (popping the return address from the return stack)
> + */
> +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushq %rax;		\
> +						movq %rsp, %rax;	\
> +						movq 24+8(%rax), %rsp;	\
> +						pushq 0+8(%rax);	\
> +						pushq 16+8(%rax);	\
> +						movq (%rax), %rax;	\
> +						popfq;			\
> +						ret
> +#else
> +/*
> + * Protected mode only, no V8086. Implies that protected mode must
> + * be entered before NMIs or MCEs are enabled. Only returns from a trap or
> + * exception to a NMI context (intra-privilege level far return). Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(esp) EIP
> + * 4(esp) CS
> + * 8(esp) EFLAGS
> + *
> + * Upon execution :
> + * Copy the stack eflags to top of stack
> + * Pop eflags into the eflags register
> + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
> + */
> +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushl 8(%esp);	\
> +						popfl;		\
> +						lret $4
> +#endif
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <asm/paravirt.h>
>  #else
> @@ -109,6 +164,7 @@ static inline unsigned long __raw_local_irq_save(void)
> 
>  #define ENABLE_INTERRUPTS(x)	sti
>  #define DISABLE_INTERRUPTS(x)	cli
> +#define INTERRUPT_RETURN_NMI_SAFE	NATIVE_INTERRUPT_RETURN_NMI_SAFE
> 
>  #ifdef CONFIG_X86_64
>  #define INTERRUPT_RETURN	iretq
> diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
> index 0f13b94..d5087e0 100644
> --- a/include/asm-x86/paravirt.h
> +++ b/include/asm-x86/paravirt.h
> @@ -141,9 +141,10 @@ struct pv_cpu_ops {
>  	u64 (*read_pmc)(int counter);
>  	unsigned long long (*read_tscp)(unsigned int *aux);
> 
> -	/* These two are jmp to, not actually called. */
> +	/* These three are jmp to, not actually called. */
>  	void (*irq_enable_syscall_ret)(void);
>  	void (*iret)(void);
> +	void (*nmi_return)(void);
> 
>  	void (*swapgs)(void);
> 
> @@ -1385,6 +1386,10 @@ static inline unsigned long __raw_local_irq_save(void)
>  	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
>  		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
> 
> +#define INTERRUPT_RETURN_NMI_SAFE					\
> +	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE,	\
> +		  jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return)
> +
>  #define DISABLE_INTERRUPTS(clobbers)					\
>  	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
>  		  PV_SAVE_REGS;			\
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index 181006c..b39f49d 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -22,10 +22,13 @@
>   * PREEMPT_MASK: 0x000000ff
>   * SOFTIRQ_MASK: 0x0000ff00
>   * HARDIRQ_MASK: 0x0fff0000
> + * HARDNMI_MASK: 0x40000000
>   */
>  #define PREEMPT_BITS	8
>  #define SOFTIRQ_BITS	8
> 
> +#define HARDNMI_BITS	1
> +
>  #ifndef HARDIRQ_BITS
>  #define HARDIRQ_BITS	12
> 
> @@ -45,16 +48,19 @@
>  #define PREEMPT_SHIFT	0
>  #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
>  #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDNMI_SHIFT	(30)
> 
>  #define __IRQ_MASK(x)	((1UL << (x))-1)
> 
>  #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
>  #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
>  #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
> +#define HARDNMI_MASK	(__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
> 
>  #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
>  #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
>  #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
> +#define HARDNMI_OFFSET	(1UL << HARDNMI_SHIFT)
> 
>  #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
>  #error PREEMPT_ACTIVE is too low!
> @@ -62,7 +68,9 @@
> 
>  #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
>  #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
> -#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define irq_count() \
> +	(preempt_count() & (HARDNMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define hardnmi_count()	(preempt_count() & HARDNMI_MASK)
> 
>  /*
>   * Are we doing bottom half or hardware interrupt processing?
> @@ -71,6 +79,7 @@
>  #define in_irq()		(hardirq_count())
>  #define in_softirq()		(softirq_count())
>  #define in_interrupt()		(irq_count())
> +#define in_nmi()		(hardnmi_count())
> 
>  #if defined(CONFIG_PREEMPT)
>  # define PREEMPT_INATOMIC_BASE kernel_locked()
> @@ -161,7 +170,19 @@ extern void irq_enter(void);
>   */
>  extern void irq_exit(void);
> 
> -#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
> -#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
> +#define nmi_enter()					\
> +	do {						\
> +		lockdep_off();				\
> +		BUG_ON(hardnmi_count());		\
> +		add_preempt_count(HARDNMI_OFFSET);	\
> +		__irq_enter();				\
> +	} while (0)
> +
> +#define nmi_exit()					\
> +	do {						\
> +		__irq_exit();				\
> +		sub_preempt_count(HARDNMI_OFFSET);	\
> +		lockdep_on();				\
> +	} while (0)
> 
>  #endif /* LINUX_HARDIRQ_H */
> 
> 

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30 14:34         ` Paul E. McKenney
@ 2008-08-31 10:58           ` Manfred Spraul
  2008-08-31 17:20             ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-08-31 10:58 UTC (permalink / raw)
  To: paulmck
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz

Paul E. McKenney wrote:
>
>> Perhaps it's possible to rely on CPU_DYING, but I haven't figured out yet 
>> how to handle read-side critical sections in CPU_DYING handlers.
>> Interrupts after CPU_DYING could be handled by rcu_irq_enter(), 
>> rcu_irq_exit() [yes, they exist on x86: the arch code enables the local 
>> interrupts in order to process the currently queued interrupts]
>>     
>
> My feeling is that CPU online/offline will be quite rare, so it should
> be OK to clean up after the races in force_quiescent_state(), which in
> this version is called every three ticks in a given grace period.
>   
If you add failing cpu offline calls, then the problem appears to be 
unsolvable:
If I get it right, the offlining process looks like this:
* one cpu in the system makes the CPU_DOWN_PREPARE notifier call. These 
calls can sleep (e.g. slab sleeps on semaphores). The cpu that goes 
offline is still alive, still doing arbitrary work. cpu_quiet calls on 
behalf of the cpu would be wrong.
* stop_machine: all cpus schedule to a special kernel thread [1], only 
the dying cpu runs.
* The cpu that goes offline calls the CPU_DYING notifiers.
* __cpu_disable(): The cpu that goes offline check if it's possible to 
offline the cpu. At least on i386, this can fail.
On success:
* at least on i386: the cpu that goes offline handles outstanding 
interrupts. I'm not sure, perhaps even softirqs are handled.
* the cpus stopps handling interrupts.
* stop machine leaves, the remaining cpus continue their work.
* The CPU_DEAD notifiers are called. They can sleep.
On failure:
* all cpus continue their work. call_rcu, synchronize_rcu(), ...
* some time later: the CPU_DOWN_FAILED callbacks are called.

Is that description correct?
Then:
- treating a cpu as always quiet after the rcu notifer was called with 
CPU_OFFLINE_PREPARE is wrong: the target cpu still runs normal code: 
user space, kernel space, interrupts, whatever. The target cpu still 
accepts interrupst, thus treating it as "normal" should work.
__cpu_disable() success:
- after CPU_DYING, a cpu is either in an interrupt or outside read-side 
critical sections. Parallel synchronize_rcu() calls are impossible until 
the cpu is dead. call_rcu() is probably possible.
- The CPU_DEAD notifiers are called. a synchronize_rcu() call before the 
rcu notifier is called is possible.
__cpu_disable() failure:
- CPU_DYING is called, but the cpu remains fully alive. The system comes 
fully alive again.
- some time later, CPU_DEAD is called.

With the current CPU_DYING callback, it's impossible to be both 
deadlock-free and race-free with the given conditions. If 
__cpu_disable() succeeds, then the cpu must be treated as gone and 
always idle. If __cpu_disable() fails, then the cpu must be treated as 
fully there. Doing both things at the same time is impossible. Waiting 
until CPU_DOWN_FAILED or CPU_DEAD is called is impossible, too: Either 
synchronize_rcu() in a CPU_DEAD notifier [called before the rcu 
notifier] would deadlock or read-side critical sections on the 
not-killed cpu would race.

What about moving the CPU_DYING notifier calls behind the 
__cpu_disable() call?
Any other solutions?

Btw, as far as I can see, rcupreempt would deadlock if a CPU_DEAD 
notifier uses synchronize_rcu().
Probably noone will ever succeed in triggering the deadlock:
- cpu goes offline.
- the other cpus in the system are restarted.
- one cpu does the CPU_DEAD notifier calls.
- before the rcu notifier is called with CPU_DEAD:
- one CPU_DEAD notifier sleeps.
- while CPU_DEAD is sleeping: on the same cpu: kmem_cache_destroy is 
called. get_online_cpus immediately succeeds.
- kmem_cache_destroy acquires the cache_chain_mutex.
- kmem_cache_destroy does synchronize_rcu(), it sleeps.
- CPU_DEAD processing continues, the slab CPU_DEAD tries to acquire the 
cache_chain_mutex. it sleeps, too.
--> deadlock, because the already dead cpu will never signal itself as 
quiet. Thus synchronize_rcu() will never succeed, thus the slab CPU_DEAD 
notifier will never return, thus rcu_offline_cpu() is never called.

--
    Manfred
[1] open question: with rcu_preempt, is it possible that these cpus 
could be inside read side critical sections?

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-31 10:58           ` Manfred Spraul
@ 2008-08-31 17:20             ` Paul E. McKenney
  2008-08-31 17:45               ` Manfred Spraul
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-31 17:20 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz, benh, davem,
	tony.luck

On Sun, Aug 31, 2008 at 12:58:12PM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:
>>
>>> Perhaps it's possible to rely on CPU_DYING, but I haven't figured out yet 
>>> how to handle read-side critical sections in CPU_DYING handlers.
>>> Interrupts after CPU_DYING could be handled by rcu_irq_enter(), 
>>> rcu_irq_exit() [yes, they exist on x86: the arch code enables the local 
>>> interrupts in order to process the currently queued interrupts]
>>>     
>>
>> My feeling is that CPU online/offline will be quite rare, so it should
>> be OK to clean up after the races in force_quiescent_state(), which in
>> this version is called every three ticks in a given grace period.
>>   
> If you add failing cpu offline calls, then the problem appears to be 
> unsolvable:
> If I get it right, the offlining process looks like this:
> * one cpu in the system makes the CPU_DOWN_PREPARE notifier call. These 
> calls can sleep (e.g. slab sleeps on semaphores). The cpu that goes offline 
> is still alive, still doing arbitrary work. cpu_quiet calls on behalf of 
> the cpu would be wrong.
> * stop_machine: all cpus schedule to a special kernel thread [1], only the 
> dying cpu runs.
> * The cpu that goes offline calls the CPU_DYING notifiers.
> * __cpu_disable(): The cpu that goes offline check if it's possible to 
> offline the cpu. At least on i386, this can fail.
> On success:
> * at least on i386: the cpu that goes offline handles outstanding 
> interrupts. I'm not sure, perhaps even softirqs are handled.
> * the cpus stopps handling interrupts.
> * stop machine leaves, the remaining cpus continue their work.

As I understand it, this is the point where the dying CPU disables
interrupts and removes itself from the online masks.  Though I would
feel better if there was an smp_mb() after the last local_irq_disable()
and before the remove_cpu_from_maps()!

> * The CPU_DEAD notifiers are called. They can sleep.
> On failure:
> * all cpus continue their work. call_rcu, synchronize_rcu(), ...
> * some time later: the CPU_DOWN_FAILED callbacks are called.
>
> Is that description correct?

Gautham?

> Then:
> - treating a cpu as always quiet after the rcu notifer was called with 
> CPU_OFFLINE_PREPARE is wrong: the target cpu still runs normal code: user 
> space, kernel space, interrupts, whatever. The target cpu still accepts 
> interrupst, thus treating it as "normal" should work.

Indeed!  My current code doesn't declare them offline until the CPU_DEAD
notifiers are called.  And force_quiescent_state() does not consider
them to be offline until after they have cleared their bit in
cpu_online_map, which does not happen until the outgoing CPU has
disabled interrupts, at least in x86.  So my current code should be
OK on x86.

It -looks- like stop_cpu() expects to be called with irqs disabled,
but I don't see what would be disabling irqs.  (Don't kthreads normally
start with irqs enabled?)  Ah, I see it -- the stop_cpu() threads
sequence through a state machine, and one of the states disables
irqs for everyone.

So the only problem would occur in architectures that re-enable irqs
in the middle of __cpu_disable(), as x86 does (but x86 correctly orders
the clearing of the cpu_online_mask bit, so is OK).  This of course
has the added benefit that irq handlers aren't running on a CPU that
is marked offline.

Checking other architectures:

o	ARM arch/arm/kernel/smp.c __cpu_disable() does not re-enable
	irqs, so is OK.

!	arch/ia64/kernel/smpboot.c __cpu_disable() clears itself
	from the cpu_online mask before flushing pending irqs, which
	might include RCU read-side critical sections.	I believe that
	the "cpu_clear(cpu, cpu_online_map)" must move to after the
	"fixup_irqs()".

o	arch/powerpc/kernel/smp.c __cpu_disable() does not disable
	irqs directly, but calls subarch-specific functions noted
	below.

o	arch/powerpc/platforms/powermac/smp.c smp_core99_cpu_disable()
	does not appear to re-enable irqs, so should be OK.

!	arch/powerpc/kernel/smp.c generic_cpu_disable() clears itself
	from the cpu_online_mask before invoking fixup_irqs(), which
	momentarily enables irqs.  I believe that the  "cpu_clear(cpu,
	cpu_online_map)" must move to after the "fixup_irqs()".

?	arch/powerpc/platforms/pseries/hotplug-cpu.c pseries_cpu_disable()
	clears itself from the cpu_online_mask before calling
	xics_migrate_irqs_away().  This function rejects already-pending
	irqs, then redirects future irqs.  Not clear to me what happens
	if an irq arrives between the reject and the immediately
	following removal from the global interrupt queue.

o	arch/s390/kernel/smp.c __cpu_disable() does not reenable irqs
	so is OK.

!	arch/sparc64/kernel/smp.c __cpu_disable() clears its bit before
	re-enabling interrupts.  I believe that the "cpu_clear(cpu,
	cpu_online_map)" needs to happen after the local_irq_disable().

?	include/asm-parisc/smp.h __cpu_disable() just returns without
	doing anything.  This means pa-risc does not support hotplug
	CPU?  If so, no problem.

I am sending (untested) patches separately for the amusement of the
arch maintainers.

> __cpu_disable() success:
> - after CPU_DYING, a cpu is either in an interrupt or outside read-side 
> critical sections. Parallel synchronize_rcu() calls are impossible until 
> the cpu is dead. call_rcu() is probably possible.
> - The CPU_DEAD notifiers are called. a synchronize_rcu() call before the 
> rcu notifier is called is possible.
> __cpu_disable() failure:
> - CPU_DYING is called, but the cpu remains fully alive. The system comes 
> fully alive again.
> - some time later, CPU_DEAD is called.
>
> With the current CPU_DYING callback, it's impossible to be both 
> deadlock-free and race-free with the given conditions. If __cpu_disable() 
> succeeds, then the cpu must be treated as gone and always idle. If 
> __cpu_disable() fails, then the cpu must be treated as fully there. Doing 
> both things at the same time is impossible. Waiting until CPU_DOWN_FAILED 
> or CPU_DEAD is called is impossible, too: Either synchronize_rcu() in a 
> CPU_DEAD notifier [called before the rcu notifier] would deadlock or 
> read-side critical sections on the not-killed cpu would race.

Assuming that the ordering of processing pending irqs and marking the
CPU offline in cpu_online_mask can be resolved as noted above, it should
work fine -- if a CPU's bit is clear, we can safely ignore it.  The race
can be resolved by checking the CPU's bit in force_quiescent_state().

Or am I missing something?

> What about moving the CPU_DYING notifier calls behind the __cpu_disable() 
> call?
> Any other solutions?

RCU should ignore the CPU_DYING notifier calls -- only the CPU_DEAD.*
calls should be processed for CPUs being offlined.  Right?

> Btw, as far as I can see, rcupreempt would deadlock if a CPU_DEAD notifier 
> uses synchronize_rcu().
> Probably noone will ever succeed in triggering the deadlock:
> - cpu goes offline.
> - the other cpus in the system are restarted.
> - one cpu does the CPU_DEAD notifier calls.
> - before the rcu notifier is called with CPU_DEAD:
> - one CPU_DEAD notifier sleeps.
> - while CPU_DEAD is sleeping: on the same cpu: kmem_cache_destroy is 
> called. get_online_cpus immediately succeeds.
> - kmem_cache_destroy acquires the cache_chain_mutex.
> - kmem_cache_destroy does synchronize_rcu(), it sleeps.
> - CPU_DEAD processing continues, the slab CPU_DEAD tries to acquire the 
> cache_chain_mutex. it sleeps, too.
> --> deadlock, because the already dead cpu will never signal itself as 
> quiet. Thus synchronize_rcu() will never succeed, thus the slab CPU_DEAD 
> notifier will never return, thus rcu_offline_cpu() is never called.

It is entirely possible that rcu_try_flip_waitack() and
rcu_try_flip_waitmb() need to check the AND of rcu_cpu_online_map and
cpu_online_map.  If this really is a problem (and it might well be),
then the easiest fix is to check for cpu_is_offline(cpu) in both
rcu_try_flip_waitmb_needed() and rcu_try_flip_waitack_needed(), and
that in both versions of both functions.  Thoughts?

> --
>    Manfred
> [1] open question: with rcu_preempt, is it possible that these cpus could 
> be inside read side critical sections?

Yes, they could.  Well, tasks that were previously running on them
might be preempted or blocked waiting on locks while still in RCU
read-side critical sections.  However, when a given CPU goes offline,
rcu_preempt moves that CPU's counters to some surviving CPU.  So RCU
knows that these tasks are still in RCU read-side critical sections,
and will therefore wait for them.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-31 17:20             ` Paul E. McKenney
@ 2008-08-31 17:45               ` Manfred Spraul
  2008-08-31 17:55                 ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-08-31 17:45 UTC (permalink / raw)
  To: paulmck
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz, benh, davem,
	tony.luck

Paul E. McKenney wrote:
> Assuming that the ordering of processing pending irqs and marking the
> CPU offline in cpu_online_mask can be resolved as noted above, it should
> work fine -- if a CPU's bit is clear, we can safely ignore it.  The race
> can be resolved by checking the CPU's bit in force_quiescent_state().
>
> Or am I missing something?
>   
Yes, that would work:
Rule 1: after CPU_DEAD, a cpu is gone. The cpu is quiet, rcu callbacks 
must be moved to other cpus, ...
Rule 2: if a cpu is not listed in cpu_online_mask, then it can be 
considered as outside a read-side critical section.

The problem with rule 2 is that it means someone 
[force_quiescent_state()] must poll the cpu_online_mask and look for 
changes.
I'd really prefer a notifier. CPU_DYING is nearly the correct thing, it 
only has to be moved down 3 lines ;-)
(I want to kill the bitmaps, not add a hierarchical bitmap polling system!)
> It is entirely possible that rcu_try_flip_waitack() and
> rcu_try_flip_waitmb() need to check the AND of rcu_cpu_online_map and
> cpu_online_map.  If this really is a problem (and it might well be),
> then the easiest fix is to check for cpu_is_offline(cpu) in both
> rcu_try_flip_waitmb_needed() and rcu_try_flip_waitack_needed(), and
> that in both versions of both functions.  Thoughts?
>   
I made a mistake, get_online_cpus() stores current, not a cpu number. 
Thus the described race it not possible. Perhaps there are other users 
that could deadlock.
I don't know enough about the preempt algorithm, thus I can't confirm if 
your proposal would work or not.

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-31 17:45               ` Manfred Spraul
@ 2008-08-31 17:55                 ` Paul E. McKenney
  2008-08-31 18:18                   ` Manfred Spraul
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-31 17:55 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz, benh, davem,
	tony.luck

On Sun, Aug 31, 2008 at 07:45:02PM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:
>> Assuming that the ordering of processing pending irqs and marking the
>> CPU offline in cpu_online_mask can be resolved as noted above, it should
>> work fine -- if a CPU's bit is clear, we can safely ignore it.  The race
>> can be resolved by checking the CPU's bit in force_quiescent_state().
>>
>> Or am I missing something?
>>   
> Yes, that would work:
> Rule 1: after CPU_DEAD, a cpu is gone. The cpu is quiet, rcu callbacks must 
> be moved to other cpus, ...
> Rule 2: if a cpu is not listed in cpu_online_mask, then it can be 
> considered as outside a read-side critical section.
>
> The problem with rule 2 is that it means someone [force_quiescent_state()] 
> must poll the cpu_online_mask and look for changes.
> I'd really prefer a notifier. CPU_DYING is nearly the correct thing, it 
> only has to be moved down 3 lines ;-)
> (I want to kill the bitmaps, not add a hierarchical bitmap polling system!)

But some later CPU_DYING notifier might decide that the CPU cannot be
removed after all, which would mean bringing the CPU back.  And then
whatever the CPU was needed for might have actually happened in the
meantime, which does not sound good to me...

>> It is entirely possible that rcu_try_flip_waitack() and
>> rcu_try_flip_waitmb() need to check the AND of rcu_cpu_online_map and
>> cpu_online_map.  If this really is a problem (and it might well be),
>> then the easiest fix is to check for cpu_is_offline(cpu) in both
>> rcu_try_flip_waitmb_needed() and rcu_try_flip_waitack_needed(), and
>> that in both versions of both functions.  Thoughts?
>>   
> I made a mistake, get_online_cpus() stores current, not a cpu number. Thus 
> the described race it not possible. Perhaps there are other users that 
> could deadlock.
> I don't know enough about the preempt algorithm, thus I can't confirm if 
> your proposal would work or not.

Well, that is on my list of things to look into...

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-31 17:55                 ` Paul E. McKenney
@ 2008-08-31 18:18                   ` Manfred Spraul
  2008-08-31 19:23                     ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-08-31 18:18 UTC (permalink / raw)
  To: paulmck
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz, benh, davem,
	tony.luck

Paul E. McKenney wrote:
> On Sun, Aug 31, 2008 at 07:45:02PM +0200, Manfred Spraul wrote:
>   
>> Paul E. McKenney wrote:
>>     
>>> Assuming that the ordering of processing pending irqs and marking the
>>> CPU offline in cpu_online_mask can be resolved as noted above, it should
>>> work fine -- if a CPU's bit is clear, we can safely ignore it.  The race
>>> can be resolved by checking the CPU's bit in force_quiescent_state().
>>>
>>> Or am I missing something?
>>>   
>>>       
>> Yes, that would work:
>> Rule 1: after CPU_DEAD, a cpu is gone. The cpu is quiet, rcu callbacks must 
>> be moved to other cpus, ...
>> Rule 2: if a cpu is not listed in cpu_online_mask, then it can be 
>> considered as outside a read-side critical section.
>>
>> The problem with rule 2 is that it means someone [force_quiescent_state()] 
>> must poll the cpu_online_mask and look for changes.
>> I'd really prefer a notifier. CPU_DYING is nearly the correct thing, it 
>> only has to be moved down 3 lines ;-)
>> (I want to kill the bitmaps, not add a hierarchical bitmap polling system!)
>>     
>
> But some later CPU_DYING notifier might decide that the CPU cannot be
> removed after all, which would mean bringing the CPU back.  And then
> whatever the CPU was needed for might have actually happened in the
> meantime, which does not sound good to me...
>   
CPU_DYING must not fail, the current code doesn't support that.

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-31 18:18                   ` Manfred Spraul
@ 2008-08-31 19:23                     ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-08-31 19:23 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: Lai Jiangshan, linux-kernel, cl, mingo, akpm, dipankar, josht,
	schamp, niv, dvhltc, ego, rostedt, peterz, benh, davem,
	tony.luck

On Sun, Aug 31, 2008 at 08:18:42PM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:
>> On Sun, Aug 31, 2008 at 07:45:02PM +0200, Manfred Spraul wrote:
>>   
>>> Paul E. McKenney wrote:
>>>     
>>>> Assuming that the ordering of processing pending irqs and marking the
>>>> CPU offline in cpu_online_mask can be resolved as noted above, it should
>>>> work fine -- if a CPU's bit is clear, we can safely ignore it.  The race
>>>> can be resolved by checking the CPU's bit in force_quiescent_state().
>>>>
>>>> Or am I missing something?
>>>>         
>>> Yes, that would work:
>>> Rule 1: after CPU_DEAD, a cpu is gone. The cpu is quiet, rcu callbacks 
>>> must be moved to other cpus, ...
>>> Rule 2: if a cpu is not listed in cpu_online_mask, then it can be 
>>> considered as outside a read-side critical section.
>>>
>>> The problem with rule 2 is that it means someone 
>>> [force_quiescent_state()] must poll the cpu_online_mask and look for 
>>> changes.
>>> I'd really prefer a notifier. CPU_DYING is nearly the correct thing, it 
>>> only has to be moved down 3 lines ;-)
>>> (I want to kill the bitmaps, not add a hierarchical bitmap polling 
>>> system!)
>>>     
>>
>> But some later CPU_DYING notifier might decide that the CPU cannot be
>> removed after all, which would mean bringing the CPU back.  And then
>> whatever the CPU was needed for might have actually happened in the
>> meantime, which does not sound good to me...
>>   
> CPU_DYING must not fail, the current code doesn't support that.

Good point!

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30  0:49   ` [PATCH, RFC, tip/core/rcu] v3 " Paul E. McKenney
  2008-08-30  9:33     ` Peter Zijlstra
  2008-08-30  9:58     ` Lai Jiangshan
@ 2008-09-01  9:38     ` Andi Kleen
  2008-09-02  1:05       ` Paul E. McKenney
  2008-09-05 15:29     ` [PATCH, RFC] v4 " Paul E. McKenney
  3 siblings, 1 reply; 94+ messages in thread
From: Andi Kleen @ 2008-09-01  9:38 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, peterz

"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> writes:
>  
> -#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
> +#if defined(CONFIG_NO_HZ)
>  extern void rcu_irq_enter(void);
>  extern void rcu_irq_exit(void);
>  #else
>  # define rcu_irq_enter() do { } while (0)
>  # define rcu_irq_exit() do { } while (0)
> -#endif /* CONFIG_PREEMPT_RCU */
> +#endif /* #if defined(CONFIG_NO_HZ) */

It would be better if you hung rcu_irq_enter in the irq_enter() if 
statement that checks if the task was idle or not. This way it would
be zero overhead for interruptions of non busy CPUs, keeping 
it out of many fast paths.

Haven't read everything, sorry.

-Andi

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-09-01  9:38     ` Andi Kleen
@ 2008-09-02  1:05       ` Paul E. McKenney
  2008-09-02  6:18         ` Andi Kleen
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-02  1:05 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, cl, mingo, akpm, manfred, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, peterz

On Mon, Sep 01, 2008 at 11:38:29AM +0200, Andi Kleen wrote:
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> writes:
> >  
> > -#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
> > +#if defined(CONFIG_NO_HZ)
> >  extern void rcu_irq_enter(void);
> >  extern void rcu_irq_exit(void);
> >  #else
> >  # define rcu_irq_enter() do { } while (0)
> >  # define rcu_irq_exit() do { } while (0)
> > -#endif /* CONFIG_PREEMPT_RCU */
> > +#endif /* #if defined(CONFIG_NO_HZ) */
> 
> It would be better if you hung rcu_irq_enter in the irq_enter() if 
> statement that checks if the task was idle or not. This way it would
> be zero overhead for interruptions of non busy CPUs, keeping 
> it out of many fast paths.
> 
> Haven't read everything, sorry.

So that I lose the #else above, and so that irq_enter() and irq_exit()
look something like the following (with additional adjustments to suit)?
Makes a lot of sense to me...

And it has the very nice side effect of allowing me to have a separate
rcu_irq_enter() and rcu_nmi_enter(), trivializing the RCU-dynticks
interface!!!  Very cool, thank you very much!!!

							Thanx, Paul

void irq_enter(void)
{
#ifdef CONFIG_NO_HZ
	int cpu = smp_processor_id();
	if (idle_cpu(cpu) && !in_interrupt())
		tick_nohz_stop_idle(cpu);
#endif
	__irq_enter();
#ifdef CONFIG_NO_HZ
	if (idle_cpu(cpu)) {
		rcu_irq_enter();
		tick_nohz_update_jiffies();
	}
#endif
}

void irq_exit(void)
{
	account_system_vtime(current);
	trace_hardirq_exit();
	sub_preempt_count(IRQ_EXIT_OFFSET);
	if (!in_interrupt() && local_softirq_pending())
		invoke_softirq();

#ifdef CONFIG_NO_HZ
	/* Make sure that timer wheel updates are propagated */
	if (idle_cpu(smp_processor_id())) {
		if (!in_interrupt() && !need_resched())
			tick_nohz_stop_sched_tick(0);
		rcu_irq_exit();
	}
#endif
	preempt_enable_no_resched();
}

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-09-02  1:05       ` Paul E. McKenney
@ 2008-09-02  6:18         ` Andi Kleen
  0 siblings, 0 replies; 94+ messages in thread
From: Andi Kleen @ 2008-09-02  6:18 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Andi Kleen, linux-kernel, cl, mingo, akpm, manfred, dipankar,
	josht, schamp, niv, dvhltc, ego, laijs, rostedt, peterz

> > It would be better if you hung rcu_irq_enter in the irq_enter() if 
> > statement that checks if the task was idle or not. This way it would
> > be zero overhead for interruptions of non busy CPUs, keeping 

Sorry that should have been "non idle CPUs" of course.

> > it out of many fast paths.
> > 
> > Haven't read everything, sorry.
> 
> So that I lose the #else above, and so that irq_enter() and irq_exit()
> look something like the following (with additional adjustments to suit)?

Yes looks good.

BTW I wonder if the compiler CSEs the idle_cpu() check properly.

> void irq_enter(void)
> {
> #ifdef CONFIG_NO_HZ
> 	int cpu = smp_processor_id();
> 	if (idle_cpu(cpu) && !in_interrupt())
> 		tick_nohz_stop_idle(cpu);
> #endif
> 	__irq_enter();
> #ifdef CONFIG_NO_HZ
> 	if (idle_cpu(cpu)) {
> 		rcu_irq_enter();
> 		tick_nohz_update_jiffies();
> 	}
> #endif


-Andi

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-08-30 15:40         ` Peter Zijlstra
  2008-08-30 19:38           ` Paul E. McKenney
@ 2008-09-02 13:26           ` Mathieu Desnoyers
  2008-09-02 13:41             ` Peter Zijlstra
  1 sibling, 1 reply; 94+ messages in thread
From: Mathieu Desnoyers @ 2008-09-02 13:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, linux-kernel, cl, mingo, akpm, manfred, dipankar, josht,
	schamp, niv, dvhltc, ego, laijs, rostedt

* Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> On Sat, 2008-08-30 at 07:10 -0700, Paul E. McKenney wrote:
> > On Sat, Aug 30, 2008 at 11:33:00AM +0200, Peter Zijlstra wrote:
> > > On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:
> > > 
> > > > Some shortcomings:
> > > > 
> > > > o	Entering and leaving dynticks idle mode is a quiescent state,
> > > > 	but the current patch doesn't take advantage of this (noted
> > > > 	by Manfred).  It appears that it should be possible to make
> > > > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > > > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > > > 	out whether it is safe to tell RCU about the quiescent state --
> > > > 	and also greatly simplify the code.
> > > 
> > > Already done and available in the -tip tree, curtesy of Mathieu.
> > 
> > Very cool!!!  I see one of his patches at http://lkml.org/lkml/2008/4/17/342,
> > but how do I find out which branch of -tip this is on?  (I am learning
> > git, but it is a slow process...)
> > 
> > This would also simplify preemptable RCU's dyntick interface, removing
> > the need for proofs.
> 
> Not sure - my git-foo isn't good enough either :-(
> 
> All I can offer is that its available in tip/master (the collective
> merge of all of tip's branches) as commit:
> 0d84b78a606f1562532cd576ee8733caf5a4aed3, which I found using
> git-annotate include/linux/hardirq.h
> 
> How to find from which particular topic branch it came from, I too am
> clueless.
> 

If you're interested in knowing the topic it came from : it's required
so a following patch can use a "popf; ret" instead of iret to return
from trap handlers executed in NMI context. There is an architectural
problem on x86 causing NMIs to be reactivated after the first iret
encountered, which leads to NMI handler races if nmi handlers trap. This
works around the problem by returning from the trap handlers without
using the iret instruction.

It's useful to Immediate Values which put a temporary breakpoint in the
instruction stream when proceeding to code modification and also useful
to LTTng (available in the -lttng tree) which writes tracing data to
vmap'd memory buffers (which can cause a minor page fault).

I'm glad to see NMI context detection is useful to others too !

Mathieu

> ---
> commit 0d84b78a606f1562532cd576ee8733caf5a4aed3
> Author: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
> Date:   Mon May 12 21:21:07 2008 +0200
> 
>     x86 NMI-safe INT3 and Page Fault
>     
>     Implements an alternative iret with popf and return so trap and exception
>     handlers can return to the NMI handler without issuing iret. iret would cause
>     NMIs to be reenabled prematurely. x86_32 uses popf and far return. x86_64 has to
>     copy the return instruction pointer to the top of the previous stack, issue a
>     popf, loads the previous esp and issue a near return (ret).
>     
>     It allows placing immediate values (and therefore optimized trace_marks) in NMI
>     code since returning from a breakpoint would be valid. Accessing vmalloc'd
>     memory, which allows executing module code or accessing vmapped or vmalloc'd
>     areas from NMI context, would also be valid. This is very useful to tracers like
>     LTTng.
>     
>     This patch makes all faults, traps and exception safe to be called from NMI
>     context *except* single-stepping, which requires iret to restore the TF (trap
>     flag) and jump to the return address in a single instruction. Sorry, no kprobes
>     support in NMI handlers because of this limitation.  We cannot single-step an
>     NMI handler, because iret must set the TF flag and return back to the
>     instruction to single-step in a single instruction. This cannot be emulated with
>     popf/lret, because lret would be single-stepped. It does not apply to immediate
>     values because they do not use single-stepping. This code detects if the TF
>     flag is set and uses the iret path for single-stepping, even if it reactivates
>     NMIs prematurely.
>     
>     Test to detect if nested under a NMI handler is only done upon the return from
>     trap/exception to kernel, which is not frequent. Other return paths (return from
>     trap/exception to userspace, return from interrupt) keep the exact same behavior
>     (no slowdown).
>     
>     Depends on :
>     change-alpha-active-count-bit.patch
>     change-avr32-active-count-bit.patch
>     
>     TODO : test with lguest, xen, kvm.
>     
>     ** This patch depends on the "Stringify support commas" patchset **
>     ** Also depends on fix-x86_64-page-fault-scheduler-race patch **
>     
>     tested on x86_32 (tests implemented in a separate patch) :
>     - instrumented the return path to export the EIP, CS and EFLAGS values when
>       taken so we know the return path code has been executed.
>     - trace_mark, using immediate values, with 10ms delay with the breakpoint
>       activated. Runs well through the return path.
>     - tested vmalloc faults in NMI handler by placing a non-optimized marker in the
>       NMI handler (so no breakpoint is executed) and connecting a probe which
>       touches every pages of a 20MB vmalloc'd buffer. It executes trough the return
>       path without problem.
>     - Tested with and without preemption
>     
>     tested on x86_64
>     - instrumented the return path to export the EIP, CS and EFLAGS values when
>       taken so we know the return path code has been executed.
>     - trace_mark, using immediate values, with 10ms delay with the breakpoint
>       activated. Runs well through the return path.
>     
>     To test on x86_64 :
>     - Test without preemption
>     - Test vmalloc faults
>     - Test on Intel 64 bits CPUs. (AMD64 was fine)
>     
>     Changelog since v1 :
>     - x86_64 fixes.
>     Changelog since v2 :
>     - fix paravirt build
>     Changelog since v3 :
>     - Include modifications suggested by Jeremy
>     Changelog since v4 :
>     - including hardirq.h in entry_32/64.S is a bad idea (non ifndef'd C code),
>       define HARDNMI_MASK in the .S files directly.
>     Changelog since v5 :
>     - Add HARDNMI_MASK to irq_count() and make die() more verbose for NMIs.
>     Changelog since v7 :
>     - Implement paravirtualized nmi_return.
>     Changelog since v8 :
>     - refreshed the patch for asm-offsets. Those were left out of v8.
>     - now depends on "Stringify support commas" patch.
>     Changelog since v9 :
>     - Only test the nmi nested preempt count flag upon return from exceptions, not
>       on return from interrupts. Only the kernel return path has this test.
>     - Add Xen, VMI, lguest support. Use their iret pavavirt ops in lieu of
>       nmi_return.
>     
>     -- Ported to sched-devel.git
>     
>     Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
>     CC: akpm@osdl.org
>     CC: mingo@elte.hu
>     CC: "H. Peter Anvin" <hpa@zytor.com>
>     CC: Jeremy Fitzhardinge <jeremy@goop.org>
>     CC: Steven Rostedt <rostedt@goodmis.org>
>     CC: "Frank Ch. Eigler" <fche@redhat.com>
>     Signed-off-by: Ingo Molnar <mingo@elte.hu>
>     Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
> index 9258808..73474e0 100644
> --- a/arch/x86/kernel/asm-offsets_32.c
> +++ b/arch/x86/kernel/asm-offsets_32.c
> @@ -111,6 +111,7 @@ void foo(void)
>  	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
>  	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
>  	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
> +	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
>  	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
>  	OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
>  #endif
> diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
> index f126c05..a5bbec3 100644
> --- a/arch/x86/kernel/asm-offsets_64.c
> +++ b/arch/x86/kernel/asm-offsets_64.c
> @@ -62,6 +62,7 @@ int main(void)
>  	OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
>  	OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
>  	OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
> +	OFFSET(PV_CPU_nmi_return, pv_cpu_ops, nmi_return);
>  	OFFSET(PV_CPU_irq_enable_syscall_ret, pv_cpu_ops, irq_enable_syscall_ret);
>  	OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
>  	OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index e6517ce..2d88211 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -68,6 +68,8 @@
>  
>  #define nr_syscalls ((syscall_table_size)/4)
>  
> +#define HARDNMI_MASK 0x40000000
> +
>  #ifdef CONFIG_PREEMPT
>  #define preempt_stop(clobbers)	DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
>  #else
> @@ -232,8 +234,32 @@ END(ret_from_fork)
>  	# userspace resumption stub bypassing syscall exit tracing
>  	ALIGN
>  	RING0_PTREGS_FRAME
> +
>  ret_from_exception:
>  	preempt_stop(CLBR_ANY)
> +	GET_THREAD_INFO(%ebp)
> +	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
> +	movb PT_CS(%esp), %al
> +	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
> +	cmpl $USER_RPL, %eax
> +	jae resume_userspace	# returning to v8086 or userspace
> +	testl $HARDNMI_MASK,TI_preempt_count(%ebp)
> +	jz resume_kernel		/* Not nested over NMI ? */
> +	testw $X86_EFLAGS_TF, PT_EFLAGS(%esp)
> +	jnz resume_kernel		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	TRACE_IRQS_IRET
> +	RESTORE_REGS
> +	addl $4, %esp			# skip orig_eax/error_code
> +	CFI_ADJUST_CFA_OFFSET -4
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  ret_from_intr:
>  	GET_THREAD_INFO(%ebp)
>  check_userspace:
> @@ -873,6 +899,10 @@ ENTRY(native_iret)
>  .previous
>  END(native_iret)
>  
> +ENTRY(native_nmi_return)
> +	NATIVE_INTERRUPT_RETURN_NMI_SAFE # Should we deal with popf exception ?
> +END(native_nmi_return)
> +
>  ENTRY(native_irq_enable_syscall_ret)
>  	sti
>  	sysexit
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index fe25e5f..5f8edc7 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -156,6 +156,8 @@ END(mcount)
>  #endif /* CONFIG_DYNAMIC_FTRACE */
>  #endif /* CONFIG_FTRACE */
>  
> +#define HARDNMI_MASK 0x40000000
> +
>  #ifndef CONFIG_PREEMPT
>  #define retint_kernel retint_restore_args
>  #endif	
> @@ -698,6 +700,9 @@ ENTRY(native_iret)
>  	.section __ex_table,"a"
>  	.quad native_iret, bad_iret
>  	.previous
> +
> +ENTRY(native_nmi_return)
> +	NATIVE_INTERRUPT_RETURN_NMI_SAFE
>  #endif
>  
>  	.section .fixup,"ax"
> @@ -753,6 +758,23 @@ retint_signal:
>  	GET_THREAD_INFO(%rcx)
>  	jmp retint_check
>  
> +	/* Returning to kernel space from exception. */
> +	/* rcx:	 threadinfo. interrupts off. */
> +ENTRY(retexc_kernel)
> +	testl $HARDNMI_MASK,threadinfo_preempt_count(%rcx)
> +	jz retint_kernel		/* Not nested over NMI ? */
> +	testw $X86_EFLAGS_TF,EFLAGS-ARGOFFSET(%rsp)	/* trap flag? */
> +	jnz retint_kernel		/*
> +					 * If single-stepping an NMI handler,
> +					 * use the normal iret path instead of
> +					 * the popf/lret because lret would be
> +					 * single-stepped. It should not
> +					 * happen : it will reactivate NMIs
> +					 * prematurely.
> +					 */
> +	RESTORE_ARGS 0,8,0
> +	INTERRUPT_RETURN_NMI_SAFE
> +
>  #ifdef CONFIG_PREEMPT
>  	/* Returning to kernel space. Check if we need preemption */
>  	/* rcx:	 threadinfo. interrupts off. */
> @@ -911,9 +933,17 @@ paranoid_swapgs\trace:
>  	TRACE_IRQS_IRETQ 0
>  	.endif
>  	SWAPGS_UNSAFE_STACK
> -paranoid_restore\trace:
> +paranoid_restore_no_nmi\trace:
>  	RESTORE_ALL 8
>  	jmp irq_return
> +paranoid_restore\trace:
> +	GET_THREAD_INFO(%rcx)
> +	testl $HARDNMI_MASK,threadinfo_preempt_count(%rcx)
> +	jz paranoid_restore_no_nmi\trace	/* Nested over NMI ? */
> +	testw $X86_EFLAGS_TF,EFLAGS-0(%rsp)	/* trap flag? */
> +	jnz paranoid_restore_no_nmi\trace
> +	RESTORE_ALL 8
> +	INTERRUPT_RETURN_NMI_SAFE
>  paranoid_userspace\trace:
>  	GET_THREAD_INFO(%rcx)
>  	movl threadinfo_flags(%rcx),%ebx
> @@ -1012,7 +1042,7 @@ error_exit:
>  	TRACE_IRQS_OFF
>  	GET_THREAD_INFO(%rcx)	
>  	testl %eax,%eax
> -	jne  retint_kernel
> +	jne  retexc_kernel
>  	LOCKDEP_SYS_EXIT_IRQ
>  	movl  threadinfo_flags(%rcx),%edx
>  	movl  $_TIF_WORK_MASK,%edi
> diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
> index 74f0c5e..bb174a8 100644
> --- a/arch/x86/kernel/paravirt.c
> +++ b/arch/x86/kernel/paravirt.c
> @@ -139,6 +139,7 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
>  		/* If the operation is a nop, then nop the callsite */
>  		ret = paravirt_patch_nop();
>  	else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
> +		 type == PARAVIRT_PATCH(pv_cpu_ops.nmi_return) ||
>  		 type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret))
>  		/* If operation requires a jmp, then jmp */
>  		ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len);
> @@ -190,6 +191,7 @@ static void native_flush_tlb_single(unsigned long addr)
>  
>  /* These are in entry.S */
>  extern void native_iret(void);
> +extern void native_nmi_return(void);
>  extern void native_irq_enable_syscall_ret(void);
>  
>  static int __init print_banner(void)
> @@ -328,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = {
>  
>  	.irq_enable_syscall_ret = native_irq_enable_syscall_ret,
>  	.iret = native_iret,
> +	.nmi_return = native_nmi_return,
>  	.swapgs = native_swapgs,
>  
>  	.set_iopl_mask = native_set_iopl_mask,
> diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
> index 82fc5fc..8ed31c7 100644
> --- a/arch/x86/kernel/paravirt_patch_32.c
> +++ b/arch/x86/kernel/paravirt_patch_32.c
> @@ -1,10 +1,13 @@
> -#include <asm/paravirt.h>
> +#include <linux/stringify.h>
> +#include <linux/irqflags.h>
>  
>  DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
>  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
>  DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf");
>  DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax");
>  DEF_NATIVE(pv_cpu_ops, iret, "iret");
> +DEF_NATIVE(pv_cpu_ops, nmi_return,
> +	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
>  DEF_NATIVE(pv_cpu_ops, irq_enable_syscall_ret, "sti; sysexit");
>  DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
>  DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
> @@ -29,6 +32,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
>  		PATCH_SITE(pv_irq_ops, restore_fl);
>  		PATCH_SITE(pv_irq_ops, save_fl);
>  		PATCH_SITE(pv_cpu_ops, iret);
> +		PATCH_SITE(pv_cpu_ops, nmi_return);
>  		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
>  		PATCH_SITE(pv_mmu_ops, read_cr2);
>  		PATCH_SITE(pv_mmu_ops, read_cr3);
> diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
> index 7d904e1..56eccea 100644
> --- a/arch/x86/kernel/paravirt_patch_64.c
> +++ b/arch/x86/kernel/paravirt_patch_64.c
> @@ -1,12 +1,15 @@
> +#include <linux/irqflags.h>
> +#include <linux/stringify.h>
>  #include <asm/paravirt.h>
>  #include <asm/asm-offsets.h>
> -#include <linux/stringify.h>
>  
>  DEF_NATIVE(pv_irq_ops, irq_disable, "cli");
>  DEF_NATIVE(pv_irq_ops, irq_enable, "sti");
>  DEF_NATIVE(pv_irq_ops, restore_fl, "pushq %rdi; popfq");
>  DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
>  DEF_NATIVE(pv_cpu_ops, iret, "iretq");
> +DEF_NATIVE(pv_cpu_ops, nmi_return,
> +	__stringify(NATIVE_INTERRUPT_RETURN_NMI_SAFE));
>  DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
>  DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
>  DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
> @@ -35,6 +38,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
>  		PATCH_SITE(pv_irq_ops, irq_enable);
>  		PATCH_SITE(pv_irq_ops, irq_disable);
>  		PATCH_SITE(pv_cpu_ops, iret);
> +		PATCH_SITE(pv_cpu_ops, nmi_return);
>  		PATCH_SITE(pv_cpu_ops, irq_enable_syscall_ret);
>  		PATCH_SITE(pv_cpu_ops, swapgs);
>  		PATCH_SITE(pv_mmu_ops, read_cr2);
> diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c
> index bde6f63..f3a59cd 100644
> --- a/arch/x86/kernel/traps_32.c
> +++ b/arch/x86/kernel/traps_32.c
> @@ -475,6 +475,9 @@ void die(const char *str, struct pt_regs *regs, long err)
>  	if (kexec_should_crash(current))
>  		crash_kexec(regs);
>  
> +	if (in_nmi())
> +		panic("Fatal exception in non-maskable interrupt");
> +
>  	if (in_interrupt())
>  		panic("Fatal exception in interrupt");
>  
> diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
> index adff76e..3dacb75 100644
> --- a/arch/x86/kernel/traps_64.c
> +++ b/arch/x86/kernel/traps_64.c
> @@ -555,6 +555,10 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
>  		oops_exit();
>  		return;
>  	}
> +	if (in_nmi())
> +		panic("Fatal exception in non-maskable interrupt");
> +	if (in_interrupt())
> +		panic("Fatal exception in interrupt");
>  	if (panic_on_oops)
>  		panic("Fatal exception");
>  	oops_exit();
> diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
> index 956f389..01d687d 100644
> --- a/arch/x86/kernel/vmi_32.c
> +++ b/arch/x86/kernel/vmi_32.c
> @@ -151,6 +151,8 @@ static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
>  					      insns, ip);
>  		case PARAVIRT_PATCH(pv_cpu_ops.iret):
>  			return patch_internal(VMI_CALL_IRET, len, insns, ip);
> +		case PARAVIRT_PATCH(pv_cpu_ops.nmi_return):
> +			return patch_internal(VMI_CALL_IRET, len, insns, ip);
>  		case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_syscall_ret):
>  			return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
>  		default:
> diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
> index af65b2d..f5cbb74 100644
> --- a/arch/x86/lguest/boot.c
> +++ b/arch/x86/lguest/boot.c
> @@ -958,6 +958,7 @@ __init void lguest_init(void)
>  	pv_cpu_ops.cpuid = lguest_cpuid;
>  	pv_cpu_ops.load_idt = lguest_load_idt;
>  	pv_cpu_ops.iret = lguest_iret;
> +	pv_cpu_ops.nmi_return = lguest_iret;
>  	pv_cpu_ops.load_sp0 = lguest_load_sp0;
>  	pv_cpu_ops.load_tr_desc = lguest_load_tr_desc;
>  	pv_cpu_ops.set_ldt = lguest_set_ldt;
> diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
> index c8a56e4..33272ce 100644
> --- a/arch/x86/xen/enlighten.c
> +++ b/arch/x86/xen/enlighten.c
> @@ -1008,6 +1008,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
>  	.read_pmc = native_read_pmc,
>  
>  	.iret = xen_iret,
> +	.nmi_return = xen_iret,
>  	.irq_enable_syscall_ret = xen_sysexit,
>  
>  	.load_tr_desc = paravirt_nop,
> diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
> index 24d71b1..c3009fd 100644
> --- a/include/asm-x86/irqflags.h
> +++ b/include/asm-x86/irqflags.h
> @@ -51,6 +51,61 @@ static inline void native_halt(void)
>  
>  #endif
>  
> +#ifdef CONFIG_X86_64
> +/*
> + * Only returns from a trap or exception to a NMI context (intra-privilege
> + * level near return) to the same SS and CS segments. Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued. It takes care of modifying the eflags, rsp and returning to the
> + * previous function.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(rsp)  RIP
> + * 8(rsp)  CS
> + * 16(rsp) EFLAGS
> + * 24(rsp) RSP
> + * 32(rsp) SS
> + *
> + * Upon execution :
> + * Copy EIP to the top of the return stack
> + * Update top of return stack address
> + * Pop eflags into the eflags register
> + * Make the return stack current
> + * Near return (popping the return address from the return stack)
> + */
> +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushq %rax;		\
> +						movq %rsp, %rax;	\
> +						movq 24+8(%rax), %rsp;	\
> +						pushq 0+8(%rax);	\
> +						pushq 16+8(%rax);	\
> +						movq (%rax), %rax;	\
> +						popfq;			\
> +						ret
> +#else
> +/*
> + * Protected mode only, no V8086. Implies that protected mode must
> + * be entered before NMIs or MCEs are enabled. Only returns from a trap or
> + * exception to a NMI context (intra-privilege level far return). Should be used
> + * upon trap or exception return when nested over a NMI context so no iret is
> + * issued.
> + *
> + * The stack, at that point, looks like :
> + *
> + * 0(esp) EIP
> + * 4(esp) CS
> + * 8(esp) EFLAGS
> + *
> + * Upon execution :
> + * Copy the stack eflags to top of stack
> + * Pop eflags into the eflags register
> + * Far return: pop EIP and CS into their register, and additionally pop EFLAGS.
> + */
> +#define NATIVE_INTERRUPT_RETURN_NMI_SAFE	pushl 8(%esp);	\
> +						popfl;		\
> +						lret $4
> +#endif
> +
>  #ifdef CONFIG_PARAVIRT
>  #include <asm/paravirt.h>
>  #else
> @@ -109,6 +164,7 @@ static inline unsigned long __raw_local_irq_save(void)
>  
>  #define ENABLE_INTERRUPTS(x)	sti
>  #define DISABLE_INTERRUPTS(x)	cli
> +#define INTERRUPT_RETURN_NMI_SAFE	NATIVE_INTERRUPT_RETURN_NMI_SAFE
>  
>  #ifdef CONFIG_X86_64
>  #define INTERRUPT_RETURN	iretq
> diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
> index 0f13b94..d5087e0 100644
> --- a/include/asm-x86/paravirt.h
> +++ b/include/asm-x86/paravirt.h
> @@ -141,9 +141,10 @@ struct pv_cpu_ops {
>  	u64 (*read_pmc)(int counter);
>  	unsigned long long (*read_tscp)(unsigned int *aux);
>  
> -	/* These two are jmp to, not actually called. */
> +	/* These three are jmp to, not actually called. */
>  	void (*irq_enable_syscall_ret)(void);
>  	void (*iret)(void);
> +	void (*nmi_return)(void);
>  
>  	void (*swapgs)(void);
>  
> @@ -1385,6 +1386,10 @@ static inline unsigned long __raw_local_irq_save(void)
>  	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE,	\
>  		  jmp *%cs:pv_cpu_ops+PV_CPU_iret)
>  
> +#define INTERRUPT_RETURN_NMI_SAFE					\
> +	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_nmi_return), CLBR_NONE,	\
> +		  jmp *%cs:pv_cpu_ops+PV_CPU_nmi_return)
> +
>  #define DISABLE_INTERRUPTS(clobbers)					\
>  	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
>  		  PV_SAVE_REGS;			\
> diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
> index 181006c..b39f49d 100644
> --- a/include/linux/hardirq.h
> +++ b/include/linux/hardirq.h
> @@ -22,10 +22,13 @@
>   * PREEMPT_MASK: 0x000000ff
>   * SOFTIRQ_MASK: 0x0000ff00
>   * HARDIRQ_MASK: 0x0fff0000
> + * HARDNMI_MASK: 0x40000000
>   */
>  #define PREEMPT_BITS	8
>  #define SOFTIRQ_BITS	8
>  
> +#define HARDNMI_BITS	1
> +
>  #ifndef HARDIRQ_BITS
>  #define HARDIRQ_BITS	12
>  
> @@ -45,16 +48,19 @@
>  #define PREEMPT_SHIFT	0
>  #define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
>  #define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
> +#define HARDNMI_SHIFT	(30)
>  
>  #define __IRQ_MASK(x)	((1UL << (x))-1)
>  
>  #define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
>  #define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
>  #define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
> +#define HARDNMI_MASK	(__IRQ_MASK(HARDNMI_BITS) << HARDNMI_SHIFT)
>  
>  #define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
>  #define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
>  #define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
> +#define HARDNMI_OFFSET	(1UL << HARDNMI_SHIFT)
>  
>  #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
>  #error PREEMPT_ACTIVE is too low!
> @@ -62,7 +68,9 @@
>  
>  #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
>  #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
> -#define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define irq_count() \
> +	(preempt_count() & (HARDNMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK))
> +#define hardnmi_count()	(preempt_count() & HARDNMI_MASK)
>  
>  /*
>   * Are we doing bottom half or hardware interrupt processing?
> @@ -71,6 +79,7 @@
>  #define in_irq()		(hardirq_count())
>  #define in_softirq()		(softirq_count())
>  #define in_interrupt()		(irq_count())
> +#define in_nmi()		(hardnmi_count())
>  
>  #if defined(CONFIG_PREEMPT)
>  # define PREEMPT_INATOMIC_BASE kernel_locked()
> @@ -161,7 +170,19 @@ extern void irq_enter(void);
>   */
>  extern void irq_exit(void);
>  
> -#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
> -#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
> +#define nmi_enter()					\
> +	do {						\
> +		lockdep_off();				\
> +		BUG_ON(hardnmi_count());		\
> +		add_preempt_count(HARDNMI_OFFSET);	\
> +		__irq_enter();				\
> +	} while (0)
> +
> +#define nmi_exit()					\
> +	do {						\
> +		__irq_exit();				\
> +		sub_preempt_count(HARDNMI_OFFSET);	\
> +		lockdep_on();				\
> +	} while (0)
>  
>  #endif /* LINUX_HARDIRQ_H */
> 
> 

-- 
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-09-02 13:26           ` Mathieu Desnoyers
@ 2008-09-02 13:41             ` Peter Zijlstra
  2008-09-02 14:55               ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Peter Zijlstra @ 2008-09-02 13:41 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: paulmck, linux-kernel, cl, mingo, akpm, manfred, dipankar, josht,
	schamp, niv, dvhltc, ego, laijs, rostedt

On Tue, 2008-09-02 at 09:26 -0400, Mathieu Desnoyers wrote:
> * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > On Sat, 2008-08-30 at 07:10 -0700, Paul E. McKenney wrote:
> > > On Sat, Aug 30, 2008 at 11:33:00AM +0200, Peter Zijlstra wrote:
> > > > On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:
> > > > 
> > > > > Some shortcomings:
> > > > > 
> > > > > o	Entering and leaving dynticks idle mode is a quiescent state,
> > > > > 	but the current patch doesn't take advantage of this (noted
> > > > > 	by Manfred).  It appears that it should be possible to make
> > > > > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > > > > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > > > > 	out whether it is safe to tell RCU about the quiescent state --
> > > > > 	and also greatly simplify the code.
> > > > 
> > > > Already done and available in the -tip tree, curtesy of Mathieu.
> > > 
> > > Very cool!!!  I see one of his patches at http://lkml.org/lkml/2008/4/17/342,
> > > but how do I find out which branch of -tip this is on?  (I am learning
> > > git, but it is a slow process...)
> > > 
> > > This would also simplify preemptable RCU's dyntick interface, removing
> > > the need for proofs.
> > 
> > Not sure - my git-foo isn't good enough either :-(
> > 
> > All I can offer is that its available in tip/master (the collective
> > merge of all of tip's branches) as commit:
> > 0d84b78a606f1562532cd576ee8733caf5a4aed3, which I found using
> > git-annotate include/linux/hardirq.h
> > 
> > How to find from which particular topic branch it came from, I too am
> > clueless.
> > 
> 
> If you're interested in knowing the topic it came from : it's required
> so a following patch can use a "popf; ret" instead of iret to return
> from trap handlers executed in NMI context. There is an architectural
> problem on x86 causing NMIs to be reactivated after the first iret
> encountered, which leads to NMI handler races if nmi handlers trap. This
> works around the problem by returning from the trap handlers without
> using the iret instruction.
> 
> It's useful to Immediate Values which put a temporary breakpoint in the
> instruction stream when proceeding to code modification and also useful
> to LTTng (available in the -lttng tree) which writes tracing data to
> vmap'd memory buffers (which can cause a minor page fault).
> 
> I'm glad to see NMI context detection is useful to others too !

While an interesting detail, its not the answer to the question.

Given a bunch of topic branches, and a branch that has all those topic
merged, how, for any particular commit from the merge branch, do you
find from which topic branch it originiated?

IOW, the answer to the above question would have been a series of git
commands that would have resulted in something like tip/tracing/nmisafe




^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC, tip/core/rcu] v3 scalable classic RCU implementation
  2008-09-02 13:41             ` Peter Zijlstra
@ 2008-09-02 14:55               ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-02 14:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, linux-kernel, cl, mingo, akpm, manfred,
	dipankar, josht, schamp, niv, dvhltc, ego, laijs, rostedt

On Tue, Sep 02, 2008 at 03:41:54PM +0200, Peter Zijlstra wrote:
> On Tue, 2008-09-02 at 09:26 -0400, Mathieu Desnoyers wrote:
> > * Peter Zijlstra (a.p.zijlstra@chello.nl) wrote:
> > > On Sat, 2008-08-30 at 07:10 -0700, Paul E. McKenney wrote:
> > > > On Sat, Aug 30, 2008 at 11:33:00AM +0200, Peter Zijlstra wrote:
> > > > > On Fri, 2008-08-29 at 17:49 -0700, Paul E. McKenney wrote:
> > > > > 
> > > > > > Some shortcomings:
> > > > > > 
> > > > > > o	Entering and leaving dynticks idle mode is a quiescent state,
> > > > > > 	but the current patch doesn't take advantage of this (noted
> > > > > > 	by Manfred).  It appears that it should be possible to make
> > > > > > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > > > > > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > > > > > 	out whether it is safe to tell RCU about the quiescent state --
> > > > > > 	and also greatly simplify the code.
> > > > > 
> > > > > Already done and available in the -tip tree, curtesy of Mathieu.
> > > > 
> > > > Very cool!!!  I see one of his patches at http://lkml.org/lkml/2008/4/17/342,
> > > > but how do I find out which branch of -tip this is on?  (I am learning
> > > > git, but it is a slow process...)
> > > > 
> > > > This would also simplify preemptable RCU's dyntick interface, removing
> > > > the need for proofs.
> > > 
> > > Not sure - my git-foo isn't good enough either :-(
> > > 
> > > All I can offer is that its available in tip/master (the collective
> > > merge of all of tip's branches) as commit:
> > > 0d84b78a606f1562532cd576ee8733caf5a4aed3, which I found using
> > > git-annotate include/linux/hardirq.h
> > > 
> > > How to find from which particular topic branch it came from, I too am
> > > clueless.
> > > 
> > 
> > If you're interested in knowing the topic it came from : it's required
> > so a following patch can use a "popf; ret" instead of iret to return
> > from trap handlers executed in NMI context. There is an architectural
> > problem on x86 causing NMIs to be reactivated after the first iret
> > encountered, which leads to NMI handler races if nmi handlers trap. This
> > works around the problem by returning from the trap handlers without
> > using the iret instruction.
> > 
> > It's useful to Immediate Values which put a temporary breakpoint in the
> > instruction stream when proceeding to code modification and also useful
> > to LTTng (available in the -lttng tree) which writes tracing data to
> > vmap'd memory buffers (which can cause a minor page fault).
> > 
> > I'm glad to see NMI context detection is useful to others too !
> 
> While an interesting detail, its not the answer to the question.
> 
> Given a bunch of topic branches, and a branch that has all those topic
> merged, how, for any particular commit from the merge branch, do you
> find from which topic branch it originiated?
> 
> IOW, the answer to the above question would have been a series of git
> commands that would have resulted in something like tip/tracing/nmisafe

I guess that it turned out that there was a series of mutt commands that
eventually got the answer.  That said, a series of git commands would
be quite nice.  ;-)

But it would appear that the series of git commands would need to come
from someone with better git-foo than either Mathieu or myself.  :-/

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [PATCH, RFC] v4 scalable classic RCU implementation
  2008-08-30  0:49   ` [PATCH, RFC, tip/core/rcu] v3 " Paul E. McKenney
                       ` (2 preceding siblings ...)
  2008-09-01  9:38     ` Andi Kleen
@ 2008-09-05 15:29     ` Paul E. McKenney
  2008-09-05 19:33       ` Andrew Morton
                         ` (3 more replies)
  3 siblings, 4 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-05 15:29 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt, peterz, penberg, andi

Hello!

Still experimental, not for inclusion.  But ready for serious experimental
use, in particular, experience on an actual >1000-CPU machine would be
most welcome.

Updates from v3:

o	The hierarchical-RCU implementation has been moved to its own
	"rcutree" set of files.  This allows configuring three different
	implementations of RCU (CLASSIC_RCU, PREEMPT_RCU, and the new
	TREE_RCU).  More importantly, it enables easy application of
	this patch to a wide variety of Linux versions.

	I hope that this implementation can completely replace Classic
	RCU, but in the meantime, this split makes for easier testing
	and review.

o	The stalled-CPU detection is now implemented and working,
	enabled by the CONFIG_RCU_CPU_STALL config parameter.  Complaints
	are kprint()ed 3 seconds into the stall, and every 30 seconds
	thereafter.  It also now attempts to force quiescent states.

o	The algorithm uses pre-fabricated masks rather than shifting
	on each access.

o	Review comments have been applied (thank you all!!!).
	For but one example, call_rcu() and call_rcu_bh() are now
	one-liners.

o	The rcu_pending() and rcu_needs_cpu() primitives are now
	much more aggressive about permitting CPUs to enter dynticks
	idle mode.  Only CPUs that have RCU callbacks are kept out
	of dynticks idle mode.

Attached is an updated patch to Classic RCU that applies a
hierarchy, greatly reducing the contention on the top-level lock
for large machines.  This passes 10-hour concurrent rcutorture and
online-offline testing on 128-CPU ppc64.  It is OK for experimental
work assuming only modestly brave experimenters (and perhaps even
cowardly experiementers), but not yet ready for inclusion.  See also
Manfred Spraul's recent patches (or his earlier work from 2004 at
http://marc.info/?l=linux-kernel&m=108546384711797&w=2).  We will
converge onto a common patch in the fullness of time, but are currently
exploring different regions of the design space.  That said, I have
already gratefully stolen a number of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	Entering and leaving dynticks idle mode is a quiescent state,
	but the current patch doesn't take advantage of this (noted
	by Manfred).  It appears that it should be possible to make
	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
	out whether it is safe to tell RCU about the quiescent state --
	and also greatly simplify the code.  However, a first attempt
	to hack this into existence failed, so will be taking a more
	measured approach.

o	There are a few places where grace periods are unnecessarily
	delayed.

o	There are probably hangs, rcutorture failures, &c.  In particular,
	the case where an interrupt from dynticks idle invokes call_rcu()
	requires a bit more thought.  And it requires NMIs to be sorted
	as noted above.

o	There are a few architectures that will sometimes execute irq
	handlers on CPUs that are already marked offline.  This is the
	subject of separate patches.  (Yes, you do have to have a very
	unlikely code construct hitting an unlikely sequence of events
	for anything bad to happen, but still needs to be fixed.)

o	Structure field layout is likely highly suboptimal.  On the other
	hand, given that the read-side primitives do not touch any of
	this data, this issue is not as pressing as it might otherwise be.

o	There is not yet a human-readable design document.  Will be fixed.

To build, start with 2.6.27-rc3, and apply:

	http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-treeRCU-2.patch

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 
 include/linux/hardirq.h  |    4 
 include/linux/rcupdate.h |    6 
 include/linux/rcutree.h  |  334 ++++++++++
 init/Kconfig             |   15 
 kernel/Kconfig.preempt   |   70 ++
 kernel/Makefile          |    6 
 kernel/rcutree.c         | 1474 +++++++++++++++++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c   |  231 +++++++
 lib/Kconfig.debug        |   13 
 9 files changed, 2135 insertions(+), 18 deletions(-)

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..a776bf0 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,13 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+#endif /* #if defined(CONFIG_NO_HZ) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e8b4039..cffec3e 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -54,8 +54,12 @@ struct rcu_head {
 
 #ifdef CONFIG_CLASSIC_RCU
 #include <linux/rcuclassic.h>
-#else /* #ifdef CONFIG_CLASSIC_RCU */
+#elif CONFIG_TREE_RCU
+#include <linux/rcutree.h>
+#elif CONFIG_PREEMPT_RCU
 #include <linux/rcupreempt.h>
+#else
+#error "Unknown RCU implementation specified to kernel configuration"
 #endif /* #else #ifdef CONFIG_CLASSIC_RCU */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 0000000..4213009
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,334 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+
+#ifndef __LINUX_RCUTREE_H
+#define __LINUX_RCUTREE_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if (NR_CPUS) <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+	/* 3) rcu-barrier functions */
+	struct rcu_head barrier;
+
+#ifdef CONFIG_NO_HZ
+	/* 4) dynticks interface (see http://lwn.net/Articles/279077/) */
+	int dynticks_nesting;		/* Track nesting level, sort of. */
+	int dynticks;			/* Even for dynticks-idle mode. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 5) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struc rcu_data. */
+#define RCU_SAVE_DYNTICK	0	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		1	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL
+#define RCU_SECONDS_TILL_STALL_CHECK	 3	/* for rsp->seconds_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK	30	/* for rsp->seconds_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* sent GP-kick IPIs? */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long seconds_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+#define __rcu_read_lock() \
+	do { \
+		preempt_disable(); \
+		__acquire(RCU); \
+		rcu_read_acquire(); \
+	} while (0)
+#define __rcu_read_unlock() \
+	do { \
+		rcu_read_release(); \
+		__release(RCU); \
+		preempt_enable(); \
+	} while (0)
+#define __rcu_read_lock_bh() \
+	do { \
+		local_bh_disable(); \
+		__acquire(RCU_BH); \
+		rcu_read_acquire(); \
+	} while (0)
+#define __rcu_read_unlock_bh() \
+	do { \
+		rcu_read_release(); \
+		__release(RCU_BH); \
+		local_bh_enable(); \
+	} while (0)
+
+#define __synchronize_sched() synchronize_rcu()
+
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
+extern void __rcu_init(void);
+#define rcu_init_sched()	do { } while (0)
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+#ifdef CONFIG_NO_HZ
+
+/*
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ *
+ * @@@ note quiescent state???
+ */
+static inline void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_data).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_data).dynticks & 0x1, &rs);
+	__get_cpu_var(rcu_bh_data).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_bh_data).dynticks & 0x1, &rs);
+}
+
+/*
+ * Exit nohz mode.
+ */
+static inline void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_data).dynticks++;
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_data).dynticks & 0x1), &rs);
+	__get_cpu_var(rcu_bh_data).dynticks++;
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_bh_data).dynticks & 0x1), &rs);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+#else /* CONFIG_NO_HZ */
+#define rcu_enter_nohz()	do { } while (0)
+#define rcu_exit_nohz()		do { } while (0)
+#endif /* CONFIG_NO_HZ */
+
+#endif /* __LINUX_RCUTREE_H */
diff --git a/init/Kconfig b/init/Kconfig
index b678803..82ee8f7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -914,10 +914,11 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
-config CLASSIC_RCU
-	def_bool !PREEMPT_RCU
-	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.  Classic RCU is the default.  Note that the
-	  PREEMPT_RCU symbol is used to select/deselect this option.
+config RCU_TRACE
+	def_bool TREE_RCU_TRACE || PREEMPT_RCU_TRACE
+	select DEBUG_FS
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.  Note that
+	  either RCU_TREE_TRACE or RCU_PREEMPT_TRACE is used to
+	  select/deselect this option.
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03..86f7fb6 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based Hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,57 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
+endchoice
+
+config TREE_RCU_TRACE
+	bool "Enable tracing for tree-based hierarchical RCU"
+	depends on TREE_RCU
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
 
-config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
+config PREEMPT_RCU_TRACE
+	bool "Enable tracing for preemptable RCU"
 	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based Hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df..101e880 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 0000000..eb10394
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1474 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#ifdef CONFIG_NO_HZ
+
+/*
+ * Helper function for rcu_irq_enter().
+ */
+void __rcu_irq_enter(struct rcu_data *rdp)
+{
+	if (rdp->dynticks_nesting)
+		rdp->dynticks_nesting++;
+
+	/*
+	 * Only update if we are coming from a stopped ticks mode
+	 * (rdp->dynticks is even).
+	 */
+	if (!in_interrupt() &&
+	    (rdp->dynticks & 0x1) == 0) {
+		/*
+		 * The following might seem like we could have a race
+		 * with NMI/SMIs. But this really isn't a problem.
+		 * Here we do a read/modify/write, and the race happens
+		 * when an NMI/SMI comes in after the read and before
+		 * the write. But NMI/SMIs will increment this counter
+		 * twice before returning, so the zero bit will not
+		 * be corrupted by the NMI/SMI which is the most important
+		 * part.
+		 *
+		 * The only thing is that we would bring back the counter
+		 * to a postion that it was in during the NMI/SMI.
+		 * But the zero bit would be set, so the rest of the
+		 * counter would again be ignored.
+		 *
+		 * On return from the IRQ, the counter may have the zero
+		 * bit be 0 and the counter the same as the return from
+		 * the NMI/SMI. If the state machine was so unlucky to
+		 * see that, it still doesn't matter, since all
+		 * RCU read-side critical sections on this CPU would
+		 * have already completed.
+		 */
+		rdp->dynticks++;
+		/*
+		 * The following memory barrier ensures that any RCU
+		 * read-side critical sections in the irq handler are
+		 * seen by other CPUs to follow the above increment to
+		 * rdp->dynticks. This is required in order for other CPUs
+		 * to correctly determine when it is safe to advance the
+		 * RCU grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		/*
+		 * Since we can't determine the dynamic tick mode from
+		 * the rdp->dynticks after this routine, we use a second
+		 * flag to acknowledge that we came from an idle state
+		 * with ticks stopped.
+		 */
+		rdp->dynticks_nesting++;
+		/*
+		 * If we take an NMI/SMI now, they will also increment
+		 * the dynticks_nesting counter, and will not update the
+		 * rdp->dynticks on exit. That is for this IRQ to do.
+		 */
+	}
+}
+
+/**
+ * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	__rcu_irq_enter(&__get_cpu_var(rcu_data));
+	__rcu_irq_enter(&__get_cpu_var(rcu_bh_data));
+}
+
+/*
+ * Helper function for rcu_irq_exit().
+ */
+static void __rcu_irq_exit(struct rcu_data *rdp)
+{
+	/*
+	 * rdp->dynticks_nesting is set if we interrupted the CPU
+	 * when it was idle with ticks stopped.
+	 * Once this occurs, we keep track of interrupt nesting
+	 * because a NMI/SMI could also come in, and we still
+	 * only want the IRQ that started the increment of the
+	 * rdp->dynticks to be the one that modifies it on exit.
+	 */
+	if (rdp->dynticks_nesting) {
+		if (--rdp->dynticks_nesting)
+			return;
+
+		/* This must match the interrupt nesting */
+		WARN_ON(in_interrupt());
+
+		/*
+		 * If an NMI/SMI happens now we are still
+		 * protected by the rdp->dynticks being odd.
+		 */
+
+		/*
+		 * The following memory barrier ensures that any
+		 * rcu_read_unlock() primitives in the irq handler
+		 * are seen by other CPUs to preceed the following
+		 * increment to rdp->dynticks. This is required in
+		 * order for other CPUs to determine when it is safe
+		 * to advance the RCU grace-period state machine.
+		 */
+		smp_mb(); /* see above block comment. */
+		rdp->dynticks++;
+		WARN_ON(rdp->dynticks & 0x1);
+	}
+}
+
+/**
+ * rcu_irq_exit - Called from exiting Hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	__rcu_irq_exit(&__get_cpu_var(rcu_data));
+	__rcu_irq_exit(&__get_cpu_var(rcu_bh_data));
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->complete - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, int comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+
+	snap = rdp->dynticks;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	ret = (snap & 0x1) == 0;
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long snap;
+
+	curr = rdp->dynticks;
+	snap = rdp->dynticks_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr - snap) >= 2 || (curr & 0x1) == 0) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, int comp) { }
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; }
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = get_seconds() - rsp->seconds_stall;
+	if (delta < 2L || rsp->gpnum != rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
+			smp_processor_id(), get_seconds(),
+			jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(get_seconds() - rsp->seconds_stall) >= 0L)
+		rsp->seconds_stall =
+			get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = get_seconds() - rsp->seconds_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0L) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed && delta >= 2L) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL */
+static void record_gp_stall_check_time(struct rcu_state *rsp) { }
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { }
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
+	__releases(rsp->rda[smp_processor_id()]->lock)
+{
+	unsigned long flags = iflg;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_SIGNAL_INIT;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  When invoking this on one's own
+ * behalf, lastcomp is used to make sure we are still in the grace period
+ * of interest.  We don't want to end the current grace period based on
+ * quiescent states detected in an earlier grace period!  On the other hand,
+ * it the CPU being quieted is offline, we can safely pass in lastcomp==NULL,
+ * since an offline CPU is in a quiescent state with respect to any grace
+ * period, unlike pesky online CPUs, which can go non-quiescent with
+ * absolutely no warning.
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != NULL &&
+	    *lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0L) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, &rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+						/* @@@ move up to simplify. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void
+rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	/* Update count, and requeue any remaining callbacks. */
+	local_irq_save(flags);
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+	local_irq_restore(flags);
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * Also do a memory barrier.  This is needed to handle
+		 * the case where writes from a preempt-disable section
+		 * of code get reordered into schedule() by this CPU's
+		 * write buffer.  The memory barrier makes sure that
+		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+		 * by other CPUs to happen after any such write.
+		 */
+
+		smp_mb();  /* See above block comment. */
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.  The memory barrier
+		 * is needed for the same reason as is the above one.
+		 */
+
+		smp_mb();  /* See above block comment. */
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		mask = 0;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0L && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags))
+		return;	/* Someone else is already on the job. */
+	if (relaxed && (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	if (rsp->completed == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+	    	force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manupulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manupulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	rdp->completed = rsp->completed;
+	rdp->gpnum = rsp->completed;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = rsp->completed - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks |= 1; /* want consecutive numbers even for hotplug. */
+	rdp->dynticks_nesting = 0;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		levelspread[i] = CONFIG_RCU_FANOUT;
+	}
+	
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+	
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++) {
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	}
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] + 
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_DEBUG_RCU_STALL
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 0000000..05de009
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,231 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+
+static DEFINE_MUTEX(rcuclassic_trace_mutex);
+static char *rcuclassic_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE (512*NR_CPUS)
+
+static int print_one_rcu_data(struct rcu_data *rdp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+
+	if (!rdp->beenonline)
+		return 0;
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		"%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+		rdp->cpu,
+		cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		rdp->completed, rdp->gpnum,
+		rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		rdp->qs_pending);
+#ifdef CONFIG_NO_HZ
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" dt=%d df=%lu", rdp->dynticks, rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+	return cnt;
+}
+
+#define PRINT_RCU_DATA(name, buf, ebuf) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			(buf) += print_one_rcu_data(&per_cpu(name, _p_r_d_i), \
+						    buf, ebuf); \
+	} while (0)
+
+static ssize_t rcudata_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static int print_one_rcu_state(struct rcu_state *rsp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+	int level = 0;
+	struct rcu_node *rnp;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+			"c=%ld g=%ld s=%d jfq=%ld nfqs=%lu/nfqsng=%lu(%lu)\n",
+			rsp->completed, rsp->gpnum, rsp->signaled,
+			(long)(rsp->jiffies_force_qs - jiffies),
+			rsp->n_force_qs, rsp->n_force_qs_ngp,
+			rsp->n_force_qs - rsp->n_force_qs_ngp);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+			level = rnp->level;
+		}
+		cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+				"%lx/%lx %d:%d ^%d    ",
+				rnp->qsmask, rnp->qsmaskinit,
+				rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+	return cnt;
+}
+
+static ssize_t rcuhier_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	buf += print_one_rcu_state(&rcu_state, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	buf += print_one_rcu_state(&rcu_bh_state, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: completed=%ld  gpnum=%ld\n",
+			rcu_state.completed, rcu_state.gpnum);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh: completed=%ld  gpnum=%ld\n",
+			rcu_bh_state.completed, rcu_bh_state.gpnum);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.read = rcudata_read,
+};
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuhier_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct dentry *rcudir, *datadir, *hierdir, *gpdir;
+static int rcuclassic_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcuclassic_trace_init(void)
+{
+	int ret;
+
+	rcuclassic_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcuclassic_trace_buf)
+		return 1;
+	ret = rcuclassic_debugfs_init();
+	if (ret)
+		kfree(rcuclassic_trace_buf);
+	return ret;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+	kfree(rcuclassic_trace_buf);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 800ac84..03df272 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_CPU_STALL
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU || TREE_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-05 15:29     ` [PATCH, RFC] v4 " Paul E. McKenney
@ 2008-09-05 19:33       ` Andrew Morton
  2008-09-05 23:04         ` Paul E. McKenney
  2008-09-06 16:37       ` Manfred Spraul
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 94+ messages in thread
From: Andrew Morton @ 2008-09-05 19:33 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Fri, 5 Sep 2008 08:29:30 -0700
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> Hello!
> 
> Still experimental, not for inclusion.  But ready for serious experimental
> use, in particular, experience on an actual >1000-CPU machine would be
> most welcome.
> 
> Updates from v3:
> 
> o	The hierarchical-RCU implementation has been moved to its own
> 	"rcutree" set of files.  This allows configuring three different
> 	implementations of RCU (CLASSIC_RCU, PREEMPT_RCU, and the new
> 	TREE_RCU).  More importantly, it enables easy application of
> 	this patch to a wide variety of Linux versions.
> 
> 	I hope that this implementation can completely replace Classic
> 	RCU, but in the meantime, this split makes for easier testing
> 	and review.
> 
> o	The stalled-CPU detection is now implemented and working,
> 	enabled by the CONFIG_RCU_CPU_STALL config parameter.  Complaints
> 	are kprint()ed 3 seconds into the stall, and every 30 seconds
> 	thereafter.  It also now attempts to force quiescent states.

The CONFIG_RCU_CPU_STALL identifier seems poorly-chosen to me - it
sounds like it will stall my CPU.  Should it be
CONFIG_RCU_CPU_STALL_DETECTOR?  If it's a debugging option then it
should have _DEBUG in there too.

> o	The algorithm uses pre-fabricated masks rather than shifting
> 	on each access.
> 
> o	Review comments have been applied (thank you all!!!).
> 	For but one example, call_rcu() and call_rcu_bh() are now
> 	one-liners.
> 
> o	The rcu_pending() and rcu_needs_cpu() primitives are now
> 	much more aggressive about permitting CPUs to enter dynticks
> 	idle mode.  Only CPUs that have RCU callbacks are kept out
> 	of dynticks idle mode.
> 
> Attached is an updated patch to Classic RCU that applies a
> hierarchy, greatly reducing the contention on the top-level lock
> for large machines.  This passes 10-hour concurrent rcutorture and
> online-offline testing on 128-CPU ppc64.  It is OK for experimental
> work assuming only modestly brave experimenters (and perhaps even
> cowardly experiementers), but not yet ready for inclusion.  See also
> Manfred Spraul's recent patches (or his earlier work from 2004 at
> http://marc.info/?l=linux-kernel&m=108546384711797&w=2).  We will
> converge onto a common patch in the fullness of time, but are currently
> exploring different regions of the design space.  That said, I have
> already gratefully stolen a number of Manfred's ideas.
> 
> This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
> of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
> 64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
> there is no hierarchy.  By default, the RCU initialization code will
> adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
> architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
> this balancing, allowing the hierarchy to be exactly aligned to the
> underlying hardware.  Up to two levels of hierarchy are permitted
> (in addition to the root node), allowing up to 16,384 CPUs on 32-bit
> systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
> am going to regret saying this, but this seems more than sufficient
> for the foreseeable future.  (Some architectures might wish to set
> CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
> If this becomes a real problem, additional levels can be added, but I
> doubt that it will make a significant difference on real hardware.)
> 
> In the common case, a given CPU will manipulate its private rcu_data
> structure and the rcu_node structure that it shares with its immediate
> neighbors.  This can reduce both lock and memory contention by multiple
> orders of magnitude, which should eliminate the need for the strange
> manipulations that are reported to be required when running Linux on
> very large systems.
> 
> Some shortcomings:
> 
> o	Entering and leaving dynticks idle mode is a quiescent state,
> 	but the current patch doesn't take advantage of this (noted
> 	by Manfred).  It appears that it should be possible to make
> 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> 	out whether it is safe to tell RCU about the quiescent state --
> 	and also greatly simplify the code.  However, a first attempt
> 	to hack this into existence failed, so will be taking a more
> 	measured approach.
> 
> o	There are a few places where grace periods are unnecessarily
> 	delayed.
> 
> o	There are probably hangs, rcutorture failures, &c.  In particular,
> 	the case where an interrupt from dynticks idle invokes call_rcu()
> 	requires a bit more thought.  And it requires NMIs to be sorted
> 	as noted above.
> 
> o	There are a few architectures that will sometimes execute irq
> 	handlers on CPUs that are already marked offline.  This is the
> 	subject of separate patches.  (Yes, you do have to have a very
> 	unlikely code construct hitting an unlikely sequence of events
> 	for anything bad to happen, but still needs to be fixed.)
> 
> o	Structure field layout is likely highly suboptimal.  On the other
> 	hand, given that the read-side primitives do not touch any of
> 	this data, this issue is not as pressing as it might otherwise be.
> 
> o	There is not yet a human-readable design document.  Will be fixed.

You forgot

  o	Adds yet another RCU flavour

Having alternative implementations of the same thing is a real cost in
terms of maintainability, supportability, etc, etc.

>
> ...
>
> +#if (NR_CPUS) <= RCU_FANOUT
> +#  define NUM_RCU_LVLS	      1
> +#  define NUM_RCU_LVL_0	      1
> +#  define NUM_RCU_LVL_1	      (NR_CPUS)
> +#  define NUM_RCU_LVL_2	      0
> +#  define NUM_RCU_LVL_3	      0
> +#elif (NR_CPUS) <= RCU_FANOUT_SQ
> +#  define NUM_RCU_LVLS	      2
> +#  define NUM_RCU_LVL_0	      1
> +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> +#  define NUM_RCU_LVL_2	      (NR_CPUS)
> +#  define NUM_RCU_LVL_3	      0
> +#elif (NR_CPUS) <= RCU_FANOUT_CUBE
> +#  define NUM_RCU_LVLS	      3
> +#  define NUM_RCU_LVL_0	      1
> +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> +#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> +#  define NUM_RCU_LVL_3	      NR_CPUS
> +#else
> +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> +#endif /* #if (NR_CPUS) <= RCU_FANOUT */

Using NR_CPUS for anything at all is grossly, grossly inaccurate. 
Vendors can and will ship kernels with NR_CPUS=1024 and their customers
can and will run those kernels on 4-cpu machines.  Lots of customers.

That's a two-and-a-half-order-of-magnitude inaccuracy.  It makes all
your above work meaningless.

To be useful, these decisions should be made at runtime.

> +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
> +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
> +
> +/*
> + * Definition for node within the RCU grace-period-detection hierarchy.
> + */
> +struct rcu_node {
> +	spinlock_t lock;
> +	unsigned long qsmask;	/* CPUs or groups that need to switch in */
> +				/*  order for current grace period to proceed.*/
> +	unsigned long qsmaskinit;
> +				/* Per-GP initialization for qsmask. */
> +	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
> +	int	grplo;		/* lowest-numbered CPU or group here. */
> +	int	grphi;		/* highest-numbered CPU or group here. */
> +	u8	grpnum;		/* CPU/group number for next level up. */
> +	u8	level;		/* root is at level 0. */
> +	struct rcu_node *parent;
> +} ____cacheline_internodealigned_in_smp;

So this is a 4096-byte structure on some setups.

How many of them do we expect to be concurrently instantiated?

>
> ...
>
> +#define __rcu_read_lock() \
> +	do { \
> +		preempt_disable(); \
> +		__acquire(RCU); \
> +		rcu_read_acquire(); \
> +	} while (0)
> +#define __rcu_read_unlock() \
> +	do { \
> +		rcu_read_release(); \
> +		__release(RCU); \
> +		preempt_enable(); \
> +	} while (0)
> +#define __rcu_read_lock_bh() \
> +	do { \
> +		local_bh_disable(); \
> +		__acquire(RCU_BH); \
> +		rcu_read_acquire(); \
> +	} while (0)
> +#define __rcu_read_unlock_bh() \
> +	do { \
> +		rcu_read_release(); \
> +		__release(RCU_BH); \
> +		local_bh_enable(); \
> +	} while (0)

did they have to be implemented in macros?  (it's generally best to use
C where poss)

> +#define __synchronize_sched() synchronize_rcu()
> +
> +#define call_rcu_sched(head, func) call_rcu(head, func)
> +
> +extern void __rcu_init(void);
> +#define rcu_init_sched()	do { } while (0)

static inline void rcu_init_sched(void)
{
}

, IMO

>
> ...
>
> +/*
> + * Enter nohz mode, in other words, -leave- the mode in which RCU
> + * read-side critical sections can occur.  (Though RCU read-side
> + * critical sections can occur in irq handlers in nohz mode, a possibility
> + * handled by rcu_irq_enter() and rcu_irq_exit()).
> + *
> + * @@@ note quiescent state???
> + */
> +static inline void rcu_enter_nohz(void)
> +{
> +	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
> +
> +	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
> +	__get_cpu_var(rcu_data).dynticks++;
> +	WARN_ON_RATELIMIT(__get_cpu_var(rcu_data).dynticks & 0x1, &rs);
> +	__get_cpu_var(rcu_bh_data).dynticks++;
> +	WARN_ON_RATELIMIT(__get_cpu_var(rcu_bh_data).dynticks & 0x1, &rs);
> +}
> +
> +/*
> + * Exit nohz mode.
> + */
> +static inline void rcu_exit_nohz(void)
> +{
> +	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
> +
> +	__get_cpu_var(rcu_data).dynticks++;
> +	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_data).dynticks & 0x1), &rs);
> +	__get_cpu_var(rcu_bh_data).dynticks++;
> +	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_bh_data).dynticks & 0x1), &rs);
> +	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
> +}

These are massive.  But it seems they'll only ever be used once, in
tick-sched.c so whatever.

>
> ...
>
> +
> +config RCU_FANOUT
> +	int "Tree-based Hierarchical RCU fanout value"

s/H/h/ ?

>
> ...
>
> +#include <linux/types.h>
> +#include <linux/kernel.h>
> +#include <linux/init.h>
> +#include <linux/spinlock.h>
> +#include <linux/smp.h>
> +#include <linux/rcupdate.h>
> +#include <linux/interrupt.h>
> +#include <linux/sched.h>
> +#include <asm/atomic.h>

It still surprises me that we don't have include/linux/atomic.h.

>
> ...
>
> +
> +struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
> +DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
> +
> +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
> +DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };

I believe we don't need the explicit initialsiation any more.

>
> ...
>
> +/**
> + * rcu_irq_exit - Called from exiting Hard irq context.

"called when"?

>
> ...
>
> +static void dyntick_record_completed(struct rcu_state *rsp, int comp) { }

I don't personally think such fugliness gains us enough.

static void dyntick_record_completed(struct rcu_state *rsp, int comp)
{
}

looks nicer, is consistent and doesn't break vi's ]] command.

>
> ...
>
> +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
> +{
> +	long delta;
> +	struct rcu_node *rnp;
> +
> +	delta = get_seconds() - rsp->seconds_stall;
> +	rnp = rdp->mynode;
> +	if ((rnp->qsmask & rdp->grpmask) && delta >= 0L) {
> +
> +		/* We haven't checked in, so go dump stack. */
> +		print_cpu_stall(rsp);
> +
> +	} else if (rsp->gpnum != rsp->completed && delta >= 2L) {

I dislike the L's here.  They don't do anything and they have an
encapsulation-violation feeling about them.  Do we really want code
sprinkled all over the palce whcih "knows" spefically which type was
used to implement these fields?  Or do we want to stick a plain old "2"
in there and have it Just Work.

> +
> +		/* They had two seconds to dump stack, so complain. */
> +		print_other_cpu_stall(rsp);
> +	}
> +}
> +
> +#else /* #ifdef CONFIG_RCU_CPU_STALL */
> +static void record_gp_stall_check_time(struct rcu_state *rsp) { }
> +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { }

]]
]]

>
> ...
>
> +/*
> + * Start a new RCU grace period if warranted, re-initializing the hierarchy
> + * in preparation for detecting the next grace period.  The caller must hold
> + * the root node's ->lock, which is released before return.  Hard irqs must
> + * be disabled.
> + */
> +static void
> +rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
> +	__releases(rsp->rda[smp_processor_id()]->lock)

hm, does that work?

akpm:/usr/src/25> grep -r __releases Documentation 
akpm:/usr/src/25> 

lolwesuck.

>
> ...
>
> +/*
> + * Remove the specified CPU from the RCU hierarchy and move any pending
> + * callbacks that it might have to the current CPU.  This code assumes
> + * that at least one CPU in the system will remain running at all times.
> + * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
> + */
> +static void rcu_offline_cpu(int cpu)
> +{
> +	__rcu_offline_cpu(cpu, &rcu_state);
> +	__rcu_offline_cpu(cpu, &rcu_bh_state);
> +}
> +
> +#else /* #ifdef CONFIG_HOTPLUG_CPU */
> +
> +static void
> +rcu_offline_cpu(int cpu)

unneeded \n there

> +{
> +}
> +
> +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
> +
>
> ...
>

Looks great!  I don't understand a line of it!

It's a pretty big pill to swallow.  Nice performance testing results
will help it to slide down.

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-05 19:33       ` Andrew Morton
@ 2008-09-05 23:04         ` Paul E. McKenney
  2008-09-05 23:52           ` Andrew Morton
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-05 23:04 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, cl, mingo, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Fri, Sep 05, 2008 at 12:33:40PM -0700, Andrew Morton wrote:
> On Fri, 5 Sep 2008 08:29:30 -0700
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

Thank you very much for looking this over!

> > Hello!
> > 
> > Still experimental, not for inclusion.  But ready for serious experimental
> > use, in particular, experience on an actual >1000-CPU machine would be
> > most welcome.
> > 
> > Updates from v3:
> > 
> > o	The hierarchical-RCU implementation has been moved to its own
> > 	"rcutree" set of files.  This allows configuring three different
> > 	implementations of RCU (CLASSIC_RCU, PREEMPT_RCU, and the new
> > 	TREE_RCU).  More importantly, it enables easy application of
> > 	this patch to a wide variety of Linux versions.
> > 
> > 	I hope that this implementation can completely replace Classic
> > 	RCU, but in the meantime, this split makes for easier testing
> > 	and review.
> > 
> > o	The stalled-CPU detection is now implemented and working,
> > 	enabled by the CONFIG_RCU_CPU_STALL config parameter.  Complaints
> > 	are kprint()ed 3 seconds into the stall, and every 30 seconds
> > 	thereafter.  It also now attempts to force quiescent states.
> 
> The CONFIG_RCU_CPU_STALL identifier seems poorly-chosen to me - it
> sounds like it will stall my CPU.  Should it be
> CONFIG_RCU_CPU_STALL_DETECTOR?  If it's a debugging option then it
> should have _DEBUG in there too.

CONFIG_RCU_CPU_STALL_DETECTOR it is!  It is sufficiently lightweight
to be included in production systems, and similar mechanisms have proven
their value in my experience.  So no _DEBUG.

> > o	The algorithm uses pre-fabricated masks rather than shifting
> > 	on each access.
> > 
> > o	Review comments have been applied (thank you all!!!).
> > 	For but one example, call_rcu() and call_rcu_bh() are now
> > 	one-liners.
> > 
> > o	The rcu_pending() and rcu_needs_cpu() primitives are now
> > 	much more aggressive about permitting CPUs to enter dynticks
> > 	idle mode.  Only CPUs that have RCU callbacks are kept out
> > 	of dynticks idle mode.
> > 
> > Attached is an updated patch to Classic RCU that applies a
> > hierarchy, greatly reducing the contention on the top-level lock
> > for large machines.  This passes 10-hour concurrent rcutorture and
> > online-offline testing on 128-CPU ppc64.  It is OK for experimental
> > work assuming only modestly brave experimenters (and perhaps even
> > cowardly experiementers), but not yet ready for inclusion.  See also
> > Manfred Spraul's recent patches (or his earlier work from 2004 at
> > http://marc.info/?l=linux-kernel&m=108546384711797&w=2).  We will
> > converge onto a common patch in the fullness of time, but are currently
> > exploring different regions of the design space.  That said, I have
> > already gratefully stolen a number of Manfred's ideas.
> > 
> > This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
> > of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
> > 64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
> > there is no hierarchy.  By default, the RCU initialization code will
> > adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
> > architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
> > this balancing, allowing the hierarchy to be exactly aligned to the
> > underlying hardware.  Up to two levels of hierarchy are permitted
> > (in addition to the root node), allowing up to 16,384 CPUs on 32-bit
> > systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
> > am going to regret saying this, but this seems more than sufficient
> > for the foreseeable future.  (Some architectures might wish to set
> > CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
> > If this becomes a real problem, additional levels can be added, but I
> > doubt that it will make a significant difference on real hardware.)
> > 
> > In the common case, a given CPU will manipulate its private rcu_data
> > structure and the rcu_node structure that it shares with its immediate
> > neighbors.  This can reduce both lock and memory contention by multiple
> > orders of magnitude, which should eliminate the need for the strange
> > manipulations that are reported to be required when running Linux on
> > very large systems.
> > 
> > Some shortcomings:
> > 
> > o	Entering and leaving dynticks idle mode is a quiescent state,
> > 	but the current patch doesn't take advantage of this (noted
> > 	by Manfred).  It appears that it should be possible to make
> > 	nmi_enter() and nmi_exit() provide an in_nmi(), which would make
> > 	it possible for rcu_irq_enter() and rcu_irq_exit() to figure
> > 	out whether it is safe to tell RCU about the quiescent state --
> > 	and also greatly simplify the code.  However, a first attempt
> > 	to hack this into existence failed, so will be taking a more
> > 	measured approach.
> > 
> > o	There are a few places where grace periods are unnecessarily
> > 	delayed.
> > 
> > o	There are probably hangs, rcutorture failures, &c.  In particular,
> > 	the case where an interrupt from dynticks idle invokes call_rcu()
> > 	requires a bit more thought.  And it requires NMIs to be sorted
> > 	as noted above.
> > 
> > o	There are a few architectures that will sometimes execute irq
> > 	handlers on CPUs that are already marked offline.  This is the
> > 	subject of separate patches.  (Yes, you do have to have a very
> > 	unlikely code construct hitting an unlikely sequence of events
> > 	for anything bad to happen, but still needs to be fixed.)
> > 
> > o	Structure field layout is likely highly suboptimal.  On the other
> > 	hand, given that the read-side primitives do not touch any of
> > 	this data, this issue is not as pressing as it might otherwise be.
> > 
> > o	There is not yet a human-readable design document.  Will be fixed.
> 
> You forgot
> 
>   o	Adds yet another RCU flavour
> 
> Having alternative implementations of the same thing is a real cost in
> terms of maintainability, supportability, etc, etc.

Agreed, but note well the quote from above:

	The hierarchical-RCU implementation has been moved to its own
 	"rcutree" set of files.  This allows configuring three different
 	implementations of RCU (CLASSIC_RCU, PREEMPT_RCU, and the new
 	TREE_RCU).  More importantly, it enables easy application of
 	this patch to a wide variety of Linux versions.
 
 	I hope that this implementation can completely replace Classic
 	RCU, but in the meantime, this split makes for easier testing
 	and review.

The idea is to avoid adding another RCU flavour (or flavor, for that
matter), instead replacing Classic RCU.

> > ...
> >
> > +#if (NR_CPUS) <= RCU_FANOUT
> > +#  define NUM_RCU_LVLS	      1
> > +#  define NUM_RCU_LVL_0	      1
> > +#  define NUM_RCU_LVL_1	      (NR_CPUS)
> > +#  define NUM_RCU_LVL_2	      0
> > +#  define NUM_RCU_LVL_3	      0
> > +#elif (NR_CPUS) <= RCU_FANOUT_SQ
> > +#  define NUM_RCU_LVLS	      2
> > +#  define NUM_RCU_LVL_0	      1
> > +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> > +#  define NUM_RCU_LVL_2	      (NR_CPUS)
> > +#  define NUM_RCU_LVL_3	      0
> > +#elif (NR_CPUS) <= RCU_FANOUT_CUBE
> > +#  define NUM_RCU_LVLS	      3
> > +#  define NUM_RCU_LVL_0	      1
> > +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> > +#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> > +#  define NUM_RCU_LVL_3	      NR_CPUS
> > +#else
> > +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > +#endif /* #if (NR_CPUS) <= RCU_FANOUT */
> 
> Using NR_CPUS for anything at all is grossly, grossly inaccurate. 
> Vendors can and will ship kernels with NR_CPUS=1024 and their customers
> can and will run those kernels on 4-cpu machines.  Lots of customers.
> 
> That's a two-and-a-half-order-of-magnitude inaccuracy.  It makes all
> your above work meaningless.
> 
> To be useful, these decisions should be made at runtime.

I agree in principle, but this case is an exception.

Suppose that we have NR_CPUS=1024 on a 4-CPU 64-bit machine.  Since 64^2
is greater than 1024, we end up with a two-level hierarchy, with one
rcu_node structure at the root and 16 rcu_node leaf structures, each
of which takes up a single 128-byte cache line.  There will be two such
structures in the system, one for rcu and one for rcu_bh.

So I do not believe that this will be a problem.

One laptops, this is even less of an issue -- NR_CPUS=8 on my laptop,
which would reduce to a pair rcu_node structures, one for rcu, the other
for rcu_bh.

Making the decision at runtime would bloat the code by much more than the
extra data consumed.  And introduce yet more races between online/offline
and everything else.  Besides, this way I am being bloat-compatible
with DEFINE_PER_CPU().  ;-)

> > +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
> > +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
> > +
> > +/*
> > + * Definition for node within the RCU grace-period-detection hierarchy.
> > + */
> > +struct rcu_node {
> > +	spinlock_t lock;
> > +	unsigned long qsmask;	/* CPUs or groups that need to switch in */
> > +				/*  order for current grace period to proceed.*/
> > +	unsigned long qsmaskinit;
> > +				/* Per-GP initialization for qsmask. */
> > +	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
> > +	int	grplo;		/* lowest-numbered CPU or group here. */
> > +	int	grphi;		/* highest-numbered CPU or group here. */
> > +	u8	grpnum;		/* CPU/group number for next level up. */
> > +	u8	level;		/* root is at level 0. */
> > +	struct rcu_node *parent;
> > +} ____cacheline_internodealigned_in_smp;
> 
> So this is a 4096-byte structure on some setups.

You lost me on this one.  On a 64-bit system, I have 8 bytes each for
lock, qsmask, qsmaskinit, grpmask, and parent, four bytes each for grplo
and grphi, and another byte each for grpnum and level, for a total of
50 bytes for each struct rcu_node, which comes to a single cache line
for most large system.  Given the default CONFIG_RCU_FANOUT=64 and
NR_CPUS=4096, we have a two-level hierarchy with one root rcu_node
structure and 64 leaf rcu_node structures.  This gives a total of
65 cache lines.

> How many of them do we expect to be concurrently instantiated?

Two sets, one for rcu and one for rcu_bh, for a grand total of 130
cache lines.

Now the rcu_data structure is another story, given that it is a
DEFINE_PER_CPU() variable, though it has several hundred other per-CPU
friends in the Linux kernel.

> > ...
> >
> > +#define __rcu_read_lock() \
> > +	do { \
> > +		preempt_disable(); \
> > +		__acquire(RCU); \
> > +		rcu_read_acquire(); \
> > +	} while (0)
> > +#define __rcu_read_unlock() \
> > +	do { \
> > +		rcu_read_release(); \
> > +		__release(RCU); \
> > +		preempt_enable(); \
> > +	} while (0)
> > +#define __rcu_read_lock_bh() \
> > +	do { \
> > +		local_bh_disable(); \
> > +		__acquire(RCU_BH); \
> > +		rcu_read_acquire(); \
> > +	} while (0)
> > +#define __rcu_read_unlock_bh() \
> > +	do { \
> > +		rcu_read_release(); \
> > +		__release(RCU_BH); \
> > +		local_bh_enable(); \
> > +	} while (0)
> 
> did they have to be implemented in macros?  (it's generally best to use
> C where poss)

Nope, just copied this from Classic RCU.  Fixed.

> > +#define __synchronize_sched() synchronize_rcu()
> > +
> > +#define call_rcu_sched(head, func) call_rcu(head, func)
> > +
> > +extern void __rcu_init(void);
> > +#define rcu_init_sched()	do { } while (0)
> 
> static inline void rcu_init_sched(void)
> {
> }
> 
> , IMO

Good point, fixed.

> > ...
> >
> > +/*
> > + * Enter nohz mode, in other words, -leave- the mode in which RCU
> > + * read-side critical sections can occur.  (Though RCU read-side
> > + * critical sections can occur in irq handlers in nohz mode, a possibility
> > + * handled by rcu_irq_enter() and rcu_irq_exit()).
> > + *
> > + * @@@ note quiescent state???
> > + */
> > +static inline void rcu_enter_nohz(void)
> > +{
> > +	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
> > +
> > +	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
> > +	__get_cpu_var(rcu_data).dynticks++;
> > +	WARN_ON_RATELIMIT(__get_cpu_var(rcu_data).dynticks & 0x1, &rs);
> > +	__get_cpu_var(rcu_bh_data).dynticks++;
> > +	WARN_ON_RATELIMIT(__get_cpu_var(rcu_bh_data).dynticks & 0x1, &rs);
> > +}
> > +
> > +/*
> > + * Exit nohz mode.
> > + */
> > +static inline void rcu_exit_nohz(void)
> > +{
> > +	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
> > +
> > +	__get_cpu_var(rcu_data).dynticks++;
> > +	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_data).dynticks & 0x1), &rs);
> > +	__get_cpu_var(rcu_bh_data).dynticks++;
> > +	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_bh_data).dynticks & 0x1), &rs);
> > +	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
> > +}
> 
> These are massive.  But it seems they'll only ever be used once, in
> tick-sched.c so whatever.

Indeed.  They belong in rcutree.c rather than in rcutree.h.  Moved.

> > ...
> >
> > +
> > +config RCU_FANOUT
> > +	int "Tree-based Hierarchical RCU fanout value"
> 
> s/H/h/ ?

Done, on both this and TREE_RCU.

> > ...
> >
> > +#include <linux/types.h>
> > +#include <linux/kernel.h>
> > +#include <linux/init.h>
> > +#include <linux/spinlock.h>
> > +#include <linux/smp.h>
> > +#include <linux/rcupdate.h>
> > +#include <linux/interrupt.h>
> > +#include <linux/sched.h>
> > +#include <asm/atomic.h>
> 
> It still surprises me that we don't have include/linux/atomic.h.

;-)

> > ...
> >
> > +
> > +struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
> > +DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
> > +
> > +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
> > +DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
> 
> I believe we don't need the explicit initialsiation any more.

Hmmm...  Compiles without it OK.  Will let you know how the testing
goes.  ;-)

> > ...
> >
> > +/**
> > + * rcu_irq_exit - Called from exiting Hard irq context.
> 
> "called when"?

Fair enough, fixed.

> > ...
> >
> > +static void dyntick_record_completed(struct rcu_state *rsp, int comp) { }
> 
> I don't personally think such fugliness gains us enough.

Multi-line-ified.

> static void dyntick_record_completed(struct rcu_state *rsp, int comp)
> {
> }
> 
> looks nicer, is consistent and doesn't break vi's ]] command.
> 
> >
> > ...
> >
> > +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
> > +{
> > +	long delta;
> > +	struct rcu_node *rnp;
> > +
> > +	delta = get_seconds() - rsp->seconds_stall;
> > +	rnp = rdp->mynode;
> > +	if ((rnp->qsmask & rdp->grpmask) && delta >= 0L) {
> > +
> > +		/* We haven't checked in, so go dump stack. */
> > +		print_cpu_stall(rsp);
> > +
> > +	} else if (rsp->gpnum != rsp->completed && delta >= 2L) {
> 
> I dislike the L's here.  They don't do anything and they have an
> encapsulation-violation feeling about them.  Do we really want code
> sprinkled all over the palce whcih "knows" spefically which type was
> used to implement these fields?  Or do we want to stick a plain old "2"
> in there and have it Just Work.

Removed the "L"s.

> > +
> > +		/* They had two seconds to dump stack, so complain. */
> > +		print_other_cpu_stall(rsp);
> > +	}
> > +}
> > +
> > +#else /* #ifdef CONFIG_RCU_CPU_STALL */
> > +static void record_gp_stall_check_time(struct rcu_state *rsp) { }
> > +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) { }
> 
> ]]
> ]]

Multi-line-ified.

> >
> > ...
> >
> > +/*
> > + * Start a new RCU grace period if warranted, re-initializing the hierarchy
> > + * in preparation for detecting the next grace period.  The caller must hold
> > + * the root node's ->lock, which is released before return.  Hard irqs must
> > + * be disabled.
> > + */
> > +static void
> > +rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
> > +	__releases(rsp->rda[smp_processor_id()]->lock)
> 
> hm, does that work?

Yes, by dint of the argument being pretty much ignored at the moment.
And it should work OK if they key off of the types.

> akpm:/usr/src/25> grep -r __releases Documentation 
> akpm:/usr/src/25> 
> 
> lolwesuck.

;-)

> > ...
> >
> > +/*
> > + * Remove the specified CPU from the RCU hierarchy and move any pending
> > + * callbacks that it might have to the current CPU.  This code assumes
> > + * that at least one CPU in the system will remain running at all times.
> > + * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
> > + */
> > +static void rcu_offline_cpu(int cpu)
> > +{
> > +	__rcu_offline_cpu(cpu, &rcu_state);
> > +	__rcu_offline_cpu(cpu, &rcu_bh_state);
> > +}
> > +
> > +#else /* #ifdef CONFIG_HOTPLUG_CPU */
> > +
> > +static void
> > +rcu_offline_cpu(int cpu)
> 
> unneeded \n there

Fixed.

> > +{
> > +}
> > +
> > +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
> > +
> >
> > ...
> >
> 
> Looks great!  I don't understand a line of it!

;-)

Guess I need to get going on that design document...

> It's a pretty big pill to swallow.  Nice performance testing results
> will help it to slide down.

Well, the read side is easy -- exactly the same code sequence as for
Classic RCU.  On the update side, this is more of a bug fix for large
numbers of CPUs, where unadorned Classic RCU is said to suffer terminal
lock contention.  I will see what I can come up with, but at the end of
the day, this will need some testing on machines larger than the 128-CPU
systems that I have access to.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-05 23:04         ` Paul E. McKenney
@ 2008-09-05 23:52           ` Andrew Morton
  2008-09-06  4:16             ` Paul E. McKenney
  0 siblings, 1 reply; 94+ messages in thread
From: Andrew Morton @ 2008-09-05 23:52 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Fri, 5 Sep 2008 16:04:11 -0700 "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
>
> ...
>
> > > +#if (NR_CPUS) <= RCU_FANOUT
> > > +#  define NUM_RCU_LVLS	      1
> > > +#  define NUM_RCU_LVL_0	      1
> > > +#  define NUM_RCU_LVL_1	      (NR_CPUS)
> > > +#  define NUM_RCU_LVL_2	      0
> > > +#  define NUM_RCU_LVL_3	      0
> > > +#elif (NR_CPUS) <= RCU_FANOUT_SQ
> > > +#  define NUM_RCU_LVLS	      2
> > > +#  define NUM_RCU_LVL_0	      1
> > > +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> > > +#  define NUM_RCU_LVL_2	      (NR_CPUS)
> > > +#  define NUM_RCU_LVL_3	      0
> > > +#elif (NR_CPUS) <= RCU_FANOUT_CUBE
> > > +#  define NUM_RCU_LVLS	      3
> > > +#  define NUM_RCU_LVL_0	      1
> > > +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> > > +#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> > > +#  define NUM_RCU_LVL_3	      NR_CPUS
> > > +#else
> > > +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > > +#endif /* #if (NR_CPUS) <= RCU_FANOUT */
> > 
> > Using NR_CPUS for anything at all is grossly, grossly inaccurate. 
> > Vendors can and will ship kernels with NR_CPUS=1024 and their customers
> > can and will run those kernels on 4-cpu machines.  Lots of customers.
> > 
> > That's a two-and-a-half-order-of-magnitude inaccuracy.  It makes all
> > your above work meaningless.
> > 
> > To be useful, these decisions should be made at runtime.
> 
> I agree in principle, but this case is an exception.
> 
> Suppose that we have NR_CPUS=1024 on a 4-CPU 64-bit machine.  Since 64^2
> is greater than 1024, we end up with a two-level hierarchy, with one
> rcu_node structure at the root and 16 rcu_node leaf structures, each
> of which takes up a single 128-byte cache line.  There will be two such
> structures in the system, one for rcu and one for rcu_bh.
> 
> So I do not believe that this will be a problem.
> 
> One laptops, this is even less of an issue -- NR_CPUS=8 on my laptop,
> which would reduce to a pair rcu_node structures, one for rcu, the other
> for rcu_bh.

Is it likely that anyone will ship kernels with NR_CPUS=8?  What are
distros presently using, and what will they be using 1-2 years hence?

> Making the decision at runtime would bloat the code by much more than the
> extra data consumed.  And introduce yet more races between online/offline
> and everything else.  Besides, this way I am being bloat-compatible
> with DEFINE_PER_CPU().  ;-)
> 
> > > +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
> > > +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
> > > +
> > > +/*
> > > + * Definition for node within the RCU grace-period-detection hierarchy.
> > > + */
> > > +struct rcu_node {
> > > +	spinlock_t lock;
> > > +	unsigned long qsmask;	/* CPUs or groups that need to switch in */
> > > +				/*  order for current grace period to proceed.*/
> > > +	unsigned long qsmaskinit;
> > > +				/* Per-GP initialization for qsmask. */
> > > +	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
> > > +	int	grplo;		/* lowest-numbered CPU or group here. */
> > > +	int	grphi;		/* highest-numbered CPU or group here. */
> > > +	u8	grpnum;		/* CPU/group number for next level up. */
> > > +	u8	level;		/* root is at level 0. */
> > > +	struct rcu_node *parent;
> > > +} ____cacheline_internodealigned_in_smp;
> > 
> > So this is a 4096-byte structure on some setups.
> 
> You lost me on this one.  On a 64-bit system, I have 8 bytes each for
> lock, qsmask, qsmaskinit, grpmask, and parent, four bytes each for grplo
> and grphi, and another byte each for grpnum and level, for a total of
> 50 bytes for each struct rcu_node, which comes to a single cache line
> for most large system.  Given the default CONFIG_RCU_FANOUT=64 and
> NR_CPUS=4096, we have a two-level hierarchy with one root rcu_node
> structure and 64 leaf rcu_node structures.  This gives a total of
> 65 cache lines.

____cacheline_internodealigned_in_smp will expand this structure to
4096 bytes on CONFIG_X86_VSMP=y.

> > It's a pretty big pill to swallow.  Nice performance testing results
> > will help it to slide down.
> 
> Well, the read side is easy -- exactly the same code sequence as for
> Classic RCU.  On the update side, this is more of a bug fix for large
> numbers of CPUs, where unadorned Classic RCU is said to suffer terminal
> lock contention.  I will see what I can come up with, but at the end of
> the day, this will need some testing on machines larger than the 128-CPU
> systems that I have access to.
> 

OK, thanks.  As it's effectively a bugfix, a full description of the
bug would grease some wheels.


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-05 23:52           ` Andrew Morton
@ 2008-09-06  4:16             ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-06  4:16 UTC (permalink / raw)
  To: Andrew Morton
  Cc: linux-kernel, cl, mingo, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Fri, Sep 05, 2008 at 04:52:35PM -0700, Andrew Morton wrote:
> On Fri, 5 Sep 2008 16:04:11 -0700 "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> >
> > ...
> >
> > > > +#if (NR_CPUS) <= RCU_FANOUT
> > > > +#  define NUM_RCU_LVLS	      1
> > > > +#  define NUM_RCU_LVL_0	      1
> > > > +#  define NUM_RCU_LVL_1	      (NR_CPUS)
> > > > +#  define NUM_RCU_LVL_2	      0
> > > > +#  define NUM_RCU_LVL_3	      0
> > > > +#elif (NR_CPUS) <= RCU_FANOUT_SQ
> > > > +#  define NUM_RCU_LVLS	      2
> > > > +#  define NUM_RCU_LVL_0	      1
> > > > +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
> > > > +#  define NUM_RCU_LVL_2	      (NR_CPUS)
> > > > +#  define NUM_RCU_LVL_3	      0
> > > > +#elif (NR_CPUS) <= RCU_FANOUT_CUBE
> > > > +#  define NUM_RCU_LVLS	      3
> > > > +#  define NUM_RCU_LVL_0	      1
> > > > +#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
> > > > +#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
> > > > +#  define NUM_RCU_LVL_3	      NR_CPUS
> > > > +#else
> > > > +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
> > > > +#endif /* #if (NR_CPUS) <= RCU_FANOUT */
> > > 
> > > Using NR_CPUS for anything at all is grossly, grossly inaccurate. 
> > > Vendors can and will ship kernels with NR_CPUS=1024 and their customers
> > > can and will run those kernels on 4-cpu machines.  Lots of customers.
> > > 
> > > That's a two-and-a-half-order-of-magnitude inaccuracy.  It makes all
> > > your above work meaningless.
> > > 
> > > To be useful, these decisions should be made at runtime.
> > 
> > I agree in principle, but this case is an exception.
> > 
> > Suppose that we have NR_CPUS=1024 on a 4-CPU 64-bit machine.  Since 64^2
> > is greater than 1024, we end up with a two-level hierarchy, with one
> > rcu_node structure at the root and 16 rcu_node leaf structures, each
> > of which takes up a single 128-byte cache line.  There will be two such
> > structures in the system, one for rcu and one for rcu_bh.
> > 
> > So I do not believe that this will be a problem.
> > 
> > One laptops, this is even less of an issue -- NR_CPUS=8 on my laptop,
> > which would reduce to a pair rcu_node structures, one for rcu, the other
> > for rcu_bh.
> 
> Is it likely that anyone will ship kernels with NR_CPUS=8?  What are
> distros presently using, and what will they be using 1-2 years hence?

Ubuntu Feisty ships with NR_CPUS=8, but yes, I imagine that this number
will increase.  Perhaps a table for 64-bit CPUs:

			 Cachelines per	      Total
       NR_CPUs		 Implementation	    Cachelines

	1-64			 1		 2	[laptop distros]
       65-128			 3		 6	[x86 distros, Power]
      129-192			 4		 8
      193-256			 5		10
      257-320			 6		12
      321-384			 7		14
      385-448			 8		16
      449-512			 9		18	[SGI ca. 2004]
      513-576			10		20
      577-640			11		22
      641-704			12		24
      705-768			13		26
      769-832			14		28
      833-896			15		30
      897-960			16		32
      961-1024			17		34	[SGI ca. 2006?]

      . . .

     3967-4032			64	       128
     4033-4096			65	       130	[SGI ca. 2008/9?]
     4097-4160			67	       134
     4161-4224			68	       136

And so on, limiting out at 262,144 CPUs, which should be at least
3-5 years in the future (famous last words).  The "Cachelines per
Implementation" column is for each of rcu and rcu_bh, while the "Total
Cachelines" column is for the combination of both.  As you can see, the
number of cachelines consumed by the rcu_nodes hierarchy is quite modest
as a function of the number of CPUs, even for pretty big numbers of CPUs.

So I really do not believe that this is a real issue.

> > Making the decision at runtime would bloat the code by much more than the
> > extra data consumed.  And introduce yet more races between online/offline
> > and everything else.  Besides, this way I am being bloat-compatible
> > with DEFINE_PER_CPU().  ;-)
> > 
> > > > +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
> > > > +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
> > > > +
> > > > +/*
> > > > + * Definition for node within the RCU grace-period-detection hierarchy.
> > > > + */
> > > > +struct rcu_node {
> > > > +	spinlock_t lock;
> > > > +	unsigned long qsmask;	/* CPUs or groups that need to switch in */
> > > > +				/*  order for current grace period to proceed.*/
> > > > +	unsigned long qsmaskinit;
> > > > +				/* Per-GP initialization for qsmask. */
> > > > +	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
> > > > +	int	grplo;		/* lowest-numbered CPU or group here. */
> > > > +	int	grphi;		/* highest-numbered CPU or group here. */
> > > > +	u8	grpnum;		/* CPU/group number for next level up. */
> > > > +	u8	level;		/* root is at level 0. */
> > > > +	struct rcu_node *parent;
> > > > +} ____cacheline_internodealigned_in_smp;
> > > 
> > > So this is a 4096-byte structure on some setups.
> > 
> > You lost me on this one.  On a 64-bit system, I have 8 bytes each for
> > lock, qsmask, qsmaskinit, grpmask, and parent, four bytes each for grplo
> > and grphi, and another byte each for grpnum and level, for a total of
> > 50 bytes for each struct rcu_node, which comes to a single cache line
> > for most large system.  Given the default CONFIG_RCU_FANOUT=64 and
> > NR_CPUS=4096, we have a two-level hierarchy with one root rcu_node
> > structure and 64 leaf rcu_node structures.  This gives a total of
> > 65 cache lines.
> 
> ____cacheline_internodealigned_in_smp will expand this structure to
> 4096 bytes on CONFIG_X86_VSMP=y.

Ah!  I guess that I have not been paying sufficient attention.

OK, http://www.scalemp.com/ says that they go to 128 cores, so perhaps
256 hardware threads, and 1TB of memory.  This is about 4GB per hardware
thread.  My approach costs them 4096/64=64 bytes per hardware thread,
or about 2 millionths of a percent of memory.

I think that this is OK.  If not, one option is to pick a smaller
alignment for that architecture.

> > > It's a pretty big pill to swallow.  Nice performance testing results
> > > will help it to slide down.
> > 
> > Well, the read side is easy -- exactly the same code sequence as for
> > Classic RCU.  On the update side, this is more of a bug fix for large
> > numbers of CPUs, where unadorned Classic RCU is said to suffer terminal
> > lock contention.  I will see what I can come up with, but at the end of
> > the day, this will need some testing on machines larger than the 128-CPU
> > systems that I have access to.
> 
> OK, thanks.  As it's effectively a bugfix, a full description of the
> bug would grease some wheels.

Here is the gist of what I was told:

	Above several hundred CPUs, the current RCU implementation
	suffers from severe lock contention, rendering the machine
	useless.  Crude workarounds exist, but are expected to cause
	their own problems at higher CPU counts.

I never have used a machine with that many CPUs, so am just passing on
what I heard.  But given that Classic RCU was designed with 32-64 CPUs
in mind, I actually would have expected it to hit the wall much sooner.

Is the above sufficient?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-05 15:29     ` [PATCH, RFC] v4 " Paul E. McKenney
  2008-09-05 19:33       ` Andrew Morton
@ 2008-09-06 16:37       ` Manfred Spraul
  2008-09-07 17:25         ` Paul E. McKenney
  2008-09-07 10:18       ` [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation) Manfred Spraul
  2008-09-15 16:02       ` [PATCH, RFC] v4 scalable classic RCU implementation Paul E. McKenney
  3 siblings, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-09-06 16:37 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

[-- Attachment #1: Type: text/plain, Size: 691 bytes --]

Hi Paul,

Paul E. McKenney wrote:
> o	The rcu_pending() and rcu_needs_cpu() primitives are now
> 	much more aggressive about permitting CPUs to enter dynticks
> 	idle mode.  Only CPUs that have RCU callbacks are kept out
> 	of dynticks idle mode.
>   

I've noticed that right now rcu_enter_nohz() can be nested within 
rcu_irq_enter():
irq_exit() first calls tick_nohz_stop_sched_tick(), then rcu_irq_exit().
And tick_nohz_stop_sched_tick() can switch into nohz mode.

Is that intentional? Does rcupreempt support that? It broke my rcustate 
code on x86-64.

I would prefer if something like the attached patch is applied. What do 
you think?
Do you need the patch as well?

--
    Manfred

[-- Attachment #2: patch-move-rcu_irq_exit --]
[-- Type: text/plain, Size: 456 bytes --]

diff --git a/kernel/softirq.c b/kernel/softirq.c
index ba20a90..cca5a83 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -284,10 +284,10 @@ void irq_exit(void)
 		invoke_softirq();
 
 #ifdef CONFIG_NO_HZ
+	rcu_irq_exit(0);
 	/* Make sure that timer wheel updates are propagated */
 	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
 		tick_nohz_stop_sched_tick(0);
-	rcu_irq_exit(0);
 #endif
 	preempt_enable_no_resched();
 }

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation)
  2008-09-05 15:29     ` [PATCH, RFC] v4 " Paul E. McKenney
  2008-09-05 19:33       ` Andrew Morton
  2008-09-06 16:37       ` Manfred Spraul
@ 2008-09-07 10:18       ` Manfred Spraul
  2008-09-07 11:07         ` Andi Kleen
  2008-09-07 19:46         ` Paul E. McKenney
  2008-09-15 16:02       ` [PATCH, RFC] v4 scalable classic RCU implementation Paul E. McKenney
  3 siblings, 2 replies; 94+ messages in thread
From: Manfred Spraul @ 2008-09-07 10:18 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

[-- Attachment #1: Type: text/plain, Size: 1896 bytes --]

Paul E. McKenney wrote:
> +/*
> + * If the specified CPU is offline, tell the caller that it is in
> + * a quiescent state.  Otherwise, whack it with a reschedule IPI.
> + * Grace periods can end up waiting on an offline CPU when that
> + * CPU is in the process of coming online -- it will be added to the
> + * rcu_node bitmasks before it actually makes it online.  Because this
> + * race is quite rare, we check for it after detecting that the grace
> + * period has been delayed rather than checking each and every CPU
> + * each and every time we start a new grace period.
> + */
> +static int rcu_implicit_offline_qs(struct rcu_data *rdp)
> +{
> +	/*
> +	 * If the CPU is offline, it is in a quiescent state.  We can
> +	 * trust its state not to change because interrupts are disabled.
> +	 */
> +	if (cpu_is_offline(rdp->cpu)) {
> +		rdp->offline_fqs++;
> +		return 1;
> +	}
>   
I fear that this won't work.
E.g. look at x86, smp_callin() [arch/x86/kernel/smpboot.c]:
The boot code must enable local interrupts around calibrate_delay(), 
otherwise the NMI watchdog would complain.
That means the first interrupts, the first read side critical sections 
can run way before the cpu bit is set within cpu_online_map.
cpus are just started, we are not within stop_machine. Thus we cannot 
make any assumption about what the remaining cpus are doing.

What about introducing a CPU_STARTING notifier call, similar to CPU_DYING:
- called with disabled interrupts
- called before interrupts are enabled
- must not sleep
- called on the new cpu.

This might also be useful for something like kvm. I'm not sure if it's 
guaranteed that hardware_enable() runs early enough.

Attached is a patch proposal. Note that it doesn't work correctly: On 
x86-64, I have seen that CPU_STARTING is called after CPU_ONLINE. Thus 
frozen_cpus could already be cleared, then _FROZEN would be wrong.

--
Manfred

[-- Attachment #2: patch-CPU_STARTING-wip --]
[-- Type: text/plain, Size: 2224 bytes --]

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d7faf88..c2747ac 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 #endif
 
 int cpu_up(unsigned int cpu);
+void notify_cpu_starting(unsigned int cpu);
 extern void cpu_hotplug_init(void);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index da2698b..8e47661 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret)
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
 #define CPU_DYING		0x0008 /* CPU (unsigned)v not running any task,
-				        * not handling interrupts, soon dead */
+				        * not handling interrupts, soon dead.
+				        * Called on the dying cpu, interrupts
+				        * are already disabled. Must not
+				        * sleep, must not fail */
 #define CPU_POST_DEAD		0x0009 /* CPU (unsigned)v dead, cpu_hotplug
 					* lock is dropped */
+#define CPU_STARTING		0x000A /* CPU (unsigned)v soon running.
+					* Called on the new cpu, just before
+					* enabling interrupts. Must not sleep,
+					* must not fail */
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
  * operation in progress
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5b7c88f..2300fc0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -455,6 +455,25 @@ out:
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, immediately
+ * before enabling interrupts.
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+	unsigned long val = CPU_STARTING;
+
+#ifdef CONFIG_PM_SLEEP_SMP
+	if (cpu_isset(cpu, frozen_cpus))
+		val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+	raw_notifier_call_chain(&cpu_chain, val, (void*)(long)cpu);
+}
+
 #endif /* CONFIG_SMP */
 
 /*

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation)
  2008-09-07 10:18       ` [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation) Manfred Spraul
@ 2008-09-07 11:07         ` Andi Kleen
  2008-09-07 19:46         ` Paul E. McKenney
  1 sibling, 0 replies; 94+ messages in thread
From: Andi Kleen @ 2008-09-07 11:07 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: paulmck, linux-kernel, cl, mingo, akpm, dipankar, josht, schamp,
	niv, dvhltc, ego, laijs, rostedt, peterz, penberg, andi

> What about introducing a CPU_STARTING notifier call, similar to CPU_DYING:
> - called with disabled interrupts
> - called before interrupts are enabled
> - must not sleep
> - called on the new cpu.
> 
> This might also be useful for something like kvm. I'm not sure if it's 
> guaranteed that hardware_enable() runs early enough.

I would find that useful too. I had several cases where i had 
to add smp_call_function_single() with a second callback to the CPU_UP 
notifier, which always seemed ugly.

-Andi
-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-06 16:37       ` Manfred Spraul
@ 2008-09-07 17:25         ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-07 17:25 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Sat, Sep 06, 2008 at 06:37:38PM +0200, Manfred Spraul wrote:
> Hi Paul,
>
> Paul E. McKenney wrote:
>> o	The rcu_pending() and rcu_needs_cpu() primitives are now
>> 	much more aggressive about permitting CPUs to enter dynticks
>> 	idle mode.  Only CPUs that have RCU callbacks are kept out
>> 	of dynticks idle mode.
>>   
>
> I've noticed that right now rcu_enter_nohz() can be nested within 
> rcu_irq_enter():
> irq_exit() first calls tick_nohz_stop_sched_tick(), then rcu_irq_exit().
> And tick_nohz_stop_sched_tick() can switch into nohz mode.
>
> Is that intentional? Does rcupreempt support that? It broke my rcustate 
> code on x86-64.
>
> I would prefer if something like the attached patch is applied. What do you 
> think?
> Do you need the patch as well?

Good question -- when I tried splitting irqs from NMIs, things broke
badly, and this might well explain it.  Thank you very much for the
hint!!!

							Thanx, Paul

> --
>    Manfred

> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index ba20a90..cca5a83 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -284,10 +284,10 @@ void irq_exit(void)
>  		invoke_softirq();
> 
>  #ifdef CONFIG_NO_HZ
> +	rcu_irq_exit(0);
>  	/* Make sure that timer wheel updates are propagated */
>  	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
>  		tick_nohz_stop_sched_tick(0);
> -	rcu_irq_exit(0);
>  #endif
>  	preempt_enable_no_resched();
>  }


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation)
  2008-09-07 10:18       ` [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation) Manfred Spraul
  2008-09-07 11:07         ` Andi Kleen
@ 2008-09-07 19:46         ` Paul E. McKenney
  1 sibling, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-07 19:46 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Sun, Sep 07, 2008 at 12:18:18PM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:
>> +/*
>> + * If the specified CPU is offline, tell the caller that it is in
>> + * a quiescent state.  Otherwise, whack it with a reschedule IPI.
>> + * Grace periods can end up waiting on an offline CPU when that
>> + * CPU is in the process of coming online -- it will be added to the
>> + * rcu_node bitmasks before it actually makes it online.  Because this
>> + * race is quite rare, we check for it after detecting that the grace
>> + * period has been delayed rather than checking each and every CPU
>> + * each and every time we start a new grace period.
>> + */
>> +static int rcu_implicit_offline_qs(struct rcu_data *rdp)
>> +{
>> +	/*
>> +	 * If the CPU is offline, it is in a quiescent state.  We can
>> +	 * trust its state not to change because interrupts are disabled.
>> +	 */
>> +	if (cpu_is_offline(rdp->cpu)) {
>> +		rdp->offline_fqs++;
>> +		return 1;
>> +	}
>>   
> I fear that this won't work.
> E.g. look at x86, smp_callin() [arch/x86/kernel/smpboot.c]:
> The boot code must enable local interrupts around calibrate_delay(), 
> otherwise the NMI watchdog would complain.
> That means the first interrupts, the first read side critical sections can 
> run way before the cpu bit is set within cpu_online_map.
> cpus are just started, we are not within stop_machine. Thus we cannot make 
> any assumption about what the remaining cpus are doing.

Ouch!  Very good catch!!!  ;-)

> What about introducing a CPU_STARTING notifier call, similar to CPU_DYING:
> - called with disabled interrupts
> - called before interrupts are enabled
> - must not sleep
> - called on the new cpu.

I suppose another approach would be to make calibrate_delay() kick the
watchdog timer every so often, but the effort explaining to people why
their machine's bogomips had decreased would probably far exceed any
possible benefit...

So given a CPU_STARTING notifier, RCU could set a bit in a coming-online
bitmap, which would be reset rcu_online_cpu().  Then RCU would consider
a CPU offline only if it is marked offline in cpu_online_map -and- it is
not marked as coming-online.

> This might also be useful for something like kvm. I'm not sure if it's 
> guaranteed that hardware_enable() runs early enough.
>
> Attached is a patch proposal. Note that it doesn't work correctly: On 
> x86-64, I have seen that CPU_STARTING is called after CPU_ONLINE. Thus 
> frozen_cpus could already be cleared, then _FROZEN would be wrong.

Then notify_cpu_starting() is invoked right before smp_callin() in
start_secondary()?

							Thanx, Paul

> --
> Manfred

> diff --git a/include/linux/cpu.h b/include/linux/cpu.h
> index d7faf88..c2747ac 100644
> --- a/include/linux/cpu.h
> +++ b/include/linux/cpu.h
> @@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
>  #endif
> 
>  int cpu_up(unsigned int cpu);
> +void notify_cpu_starting(unsigned int cpu);
>  extern void cpu_hotplug_init(void);
>  extern void cpu_maps_update_begin(void);
>  extern void cpu_maps_update_done(void);
> diff --git a/include/linux/notifier.h b/include/linux/notifier.h
> index da2698b..8e47661 100644
> --- a/include/linux/notifier.h
> +++ b/include/linux/notifier.h
> @@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret)
>  #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
>  #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
>  #define CPU_DYING		0x0008 /* CPU (unsigned)v not running any task,
> -				        * not handling interrupts, soon dead */
> +				        * not handling interrupts, soon dead.
> +				        * Called on the dying cpu, interrupts
> +				        * are already disabled. Must not
> +				        * sleep, must not fail */
>  #define CPU_POST_DEAD		0x0009 /* CPU (unsigned)v dead, cpu_hotplug
>  					* lock is dropped */
> +#define CPU_STARTING		0x000A /* CPU (unsigned)v soon running.
> +					* Called on the new cpu, just before
> +					* enabling interrupts. Must not sleep,
> +					* must not fail */
> 
>  /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
>   * operation in progress
> diff --git a/kernel/cpu.c b/kernel/cpu.c
> index 5b7c88f..2300fc0 100644
> --- a/kernel/cpu.c
> +++ b/kernel/cpu.c
> @@ -455,6 +455,25 @@ out:
>  }
>  #endif /* CONFIG_PM_SLEEP_SMP */
> 
> +/**
> + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
> + * @cpu: cpu that just started
> + *
> + * This function calls the cpu_chain notifiers with CPU_STARTING.
> + * It must be called by the arch code on the new cpu, immediately
> + * before enabling interrupts.
> + */
> +void notify_cpu_starting(unsigned int cpu)
> +{
> +	unsigned long val = CPU_STARTING;
> +
> +#ifdef CONFIG_PM_SLEEP_SMP
> +	if (cpu_isset(cpu, frozen_cpus))
> +		val = CPU_STARTING_FROZEN;
> +#endif /* CONFIG_PM_SLEEP_SMP */
> +	raw_notifier_call_chain(&cpu_chain, val, (void*)(long)cpu);
> +}
> +
>  #endif /* CONFIG_SMP */
> 
>  /*


^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-05 15:29     ` [PATCH, RFC] v4 " Paul E. McKenney
                         ` (2 preceding siblings ...)
  2008-09-07 10:18       ` [RFC, PATCH] Add a CPU_STARTING notifier (was: Re: [PATCH, RFC] v4 scalable classic RCU implementation) Manfred Spraul
@ 2008-09-15 16:02       ` Paul E. McKenney
  2008-09-16 16:52         ` Manfred Spraul
  2008-09-23 23:53         ` [PATCH, RFC] v6 " Paul E. McKenney
  3 siblings, 2 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-15 16:02 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt, peterz, penberg, andi

Hello!

This patch fixes a long-standing performance bug in classic RCU that
results in massive lock contention on the internal RCU lock on systems
with more than a few hundred CPUs.  Although this patch creates a
separate flavor of RCU for easy of review and patch maintenance, it
is intended to replace classic RCU.

Still experimental, not for inclusion, but given that I am now finding
more bugs in the rest of Linux than in this code, I suspect that it is
getting close.  Definitely ready for serious experimental use, especially
in !CONFIG_NO_HZ configurations.  In particular, experience on an actual
>1000-CPU machine would be most welcome.

Updates from v4:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a
hierarchy, greatly reducing the contention on the top-level lock
for large machines.  This passes 10-hour concurrent rcutorture and
online-offline testing on 128-CPU ppc64 without dynticks enabled,
and exposes some timekeeping bugs in presence of dynticks (exciting
working on a system where "sleep 1" hangs until interrupted...).
It is OK for experimental work, but not yet ready for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	Entering and leaving dynticks idle mode is a quiescent state,
	but the current patch doesn't directly take advantage of this
	(noted by Manfred).  I need to get the timekeeping bugs fixed
	before I can reliably test this added feature -- and may defer
	this in any case, as force_quiescent_state() already covers
	this case.

o	There is a bit of debug code in place.  This will be removed.

o	There are a few places where grace periods are unnecessarily
	delayed.

o	There are probably hangs, rcutorture failures, &c.  In particular,
	the case where an interrupt from dynticks idle invokes call_rcu()
	requires a bit more thought.  And it requires NMIs to be sorted
	as noted above.

o	There are a few architectures that will sometimes execute irq
	handlers on CPUs that are already marked offline.  This is the
	subject of separate patches, most of which are on their way in.
	(Yes, you do have to have a very unlikely code construct hitting
	an unlikely sequence of events for anything bad to happen,
	but still needs to be fixed.)

o	Structure field layout is likely highly suboptimal.  On the other
	hand, given that the read-side primitives do not touch any of
	this data, this issue is not as pressing as it might otherwise be.

o	There is not yet a human-readable design document.  Will be fixed.

To build, start with 2.6.27-rc3, and apply:

	http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-treeRCU-20.patch

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 
 arch/powerpc/platforms/pseries/rtasd.c |    4 
 include/linux/hardirq.h                |   14 
 include/linux/rcupdate.h               |   10 
 include/linux/rcutree.h                |  320 +++++++
 init/Kconfig                           |   15 
 kernel/Kconfig.preempt                 |   70 +
 kernel/Makefile                        |    6 
 kernel/rcupreempt.c                    |   10 
 kernel/rcutree.c                       | 1488 +++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c                 |  235 +++++
 kernel/softirq.c                       |    3 
 lib/Kconfig.debug                      |   13 
 12 files changed, 2163 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index c9ffd8c..d8e784a 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 	/* Check to see if we need to or have stopped logging */
 	if (fatal || !logging_enabled) {
 		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		else
 			rtas_log_start += 1;
 
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		wake_up_interruptible(&rtas_log_wait);
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..9b70b92 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
+extern void rcu_nmi_enter(void);
+extern void rcu_nmi_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+# define rcu_nmi_enter() do { } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
@@ -134,7 +138,6 @@ extern void rcu_irq_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
-		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -153,7 +156,6 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
-		rcu_irq_exit();				\
 	} while (0)
 
 /*
@@ -161,7 +163,7 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()		do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e8b4039..f8544ae 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,15 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
-#ifdef CONFIG_CLASSIC_RCU
+#if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
-#else /* #ifdef CONFIG_CLASSIC_RCU */
+#elif defined(CONFIG_TREE_RCU)
+#include <linux/rcutree.h>
+#elif defined(CONFIG_PREEMPT_RCU)
 #include <linux/rcupreempt.h>
-#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+#else
+#error "Unknown RCU implementation specified to kernel configuration"
+#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 0000000..43aa355
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,320 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+
+#ifndef __LINUX_RCUTREE_H
+#define __LINUX_RCUTREE_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if (NR_CPUS) <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+	int dynticks_nesting;	/* Track nesting level, sort of. */
+	int dynticks;		/* Even value for dynticks-idle, else odd. */
+	int dynticks_nmi;	/* Even value for either dynticks-idle or */
+				/*  not in nmi handler, else odd.  So this */
+				/*  remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+	/* 3) rcu-barrier functions */
+	struct rcu_head barrier;
+
+#ifdef CONFIG_NO_HZ
+	/* 4) dynticks interface (see http://lwn.net/Articles/279077/) */
+	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 5) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struc rcu_data. */
+#define RCU_SAVE_DYNTICK	0	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		1	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK	 3	/* for rsp->seconds_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK	30	/* for rsp->seconds_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* sent GP-kick IPIs? */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long seconds_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+static inline void __rcu_read_lock(void)
+{
+	preempt_disable();
+	__acquire(RCU);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock(void)
+{
+	rcu_read_release();
+	__release(RCU);
+	preempt_enable();
+}
+static inline void __rcu_read_lock_bh(void)
+{
+	local_bh_disable();
+	__acquire(RCU_BH);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock_bh(void)
+{
+	rcu_read_release();
+	__release(RCU_BH);
+	local_bh_enable();
+}
+
+#define __synchronize_sched() synchronize_rcu()
+
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
+static inline void rcu_init_sched(void)
+{
+}
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+#ifdef CONFIG_NO_HZ
+void rcu_enter_nohz(void);
+void rcu_exit_nohz(void);
+#else /* CONFIG_NO_HZ */
+static inline void rcu_enter_nohz(void)
+{
+}
+static inline void rcu_exit_nohz(void)
+{
+}
+#endif /* CONFIG_NO_HZ */
+
+#endif /* __LINUX_RCUTREE_H */
diff --git a/init/Kconfig b/init/Kconfig
index b678803..82ee8f7 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -914,10 +914,11 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
-config CLASSIC_RCU
-	def_bool !PREEMPT_RCU
-	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.  Classic RCU is the default.  Note that the
-	  PREEMPT_RCU symbol is used to select/deselect this option.
+config RCU_TRACE
+	def_bool TREE_RCU_TRACE || PREEMPT_RCU_TRACE
+	select DEBUG_FS
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.  Note that
+	  either RCU_TREE_TRACE or RCU_PREEMPT_TRACE is used to
+	  select/deselect this option.
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03..14257ef 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,57 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
+endchoice
+
+config TREE_RCU_TRACE
+	bool "Enable tracing for tree-based hierarchical RCU"
+	depends on TREE_RCU
+	help
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
+
+	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
 
-config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
+config PREEMPT_RCU_TRACE
+	bool "Enable tracing for preemptable RCU"
 	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df..101e880 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 2782793..6bc8489 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -559,6 +559,16 @@ void rcu_irq_exit(void)
 	}
 }
 
+void rcu_nmi_enter(void)
+{
+	rcu_irq_enter();
+}
+
+void rcu_nmi_exit(void)
+{
+	rcu_irq_exit();
+}
+
 static void dyntick_save_progress_counter(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 0000000..366fe17
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1488 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+#endif /* #ifdef CONFIG_NO_HZ */
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
+
+/*
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ */
+void rcu_enter_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dynticks).dynticks & 0x1, &rcu_rs);
+	local_irq_restore(flags);
+}
+
+/*
+ * Exit nohz mode.
+ */
+void rcu_exit_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting--;
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dynticks).dynticks & 0x1),
+			  &rcu_rs);
+	local_irq_restore(flags);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_enter - Called from NMI
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+}
+
+/**
+ * rcu_nmi_exit - Called from NMI
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+}
+
+/**
+ * rcu_irq_enter - Called from hard irq handlers
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks_nesting++) {
+		return;
+	}
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+}
+
+/**
+ * rcu_irq_exit - Called when exiting hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (--rdtp->dynticks_nesting) {
+		return;
+	}
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (__get_cpu_var(rcu_data).nxtlist ||
+	    __get_cpu_var(rcu_bh_data).nxtlist)
+		set_need_resched();
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->complete - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, int comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+	int snap_nmi;
+
+	snap = rdp->dynticks->dynticks;
+	snap_nmi = rdp->dynticks->dynticks_nmi;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	rdp->dynticks_nmi_snap = snap_nmi;
+	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long curr_nmi;
+	long snap;
+	long snap_nmi;
+
+	curr = rdp->dynticks->dynticks;
+	snap = rdp->dynticks_snap;
+	curr_nmi = rdp->dynticks->dynticks_nmi;
+	snap_nmi = rdp->dynticks_nmi_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq/NMI handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr != snap || (curr & 0x1) == 0) &&
+	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, int comp)
+{
+}
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; }
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = get_seconds() - rsp->seconds_stall;
+	if (delta < 2 || rsp->gpnum != rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
+			smp_processor_id(), get_seconds(),
+			jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(get_seconds() - rsp->seconds_stall) >= 0)
+		rsp->seconds_stall =
+			get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = get_seconds() - rsp->seconds_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed && delta >= 2) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
+	__releases(rsp->rda[smp_processor_id()]->lock)
+{
+	unsigned long flags = iflg;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_SIGNAL_INIT;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  When invoking this on one's own
+ * behalf, lastcomp is used to make sure we are still in the grace period
+ * of interest.  We don't want to end the current grace period based on
+ * quiescent states detected in an earlier grace period!  On the other hand,
+ * it the CPU being quieted is offline, we can safely pass in lastcomp==NULL,
+ * since an offline CPU is in a quiescent state with respect to any grace
+ * period, unlike pesky online CPUs, which can go non-quiescent with
+ * absolutely no warning.
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != NULL &&
+	    *lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, &rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+						/* @@@ move up to simplify. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	/* Update count, and requeue any remaining callbacks. */
+	local_irq_save(flags);
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+	local_irq_restore(flags);
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * Also do a memory barrier.  This is needed to handle
+		 * the case where writes from a preempt-disable section
+		 * of code get reordered into schedule() by this CPU's
+		 * write buffer.  The memory barrier makes sure that
+		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+		 * by other CPUs to happen after any such write.
+		 */
+
+		smp_mb();  /* See above block comment. */
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.  The memory barrier
+		 * is needed for the same reason as is the above one.
+		 */
+
+		smp_mb();  /* See above block comment. */
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		mask = 0;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags))
+		return;	/* Someone else is already on the job. */
+	if (relaxed && (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	if (rsp->completed == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+	    	force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manupulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manupulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	rdp->completed = rsp->completed;
+	rdp->gpnum = rsp->completed;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = rsp->completed - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dynticks_nesting = 1;
+	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
+	rdtp->dynticks_nmi = (rdtp->dynticks + 1) & ~0x1;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		levelspread[i] = CONFIG_RCU_FANOUT;
+	}
+	
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+	
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++) {
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	}
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] + 
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_DEBUG_RCU_STALL
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 0000000..18d1613
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,235 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+extern int tick_do_timer_cpu __read_mostly;  /* @@@ DEBUG @@@ */
+
+static DEFINE_MUTEX(rcuclassic_trace_mutex);
+static char *rcuclassic_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE (512*NR_CPUS)
+
+static int print_one_rcu_data(struct rcu_data *rdp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+
+	if (!rdp->beenonline)
+		return 0;
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		"%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+		rdp->cpu,
+		cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		rdp->completed, rdp->gpnum,
+		rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		rdp->qs_pending);
+#ifdef CONFIG_NO_HZ
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" dt=%d dn=%d df=%lu",
+		rdp->dynticks->dynticks, rdp->dynticks->dynticks_nmi,
+		rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+	return cnt;
+}
+
+#define PRINT_RCU_DATA(name, buf, ebuf) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			(buf) += print_one_rcu_data(&per_cpu(name, _p_r_d_i), \
+						    buf, ebuf); \
+	} while (0)
+
+static ssize_t rcudata_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static int print_one_rcu_state(struct rcu_state *rsp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+	int level = 0;
+	struct rcu_node *rnp;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+			"c=%ld g=%ld s=%d jfq=%ld nfqs=%lu/nfqsng=%lu(%lu)\n",
+			rsp->completed, rsp->gpnum, rsp->signaled,
+			(long)(rsp->jiffies_force_qs - jiffies),
+			rsp->n_force_qs, rsp->n_force_qs_ngp,
+			rsp->n_force_qs - rsp->n_force_qs_ngp);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+			level = rnp->level;
+		}
+		cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+				"%lx/%lx %d:%d ^%d    ",
+				rnp->qsmask, rnp->qsmaskinit,
+				rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+	return cnt;
+}
+
+static ssize_t rcuhier_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: tick_do_timer_cpu=%d\n",
+			tick_do_timer_cpu);  /* @@@ DEBUG @@@ */
+	buf += print_one_rcu_state(&rcu_state, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	buf += print_one_rcu_state(&rcu_bh_state, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: completed=%ld  gpnum=%ld\n",
+			rcu_state.completed, rcu_state.gpnum);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh: completed=%ld  gpnum=%ld\n",
+			rcu_bh_state.completed, rcu_bh_state.gpnum);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.read = rcudata_read,
+};
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuhier_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct dentry *rcudir, *datadir, *hierdir, *gpdir;
+static int rcuclassic_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcuclassic_trace_init(void)
+{
+	int ret;
+
+	rcuclassic_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcuclassic_trace_buf)
+		return 1;
+	ret = rcuclassic_debugfs_init();
+	if (ret)
+		kfree(rcuclassic_trace_buf);
+	return ret;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+	kfree(rcuclassic_trace_buf);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f26..e5a08bc 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -259,6 +259,7 @@ void irq_enter(void)
 	if (idle_cpu(cpu) && !in_interrupt())
 		tick_nohz_stop_idle(cpu);
 #endif
+	rcu_irq_enter();
 	__irq_enter();
 #ifdef CONFIG_NO_HZ
 	if (idle_cpu(cpu))
@@ -285,9 +286,9 @@ void irq_exit(void)
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
+	rcu_irq_exit();
 	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
 		tick_nohz_stop_sched_tick(0);
-	rcu_irq_exit();
 #endif
 	preempt_enable_no_resched();
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 800ac84..804e08c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_CPU_STALL_DETECTOR
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU || TREE_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-15 16:02       ` [PATCH, RFC] v4 scalable classic RCU implementation Paul E. McKenney
@ 2008-09-16 16:52         ` Manfred Spraul
  2008-09-16 17:30           ` Paul E. McKenney
  2008-09-23 23:53         ` [PATCH, RFC] v6 " Paul E. McKenney
  1 sibling, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-09-16 16:52 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

Hi Paul,

Paul E. McKenney wrote:
> +/*
> + * Scan the leaf rcu_node structures, processing dyntick state for any that
> + * have not yet encountered a quiescent state, using the function specified.
> + * Returns 1 if the current grace period ends while scanning (possibly
> + * because we made it end).
> + */
> +static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
> +			       int (*f)(struct rcu_data *))
> +{
> +	unsigned long bit;
> +	int cpu;
> +	unsigned long flags;
> +	unsigned long mask;
> +	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
> +	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
> +
> +	for (; rnp_cur < rnp_end; rnp_cur++) {
> +		mask = 0;
> +		spin_lock_irqsave(&rnp_cur->lock, flags);
> +		if (rsp->completed != lastcomp) {
> +			spin_unlock_irqrestore(&rnp_cur->lock, flags);
> +			return 1;
> +		}
> +		if (rnp_cur->qsmask == 0) {
> +			spin_unlock_irqrestore(&rnp_cur->lock, flags);
> +			continue;
> +		}
> +		cpu = rnp_cur->grplo;
> +		bit = 1;
> +		mask = 0;
> +		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
> +			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
> +				mask |= bit;
> +		}
>   
I'm still comparing my implementation with your code:
- f is called once for each cpu in the system, correct?
- if at least one cpu is in nohz mode, this loop will be needed for 
every grace period.

That means an O(NR_CPUS) loop with disabled local interrupts :-(
Is that correct?

Unfortunately, my solution is even worse:
My rcu_irq_exit() acquires a global spinlock when called on a nohz cpus.
A few cpus in cpu_idle, nohz, executing 50k network interrupts/sec would 
cacheline-trash that spinlock.
I'm considering counting interrupts: if a nohz cpu executes more than a 
few interrupts/tick, then add a timer that check rcu_pending().

Perhaps even wouldn't be enough: I remember that the initial unhandled 
irq detection code broke miserably on large SGI systems:
An atomic_inc(&global_var) in the local timer interrupt (i.e.: 
NR_CPUS*HZ calls/sec) caused so severe trashing that the system wouldn't 
boot. IIRC that was with 512 cpus.


--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-16 16:52         ` Manfred Spraul
@ 2008-09-16 17:30           ` Paul E. McKenney
  2008-09-16 17:48             ` Manfred Spraul
  0 siblings, 1 reply; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-16 17:30 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Tue, Sep 16, 2008 at 06:52:54PM +0200, Manfred Spraul wrote:
> Hi Paul,

Hello, Manfred!

Thank you for looking this over!

> Paul E. McKenney wrote:
>> +/*
>> + * Scan the leaf rcu_node structures, processing dyntick state for any 
>> that
>> + * have not yet encountered a quiescent state, using the function 
>> specified.
>> + * Returns 1 if the current grace period ends while scanning (possibly
>> + * because we made it end).
>> + */
>> +static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
>> +			       int (*f)(struct rcu_data *))
>> +{
>> +	unsigned long bit;
>> +	int cpu;
>> +	unsigned long flags;
>> +	unsigned long mask;
>> +	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
>> +	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
>> +
>> +	for (; rnp_cur < rnp_end; rnp_cur++) {
>> +		mask = 0;
>> +		spin_lock_irqsave(&rnp_cur->lock, flags);
>> +		if (rsp->completed != lastcomp) {
>> +			spin_unlock_irqrestore(&rnp_cur->lock, flags);
>> +			return 1;
>> +		}
>> +		if (rnp_cur->qsmask == 0) {
>> +			spin_unlock_irqrestore(&rnp_cur->lock, flags);
>> +			continue;
>> +		}
>> +		cpu = rnp_cur->grplo;
>> +		bit = 1;
>> +		mask = 0;
>> +		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
>> +			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
>> +				mask |= bit;
>> +		}
>>   
> I'm still comparing my implementation with your code:
> - f is called once for each cpu in the system, correct?

Not necessarily.  If all CPUs corresponding to this rcu_state struct
have checked in already, we don't even get to this loop -- see the
"continue" above.

> - if at least one cpu is in nohz mode, this loop will be needed for every 
> grace period.

The outer loop, yes.  The inner loop only for those rcu_state structs
that have at least one CPU in nohz mode.

> That means an O(NR_CPUS) loop with disabled local interrupts :-(
> Is that correct?

With the definition of "O()" being the worst-case execution time, yes.
But this worst case could only happen when the system was mostly idle,
in which case the added overhead should not be too horribly bad.  If the
system was busy enough that each CPU ran at least one process during each
grace period, then this function would not be invoked in the first place.

If this does prove to be a problem in practice, I will rework
force_quiescent_state() to run incrementally.  But I would rather
avoid both the added complexity and the resulting longer grace periods,
so someone needs to bring me a real-world problem before I take that
approach.

> Unfortunately, my solution is even worse:
> My rcu_irq_exit() acquires a global spinlock when called on a nohz cpus.
> A few cpus in cpu_idle, nohz, executing 50k network interrupts/sec would 
> cacheline-trash that spinlock.
> I'm considering counting interrupts: if a nohz cpu executes more than a few 
> interrupts/tick, then add a timer that check rcu_pending().

I tried putting a cpu_quiet() in my rcu_irq_exit() as well, and quickly
decided that this was counter-productive.  ;-)

> Perhaps even wouldn't be enough: I remember that the initial unhandled irq 
> detection code broke miserably on large SGI systems:
> An atomic_inc(&global_var) in the local timer interrupt (i.e.: NR_CPUS*HZ 
> calls/sec) caused so severe trashing that the system wouldn't boot. IIRC 
> that was with 512 cpus.

/me runs off and checks to make sure that all of my dyntick entry/exit
code restricts itself to per-CPU variables...

Yep!  (Whew!!!)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-16 17:30           ` Paul E. McKenney
@ 2008-09-16 17:48             ` Manfred Spraul
  2008-09-16 18:22               ` Paul E. McKenney
  2008-09-21 11:09               ` Manfred Spraul
  0 siblings, 2 replies; 94+ messages in thread
From: Manfred Spraul @ 2008-09-16 17:48 UTC (permalink / raw)
  To: paulmck
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

Paul E. McKenney wrote:
>   
>> That means an O(NR_CPUS) loop with disabled local interrupts :-(
>> Is that correct?
>>     
>
> With the definition of "O()" being the worst-case execution time, yes.
> But this worst case could only happen when the system was mostly idle,
> in which case the added overhead should not be too horribly bad.
No: "was mostly running cpu_idle()". A cpu_idle() cpu could execute lots 
of irqs and softirqs.
So the worst case would be a system with 1 cpu/node for reserved for irq 
handling.
The "idle" cpu would be always in no_hz mode, even though it might be 
100% busy handling irqs.
The remaning cpus might be 100% busy handling user space.

And every quiescent state will end up in that O(NR_CPUS) loop.

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-16 17:48             ` Manfred Spraul
@ 2008-09-16 18:22               ` Paul E. McKenney
  2008-09-21 11:09               ` Manfred Spraul
  1 sibling, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-16 18:22 UTC (permalink / raw)
  To: Manfred Spraul
  Cc: linux-kernel, cl, mingo, akpm, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi

On Tue, Sep 16, 2008 at 07:48:00PM +0200, Manfred Spraul wrote:
> Paul E. McKenney wrote:
>>   
>>> That means an O(NR_CPUS) loop with disabled local interrupts :-(
>>> Is that correct?
>>>     
>>
>> With the definition of "O()" being the worst-case execution time, yes.
>> But this worst case could only happen when the system was mostly idle,
>> in which case the added overhead should not be too horribly bad.
>
> No: "was mostly running cpu_idle()". A cpu_idle() cpu could execute lots of 
> irqs and softirqs.
> So the worst case would be a system with 1 cpu/node for reserved for irq 
> handling.
> The "idle" cpu would be always in no_hz mode, even though it might be 100% 
> busy handling irqs.
> The remaning cpus might be 100% busy handling user space.
>
> And every quiescent state will end up in that O(NR_CPUS) loop.

Good point!

Indeed, if you had a 1024-CPU box acting as (say) a router/hub using
the Linux-kernel protocol stacks with no user-mode processing, then
you could indeed have the system mostly busy with no user-space code
running, and thus no quiescent states.

However, last I checked, almost all 1024-CPU boxes run HPC workloads
mostly in user mode, so this scenario would not occur.  However, again,
if it does come up, I would add an additional level of state machine
to the force_quiescent_state() family of functions, so that the scan
would be done incrementally.  Perhaps arranging for CPU groups to be
scanned by CPUs within that group.

But again, I don't want to take that step until I see someone actually
needing it.  Maybe the Vyatta guys will be there sooner than I think,
but...

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-16 17:48             ` Manfred Spraul
  2008-09-16 18:22               ` Paul E. McKenney
@ 2008-09-21 11:09               ` Manfred Spraul
  2008-09-21 21:14                 ` Paul E. McKenney
  1 sibling, 1 reply; 94+ messages in thread
From: Manfred Spraul @ 2008-09-21 11:09 UTC (permalink / raw)
  To: paulmck; +Cc: linux-kernel

Hi Paul,

Some further thoughts about design differences between your and my 
implementation:

- rcutree's qsmaskinit  is the worst-case list of cpus that could be in 
rcu read side critical sections.
- rcustate's cpu_total is the accurate list of cpus that could be in rcu 
read side critical sections.

Both variables are read rarely: for rcu_state, twice per grace period.

rcutree fixes up cpus that are "incorrectly" listed in qsmaskinit with 
force_quiescent_state(). It forces rcutree to use a cpu bitmask for 
qsmask and it forces rcutree to store the "done" information in a global 
structure. Additionately, in the worst case force_quiescent_state() must 
loop over all cpus.
rcustate can use per-cpu structures and a global atomic_t. There is no 
loop over all cpus. That's a big advantage, thus I think it's worth the 
effort to maintain an accurate list.
Unfortunately, I don't have an efficient implementation for the accurate 
list.

Some random ideas:
- cpu_total is only read rarely. Thus it would be ok if the read 
operation is expensive [e.g. collect data from multiple cachelines, 
acquire spinlocks...]
- updates to cpu_total happen with every interrupt on an idle system 
with no_hz.
    Thus it must be very scalable, preferably per-cpu data.
    And: Updates are far more frequent than grace periods.
- updates to cpu_total happen nearly never without no_hz.
   Especially: far less frequent than grace periods.

What about adding an "invalid" flag to cpu_total? The "real" data is 
stored in per-cpu structures.
- when a cpu enters/leaves nohz, then it invalidates the global 
cpu_total and updates a per-cpu structure
- when the state machine needs the number of rcu-tracked cpus, then it 
checks if the global cpu_total is valid.
If it's valid, then cpu_total is used directly. Otherwise the per-cpu 
structures are enumerated and the new value is stored as cpu_total.

What do you think?

--
    Manfred

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v4 scalable classic RCU implementation
  2008-09-21 11:09               ` Manfred Spraul
@ 2008-09-21 21:14                 ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-21 21:14 UTC (permalink / raw)
  To: Manfred Spraul; +Cc: linux-kernel

On Sun, Sep 21, 2008 at 01:09:51PM +0200, Manfred Spraul wrote:
> Hi Paul,
>
> Some further thoughts about design differences between your and my 
> implementation:
>
> - rcutree's qsmaskinit  is the worst-case list of cpus that could be in rcu 
> read side critical sections.
> - rcustate's cpu_total is the accurate list of cpus that could be in rcu 
> read side critical sections.
>
> Both variables are read rarely: for rcu_state, twice per grace period.
>
> rcutree fixes up cpus that are "incorrectly" listed in qsmaskinit with 
> force_quiescent_state(). It forces rcutree to use a cpu bitmask for qsmask 
> and it forces rcutree to store the "done" information in a global 
> structure. Additionately, in the worst case force_quiescent_state() must 
> loop over all cpus.
> rcustate can use per-cpu structures and a global atomic_t. There is no loop 
> over all cpus. That's a big advantage, thus I think it's worth the effort 
> to maintain an accurate list.
> Unfortunately, I don't have an efficient implementation for the accurate 
> list.

That has been one of my biggest questions about your approach, that and
the need to hit holdouts with resched IPI.  (Though perhaps you have
worked out another way around this.)

> Some random ideas:
> - cpu_total is only read rarely. Thus it would be ok if the read operation 
> is expensive [e.g. collect data from multiple cachelines, acquire 
> spinlocks...]

Agreed.  I could use this approach as well, having each CPU set and clear
its qsmaskinit bit on every exit from and entry to dynticks idle state,
but see below...

In your case, you would need to carefully keep state so that a CPU
entering dynticks idle mode would know whether or not it needed to
respond to the current grace period -- but you need this in any case,
so no added complexity as far as I can see.

> - updates to cpu_total happen with every interrupt on an idle system with 
> no_hz.
>    Thus it must be very scalable, preferably per-cpu data.
>    And: Updates are far more frequent than grace periods.

Yep!  Hence my reluctance to add overhead to the dynticks side of the
algorithm.

> - updates to cpu_total happen nearly never without no_hz.
>   Especially: far less frequent than grace periods.

Indeed, this is the easy case for both of our approaches.

> What about adding an "invalid" flag to cpu_total? The "real" data is stored 
> in per-cpu structures.
> - when a cpu enters/leaves nohz, then it invalidates the global cpu_total 
> and updates a per-cpu structure
> - when the state machine needs the number of rcu-tracked cpus, then it 
> checks if the global cpu_total is valid.
> If it's valid, then cpu_total is used directly. Otherwise the per-cpu 
> structures are enumerated and the new value is stored as cpu_total.
>
> What do you think?

I use an analogous algorithm, as the qsinitmask values might change while
setting up for the next quiescent state.  The tough part of this was
correctly handling races between setting up for the quiescent state and
onlining/offlining CPUs (and, in your case, CPUs entering/leaving dynticks
idle mode).  I chose to use a global lock that excludes online/offline
and starting a quiescent state (except in the case where there are so
few CPUs that there is but one rcu_node structure, in which case that
structure's lock suffices, so that onofflock need not be acquired).

But although acquiring a global lock is reasonable for CPU online/offline
(as there can be but one such operation at a time), it would be quite
painful for the dynticks case.

Of course, I might be able to avoid the need for this global lock if I
were willing to acquire the interior-node rcu_node locks when setting
up for the next grace period.  But this would require me to put in a
cleanup step after grace-period setup, as some set of dyntick operations
might otherwise end the grace period before it is fully initialized.
This could potentially result in two different CPUs setting up grace
periods concurrently (yuck!).  Worse yet, the grace period could end
while the initialization was only halfway through the leaves, so that
the CPU doing the initialization would need to recognize this and stop
further initialization -- and, again, clean up (yuck^2!).

Your two-phase approach might (or might not) avoid this issue.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [PATCH, RFC] v6 scalable classic RCU implementation
  2008-09-15 16:02       ` [PATCH, RFC] v4 scalable classic RCU implementation Paul E. McKenney
  2008-09-16 16:52         ` Manfred Spraul
@ 2008-09-23 23:53         ` Paul E. McKenney
  2008-09-25  7:26           ` Ingo Molnar
                             ` (2 more replies)
  1 sibling, 3 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-23 23:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt, peterz, penberg, andi, tglx

[-- Attachment #1: Type: text/plain, Size: 100868 bytes --]

Hello!

This patch fixes a long-standing performance bug in classic RCU that
results in massive lock contention on the internal RCU lock on systems
with more than a few hundred CPUs.  Although this patch creates a
separate flavor of RCU for easy of review and patch maintenance, it
is intended to replace classic RCU.

Still experimental, not for inclusion, but given that I am now finding
more bugs in the rest of Linux than in this code, I suspect that it
is getting close.  Definitely ready for serious experimental use,
especially in !CONFIG_NO_HZ configurations.  In particular, experience
on an actual 1000+ CPU machine would be most welcome, and now appears
to be forthcoming!

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a
hierarchy, greatly reducing the contention on the top-level lock
for large machines.  This passes 10-hour concurrent rcutorture and
online-offline testing on 128-CPU ppc64 without dynticks enabled,
and exposes some timekeeping bugs in presence of dynticks (exciting
working on a system where "sleep 1" hangs until interrupted...).
It is OK for experimental work, but not yet ready for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	There is a bit of debug code in place.  This will be removed.

o	There is still a little strangeness in timekeeping when running
	a heavy dyntick and CPU hotplug workload concurrently with
	rcutorture, but there no longer seem to be cases of the jiffies
	counter deciding to stop counting (thank you, Thomas!!!).
	But I can make -that- happen -without- this patch.

o	There are probably hangs, rcutorture failures, &c.  Seems
	reasonably stable on a 128-CPU machine, but that is kind of
	small compared to 4096.

o	There is not yet a human-readable design document.  One is now
	in the works (finally!).

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, and Andrew Morton for reviews and
	comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

To build, start with 2.6.27-rc7, and apply:

	http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-treeRCU-20.patch
	http://tglx.de/~tglx/gack.patch
	http://tglx.de/~tglx/clockevents-keep-tick-next-period-up-to-date.patch

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 Documentation/RCU/00-INDEX             |    2 
 Documentation/RCU/trace.txt            |  389 ++++++++
 arch/powerpc/platforms/pseries/rtasd.c |    4 
 include/linux/hardirq.h                |   14 
 include/linux/rcupdate.h               |   10 
 include/linux/rcutree.h                |  320 +++++++
 init/Kconfig                           |   18 
 kernel/Kconfig.preempt                 |   62 +
 kernel/Makefile                        |    6 
 kernel/rcupreempt.c                    |   10 
 kernel/rcupreempt_trace.c              |   10 
 kernel/rcutree.c                       | 1502 +++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c                 |  235 +++++
 kernel/softirq.c                       |   15 
 lib/Kconfig.debug                      |   13 
 15 files changed, 2576 insertions(+), 34 deletions(-)

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 461481d..7dc0695 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -16,6 +16,8 @@ RTFP.txt
 	- List of RCU papers (bibliography) going back to 1980.
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+trace.txt
+	- CONFIG_RCU_TRACE debugfs files and formats
 UP.txt
 	- RCU on Uniprocessor Systems
 whatisRCU.txt
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
new file mode 100644
index 0000000..1357b97
--- /dev/null
+++ b/Documentation/RCU/trace.txt
@@ -0,0 +1,389 @@
+CONFIG_RCU_TRACE debugfs Files and Formats
+
+
+The rcupreempt and rcutree implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+Note that the rcuclassic implementation of RCU does not provide debugfs
+trace output.
+
+The following sections describe the debugfs files and formats for
+preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
+
+
+Preemptable RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
+counters used by preemptable RCU) rcu/rcugp (which displays grace-period
+counters), and rcu/rcustats (which internal counters for debugging RCU).
+
+The output of "cat rcu/rcuctrs" looks as follows:
+
+CPU last cur F M
+  0    5  -5 0 0
+  1   -1   0 0 0
+  2    0   1 0 0
+  3    0   1 0 0
+  4    0   1 0 0
+  5    0   1 0 0
+  6    0   2 0 0
+  7    0  -1 0 0
+  8    0   1 0 0
+ggp = 26226, state = waitzero
+
+The per-CPU fields are as follows:
+
+o	"CPU" gives the CPU number.  Offline CPUs are not displayed.
+
+o	"last" gives the value of the counter that is being decremented
+	for the current grace period phase.  In the example above,
+	the counters sum to 4, indicating that there are still four
+	RCU read-side critical sections still running that started
+	before the last counter flip.
+
+o	"cur" gives the value of the counter that is currently being
+	both incremented (by rcu_read_lock()) and decremented (by
+	rcu_read_unlock()).  In the example above, the counters sum to
+	1, indicating that there is only one RCU read-side critical section
+	still running that started after the last counter flip.
+
+o	"F" indicates whether RCU is waiting for this CPU to acknowledge
+	a counter flip.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitack".
+
+o	"M" indicates whether RCU is waiting for this CPU to execute a
+	memory barrier.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitmb".
+
+o	"ggp" is the global grace-period counter.
+
+o	"state" is the RCU state, which can be one of the following:
+
+	o	"idle": there is no grace period in progress.
+
+	o	"waitack": RCU just incremented the global grace-period
+		counter, which has the effect of reversing the roles of
+		the "last" and "cur" counters above, and is waiting for
+		all the CPUs to acknowledge the flip.  Once the flip has
+		been acknowledged, CPUs will no longer be incrementing
+		what are now the "last" counters, so that their sum will
+		decrease monotonically down to zero.
+
+	o	"waitzero": RCU is waiting for the sum of the "last" counters
+		to decrease to zero.
+
+	o	"waitmb": RCU is waiting for each CPU to execute a memory
+		barrier, which ensures that instructions from a given CPU's
+		last RCU read-side critical section cannot be reordered
+		with instructions following the memory-barrier instruction.
+
+The output of "cat rcu/rcugp" looks as follows:
+
+oldggp=48870  newggp=48873
+
+Note that reading from this file provokes a synchronize_rcu().  The
+"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
+executing the synchronize_rcu(), and the "newggp" value is also the
+"ggp" value, but taken after the synchronize_rcu() command returns.
+
+
+The output of "cat rcu/rcugp" looks as follows:
+
+na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
+1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
+z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
+
+These are counters tracking internal preemptable-RCU events, however,
+some of them may be useful for debugging algorithms using RCU.  In
+particular, the "nl", "wl", and "dl" values track the number of RCU
+callbacks in various states.  The fields are as follows:
+
+o	"na" is the total number of RCU callbacks that have been enqueued
+	since boot.
+
+o	"nl" is the number of RCU callbacks waiting for the previous
+	grace period to end so that they can start waiting on the next
+	grace period.
+
+o	"wa" is the total number of RCU callbacks that have started waiting
+	for a grace period since boot.  "na" should be roughly equal to
+	"nl" plus "wa".
+
+o	"wl" is the number of RCU callbacks currently waiting for their
+	grace period to end.
+
+o	"da" is the total number of RCU callbacks whose grace periods
+	have completed since boot.  "wa" should be roughly equal to
+	"wl" plus "da".
+
+o	"di" is the total number of RCU callbacks that have been invoked
+	since boot.  "di" should be roughly equal to "da", though some
+	early versions of preemptable RCU had a bug so that only the
+	last CPU's count of invocations was displayed, rather than the
+	sum of all CPU's counts.
+
+o	"1" is the number of calls to rcu_try_flip().
+
+o	"e1" is the number of times that rcu_try_flip() was unable to
+	acquire the fliplock.
+
+o	"i1" is the number of calls to rcu_try_flip_idle().
+
+o	"ie1" is the number of times rcu_try_flip_idle() exited early
+	due to the calling CPU having no work for RCU.
+
+o	"g1" is the number of times that rcu_try_flip_idle() decided
+	to start a new grace period.  "i1" should be roughly equal to
+	"ie1" plus "g1".
+
+o	"a1" is the number of calls to rcu_try_flip_waitack().
+
+o	"ae1" is the number of times that rcu_try_flip_waitack() found
+	that at least one CPU had not yet acknowledge the new grace period
+	(AKA "counter flip").
+
+o	"a2" is the number of time rcu_try_flip_waitack() found that
+	all CPUs had acknowledged.  "a1" should be roughly equal to
+	"ae1" plus "a2".  (This particular output was collected on
+	a 128-CPU machine, hence the smaller-than-usual fraction of
+	calls to rcu_try_flip_waitack() finding all CPUs having already
+	acknowledged.)
+
+o	"z1" is the number of calls to rcu_try_flip_waitzero().
+
+o	"ze1" is the number of times that rcu_try_flip_waitzero() found
+	that not all of the old RCU read-side critical sections had
+	completed.
+
+o	"z2" is the number of times that rcu_try_flip_waitzero() finds
+	the sum of the counters equal to zero, in other words, that
+	all of the old RCU read-side critical sections had completed.
+	The value of "z1" should be roughly equal to "ze1" plus
+	"z2".
+
+o	"m1" is the number of calls to rcu_try_flip_waitmb().
+
+o	"me1" is the number of times that rcu_try_flip_waitmb() finds
+	that at least one CPU has not yet executed a memory barrier.
+
+o	"m2" is the number of times that rcu_try_flip_waitmb() finds that
+	all CPUs have executed a memory barrier.
+
+
+Hierarchical RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcudata (which displays fields in struct
+rcu_data), rcu/rcugp (which displays grace-period counters), and
+rcu/rcuhier (which displays the struct rcu_node hierarchy).
+
+The output of "cat rcu/rcudata" looks as follows:
+
+rcu:
+  0 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=26097 dn=2 df=9102 of=0 ri=11 ql=2 b=10
+  1 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=30421 dn=2 df=6608 of=0 ri=2 ql=39 b=10
+  2 c=1982 g=1982 pq=1 pqc=1982 qp=0 dt=10934 dn=2 df=9612 of=0 ri=0 ql=0 b=10
+  3 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=30139 dn=2 df=6043 of=0 ri=0 ql=58 b=10
+  4 c=1960 g=1960 pq=1 pqc=1960 qp=1 dt=1202 dn=2 df=30470 of=0 ri=3 ql=0 b=10
+  5 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=15341 dn=2 df=5350 of=0 ri=0 ql=25 b=10
+  6 c=1983 g=1984 pq=1 pqc=1983 qp=1 dt=516 dn=2 df=31950 of=0 ri=0 ql=0 b=10
+  7 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=8205 dn=2 df=7465 of=0 ri=0 ql=28 b=10
+rcu_bh:
+  0 c=375 g=375 pq=1 pqc=375 qp=0 dt=26097 dn=2 df=0 of=0 ri=0 ql=0 b=10
+  1 c=375 g=375 pq=1 pqc=375 qp=0 dt=30421 dn=2 df=162 of=0 ri=0 ql=0 b=10
+  2 c=375 g=375 pq=1 pqc=375 qp=1 dt=10934 dn=2 df=162 of=0 ri=0 ql=0 b=10
+  3 c=375 g=375 pq=1 pqc=375 qp=0 dt=30139 dn=2 df=107 of=0 ri=0 ql=0 b=10
+  4 c=375 g=375 pq=1 pqc=375 qp=1 dt=1202 dn=2 df=174 of=0 ri=0 ql=0 b=10
+  5 c=375 g=375 pq=1 pqc=375 qp=0 dt=15341 dn=2 df=122 of=0 ri=0 ql=0 b=10
+  6 c=375 g=375 pq=1 pqc=375 qp=1 dt=516 dn=2 df=117 of=0 ri=0 ql=0 b=10
+  7 c=375 g=375 pq=1 pqc=375 qp=0 dt=8205 dn=2 df=127 of=0 ri=0 ql=0 b=10
+
+The first section lists the rcu_data structures for rcu, the second for
+rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
+The fields are as follows:
+
+o	The number at the beginning of each line is the CPU number.
+	CPUs numbers followed by an exclamation mark are offline,
+	but have been online at least once since boot.	There will be
+	no output for CPUs that have never been online, which can be
+	a good thing in the surprisingly common case where NR_CPUS is
+	substantially larger than the number of actual CPUs.
+
+o	"c" is the count of grace periods that this CPU believes have
+	completed.  CPUs in dynticks idle mode may lag quite a ways
+	behind, for example, CPU 4 under "rcu" above, which has slept
+	through the past 25 RCU grace periods.	It is not unusual to
+	see CPUs lagging by thousands of grace periods.
+
+o	"g" is the count of grace periods that this CPU believes have
+	started.  Again, CPUs in dynticks idle mode may lag behind.
+	If the "c" and "g" values are equal, this CPU has already
+	reported a quiescent state for the last RCU grace period that
+	it is aware of, otherwise, the CPU believes that it owes RCU a
+	quiescent state.
+
+o	"pq" indicates that this CPU has passed through a quiescent state
+	for the current grace period.  It is possible for "pq" to be
+	"1" and "c" different than "g", which indicates that although
+	the CPU has passed through a quiescent state, either (1) this
+	CPU has not yet reported that fact, (2) some other CPU has not
+	yet reported for this grace period, or (3) both.
+
+o	"pqc" indicates which grace period the last-observed quiescent
+	state for this CPU corresponds to.  This is important for handling
+	the race between CPU 0 reporting an extended dynticks-idle
+	quiescent state for CPU 1 and CPU 1 suddenly waking up and
+	reporting its own quiescent state.  If CPU 1 was the last CPU
+	for the current grace period, then the CPU that loses this race
+	will attempt to incorrectly mark CPU 1 as having checked in for
+	the next grace period!
+
+o	"qp" indicates that RCU still expects a quiescent state from
+	this CPU.
+
+o	"dt" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state, either by the
+	scheduler or by irq.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"dn" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state via NMI.  If both
+	the "dt" and "dn" values are even, then this CPU is in dynticks
+	idle mode and may be ignored by RCU.  If either of these two
+	counters is odd, then RCU must be alert to the possibility of
+	an RCU read-side critical section running on this CPU.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"df" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being in
+	dynticks-idle state.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"of" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being
+	offline.  In a perfect world, this might neve happen, but it
+	turns out that offlining and onlining a CPU can take several grace
+	periods, and so there is likely to be an extended period of time
+	when RCU believes that the CPU is online when it really is not.
+	Please note that erring in the other direction (RCU believing a
+	CPU is offline when it is really alive and kicking) is a fatal
+	error, so it makes sense to err conservatively.
+
+o	"ri" is the number of times that RCU has seen fit to send a
+	reschedule IPI to this CPU in order to get it to report a
+	quiescent state.
+
+o	"ql" is the number of RCU callbacks currently residing on
+	this CPU.  This is the total number of callbacks, regardless
+	of what state they are in (new, waiting for grace period to
+	start, waiting for grace period to end, ready to invoke).
+
+o	"b" is the batch limit for this CPU.  If more than this number
+	of RCU callbacks is ready to invoke, then the remainder will
+	be deferred.
+
+
+The output of "cat rcu/rcudata" looks as follows:
+
+rcu: completed=33062  gpnum=33063
+rcu_bh: completed=464  gpnum=464
+
+Again, this output is for both "rcu" and "rcu_bh".  The fields are
+taken from the rcu_state structure, and are as follows:
+
+o	"completed" is the number of grace periods that have completed.
+	It is comparable to the "c" field from rcu/rcudata in that a
+	CPU whose "c" field matches the value of "completed" is aware
+	that the corresponding RCU grace period has completed.
+
+o	"gpnum" is the number of grace periods that have started.  It is
+	comparable to the "g" field from rcu/rcudata in that a CPU
+	whose "g" field matches the value of "gpnum" is aware that the
+	corresponding RCU grace period has started.
+
+	If these two fields are equal (as they are for "rcu_bh" above),
+	then there is no grace period in progress, in other words, RCU
+	is idle.  On the other hand, if the two fields differ (as they
+	do for "rcu" above), then an RCU grace period is in progress.
+
+
+The output of "cat rcu/rcuhier" looks as follows, with very long lines:
+
+rcu:
+c=33184 g=33185 s=0 jfq=1 nfqs=61601/nfqsng=28011(33590)
+1/1 0:127 ^0    
+1/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+14/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+rcu_bh:
+c=470 g=470 s=0 jfq=2 nfqs=62302/nfqsng=62027(275)
+0/1 0:127 ^0    
+0/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+0/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+
+This is once again split into "rcu" and "rcu_bh" portions.  The fields are
+as follows:
+
+o	"c" is exactly the same as "completed" under rcu/rcugp.
+
+o	"g" is exactly the same as "gpnum" under rcu/rcugp.
+
+o	"s" is the "signaled" state that drives force_quiescent_state()'s
+	state machine.
+
+o	"jfq" is the number of jiffies remaining for this grace period
+	before force_quiescent_state() is invoked to help push things
+	along.  Note that CPUs in dyntick-idle mode thoughout the grace
+	period will not report on their own, but rather must be check by
+	some other CPU via force_quiescent_state().
+
+o	"nfqs" is the number of calls to force_quiescent_state() since
+	boot.
+
+o	"nfqsng" is the number of useless calls to force_quiescent_state(),
+	where there wasn't actually a grace period active.  This can
+	happen due to races.  The number in parentheses is the difference
+	between "nfqs" and "nfqsng", or the number of times that
+	force_quiescent_state() actually did some real work.
+
+o	Each element of the form "1/1 0:127 ^0" represents one struct
+	rcu_node.  Each line represents one level of the hierarchy, from
+	root to leaves.  It is best to think of the rcu_data structures
+	as forming yet another level after the leaves.  Note that there
+	might be either one, two, or three levels of rcu_node structures,
+	depending on the relationship between CONFIG_RCU_FANOUT and
+	CONFIG_NR_CPUS.
+	
+	o	The numbers separated by the "/" are the qsmask followed
+		by the qsmaskinit.  The qsmask will have one bit
+		set for each entity in the next lower level that
+		has not yet checked in for the current grace period.
+		The qsmaskinit will have one bit for each entity that is
+		currently expected to check in during each grace period.
+		The value of qsmaskinit is assigned to that of qsmask
+		at the beginning of each grace period.
+
+		For example, for "rcu", the qsmask of the first entry
+		of the lowest level is 0x14, meaning that we are still
+		waiting for CPUs 2 and 4 to check in for the current
+		grace period.
+
+	o	The numbers separated by the ":" are the range of CPUs
+		served by this struct rcu_node.  This can be helpful
+		in working out how the hierarchy is wired together.
+
+		For example, the first entry at the lowest level shows
+		"0:5", indicating that it covers CPUs 0 through 5.
+
+	o	The number after the "^" indicates the bit in the
+		next higher level rcu_node structure that this
+		rcu_node structure corresponds to.
+
+		For example, the first entry at the lowest level shows
+		"^0", indicating that it corresponds to bit zero in
+		the first entry at the middle level.
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index c9ffd8c..d8e784a 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 	/* Check to see if we need to or have stopped logging */
 	if (fatal || !logging_enabled) {
 		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		else
 			rtas_log_start += 1;
 
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		wake_up_interruptible(&rtas_log_wait);
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..9b70b92 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
+extern void rcu_nmi_enter(void);
+extern void rcu_nmi_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+# define rcu_nmi_enter() do { } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
@@ -134,7 +138,6 @@ extern void rcu_irq_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
-		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -153,7 +156,6 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
-		rcu_irq_exit();				\
 	} while (0)
 
 /*
@@ -161,7 +163,7 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()		do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e8b4039..f8544ae 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,15 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
-#ifdef CONFIG_CLASSIC_RCU
+#if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
-#else /* #ifdef CONFIG_CLASSIC_RCU */
+#elif defined(CONFIG_TREE_RCU)
+#include <linux/rcutree.h>
+#elif defined(CONFIG_PREEMPT_RCU)
 #include <linux/rcupreempt.h>
-#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+#else
+#error "Unknown RCU implementation specified to kernel configuration"
+#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 0000000..7f8e70b
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,320 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+
+#ifndef __LINUX_RCUTREE_H
+#define __LINUX_RCUTREE_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if (NR_CPUS) <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+	int dynticks_nesting;	/* Track nesting level, sort of. */
+	int dynticks;		/* Even value for dynticks-idle, else odd. */
+	int dynticks_nmi;	/* Even value for either dynticks-idle or */
+				/*  not in nmi handler, else odd.  So this */
+				/*  remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+	/* 3) rcu-barrier functions */
+	struct rcu_head barrier;
+
+#ifdef CONFIG_NO_HZ
+	/* 4) dynticks interface (see http://lwn.net/Articles/279077/) */
+	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 5) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struc rcu_data. */
+#define RCU_SAVE_DYNTICK	0	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		1	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK	 3	/* for rsp->seconds_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK	30	/* for rsp->seconds_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* Force QS state. */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long seconds_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+static inline void __rcu_read_lock(void)
+{
+	preempt_disable();
+	__acquire(RCU);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock(void)
+{
+	rcu_read_release();
+	__release(RCU);
+	preempt_enable();
+}
+static inline void __rcu_read_lock_bh(void)
+{
+	local_bh_disable();
+	__acquire(RCU_BH);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock_bh(void)
+{
+	rcu_read_release();
+	__release(RCU_BH);
+	local_bh_enable();
+}
+
+#define __synchronize_sched() synchronize_rcu()
+
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
+static inline void rcu_init_sched(void)
+{
+}
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+#ifdef CONFIG_NO_HZ
+void rcu_enter_nohz(void);
+void rcu_exit_nohz(void);
+#else /* CONFIG_NO_HZ */
+static inline void rcu_enter_nohz(void)
+{
+}
+static inline void rcu_exit_nohz(void)
+{
+}
+#endif /* CONFIG_NO_HZ */
+
+#endif /* __LINUX_RCUTREE_H */
diff --git a/init/Kconfig b/init/Kconfig
index b678803..6fdca78 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -914,10 +914,16 @@ source "block/Kconfig"
 config PREEMPT_NOTIFIERS
 	bool
 
-config CLASSIC_RCU
-	def_bool !PREEMPT_RCU
+config TREE_RCU_TRACE
+	def_bool RCU_TRACE && TREE_RCU
+	select DEBUG_FS
 	help
-	  This option selects the classic RCU implementation that is
-	  designed for best read-side performance on non-realtime
-	  systems.  Classic RCU is the default.  Note that the
-	  PREEMPT_RCU symbol is used to select/deselect this option.
+	  This option provides tracing for the TREE_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcutree_trace.c.
+
+config PREEMPT_RCU_TRACE
+	def_bool RCU_TRACE && PREEMPT_RCU
+	select DEBUG_FS
+	help
+	  This option provides tracing for the PREEMPT_RCU implementation,
+	  permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 9fdba03..463f297 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,10 +52,29 @@ config PREEMPT
 
 endchoice
 
+choice
+	prompt "RCU Implementation"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+	  
+	  Select this option if you are unsure.
+
+config TREE_RCU
+	bool "Tree-based hierarchical RCU"
+	help
+	  This option selects the RCU implementation that is
+	  designed for very large SMP system with hundreds or
+	  thousands of CPUs.
+
 config PREEMPT_RCU
 	bool "Preemptible RCU"
 	depends on PREEMPT
-	default n
 	help
 	  This option reduces the latency of the kernel by making certain
 	  RCU sections preemptible. Normally RCU code is non-preemptible, if
@@ -64,16 +83,47 @@ config PREEMPT_RCU
 	  now-naive assumptions about each RCU read-side critical section
 	  remaining on a given CPU through its execution.
 
-	  Say N if you are unsure.
+endchoice
 
 config RCU_TRACE
-	bool "Enable tracing for RCU - currently stats in debugfs"
-	depends on PREEMPT_RCU
-	select DEBUG_FS
-	default y
+	bool "Enable tracing for RCU"
+	depends on TREE_RCU || PREEMPT_RCU
 	help
 	  This option provides tracing in RCU which presents stats
 	  in debugfs for debugging RCU implementation.
 
 	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
+
+config RCU_FANOUT
+	int "Tree-based hierarchical RCU fanout value"
+	range 2 64 if 64BIT
+	range 2 32 if !64BIT
+	depends on TREE_RCU
+	default 64 if 64BIT
+	default 32 if !64BIT
+	help
+	  This option controls the fanout of hierarchical implementations
+	  of RCU, allowing RCU to work efficiently on machines with
+	  large numbers of CPUs.  This value must be at least the cube
+	  root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+	  systems and up to 262,144 for 64-bit systems.
+
+	  Select a specific number if testing RCU itself.
+	  Take the default if unsure.
+
+config RCU_FANOUT_EXACT
+	bool "Disable tree-based hierarchical RCU auto-balancing"
+	depends on TREE_RCU
+	default n
+	help
+	  This option forces use of the exact RCU_FANOUT value specified,
+	  regardless of imbalances in the hierarchy.  This is useful for
+	  testing RCU itself, and might one day be useful on systems with
+	  strong NUMA behavior.
+
+	  Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+
+	  Say n if unsure.
+
+	
diff --git a/kernel/Makefile b/kernel/Makefile
index 4e1d7df..101e880 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,10 +74,10 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_TREE_RCU) += rcutree.o
 obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
-ifeq ($(CONFIG_PREEMPT_RCU),y)
-obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
-endif
+obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 2782793..6bc8489 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -559,6 +559,16 @@ void rcu_irq_exit(void)
 	}
 }
 
+void rcu_nmi_enter(void)
+{
+	rcu_irq_enter();
+}
+
+void rcu_nmi_exit(void)
+{
+	rcu_irq_exit();
+}
+
 static void dyntick_save_progress_counter(int cpu)
 {
 	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c..def42e8 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -149,12 +149,12 @@ static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
 		sp->done_length += cp->done_length;
 		sp->done_add += cp->done_add;
 		sp->done_remove += cp->done_remove;
-		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		atomic_add(atomic_read(&cp->done_invoked), &sp->done_invoked);
 		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
-		atomic_set(&sp->rcu_try_flip_1,
-			   atomic_read(&cp->rcu_try_flip_1));
-		atomic_set(&sp->rcu_try_flip_e1,
-			   atomic_read(&cp->rcu_try_flip_e1));
+		atomic_add(atomic_read(&cp->rcu_try_flip_1),
+			   &sp->rcu_try_flip_1);
+		atomic_add(atomic_read(&cp->rcu_try_flip_e1),
+			   &sp->rcu_try_flip_e1);
 		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
 		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
 		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
new file mode 100644
index 0000000..ed4e1f0
--- /dev/null
+++ b/kernel/rcutree.c
@@ -0,0 +1,1502 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/time.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+/* Data structures. */
+
+#define RCU_STATE_INITIALIZER(name) { \
+	.level = { &name.node[0] }, \
+	.levelcnt = { \
+		NUM_RCU_LVL_0,  /* root of hierarchy. */ \
+		NUM_RCU_LVL_1, \
+		NUM_RCU_LVL_2, \
+		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
+	}, \
+	.signaled = RCU_SIGNAL_INIT, \
+	.gpnum = -300, \
+	.completed = -300, \
+	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
+	.n_force_qs = 0, \
+	.n_force_qs_ngp = 0, \
+}
+
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+#ifdef CONFIG_NO_HZ
+DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks);
+#endif /* #ifdef CONFIG_NO_HZ */
+
+static int blimit = 10;		/* Maximum callbacks per softirq. */
+static int qhimark = 10000;	/* If this many pending, ignore blimit. */
+static int qlowmark = 100;	/* Once only this many pending, use blimit. */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
+
+/*
+ * Return the number of RCU batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU BH batches processed thus far for debug & stats.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_state.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/*
+ * Does the CPU have callbacks ready to be invoked?
+ */
+static int
+cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
+{
+	return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL];
+}
+
+/*
+ * Does the current CPU require a yet-as-unscheduled grace period?
+ */
+static int
+cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* ACCESS_ONCE() because we are accessing outside of lock. */
+	return *rdp->nxttail[RCU_DONE_TAIL] &&
+	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+}
+
+/*
+ * Return the root node of the specified rcu_state structure.
+ */
+static struct rcu_node *rcu_get_root(struct rcu_state *rsp)
+{
+	return &rsp->node[0];
+}
+
+/*
+ * If the specified CPU is offline, tell the caller that it is in
+ * a quiescent state.  Otherwise, whack it with a reschedule IPI.
+ * Grace periods can end up waiting on an offline CPU when that
+ * CPU is in the process of coming online -- it will be added to the
+ * rcu_node bitmasks before it actually makes it online.  Because this
+ * race is quite rare, we check for it after detecting that the grace
+ * period has been delayed rather than checking each and every CPU
+ * each and every time we start a new grace period.
+ */
+static int rcu_implicit_offline_qs(struct rcu_data *rdp)
+{
+	/*
+	 * If the CPU is offline, it is in a quiescent state.  We can
+	 * trust its state not to change because interrupts are disabled.
+	 */
+	if (cpu_is_offline(rdp->cpu)) {
+		rdp->offline_fqs++;
+		return 1;
+	}
+
+	/* The CPU is online, so send it a reschedule IPI. */
+	if (rdp->cpu != smp_processor_id())
+		smp_send_reschedule(rdp->cpu);
+	else
+		set_need_resched();
+	rdp->resched_ipi++;
+	return 0;
+}
+
+#ifdef CONFIG_NO_HZ
+static DEFINE_RATELIMIT_STATE(rcu_rs, 10 * HZ, 5);
+
+/*
+ * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * read-side critical sections can occur.  (Though RCU read-side
+ * critical sections can occur in irq handlers in nohz mode, a possibility
+ * handled by rcu_irq_enter() and rcu_irq_exit()).
+ */
+void rcu_enter_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dynticks).dynticks & 0x1, &rcu_rs);
+	local_irq_restore(flags);
+}
+
+/*
+ * Exit nohz mode.
+ */
+void rcu_exit_nohz(void)
+{
+	unsigned long flags;
+	struct rcu_dynticks *rdtp;
+
+	local_irq_save(flags);
+	rdtp = &__get_cpu_var(rcu_dynticks);
+	rdtp->dynticks++;
+	rdtp->dynticks_nesting--;
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dynticks).dynticks & 0x1),
+			  &rcu_rs);
+	local_irq_restore(flags);
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+}
+
+/**
+ * rcu_nmi_enter - Called from NMI
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is active.
+ */
+void rcu_nmi_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks_nmi & 0x1), &rcu_rs);
+}
+
+/**
+ * rcu_nmi_exit - Called from NMI
+ *
+ * If the CPU was idle with dynamic ticks active, and there is no
+ * irq handler running, this updates rdtp->dynticks_nmi to let the
+ * RCU grace-period handling know that the CPU is no longer active.
+ */
+void rcu_nmi_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks & 0x1)
+		return;
+	rdtp->dynticks_nmi++;
+	WARN_ON_RATELIMIT(rdtp->dynticks_nmi & 0x1, &rcu_rs);
+}
+
+/**
+ * rcu_irq_enter - Called from hard irq handlers
+ *
+ * If the CPU was idle with dynamic ticks active, this updates the
+ * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ */
+void rcu_irq_enter(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (rdtp->dynticks_nesting++) {
+		return;
+	}
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(!(rdtp->dynticks & 0x1), &rcu_rs);
+}
+
+/**
+ * rcu_irq_exit - Called when exiting hard irq context.
+ *
+ * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * to put let the RCU handling be aware that the CPU is going back to idle
+ * with no ticks.
+ */
+void rcu_irq_exit(void)
+{
+	struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
+
+	if (--rdtp->dynticks_nesting) {
+		return;
+	}
+	rdtp->dynticks++;
+	WARN_ON_RATELIMIT(rdtp->dynticks & 0x1, &rcu_rs);
+
+	/* If the interrupt queued a callback, get out of dyntick mode. */
+	if (__get_cpu_var(rcu_data).nxtlist ||
+	    __get_cpu_var(rcu_bh_data).nxtlist)
+		set_need_resched();
+}
+
+/*
+ * Record the specified "completed" value, which is later used to validate
+ * dynticks counter manipulations.  Specify "rsp->complete - 1" to
+ * unconditionally invalidate any future dynticks manipulations (which is
+ * useful at the beginning of a grace period).
+ */
+static void dyntick_record_completed(struct rcu_state *rsp, int comp)
+{
+	rsp->dynticks_completed = comp;
+}
+
+/*
+ * Recall the previously recorded value of the completion for dynticks.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->dynticks_completed;
+}
+
+/*
+ * Snapshot the specified CPU's dynticks counter so that we can later
+ * credit them with an implicit quiescent state.  Return 1 if this CPU
+ * is already in a quiescent state courtesy of dynticks idle mode.
+ */
+static int dyntick_save_progress_counter(struct rcu_data *rdp)
+{
+	int ret;
+	int snap;
+	int snap_nmi;
+
+	snap = rdp->dynticks->dynticks;
+	snap_nmi = rdp->dynticks->dynticks_nmi;
+	smp_mb();	/* Order sampling of snap with end of grace period. */
+	rdp->dynticks_snap = snap;
+	rdp->dynticks_nmi_snap = snap_nmi;
+	ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
+	if (ret)
+		rdp->dynticks_fqs++;
+	return ret;
+}
+
+/*
+ * Return true if the specified CPU has passed through a quiescent
+ * state by virtue of being in or having passed through an dynticks
+ * idle state since the last call to dyntick_save_progress_counter()
+ * for this same CPU.
+ */
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	long curr;
+	long curr_nmi;
+	long snap;
+	long snap_nmi;
+
+	curr = rdp->dynticks->dynticks;
+	snap = rdp->dynticks_snap;
+	curr_nmi = rdp->dynticks->dynticks_nmi;
+	snap_nmi = rdp->dynticks_nmi_snap;
+	smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+
+	/*
+	 * If the CPU passed through or entered a dynticks idle phase with
+	 * no active irq/NMI handlers, then we can safely pretend that the CPU
+	 * already acknowledged the request to pass through a quiescent
+	 * state.  Either way, that CPU cannot possibly be in an RCU
+	 * read-side critical section that started before the beginning
+	 * of the current RCU grace period.
+	 */
+	if ((curr != snap || (curr & 0x1) == 0) &&
+	    (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+		rdp->dynticks_fqs++;
+		return 1;
+	}
+
+	/* Go check for the CPU being offline. */
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#else /* #ifdef CONFIG_NO_HZ */
+
+static void dyntick_record_completed(struct rcu_state *rsp, int comp)
+{
+}
+
+/*
+ * If there are no dynticks, then the only way that a CPU can passively
+ * be in a quiescent state is to be offline.  Unlike dynticks idle, which
+ * is a point in time during the prior (already finished) grace period,
+ * an offline CPU is always in a quiescent state, and thus can be
+ * unconditionally applied.  So just return the current value of completed.
+ */
+static long dyntick_recall_completed(struct rcu_state *rsp)
+{
+	return rsp->completed;
+}
+
+static int dyntick_save_progress_counter(struct rcu_data *rdp) { return 0; }
+
+static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
+{
+	return rcu_implicit_offline_qs(rdp);
+}
+
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+	rsp->gp_start = jiffies;
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_state *rsp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rnp->lock, flags);
+	delta = get_seconds() - rsp->seconds_stall;
+	if (delta < 2 || rsp->gpnum != rsp->completed) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	rsp->seconds_stall = get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		if (rnp_cur->qsmask == 0)
+			continue;
+		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
+			if (rnp_cur->qsmask & (1UL << cpu))
+				printk(" %d", rnp_cur->grplo + cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
+	force_quiescent_state(rsp, 0);  /* Kick them all. */
+}
+
+static void print_cpu_stall(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
+			smp_processor_id(), get_seconds(),
+			jiffies - rsp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rnp->lock, flags);
+	if ((long)(get_seconds() - rsp->seconds_stall) >= 0)
+		rsp->seconds_stall =
+			get_seconds() + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rnp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long delta;
+	struct rcu_node *rnp;
+
+	delta = get_seconds() - rsp->seconds_stall;
+	rnp = rdp->mynode;
+	if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rsp);
+
+	} else if (rsp->gpnum != rsp->completed && delta >= 2) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rsp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_state *rsp)
+{
+}
+
+static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * Update CPU-local rcu_data state to record the newly noticed grace period.
+ * This is used both when we started the grace period and when we notice
+ * that someone else started the grace period.
+ */
+static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	rdp->qs_pending = 1;
+	rdp->passed_quiesc = 0;
+	rdp->gpnum = rsp->gpnum;
+}
+
+/*
+ * Did someone else start a new RCU grace period start since we last
+ * checked?  Update local state appropriately if so.  Must be called
+ * on the CPU corresponding to rdp.
+ */
+static int
+check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	local_irq_save(flags);
+	if (rdp->gpnum != rsp->gpnum) {
+		note_new_gpnum(rsp, rdp);
+		ret = 1;
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Start a new RCU grace period if warranted, re-initializing the hierarchy
+ * in preparation for detecting the next grace period.  The caller must hold
+ * the root node's ->lock, which is released before return.  Hard irqs must
+ * be disabled.
+ */
+static void
+rcu_start_gp(struct rcu_state *rsp, unsigned long iflg)
+	__releases(rsp->rda[smp_processor_id()]->lock)
+{
+	unsigned long flags = iflg;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	struct rcu_node *rnp_cur;
+	struct rcu_node *rnp_end;
+
+	if (!cpu_needs_another_gp(rsp, rdp)) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	/* Advance to a new grace period and initialize state. */
+	rsp->gpnum++;
+	rsp->signaled = RCU_SIGNAL_INIT;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	record_gp_stall_check_time(rsp);
+	dyntick_record_completed(rsp, rsp->completed - 1);
+	note_new_gpnum(rsp, rdp);
+
+	/*
+	 * Because we are first, we know that all our callbacks will
+	 * be covered by this upcoming grace period, even the ones
+	 * that were registered arbitrarily recently.
+	 */
+	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+	/* Special-case the common single-level case. */
+	if (NUM_RCU_NODES == 1) {
+		rnp->qsmask = rnp->qsmaskinit;
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+
+	spin_unlock_irqrestore(&rnp->lock, flags);
+
+
+	/* Exclude any concurrent CPU-hotplug operations. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/*
+	 * Set the quiescent-state-needed bits in all the non-leaf RCU
+	 * nodes for all currently online CPUs.  This operation relies
+	 * on the layout of the hierarchy within the rsp->node[] array.
+	 * Note that other CPUs will access only the leaves of the
+	 * hierarchy, which still indicate that no grace period is in
+	 * progress.  In addition, we have excluded CPU-hotplug operations.
+	 *
+	 * We therefore do not need to hold any locks.  Any required
+	 * memory barriers will be supplied by the locks guarding the
+	 * leaf rcu_nodes in the hierarchy.
+	 */
+
+	rnp_end = rsp->level[NUM_RCU_LVLS - 1];
+	for (rnp_cur = &rsp->node[0]; rnp_cur < rnp_end; rnp_cur++)
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+
+	/*
+	 * Now set up the leaf nodes.  Here we must be careful.  First,
+	 * we need to hold the lock in order to exclude other CPUs, which
+	 * might be contending for the leaf nodes' locks.  Second, as
+	 * soon as we initialize a given leaf node, its CPUs might run
+	 * up the rest of the hierarchy.  We must therefore acquire locks
+	 * for each node that we touch during this stage.  (But we still
+	 * are excluding CPU-hotplug operations.)
+	 *
+	 * Note that the grace period cannot complete until we finish
+	 * the initialization process, as there will be at least one
+	 * qsmask bit set in the root node until that time, namely the
+	 * one corresponding to this CPU.
+	 */
+	rnp_end = &rsp->node[NUM_RCU_NODES];
+	rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		spin_lock(&rnp_cur->lock);	/* irqs already disabled. */
+		rnp_cur->qsmask = rnp_cur->qsmaskinit;
+		spin_unlock(&rnp_cur->lock);	/* irqs already disabled. */
+	}
+
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
+/*
+ * Advance this CPU's callbacks, but only if the current grace period
+ * has ended.  This may be called only from the CPU to whom the rdp
+ * belongs.
+ */
+static void
+rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	long completed_snap;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */
+
+	/* Did another grace period end? */
+	if (rdp->completed != completed_snap) {
+
+		/* Advance callbacks.  No harm if list empty. */
+		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL];
+		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		/* Remember that we saw this grace-period completion. */
+		rdp->completed = completed_snap;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Similar to cpu_quiet(), for which it is a helper function.  Allows
+ * a group of CPUs to be quieted at one go, though all the CPUs in the
+ * group must be represented by the same leaf rcu_node structure.
+ * That structure's lock must be held upon entry, and it is released
+ * before return.
+ */
+static void
+cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,
+	      unsigned long flags)
+	__releases(rnp->lock)
+{
+	/* Walk up the rcu_node hierarchy. */
+	for (;;) {
+		if (!(rnp->qsmask & mask)) {
+
+			/* Our bit has already been cleared, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		rnp->qsmask &= ~mask;
+		if (rnp->qsmask != 0) {
+
+			/* Other bits still set at this level, so done. */
+			spin_unlock_irqrestore(&rnp->lock, flags);
+			return;
+		}
+		mask = rnp->grpmask;
+		if (rnp->parent == NULL) {
+
+			/* No more levels.  Exit loop holding root lock. */
+
+			break;
+		}
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		rnp = rnp->parent;
+		spin_lock_irqsave(&rnp->lock, flags);
+	}
+
+	/*
+	 * Get here if we are the last CPU to pass through a quiescent
+	 * state for this grace period.  Clean up and let rcu_start_gp()
+	 * start up the next grace period if one is needed.  Note that
+	 * we still hold rnp->lock, as required by rcu_start_gp(), which
+	 * will release it.
+	 */
+	rsp->completed = rsp->gpnum;
+	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
+	rcu_start_gp(rsp, flags);  /* releases rnp->lock. */
+}
+
+/*
+ * Record a quiescent state for the specified CPU, which must either be
+ * the current CPU or an offline CPU.  When invoking this on one's own
+ * behalf, lastcomp is used to make sure we are still in the grace period
+ * of interest.  We don't want to end the current grace period based on
+ * quiescent states detected in an earlier grace period!  On the other hand,
+ * it the CPU being quieted is offline, we can safely pass in lastcomp==NULL,
+ * since an offline CPU is in a quiescent state with respect to any grace
+ * period, unlike pesky online CPUs, which can go non-quiescent with
+ * absolutely no warning.
+ */
+static void
+cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long *lastcomp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp;
+
+	rnp = rdp->mynode;
+	spin_lock_irqsave(&rnp->lock, flags);
+	if (lastcomp != NULL &&
+	    *lastcomp != ACCESS_ONCE(rsp->completed)) {
+
+		/*
+		 * Someone beat us to it for this grace period, so leave.
+		 * The race with GP start is resolved by the fact that we
+		 * hold the leaf rcu_node lock, so that the per-CPU bits
+		 * cannot yet be initialized -- so we would simply find our
+		 * CPU's bit already cleared in cpu_quiet_msk() if this race
+		 * occurred.
+		 */
+		rdp->passed_quiesc = 0;	/* try again later! */
+		spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	mask = rdp->grpmask;
+	if ((rnp->qsmask & mask) == 0) {
+		spin_unlock_irqrestore(&rnp->lock, flags);
+	} else {
+		rdp->qs_pending = 0;
+
+		/*
+		 * This GP can't end until cpu checks in, so all of our
+		 * callbacks can be processed during the next GP.
+		 */
+		rdp = rsp->rda[smp_processor_id()];
+		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+
+		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */
+	}
+}
+
+/*
+ * Check to see if there is a new grace period of which this CPU
+ * is not yet aware, and if so, set up local rcu_data state for it.
+ * Otherwise, see if this CPU has just passed through its first
+ * quiescent state for this grace period, and record that fact if so.
+ */
+static void
+rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* If there is now a new grace period, record and return. */
+	if (check_for_new_grace_period(rsp, rdp))
+		return;
+
+	/*
+	 * Does this CPU still need to do its part for current grace period?
+	 * If no, return and let the other CPUs do their part as well.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+
+	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */
+	cpu_quiet(rdp->cpu, rsp, rdp, &rdp->passed_quiesc_completed);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
+ * and move all callbacks from the outgoing CPU to the current one.
+ */
+static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
+{
+	int i;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_data *rdp_me;
+	struct rcu_node *rnp;
+
+	/* Exclude any attempts to start a new grace period. */
+	spin_lock_irqsave(&rsp->onofflock, flags);
+
+	/* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;	/* rnp->grplo is constant. */
+	do {
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
+		rnp->qsmaskinit &= ~mask;
+		if (rnp->qsmaskinit != 0) {
+			spin_unlock(&rnp->lock); /* irqs already disabled. */
+			break;
+		}
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+						/* @@@ move up to simplify. */
+		rnp = rnp->parent;
+	} while (rnp != NULL);
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/* Being offline is a quiescent state, so go record it. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+
+	/*
+	 * Move callbacks from the outgoing CPU to the running CPU.
+	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * (uncharacteristically) safe to access it rcu_data structure.
+	 * Note also that we must carefully retain the order of the
+	 * outgoing CPU's callbacks in order for rcu_barrier() to work
+	 * correctly.  Finally, note that we start all the callbacks
+	 * afresh, even those that have passed through a grace period
+	 * and are therefore ready to invoke.  The theory is that hotplug
+	 * events are rare, and that if they are frequent enough to
+	 * indefinitely delay callbacks, you have far worse things to
+	 * be worrying about.
+	 */
+	rdp_me = rsp->rda[smp_processor_id()];
+	if (rdp->nxtlist != NULL) {
+		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
+		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
+		rdp->nxtlist = NULL;
+		for (i = 0; i < RCU_NEXT_SIZE; i++)
+			rdp->nxttail[i] = &rdp->nxtlist;
+		rdp_me->qlen += rdp->qlen;
+		rdp->qlen = 0;
+	}
+	local_irq_restore(flags);
+}
+
+/*
+ * Remove the specified CPU from the RCU hierarchy and move any pending
+ * callbacks that it might have to the current CPU.  This code assumes
+ * that at least one CPU in the system will remain running at all times.
+ * Any attempt to offline -all- CPUs is likely to strand RCU callbacks.
+ */
+static void rcu_offline_cpu(int cpu)
+{
+	__rcu_offline_cpu(cpu, &rcu_state);
+	__rcu_offline_cpu(cpu, &rcu_bh_state);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+/*
+ * Invoke any RCU callbacks that have made it to the end of their grace
+ * period.  Thottle as specified by rdp->blimit.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list, **tail;
+	int count;
+
+	/* If no callbacks are ready, just return.*/
+	if (!cpu_has_callbacks_ready_to_invoke(rdp))
+		return;
+
+	/*
+	 * Extract the list of ready callbacks, disabling to prevent
+	 * races with call_rcu() from interrupt handlers.
+	 */
+	local_irq_save(flags);
+	list = rdp->nxtlist;
+	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+	*rdp->nxttail[RCU_DONE_TAIL] = NULL;
+	tail = rdp->nxttail[RCU_DONE_TAIL];
+	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--)
+		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL])
+			rdp->nxttail[count] = &rdp->nxtlist;
+	local_irq_restore(flags);
+
+	/* Invoke callbacks. */
+	count = 0;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+
+	/* Update count, and requeue any remaining callbacks. */
+	local_irq_save(flags);
+	rdp->qlen -= count;
+	if (list != NULL) {
+		*tail = rdp->nxtlist;
+		rdp->nxtlist = list;
+		for (count = 0; count < RCU_NEXT_SIZE; count++)
+			if (&rdp->nxtlist == rdp->nxttail[count])
+				rdp->nxttail[count] = tail;
+			else
+				break;
+	}
+	local_irq_restore(flags);
+
+	/* Reinstate batch limit if we have worked down the excess. */
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	/* Re-raise the RCU softirq if there are callbacks remaining. */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		raise_softirq(RCU_SOFTIRQ);
+}
+
+/*
+ * Check to see if this CPU is in a non-context-switch quiescent state
+ * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
+ * Also schedule the RCU softirq handler.
+ *
+ * This function must be called with hardirqs disabled.  It is normally
+ * invoked from the scheduling-clock interrupt.  If rcu_pending returns
+ * false, there is no point in invoking rcu_check_callbacks().
+ */
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+
+		/*
+		 * Get here if this CPU took its interrupt from user
+		 * mode or from the idle loop, and if this is not a
+		 * nested interrupt.  In this case, the CPU is in
+		 * a quiescent state, so count it.
+		 *
+		 * Also do a memory barrier.  This is needed to handle
+		 * the case where writes from a preempt-disable section
+		 * of code get reordered into schedule() by this CPU's
+		 * write buffer.  The memory barrier makes sure that
+		 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
+		 * by other CPUs to happen after any such write.
+		 */
+
+		smp_mb();  /* See above block comment. */
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+
+	} else if (!in_softirq()) {
+
+		/*
+		 * Get here if this CPU did not take its interrupt from
+		 * softirq, in other words, if it is not interrupting
+		 * a rcu_bh read-side critical section.  This is an _bh
+		 * critical section, so count it.  The memory barrier
+		 * is needed for the same reason as is the above one.
+		 */
+
+		smp_mb();  /* See above block comment. */
+		rcu_bh_qsctr_inc(cpu);
+	}
+	raise_softirq(RCU_SOFTIRQ);
+}
+
+#ifdef CONFIG_SMP
+
+/*
+ * Scan the leaf rcu_node structures, processing dyntick state for any that
+ * have not yet encountered a quiescent state, using the function specified.
+ * Returns 1 if the current grace period ends while scanning (possibly
+ * because we made it end).
+ */
+static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
+			       int (*f)(struct rcu_data *))
+{
+	unsigned long bit;
+	int cpu;
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
+	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+
+	for (; rnp_cur < rnp_end; rnp_cur++) {
+		mask = 0;
+		spin_lock_irqsave(&rnp_cur->lock, flags);
+		if (rsp->completed != lastcomp) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			return 1;
+		}
+		if (rnp_cur->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			continue;
+		}
+		cpu = rnp_cur->grplo;
+		bit = 1;
+		mask = 0;
+		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
+			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+				mask |= bit;
+		}
+		if (mask != 0 && rsp->completed == lastcomp) {
+
+			/* cpu_quiet_msk() releases rnp_cur->lock. */
+			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			continue;
+		}
+		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+	}
+	return 0;
+}
+
+/*
+ * Force quiescent states on reluctant CPUs, and also detect which
+ * CPUs are in dyntick-idle mode.
+ */
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	unsigned long flags;
+	long lastcomp;
+	struct rcu_node *rnp = rcu_get_root(rsp);
+	u8 signaled;
+
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
+		return;  /* No grace period in progress, nothing to force. */
+	if (!spin_trylock_irqsave(&rsp->fqslock, flags))
+		return;	/* Someone else is already on the job. */
+	if (relaxed && (long)(rsp->jiffies_force_qs - jiffies) >= 0)
+		goto unlock_ret; /* no emergency and done recently. */
+	rsp->n_force_qs++;
+	spin_lock(&rnp->lock);
+	lastcomp = rsp->completed;
+	signaled = rsp->signaled;
+	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
+	if (rsp->completed == rsp->gpnum) {
+		rsp->n_force_qs_ngp++;
+		spin_unlock(&rnp->lock);
+		goto unlock_ret;  /* no GP in progress, time updated. */
+	}
+	spin_unlock(&rnp->lock);
+	switch (signaled) {
+	case RCU_SAVE_DYNTICK:
+
+		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
+			break; /* So gcc recognizes the dead code. */
+
+		/* Record dyntick-idle state. */
+		if (rcu_process_dyntick(rsp, lastcomp,
+					dyntick_save_progress_counter))
+			goto unlock_ret;
+
+		/* Update state, record completion counter. */
+		spin_lock(&rnp->lock);
+		if (lastcomp == rsp->completed) {
+			rsp->signaled = RCU_FORCE_QS;
+			dyntick_record_completed(rsp, lastcomp);
+		}
+		spin_unlock(&rnp->lock);
+		break;
+
+	case RCU_FORCE_QS:
+
+		/* Check dyntick-idle state, send IPI to laggarts. */
+		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp),
+					rcu_implicit_dynticks_qs))
+			goto unlock_ret;
+
+		/* Leave state in case more forcing is required. */
+
+		break;
+	}
+unlock_ret:
+	spin_unlock_irqrestore(&rsp->fqslock, flags);
+}
+
+#else /* #ifdef CONFIG_SMP */
+
+static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
+{
+	set_need_resched();
+}
+
+#endif /* #else #ifdef CONFIG_SMP */
+
+/*
+ * This does the RCU processing work from softirq context for the
+ * specified rcu_state and rcu_data structures.  This may be called
+ * only from the CPU to whom the rdp belongs.
+ */
+static void
+__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	unsigned long flags;
+
+	/*
+	 * If an RCU GP has gone long enough, go check for dyntick
+	 * idle CPUs and, if needed, send resched IPIs.
+	 */
+	if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+	    	force_quiescent_state(rsp, 1);
+
+	/*
+	 * Advance callbacks in response to end of earlier grace
+	 * period that some other CPU ended.
+	 */
+	rcu_process_gp_end(rsp, rdp);
+
+	/* Update RCU state based on any recent quiescent states. */
+	rcu_check_quiescent_state(rsp, rdp);
+
+	/* Does this CPU require a not-yet-started grace period? */
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
+		rcu_start_gp(rsp, flags);  /* releases above lock */
+	}
+
+	/* If there are callbacks ready, invoke them. */
+	rcu_do_batch(rdp);
+}
+
+/*
+ * Do softirq processing for the current CPU.
+ */
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be seen before any RCU
+	 * grace-period manupulations below.
+	 */
+	smp_mb(); /* See above block comment. */
+
+	__rcu_process_callbacks(&rcu_state, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be seen after any RCU
+	 * grace-period manupulations above.
+	 */
+	smp_mb(); /* See above block comment. */
+}
+
+static void
+__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
+	   struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+
+	smp_mb(); /* Ensure RCU update seen before callback registry. */
+
+	/*
+	 * Opportunistically note grace-period endings and beginnings.
+	 * Note that we might see a beginning right after we see an
+	 * end, but never vice versa, since this CPU has to pass through
+	 * a quiescent state betweentimes.
+	 */
+	local_irq_save(flags);
+	rdp = rsp->rda[smp_processor_id()];
+	rcu_process_gp_end(rsp, rdp);
+	check_for_new_grace_period(rsp, rdp);
+
+	/* Add the callback to our list. */
+	*rdp->nxttail[RCU_NEXT_TAIL] = head;
+	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+
+	/* Start a new grace period if one not already started. */
+	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
+		unsigned long nestflag;
+		struct rcu_node *rnp_root = rcu_get_root(rsp);
+
+		spin_lock_irqsave(&rnp_root->lock, nestflag);
+		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
+	}
+
+	/* Force the grace period if too many callbacks or too long waiting. */
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rsp, 0);
+	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		force_quiescent_state(rsp, 1);
+	local_irq_restore(flags);
+}
+
+/*
+ * Queue an RCU callback for invocation after a grace period.
+ */
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Queue an RCU for invocation after a quicker grace period.
+ */
+void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	__call_rcu(head, func, &rcu_bh_state);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, for the specified type of RCU, returning 1 if so.
+ * The checks are in order of increasing expense: checks that can be
+ * carried out against CPU-local state are performed first.  However,
+ * we must check for CPU stalls first, else we might not get a chance.
+ */
+static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
+{
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rsp, rdp);
+
+	/* Is the RCU core waiting for a quiescent state from this CPU? */
+	if (rdp->qs_pending)
+		return 1;
+
+	/* Does this CPU have callbacks ready to invoke? */
+	if (cpu_has_callbacks_ready_to_invoke(rdp))
+		return 1;
+
+	/* Has RCU gone idle with this CPU needing another grace period? */
+	if (cpu_needs_another_gp(rsp, rdp))
+		return 1;
+
+	/* Has another RCU grace period completed?  */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+		return 1;
+
+	/* Has a new RCU grace period started? */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+		return 1;
+
+	/* Has an RCU GP gone long enough to send resched IPIs &c? */
+	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
+	    (long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_state, &per_cpu(rcu_data, cpu)) ||
+	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	/* RCU callbacks either ready or pending? */
+	return per_cpu(rcu_data, cpu).nxtlist ||
+	       per_cpu(rcu_bh_data, cpu).nxtlist;
+}
+
+/*
+ * Initialize a CPU's per-CPU RCU data.  We take this "scorched earth"
+ * approach so that we don't have to worry about how long the CPU has
+ * been gone, or whether it ever was online previously.  We do trust the
+ * ->mynode field, as it is constant for a given struct rcu_data and
+ * initialized during early boot.
+ *
+ * Note that only one online or offline event can be happening at a given
+ * time.  Note also that we can accept some slop in the rsp->completed
+ * access due to the fact that this CPU cannot possibly have any RCU
+ * callbacks in flight yet.
+ */
+static void
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	int i;
+	unsigned long mask;
+	struct rcu_data *rdp = rsp->rda[cpu];
+	struct rcu_node *rnp = rcu_get_root(rsp);
+
+	/* Set up local state, ensuring consistent view of global state. */
+	spin_lock_irqsave(&rnp->lock, flags);
+	rdp->completed = rsp->completed;
+	rdp->gpnum = rsp->completed;
+	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
+	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */
+	rdp->beenonline = 1;	 /* We have now been online. */
+	rdp->passed_quiesc_completed = rsp->completed - 1;
+	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rdp->qlen = 0;
+	rdp->blimit = blimit;
+#ifdef CONFIG_NO_HZ
+	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
+#endif /* #ifdef CONFIG_NO_HZ */
+	rdp->cpu = cpu;
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we won't be part
+	 * of it, but that is OK, as we are currently in a quiescent state.
+	 */
+
+	/* Exclude any attempts to start a new GP on large systems. */
+	spin_lock(&rsp->onofflock);		/* irqs already disabled. */
+
+	/* Add CPU to rcu_node bitmasks. */
+	rnp = rdp->mynode;
+	mask = rdp->grpmask;
+	do {
+		/* Exclude any attempts to start a new GP on small systems. */
+		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		rnp->qsmaskinit |= mask;
+		mask = rnp->grpmask;
+		spin_unlock(&rnp->lock); /* irqs already disabled. */
+		rnp = rnp->parent;
+	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
+
+	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+
+	/*
+	 * A new grace period might start here.  If so, we will be part of
+	 * it, and its gpnum will be greater than ours, so we will
+	 * participate.  It is also possible for the gpnum to have been
+	 * incremented before this function was called, and the bitmasks
+	 * to not be filled out until now, in which case we will also
+	 * participate due to our gpnum being behind.
+	 */
+
+	/* Since it is coming online, the CPU is in a quiescent state. */
+	cpu_quiet(cpu, rsp, rdp, NULL);
+	local_irq_restore(flags);
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+#ifdef CONFIG_NO_HZ
+	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+	rdtp->dynticks_nesting = 1;
+	rdtp->dynticks |= 1; 	/* need consecutive #s even for hotplug. */
+	rdtp->dynticks_nmi = (rdtp->dynticks + 1) & ~0x1;
+#endif /* #ifdef CONFIG_NO_HZ */
+	rcu_init_percpu_data(cpu, &rcu_state);
+	rcu_init_percpu_data(cpu, &rcu_bh_state);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+/*
+ * Handle CPU online/offline notifcation events.
+ */
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+/*
+ * Compute the per-level fanout, either using the exact fanout specified
+ * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
+ */
+#ifdef CONFIG_RCU_FANOUT_EXACT
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int i;
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+	}
+	
+}
+#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
+static void __init rcu_init_levelspread(struct rcu_state *rsp)
+{
+	int ccur;
+	int cprv;
+	int i;
+
+	cprv = NR_CPUS;
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		ccur = rsp->levelcnt[i];
+		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+		cprv = ccur;
+	}
+	
+}
+#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
+
+/*
+ * Helper function for rcu_init() that initializes one rcu_state structure.
+ */
+static void __init rcu_init_one(struct rcu_state *rsp)
+{
+	int cpustride = 1;
+	int i;
+	int j;
+	struct rcu_node *rnp;
+
+	/* Initialize the level-tracking arrays. */
+
+	for (i = 1; i < NUM_RCU_LVLS; i++) {
+		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];
+	}
+	rcu_init_levelspread(rsp);
+
+	/* Initialize the elements themselves, starting from the leaves. */
+
+	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) {
+		cpustride *= rsp->levelspread[i];
+		rnp = rsp->level[i];
+		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
+			spin_lock_init(&rnp->lock);
+			rnp->qsmask = 0;
+			rnp->qsmaskinit = 0;
+			rnp->grplo = j * cpustride;
+			rnp->grphi = (j + 1) * cpustride - 1;
+			if (rnp->grphi >= NR_CPUS)
+				rnp->grphi = NR_CPUS - 1;
+			if (i == 0) {
+				rnp->grpnum = 0;
+				rnp->grpmask = 0;
+				rnp->parent = NULL;
+			} else {
+				rnp->grpnum = j % rsp->levelspread[i - 1];
+				rnp->grpmask = 1UL << rnp->grpnum;
+				rnp->parent = rsp->level[i - 1] + 
+					      j / rsp->levelspread[i - 1];
+			}
+			rnp->level = i;
+		}
+	}
+}
+
+/*
+ * Helper macro for __rcu_init().  To be used nowhere else!
+ * Assigns leaf node pointers into each CPU's rcu_data structure.
+ */
+#define RCU_DATA_PTR_INIT(rsp, rcu_data) \
+do { \
+	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	j = 0; \
+	for_each_possible_cpu(i) { \
+		if (i > rnp[j].grphi) \
+			j++; \
+		per_cpu(rcu_data, i).mynode = &rnp[j]; \
+		(rsp)->rda[i] = &per_cpu(rcu_data, i); \
+	} \
+} while (0)
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int i;			/* All used by RCU_DATA_PTR_INIT(). */
+	int j;
+	struct rcu_node *rnp;
+
+	printk(KERN_WARNING "Experimental hierarchical RCU implementation.\n");
+#ifdef CONFIG_DEBUG_RCU_STALL
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+	rcu_init_one(&rcu_state);
+	RCU_DATA_PTR_INIT(&rcu_state, rcu_data);
+	rcu_init_one(&rcu_bh_state);
+	RCU_DATA_PTR_INIT(&rcu_bh_state, rcu_bh_data);
+
+	for_each_online_cpu(i)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)i);
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+	printk(KERN_WARNING "Experimental hierarchical RCU init done.\n");
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
new file mode 100644
index 0000000..18d1613
--- /dev/null
+++ b/kernel/rcutree_trace.c
@@ -0,0 +1,235 @@
+/*
+ * Read-Copy Update tracing for classic implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+extern int tick_do_timer_cpu __read_mostly;  /* @@@ DEBUG @@@ */
+
+static DEFINE_MUTEX(rcuclassic_trace_mutex);
+static char *rcuclassic_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE (512*NR_CPUS)
+
+static int print_one_rcu_data(struct rcu_data *rdp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+
+	if (!rdp->beenonline)
+		return 0;
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		"%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d",
+		rdp->cpu,
+		cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		rdp->completed, rdp->gpnum,
+		rdp->passed_quiesc, rdp->passed_quiesc_completed,
+		rdp->qs_pending);
+#ifdef CONFIG_NO_HZ
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" dt=%d dn=%d df=%lu",
+		rdp->dynticks->dynticks, rdp->dynticks->dynticks_nmi,
+		rdp->dynticks_fqs);
+#endif /* #ifdef CONFIG_NO_HZ */
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+		" ql=%ld b=%ld\n", rdp->qlen, rdp->blimit);
+	return cnt;
+}
+
+#define PRINT_RCU_DATA(name, buf, ebuf) \
+	do { \
+		int _p_r_d_i; \
+		\
+		for_each_possible_cpu(_p_r_d_i) \
+			(buf) += print_one_rcu_data(&per_cpu(name, _p_r_d_i), \
+						    buf, ebuf); \
+	} while (0)
+
+static ssize_t rcudata_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu:\n");
+	PRINT_RCU_DATA(rcu_data, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	PRINT_RCU_DATA(rcu_bh_data, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static int print_one_rcu_state(struct rcu_state *rsp, char *buf, char *ebuf)
+{
+	int cnt = 0;
+	int level = 0;
+	struct rcu_node *rnp;
+
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+			"c=%ld g=%ld s=%d jfq=%ld nfqs=%lu/nfqsng=%lu(%lu)\n",
+			rsp->completed, rsp->gpnum, rsp->signaled,
+			(long)(rsp->jiffies_force_qs - jiffies),
+			rsp->n_force_qs, rsp->n_force_qs_ngp,
+			rsp->n_force_qs - rsp->n_force_qs_ngp);
+	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
+		if (rnp->level != level) {
+			cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+			level = rnp->level;
+		}
+		cnt += snprintf(&buf[cnt], ebuf - &buf[cnt],
+				"%lx/%lx %d:%d ^%d    ",
+				rnp->qsmask, rnp->qsmaskinit,
+				rnp->grplo, rnp->grphi, rnp->grpnum);
+	}
+	cnt += snprintf(&buf[cnt], ebuf - &buf[cnt], "\n");
+	return cnt;
+}
+
+static ssize_t rcuhier_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: tick_do_timer_cpu=%d\n",
+			tick_do_timer_cpu);  /* @@@ DEBUG @@@ */
+	buf += print_one_rcu_state(&rcu_state, buf, ebuf);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh:\n");
+	buf += print_one_rcu_state(&rcu_bh_state, buf, ebuf);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	ssize_t bcount;
+	char *buf = rcuclassic_trace_buf;
+	char *ebuf = &rcuclassic_trace_buf[RCUPREEMPT_TRACE_BUF_SIZE];
+
+	mutex_lock(&rcuclassic_trace_mutex);
+	buf += snprintf(buf, ebuf - buf, "rcu: completed=%ld  gpnum=%ld\n",
+			rcu_state.completed, rcu_state.gpnum);
+	buf += snprintf(buf, ebuf - buf, "rcu_bh: completed=%ld  gpnum=%ld\n",
+			rcu_bh_state.completed, rcu_bh_state.gpnum);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcuclassic_trace_buf, strlen(rcuclassic_trace_buf));
+	mutex_unlock(&rcuclassic_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcudata_fops = {
+	.owner = THIS_MODULE,
+	.read = rcudata_read,
+};
+
+static struct file_operations rcuhier_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuhier_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct dentry *rcudir, *datadir, *hierdir, *gpdir;
+static int rcuclassic_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	datadir = debugfs_create_file("rcudata", 0444, rcudir,
+						NULL, &rcudata_fops);
+	if (!datadir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	hierdir = debugfs_create_file("rcuhier", 0444, rcudir,
+						NULL, &rcuhier_fops);
+	if (!hierdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (datadir)
+		debugfs_remove(datadir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcuclassic_trace_init(void)
+{
+	int ret;
+
+	rcuclassic_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcuclassic_trace_buf)
+		return 1;
+	ret = rcuclassic_debugfs_init();
+	if (ret)
+		kfree(rcuclassic_trace_buf);
+	return ret;
+}
+
+static void __exit rcuclassic_trace_cleanup(void)
+{
+	debugfs_remove(datadir);
+	debugfs_remove(gpdir);
+	debugfs_remove(hierdir);
+	debugfs_remove(rcudir);
+	kfree(rcuclassic_trace_buf);
+}
+
+
+module_init(rcuclassic_trace_init);
+module_exit(rcuclassic_trace_cleanup);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index c506f26..ad31780 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -256,8 +256,11 @@ void irq_enter(void)
 {
 #ifdef CONFIG_NO_HZ
 	int cpu = smp_processor_id();
-	if (idle_cpu(cpu) && !in_interrupt())
-		tick_nohz_stop_idle(cpu);
+	if (idle_cpu(cpu)) {
+		if (!in_interrupt())
+			tick_nohz_stop_idle(cpu);
+		rcu_irq_enter();
+	}
 #endif
 	__irq_enter();
 #ifdef CONFIG_NO_HZ
@@ -285,9 +288,11 @@ void irq_exit(void)
 
 #ifdef CONFIG_NO_HZ
 	/* Make sure that timer wheel updates are propagated */
-	if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
-		tick_nohz_stop_sched_tick(0);
-	rcu_irq_exit();
+	if (idle_cpu(smp_processor_id())) {
+		rcu_irq_exit();
+		if (!in_interrupt() && !need_resched())
+			tick_nohz_stop_sched_tick(0);
+	}
 #endif
 	preempt_enable_no_resched();
 }
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 800ac84..804e08c 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_CPU_STALL_DETECTOR
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU || TREE_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL

[-- Attachment #2: onoffrandtorture.sh --]
[-- Type: application/x-sh, Size: 931 bytes --]

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v6 scalable classic RCU implementation
  2008-09-23 23:53         ` [PATCH, RFC] v6 " Paul E. McKenney
@ 2008-09-25  7:26           ` Ingo Molnar
  2008-09-25 14:05             ` Paul E. McKenney
  2008-09-25  7:29           ` Ingo Molnar
  2008-10-10 16:09           ` [PATCH, RFC] v7 " Paul E. McKenney
  2 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-09-25  7:26 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi, tglx


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> +config RCU_CPU_STALL_DETECTOR
> +	bool "Check for stalled CPUs delaying RCU grace periods"
> +	depends on CLASSIC_RCU || TREE_RCU
> +	default n
> +	help
> +	  This option causes RCU to printk information on which
> +	  CPUs are delaying the current grace period, but only when
> +	  the grace period extends for excessive time periods.
> +
> +	  Say Y if you want RCU to perform such checks.
> +
> +	  Say N if you are unsure.

could you please send this bit separately, even if rcutree isnt finished 
yet? Seems like a quite useful debug feature.

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v6 scalable classic RCU implementation
  2008-09-23 23:53         ` [PATCH, RFC] v6 " Paul E. McKenney
  2008-09-25  7:26           ` Ingo Molnar
@ 2008-09-25  7:29           ` Ingo Molnar
  2008-09-25 14:18             ` Paul E. McKenney
  2008-10-10 16:09           ` [PATCH, RFC] v7 " Paul E. McKenney
  2 siblings, 1 reply; 94+ messages in thread
From: Ingo Molnar @ 2008-09-25  7:29 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi, tglx,
	Thomas Gleixner


* Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:

> Attached is an updated patch to Classic RCU that applies a hierarchy, 
> greatly reducing the contention on the top-level lock for large 
> machines.  This passes 10-hour concurrent rcutorture and 
> online-offline testing on 128-CPU ppc64 without dynticks enabled, and 
> exposes some timekeeping bugs in presence of dynticks (exciting 
> working on a system where "sleep 1" hangs until interrupted...).

i'm wondering about those timekeeping bugs. Do you have an idea what's 
it about and does it affect mainline?

	Ingo

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v6 scalable classic RCU implementation
  2008-09-25  7:26           ` Ingo Molnar
@ 2008-09-25 14:05             ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-25 14:05 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi, tglx

On Thu, Sep 25, 2008 at 09:26:07AM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > +config RCU_CPU_STALL_DETECTOR
> > +	bool "Check for stalled CPUs delaying RCU grace periods"
> > +	depends on CLASSIC_RCU || TREE_RCU
> > +	default n
> > +	help
> > +	  This option causes RCU to printk information on which
> > +	  CPUs are delaying the current grace period, but only when
> > +	  the grace period extends for excessive time periods.
> > +
> > +	  Say Y if you want RCU to perform such checks.
> > +
> > +	  Say N if you are unsure.
> 
> could you please send this bit separately, even if rcutree isnt finished 
> yet? Seems like a quite useful debug feature.

Good point -- I guess I do need to implement the CLASSIC_RCU piece of
this in any case.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* Re: [PATCH, RFC] v6 scalable classic RCU implementation
  2008-09-25  7:29           ` Ingo Molnar
@ 2008-09-25 14:18             ` Paul E. McKenney
  0 siblings, 0 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-09-25 14:18 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, cl, akpm, manfred, dipankar, josht, schamp, niv,
	dvhltc, ego, laijs, rostedt, peterz, penberg, andi, tglx,
	Thomas Gleixner

On Thu, Sep 25, 2008 at 09:29:15AM +0200, Ingo Molnar wrote:
> 
> * Paul E. McKenney <paulmck@linux.vnet.ibm.com> wrote:
> 
> > Attached is an updated patch to Classic RCU that applies a hierarchy, 
> > greatly reducing the contention on the top-level lock for large 
> > machines.  This passes 10-hour concurrent rcutorture and 
> > online-offline testing on 128-CPU ppc64 without dynticks enabled, and 
> > exposes some timekeeping bugs in presence of dynticks (exciting 
> > working on a system where "sleep 1" hangs until interrupted...).
> 
> i'm wondering about those timekeeping bugs. Do you have an idea what's 
> it about and does it affect mainline?

Sad to say, they do affect mainline -- I can reproduce these problems
in 2.6.27-rc7.  Thomas has given me a couple of fixes that got rid of
earlier problems in which jiffies would stop counting, and I believe
has located the cause of at least one other problem.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 94+ messages in thread

* [PATCH, RFC] v7 scalable classic RCU implementation
  2008-09-23 23:53         ` [PATCH, RFC] v6 " Paul E. McKenney
  2008-09-25  7:26           ` Ingo Molnar
  2008-09-25  7:29           ` Ingo Molnar
@ 2008-10-10 16:09           ` Paul E. McKenney
  2008-10-12 15:52             ` Manfred Spraul
                               ` (3 more replies)
  2 siblings, 4 replies; 94+ messages in thread
From: Paul E. McKenney @ 2008-10-10 16:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: cl, mingo, akpm, manfred, dipankar, josht, schamp, niv, dvhltc,
	ego, laijs, rostedt, peterz, penberg, andi, tglx

Hello!

This patch fixes a long-standing performance bug in classic RCU that
results in massive lock contention on the internal RCU lock on systems
with more than a few hundred CPUs.  Although this patch creates a
separate flavor of RCU for easy of review and patch maintenance, it
is intended to replace classic RCU.

Still experimental, not for inclusion, but getting quite close.  I expect
to have it in shape for 2.6.29.  Definitely ready for -serious- testing
and abuse.  In particular, experience on an actual 1000+ CPU machine
would be most welcome, and still appears to be forthcoming...

Updates from v6 (http://lkml.org/lkml/2008/9/23/448):

o	Fix a number of checkpatch.pl complaints.

o	Apply review comments from Ingo Molnar and Lai Jiangshan
	on the stall-detection code.

o	Fix several bugs in !CONFIG_SMP builds.

o	Fix a misspelled config-parameter name so that RCU now announces
	at boot time if stall detection is configured.

o	Run tests on numerous combinations of configurations parameters,
	which after the fixes above, now build and run correctly.

Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line):

o	Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a
	changeset some time ago, and finally got around to retesting
	this option).

o	Fix some tracing bugs in rcupreempt that caused incorrect
	totals to be printed.

o	I now test with a more brutal random-selection online/offline
	script (attached).  Probably more brutal than it needs to be
	on the people reading it as well, but so it goes.

o	A number of optimizations and usability improvements:

	o	Make rcu_pending() ignore the grace-period timeout when
		there is no grace period in progress.

	o	Make force_quiescent_state() avoid going for a global
		lock in the case where there is no grace period in
		progress.

	o	Rearrange struct fields to improve struct layout.

	o	Make call_rcu() initiate a grace period if RCU was
		idle, rather than waiting for the next scheduling
		clock interrupt.

	o	Invoke rcu_irq_enter() and rcu_irq_exit() only when
		idle, as suggested by Andi Kleen.  I still don't
		completely trust this change, and might back it out.

	o	Make CONFIG_RCU_TRACE be the single config variable
		manipulated for all forms of RCU, instead of the prior
		confusion.

	o	Document tracing files and formats for both rcupreempt
		and rcutree.

Updates from v4 for those missing v5 given its bad subject line:

o	Separated dynticks interface so that NMIs and irqs call separate
	functions, greatly simplifying it.  In particular, this code
	no longer requires a proof of correctness.  ;-)

o	Separated dynticks state out into its own per-CPU structure,
	avoiding the duplicated accounting.

o	The case where a dynticks-idle CPU runs an irq handler that
	invokes call_rcu() is now correctly handled, forcing that CPU
	out of dynticks-idle mode.

o	Review comments have been applied (thank you all!!!).
	For but one example, fixed the dynticks-ordering issue that
	Manfred pointed out, saving me much debugging.  ;-)

o	Adjusted rcuclassic and rcupreempt to handle dynticks changes.

Attached is an updated patch to Classic RCU that applies a
hierarchy, greatly reducing the contention on the top-level lock
for large machines.  This passes 10-hour concurrent rcutorture and
online-offline testing on 128-CPU ppc64 without dynticks enabled,
and exposes some timekeeping bugs in presence of dynticks (exciting
working on a system where "sleep 1" hangs until interrupted...).
It is OK for experimental work, but not yet ready for inclusion.
See also Manfred Spraul's recent patches (or his earlier work from
2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2).
We will converge onto a common patch in the fullness of time, but are
currently exploring different regions of the design space.  That said,
I have already gratefully stolen quite a few of Manfred's ideas.

This patch provides CONFIG_RCU_FANOUT, which controls the bushiness
of the RCU hierarchy.  Defaults to 32 on 32-bit machines and 64 on
64-bit machines.  If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT,
there is no hierarchy.  By default, the RCU initialization code will
adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA
architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable
this balancing, allowing the hierarchy to be exactly aligned to the
underlying hardware.  Up to two levels of hierarchy are permitted
(in addition to the root node), allowing up to 16,384 CPUs on 32-bit
systems and up to 262,144 CPUs on 64-bit systems.  I just know that I
am going to regret saying this, but this seems more than sufficient
for the foreseeable future.  (Some architectures might wish to set
CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs.
If this becomes a real problem, additional levels can be added, but I
doubt that it will make a significant difference on real hardware.)

In the common case, a given CPU will manipulate its private rcu_data
structure and the rcu_node structure that it shares with its immediate
neighbors.  This can reduce both lock and memory contention by multiple
orders of magnitude, which should eliminate the need for the strange
manipulations that are reported to be required when running Linux on
very large systems.

Some shortcomings:

o	Some of the NR_CPUS need to be eliminated.  That said, some
	will remain.

o	There is a bit of debug code in place.  This will be removed.

o	There are probably hangs, rcutorture failures, &c.  Seems
	quite stable on a 128-CPU machine, but that is kind of small
	compared to 4096 CPUs.

o	There is not yet a human-readable design document.  One is now
	close to completion.

Credits:

o	Manfred Spraul for ideas, review comments, and bugs spotted,
	as well as some good friendly competition.  ;-)

o	Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers,
	Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton
	for reviews and comments.

o	Thomas Gleixner for much-needed help with some timer issues
	(see patches below).

o	Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos,
	Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton
	Blanchard, and Nathan Lynch for keeping machines alive despite
	my heavy abuse^Wtesting.

To build, start with 2.6.27-rc7, and apply:

	http://www.rdrop.com/users/paulmck/patches/2.6.27-rc3-treeRCU-20.patch
	http://tglx.de/~tglx/gack.patch
	http://tglx.de/~tglx/clockevents-keep-tick-next-period-up-to-date.patch

Thoughts?

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---

 Documentation/RCU/00-INDEX             |    2 
 Documentation/RCU/trace.txt            |  398 ++++++++
 arch/powerpc/platforms/pseries/rtasd.c |    4 
 include/linux/hardirq.h                |   14 
 include/linux/rcupdate.h               |   10 
 include/linux/rcutree.h                |  325 +++++++
 init/Kconfig                           |   18 
 kernel/Kconfig.preempt                 |   62 +
 kernel/Makefile                        |    6 
 kernel/rcupreempt.c                    |   10 
 kernel/rcupreempt_trace.c              |   10 
 kernel/rcutree.c                       | 1510 +++++++++++++++++++++++++++++++++
 kernel/rcutree_trace.c                 |  232 +++++
 kernel/softirq.c                       |   15 
 lib/Kconfig.debug                      |   13 
 15 files changed, 2595 insertions(+), 34 deletions(-)

diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX
index 461481d..7dc0695 100644
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -16,6 +16,8 @@ RTFP.txt
 	- List of RCU papers (bibliography) going back to 1980.
 torture.txt
 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
+trace.txt
+	- CONFIG_RCU_TRACE debugfs files and formats
 UP.txt
 	- RCU on Uniprocessor Systems
 whatisRCU.txt
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
new file mode 100644
index 0000000..d25110c
--- /dev/null
+++ b/Documentation/RCU/trace.txt
@@ -0,0 +1,398 @@
+CONFIG_RCU_TRACE debugfs Files and Formats
+
+
+The rcupreempt and rcutree implementations of RCU provide debugfs trace
+output that summarizes counters and state.  This information is useful for
+debugging RCU itself, and can sometimes also help to debug abuses of RCU.
+Note that the rcuclassic implementation of RCU does not provide debugfs
+trace output.
+
+The following sections describe the debugfs files and formats for
+preemptable RCU (rcupreempt) and hierarchical RCU (rcutree).
+
+
+Preemptable RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcuctrs (which displays the per-CPU
+counters used by preemptable RCU) rcu/rcugp (which displays grace-period
+counters), and rcu/rcustats (which internal counters for debugging RCU).
+
+The output of "cat rcu/rcuctrs" looks as follows:
+
+CPU last cur F M
+  0    5  -5 0 0
+  1   -1   0 0 0
+  2    0   1 0 0
+  3    0   1 0 0
+  4    0   1 0 0
+  5    0   1 0 0
+  6    0   2 0 0
+  7    0  -1 0 0
+  8    0   1 0 0
+ggp = 26226, state = waitzero
+
+The per-CPU fields are as follows:
+
+o	"CPU" gives the CPU number.  Offline CPUs are not displayed.
+
+o	"last" gives the value of the counter that is being decremented
+	for the current grace period phase.  In the example above,
+	the counters sum to 4, indicating that there are still four
+	RCU read-side critical sections still running that started
+	before the last counter flip.
+
+o	"cur" gives the value of the counter that is currently being
+	both incremented (by rcu_read_lock()) and decremented (by
+	rcu_read_unlock()).  In the example above, the counters sum to
+	1, indicating that there is only one RCU read-side critical section
+	still running that started after the last counter flip.
+
+o	"F" indicates whether RCU is waiting for this CPU to acknowledge
+	a counter flip.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitack".
+
+o	"M" indicates whether RCU is waiting for this CPU to execute a
+	memory barrier.  In the above example, RCU is not waiting on any,
+	which is consistent with the state being "waitzero" rather than
+	"waitmb".
+
+o	"ggp" is the global grace-period counter.
+
+o	"state" is the RCU state, which can be one of the following:
+
+	o	"idle": there is no grace period in progress.
+
+	o	"waitack": RCU just incremented the global grace-period
+		counter, which has the effect of reversing the roles of
+		the "last" and "cur" counters above, and is waiting for
+		all the CPUs to acknowledge the flip.  Once the flip has
+		been acknowledged, CPUs will no longer be incrementing
+		what are now the "last" counters, so that their sum will
+		decrease monotonically down to zero.
+
+	o	"waitzero": RCU is waiting for the sum of the "last" counters
+		to decrease to zero.
+
+	o	"waitmb": RCU is waiting for each CPU to execute a memory
+		barrier, which ensures that instructions from a given CPU's
+		last RCU read-side critical section cannot be reordered
+		with instructions following the memory-barrier instruction.
+
+The output of "cat rcu/rcugp" looks as follows:
+
+oldggp=48870  newggp=48873
+
+Note that reading from this file provokes a synchronize_rcu().  The
+"oldggp" value is that of "ggp" from rcu/rcuctrs above, taken before
+executing the synchronize_rcu(), and the "newggp" value is also the
+"ggp" value, but taken after the synchronize_rcu() command returns.
+
+
+The output of "cat rcu/rcugp" looks as follows:
+
+na=1337955 nl=40 wa=1337915 wl=44 da=1337871 dl=0 dr=1337871 di=1337871
+1=50989 e1=6138 i1=49722 ie1=82 g1=49640 a1=315203 ae1=265563 a2=49640
+z1=1401244 ze1=1351605 z2=49639 m1=5661253 me1=5611614 m2=49639
+
+These are counters tracking internal preemptable-RCU events, however,
+some of them may be useful for debugging algorithms using RCU.  In
+particular, the "nl", "wl", and "dl" values track the number of RCU
+callbacks in various states.  The fields are as follows:
+
+o	"na" is the total number of RCU callbacks that have been enqueued
+	since boot.
+
+o	"nl" is the number of RCU callbacks waiting for the previous
+	grace period to end so that they can start waiting on the next
+	grace period.
+
+o	"wa" is the total number of RCU callbacks that have started waiting
+	for a grace period since boot.  "na" should be roughly equal to
+	"nl" plus "wa".
+
+o	"wl" is the number of RCU callbacks currently waiting for their
+	grace period to end.
+
+o	"da" is the total number of RCU callbacks whose grace periods
+	have completed since boot.  "wa" should be roughly equal to
+	"wl" plus "da".
+
+o	"dr" is the total number of RCU callbacks that have been removed
+	from the list of callbacks ready to invoke.  "dr" should be roughly
+	equal to "da".
+
+o	"di" is the total number of RCU callbacks that have been invoked
+	since boot.  "di" should be roughly equal to "da", though some
+	early versions of preemptable RCU had a bug so that only the
+	last CPU's count of invocations was displayed, rather than the
+	sum of all CPU's counts.
+
+o	"1" is the number of calls to rcu_try_flip().  This should be
+	roughly equal to the sum of "e1", "i1", "a1", "z1", and "m1"
+	described below.  In other words, the number of times that
+	the state machine is visited should be equal to the sum of the
+	number of times that each state is visited plus the number of
+	times that the state-machine lock acquisition failed.
+
+o	"e1" is the number of times that rcu_try_flip() was unable to
+	acquire the fliplock.
+
+o	"i1" is the number of calls to rcu_try_flip_idle().
+
+o	"ie1" is the number of times rcu_try_flip_idle() exited early
+	due to the calling CPU having no work for RCU.
+
+o	"g1" is the number of times that rcu_try_flip_idle() decided
+	to start a new grace period.  "i1" should be roughly equal to
+	"ie1" plus "g1".
+
+o	"a1" is the number of calls to rcu_try_flip_waitack().
+
+o	"ae1" is the number of times that rcu_try_flip_waitack() found
+	that at least one CPU had not yet acknowledge the new grace period
+	(AKA "counter flip").
+
+o	"a2" is the number of time rcu_try_flip_waitack() found that
+	all CPUs had acknowledged.  "a1" should be roughly equal to
+	"ae1" plus "a2".  (This particular output was collected on
+	a 128-CPU machine, hence the smaller-than-usual fraction of
+	calls to rcu_try_flip_waitack() finding all CPUs having already
+	acknowledged.)
+
+o	"z1" is the number of calls to rcu_try_flip_waitzero().
+
+o	"ze1" is the number of times that rcu_try_flip_waitzero() found
+	that not all of the old RCU read-side critical sections had
+	completed.
+
+o	"z2" is the number of times that rcu_try_flip_waitzero() finds
+	the sum of the counters equal to zero, in other words, that
+	all of the old RCU read-side critical sections had completed.
+	The value of "z1" should be roughly equal to "ze1" plus
+	"z2".
+
+o	"m1" is the number of calls to rcu_try_flip_waitmb().
+
+o	"me1" is the number of times that rcu_try_flip_waitmb() finds
+	that at least one CPU has not yet executed a memory barrier.
+
+o	"m2" is the number of times that rcu_try_flip_waitmb() finds that
+	all CPUs have executed a memory barrier.
+
+
+Hierarchical RCU debugfs Files and Formats
+
+This implementation of RCU provides three debugfs files under the
+top-level directory RCU: rcu/rcudata (which displays fields in struct
+rcu_data), rcu/rcugp (which displays grace-period counters), and
+rcu/rcuhier (which displays the struct rcu_node hierarchy).
+
+The output of "cat rcu/rcudata" looks as follows:
+
+rcu:
+  0 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=26097 dn=2 df=9102 of=0 ri=11 ql=2 b=10
+  1 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=30421 dn=2 df=6608 of=0 ri=2 ql=39 b=10
+  2 c=1982 g=1982 pq=1 pqc=1982 qp=0 dt=10934 dn=2 df=9612 of=0 ri=0 ql=0 b=10
+  3 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=30139 dn=2 df=6043 of=0 ri=0 ql=58 b=10
+  4 c=1960 g=1960 pq=1 pqc=1960 qp=1 dt=1202 dn=2 df=30470 of=0 ri=3 ql=0 b=10
+  5 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=15341 dn=2 df=5350 of=0 ri=0 ql=25 b=10
+  6 c=1983 g=1984 pq=1 pqc=1983 qp=1 dt=516 dn=2 df=31950 of=0 ri=0 ql=0 b=10
+  7 c=1985 g=1986 pq=1 pqc=1985 qp=0 dt=8205 dn=2 df=7465 of=0 ri=0 ql=28 b=10
+rcu_bh:
+  0 c=375 g=375 pq=1 pqc=375 qp=0 dt=26097 dn=2 df=0 of=0 ri=0 ql=0 b=10
+  1 c=375 g=375 pq=1 pqc=375 qp=0 dt=30421 dn=2 df=162 of=0 ri=0 ql=0 b=10
+  2 c=375 g=375 pq=1 pqc=375 qp=1 dt=10934 dn=2 df=162 of=0 ri=0 ql=0 b=10
+  3 c=375 g=375 pq=1 pqc=375 qp=0 dt=30139 dn=2 df=107 of=0 ri=0 ql=0 b=10
+  4 c=375 g=375 pq=1 pqc=375 qp=1 dt=1202 dn=2 df=174 of=0 ri=0 ql=0 b=10
+  5 c=375 g=375 pq=1 pqc=375 qp=0 dt=15341 dn=2 df=122 of=0 ri=0 ql=0 b=10
+  6 c=375 g=375 pq=1 pqc=375 qp=1 dt=516 dn=2 df=117 of=0 ri=0 ql=0 b=10
+  7 c=375 g=375 pq=1 pqc=375 qp=0 dt=8205 dn=2 df=127 of=0 ri=0 ql=0 b=10
+
+The first section lists the rcu_data structures for rcu, the second for
+rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
+The fields are as follows:
+
+o	The number at the beginning of each line is the CPU number.
+	CPUs numbers followed by an exclamation mark are offline,
+	but have been online at least once since boot.	There will be
+	no output for CPUs that have never been online, which can be
+	a good thing in the surprisingly common case where NR_CPUS is
+	substantially larger than the number of actual CPUs.
+
+o	"c" is the count of grace periods that this CPU believes have
+	completed.  CPUs in dynticks idle mode may lag quite a ways
+	behind, for example, CPU 4 under "rcu" above, which has slept
+	through the past 25 RCU grace periods.	It is not unusual to
+	see CPUs lagging by thousands of grace periods.
+
+o	"g" is the count of grace periods that this CPU believes have
+	started.  Again, CPUs in dynticks idle mode may lag behind.
+	If the "c" and "g" values are equal, this CPU has already
+	reported a quiescent state for the last RCU grace period that
+	it is aware of, otherwise, the CPU believes that it owes RCU a
+	quiescent state.
+
+o	"pq" indicates that this CPU has passed through a quiescent state
+	for the current grace period.  It is possible for "pq" to be
+	"1" and "c" different than "g", which indicates that although
+	the CPU has passed through a quiescent state, either (1) this
+	CPU has not yet reported that fact, (2) some other CPU has not
+	yet reported for this grace period, or (3) both.
+
+o	"pqc" indicates which grace period the last-observed quiescent
+	state for this CPU corresponds to.  This is important for handling
+	the race between CPU 0 reporting an extended dynticks-idle
+	quiescent state for CPU 1 and CPU 1 suddenly waking up and
+	reporting its own quiescent state.  If CPU 1 was the last CPU
+	for the current grace period, then the CPU that loses this race
+	will attempt to incorrectly mark CPU 1 as having checked in for
+	the next grace period!
+
+o	"qp" indicates that RCU still expects a quiescent state from
+	this CPU.
+
+o	"dt" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state, either by the
+	scheduler or by irq.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"dn" is the current value of the dyntick counter that is incremented
+	when entering or leaving dynticks idle state via NMI.  If both
+	the "dt" and "dn" values are even, then this CPU is in dynticks
+	idle mode and may be ignored by RCU.  If either of these two
+	counters is odd, then RCU must be alert to the possibility of
+	an RCU read-side critical section running on this CPU.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"df" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being in
+	dynticks-idle state.
+
+	This field is displayed only for CONFIG_NO_HZ kernels.
+
+o	"of" is the number of times that some other CPU has forced a
+	quiescent state on behalf of this CPU due to this CPU being
+	offline.  In a perfect world, this might neve happen, but it
+	turns out that offlining and onlining a CPU can take several grace
+	periods, and so there is likely to be an extended period of time
+	when RCU believes that the CPU is online when it really is not.
+	Please note that erring in the other direction (RCU believing a
+	CPU is offline when it is really alive and kicking) is a fatal
+	error, so it makes sense to err conservatively.
+
+o	"ri" is the number of times that RCU has seen fit to send a
+	reschedule IPI to this CPU in order to get it to report a
+	quiescent state.
+
+o	"ql" is the number of RCU callbacks currently residing on
+	this CPU.  This is the total number of callbacks, regardless
+	of what state they are in (new, waiting for grace period to
+	start, waiting for grace period to end, ready to invoke).
+
+o	"b" is the batch limit for this CPU.  If more than this number
+	of RCU callbacks is ready to invoke, then the remainder will
+	be deferred.
+
+
+The output of "cat rcu/rcudata" looks as follows:
+
+rcu: completed=33062  gpnum=33063
+rcu_bh: completed=464  gpnum=464
+
+Again, this output is for both "rcu" and "rcu_bh".  The fields are
+taken from the rcu_state structure, and are as follows:
+
+o	"completed" is the number of grace periods that have completed.
+	It is comparable to the "c" field from rcu/rcudata in that a
+	CPU whose "c" field matches the value of "completed" is aware
+	that the corresponding RCU grace period has completed.
+
+o	"gpnum" is the number of grace periods that have started.  It is
+	comparable to the "g" field from rcu/rcudata in that a CPU
+	whose "g" field matches the value of "gpnum" is aware that the
+	corresponding RCU grace period has started.
+
+	If these two fields are equal (as they are for "rcu_bh" above),
+	then there is no grace period in progress, in other words, RCU
+	is idle.  On the other hand, if the two fields differ (as they
+	do for "rcu" above), then an RCU grace period is in progress.
+
+
+The output of "cat rcu/rcuhier" looks as follows, with very long lines:
+
+rcu:
+c=33184 g=33185 s=0 jfq=1 nfqs=61601/nfqsng=28011(33590)
+1/1 0:127 ^0    
+1/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+14/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+rcu_bh:
+c=470 g=470 s=0 jfq=2 nfqs=62302/nfqsng=62027(275)
+0/1 0:127 ^0    
+0/3 0:35 ^0    0/0 36:71 ^1    0/0 72:107 ^2    0/0 108:127 ^3    
+0/3f 0:5 ^0    0/3 6:11 ^1    0/0 12:17 ^2    0/0 18:23 ^3    0/0 24:29 ^4    0/0 30:35 ^5    0/0 36:41 ^0    0/0 42:47 ^1    0/0 48:53 ^2    0/0 54:59 ^3    0/0 60:65 ^4    0/0 66:71 ^5    0/0 72:77 ^0    0/0 78:83 ^1    0/0 84:89 ^2    0/0 90:95 ^3    0/0 96:101 ^4    0/0 102:107 ^5    0/0 108:113 ^0    0/0 114:119 ^1    0/0 120:125 ^2    0/0 126:127 ^3    
+
+This is once again split into "rcu" and "rcu_bh" portions.  The fields are
+as follows:
+
+o	"c" is exactly the same as "completed" under rcu/rcugp.
+
+o	"g" is exactly the same as "gpnum" under rcu/rcugp.
+
+o	"s" is the "signaled" state that drives force_quiescent_state()'s
+	state machine.
+
+o	"jfq" is the number of jiffies remaining for this grace period
+	before force_quiescent_state() is invoked to help push things
+	along.  Note that CPUs in dyntick-idle mode thoughout the grace
+	period will not report on their own, but rather must be check by
+	some other CPU via force_quiescent_state().
+
+o	"nfqs" is the number of calls to force_quiescent_state() since
+	boot.
+
+o	"nfqsng" is the number of useless calls to force_quiescent_state(),
+	where there wasn't actually a grace period active.  This can
+	happen due to races.  The number in parentheses is the difference
+	between "nfqs" and "nfqsng", or the number of times that
+	force_quiescent_state() actually did some real work.
+
+o	Each element of the form "1/1 0:127 ^0" represents one struct
+	rcu_node.  Each line represents one level of the hierarchy, from
+	root to leaves.  It is best to think of the rcu_data structures
+	as forming yet another level after the leaves.  Note that there
+	might be either one, two, or three levels of rcu_node structures,
+	depending on the relationship between CONFIG_RCU_FANOUT and
+	CONFIG_NR_CPUS.
+	
+	o	The numbers separated by the "/" are the qsmask followed
+		by the qsmaskinit.  The qsmask will have one bit
+		set for each entity in the next lower level that
+		has not yet checked in for the current grace period.
+		The qsmaskinit will have one bit for each entity that is
+		currently expected to check in during each grace period.
+		The value of qsmaskinit is assigned to that of qsmask
+		at the beginning of each grace period.
+
+		For example, for "rcu", the qsmask of the first entry
+		of the lowest level is 0x14, meaning that we are still
+		waiting for CPUs 2 and 4 to check in for the current
+		grace period.
+
+	o	The numbers separated by the ":" are the range of CPUs
+		served by this struct rcu_node.  This can be helpful
+		in working out how the hierarchy is wired together.
+
+		For example, the first entry at the lowest level shows
+		"0:5", indicating that it covers CPUs 0 through 5.
+
+	o	The number after the "^" indicates the bit in the
+		next higher level rcu_node structure that this
+		rcu_node structure corresponds to.
+
+		For example, the first entry at the lowest level shows
+		"^0", indicating that it corresponds to bit zero in
+		the first entry at the middle level.
diff --git a/arch/powerpc/platforms/pseries/rtasd.c b/arch/powerpc/platforms/pseries/rtasd.c
index c9ffd8c..d8e784a 100644
--- a/arch/powerpc/platforms/pseries/rtasd.c
+++ b/arch/powerpc/platforms/pseries/rtasd.c
@@ -208,6 +208,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -227,6 +228,7 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 	/* Check to see if we need to or have stopped logging */
 	if (fatal || !logging_enabled) {
 		logging_enabled = 0;
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
@@ -249,11 +251,13 @@ void pSeries_log_error(char *buf, unsigned int err_type, int fatal)
 		else
 			rtas_log_start += 1;
 
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		wake_up_interruptible(&rtas_log_wait);
 		break;
 	case ERR_TYPE_KERNEL_PANIC:
 	default:
+		WARN_ON_ONCE(!irqs_disabled()); /* @@@ DEBUG @@@ */
 		spin_unlock_irqrestore(&rtasd_log_lock, s);
 		return;
 	}
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 181006c..9b70b92 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk)
 }
 #endif
 
-#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU)
 extern void rcu_irq_enter(void);
 extern void rcu_irq_exit(void);
+extern void rcu_nmi_enter(void);
+extern void rcu_nmi_exit(void);
 #else
 # define rcu_irq_enter() do { } while (0)
 # define rcu_irq_exit() do { } while (0)
-#endif /* CONFIG_PREEMPT_RCU */
+# define rcu_nmi_enter() do { } while (0)
+# define rcu_nmi_exit() do { } while (0)
+#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */
 
 /*
  * It is safe to do non-atomic ops on ->hardirq_context,
@@ -134,7 +138,6 @@ extern void rcu_irq_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
-		rcu_irq_enter();			\
 		account_system_vtime(current);		\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
@@ -153,7 +156,6 @@ extern void irq_enter(void);
 		trace_hardirq_exit();			\
 		account_system_vtime(current);		\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
-		rcu_irq_exit();				\
 	} while (0)
 
 /*
@@ -161,7 +163,7 @@ extern void irq_enter(void);
  */
 extern void irq_exit(void);
 
-#define nmi_enter()		do { lockdep_off(); __irq_enter(); } while (0)
-#define nmi_exit()		do { __irq_exit(); lockdep_on(); } while (0)
+#define nmi_enter()		do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0)
+#define nmi_exit()		do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0)
 
 #endif /* LINUX_HARDIRQ_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index e8b4039..f8544ae 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,11 +52,15 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
-#ifdef CONFIG_CLASSIC_RCU
+#if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
-#else /* #ifdef CONFIG_CLASSIC_RCU */
+#elif defined(CONFIG_TREE_RCU)
+#include <linux/rcutree.h>
+#elif defined(CONFIG_PREEMPT_RCU)
 #include <linux/rcupreempt.h>
-#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+#else
+#error "Unknown RCU implementation specified to kernel configuration"
+#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */
 
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
new file mode 100644
index 0000000..00f8be2
--- /dev/null
+++ b/include/linux/rcutree.h
@@ -0,0 +1,325 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2008
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *	   Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical algorithm
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 	Documentation/RCU
+ */
+
+#ifndef __LINUX_RCUTREE_H
+#define __LINUX_RCUTREE_H
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+/*
+ * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT.
+ * In theory, it should be possible to add more levels straightforwardly.
+ * In practice, this has not been tested, so there is probably some
+ * bug somewhere.
+ */
+#define MAX_RCU_LVLS 3
+#define RCU_FANOUT	      (CONFIG_RCU_FANOUT)
+#define RCU_FANOUT_SQ	      (RCU_FANOUT * RCU_FANOUT)
+#define RCU_FANOUT_CUBE	      (RCU_FANOUT_SQ * RCU_FANOUT)
+
+#if (NR_CPUS) <= RCU_FANOUT
+#  define NUM_RCU_LVLS	      1
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (NR_CPUS)
+#  define NUM_RCU_LVL_2	      0
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_SQ
+#  define NUM_RCU_LVLS	      2
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_2	      (NR_CPUS)
+#  define NUM_RCU_LVL_3	      0
+#elif (NR_CPUS) <= RCU_FANOUT_CUBE
+#  define NUM_RCU_LVLS	      3
+#  define NUM_RCU_LVL_0	      1
+#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_3	      NR_CPUS
+#else
+# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
+#endif /* #if (NR_CPUS) <= RCU_FANOUT */
+
+#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
+#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
+
+/*
+ * Dynticks per-CPU state.
+ */
+struct rcu_dynticks {
+	int dynticks_nesting;	/* Track nesting level, sort of. */
+	int dynticks;		/* Even value for dynticks-idle, else odd. */
+	int dynticks_nmi;	/* Even value for either dynticks-idle or */
+				/*  not in nmi handler, else odd.  So this */
+				/*  remains even for nmi from irq handler. */
+};
+
+/*
+ * Definition for node within the RCU grace-period-detection hierarchy.
+ */
+struct rcu_node {
+	spinlock_t lock;
+	unsigned long qsmask;	/* CPUs or groups that need to switch in */
+				/*  order for current grace period to proceed.*/
+	unsigned long qsmaskinit;
+				/* Per-GP initialization for qsmask. */
+	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+	int	grplo;		/* lowest-numbered CPU or group here. */
+	int	grphi;		/* highest-numbered CPU or group here. */
+	u8	grpnum;		/* CPU/group number for next level up. */
+	u8	level;		/* root is at level 0. */
+	struct rcu_node *parent;
+} ____cacheline_internodealigned_in_smp;
+
+/* Index values for nxttail array in struct rcu_data. */
+#define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
+#define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
+#define RCU_NEXT_READY_TAIL	2	/* Also RCU_NEXT head. */
+#define RCU_NEXT_TAIL		3
+#define RCU_NEXT_SIZE		4
+
+/* Per-CPU data for read-copy update. */
+struct rcu_data {
+	/* 1) quiescent-state and grace-period handling : */
+	long		completed;	/* Track rsp->completed gp number */
+					/*  in order to detect GP end. */
+	long		gpnum;		/* Highest gp number that this CPU */
+					/*  is aware of having started. */
+	long		passed_quiesc_completed;
+					/* Value of completed at time of qs. */
+	bool		passed_quiesc;	/* User-mode/idle loop etc. */
+	bool		qs_pending;	/* Core waits for quiesc state. */
+	bool		beenonline;	/* CPU online at least once. */
+	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
+	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
+
+	/* 2) batch handling */
+	/*
+	 * If nxtlist is not NULL, it is partitioned as follows.
+	 * Any of the partitions might be empty, in which case the
+	 * pointer to that partition will be equal to the pointer for
+	 * the following partition.  When the list is empty, all of
+	 * the nxttail elements point to nxtlist, which is NULL.
+	 *
+	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
+	 *	Entries that batch # <= ->completed
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail[RCU_NEXT_SIZE];
+	long		qlen; 	 	/* # of queued callbacks */
+	long		blimit;		/* Upper limit on a processed batch */
+
+	/* 3) rcu-barrier functions */
+	struct rcu_head barrier;
+
+#ifdef CONFIG_NO_HZ
+	/* 4) dynticks interface (see http://lwn.net/Articles/279077/) */
+	struct rcu_dynticks *dynticks;	/* Shared per-CPU dynticks state. */
+	int dynticks_snap;		/* Per-GP tracking for dynticks. */
+	int dynticks_nmi_snap;		/* Per-GP tracking for dynticks_nmi. */
+#endif /* #ifdef CONFIG_NO_HZ */
+
+	/* 5) reasons this CPU needed to be kicked by force_quiescent_state */
+#ifdef CONFIG_NO_HZ
+	unsigned long dynticks_fqs;	/* Kicked due to dynticks idle. */
+#endif /* #ifdef CONFIG_NO_HZ */
+	unsigned long offline_fqs;	/* Kicked due to being offline. */
+	unsigned long resched_ipi;	/* Sent a resched IPI. */
+
+	int cpu;
+};
+
+/* Values for signaled field in struc rcu_data. */
+#define RCU_SAVE_DYNTICK	0	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		1	/* Need to force quiescent state. */
+#ifdef CONFIG_NO_HZ
+#define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
+#else /* #ifdef CONFIG_NO_HZ */
+#define RCU_SIGNAL_INIT		RCU_FORCE_QS
+#endif /* #else #ifdef CONFIG_NO_HZ */
+
+#define RCU_JIFFIES_TILL_FORCE_QS	 3	/* for rsp->jiffies_force_qs */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK	(3 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ)  /* for rsp->jiffies_stall */
+#define RCU_STALL_RAT_DELAY		2	  /* Allow other CPUs time */
+						  /*  to take at least one */
+						  /*  scheduling clock irq */
+						  /*  before ratting on them. */
+
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+/*
+ * RCU global state, including node hierarchy.  This hierarchy is
+ * represented in "heap" form in a dense array.  The root (first level)
+ * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second
+ * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]),
+ * and the third level in ->node[m+1] and following (->node[m+1] referenced
+ * by ->level[2]).  The number of levels is determined by the number of
+ * CPUs and by CONFIG_RCU_FANOUT.  Small systems will have a "hierarchy"
+ * consisting of a single rcu_node.
+ */
+struct rcu_state {
+	struct rcu_node node[NUM_RCU_NODES];	/* Hierarchy. */
+	struct rcu_node *level[NUM_RCU_LVLS];	/* Hierarchy levels. */
+	u32 levelcnt[MAX_RCU_LVLS + 1];		/* # nodes in each level. */
+	u8 levelspread[NUM_RCU_LVLS];		/* kids/node in each level. */
+	struct rcu_data *rda[NR_CPUS];		/* array of rdp pointers. */
+
+	/* The following fields are guarded by the root rcu_node's lock. */
+
+	u8	signaled ____cacheline_internodealigned_in_smp;
+						/* Force QS state. */
+	long	gpnum;				/* Current gp number. */
+	long	completed;			/* # of last completed gp. */
+	spinlock_t onofflock;			/* exclude on/offline and */
+						/*  starting new GP. */
+	spinlock_t fqslock;			/* Only one task forcing */
+						/*  quiescent states. */
+	unsigned long jiffies_force_qs;		/* Time at which to invoke */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs;		/* Number of calls to */
+						/*  force_quiescent_state(). */
+	unsigned long n_force_qs_ngp;		/* Number of calls leaving */
+						/*  due to no GP active. */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;			/* Time at which GP started, */
+						/*  but in jiffies. */
+	unsigned long jiffies_stall;		/* Time at which to check */
+						/*  for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#ifdef CONFIG_NO_HZ
+	long dynticks_completed;		/* Value of completed @ snap. */
+#endif /* #ifdef CONFIG_NO_HZ */
+};
+
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+static inline void __rcu_read_lock(void)
+{
+	preempt_disable();
+	__acquire(RCU);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock(void)
+{
+	rcu_read_release();
+	__release(RCU);
+	preempt_enable();
+}
+static inline void __rcu_read_lock_bh(void)
+{
+	local_bh_disable();
+	__acquire(RCU_BH);
+	rcu_read_acquire();
+}
+static inline void __rcu_read_unlock_bh(void)
+{
+	rcu_read_release();
+	__release(RCU_BH);
+	local_bh_enable();
+}
+
+#define __synchronize_sched() synchronize_rcu()
+
+#define call_rcu_sched(head, func) call_rcu(head, func)
+
+static inline void rcu_init_sched(void)
+{
+}
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batch