LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/3] x86: Move msr accesses out of line
@ 2015-02-21  1:38 Andi Kleen
  2015-02-21  1:38 ` [PATCH 2/3] x86: Add trace point for MSR accesses Andi Kleen
                   ` (2 more replies)
  0 siblings, 3 replies; 35+ messages in thread
From: Andi Kleen @ 2015-02-21  1:38 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel, peterz, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

To add trace points to msr accesses we need to include
linux/tracepoint.h. Unfortunately this causes hellish include loops
when with the msr inlines in asm/msr.h, which is included all over.

I tried to fix several of them, but eventually gave up.

This patch moves the MSR functions out of line. A MSR access is typically
40-100 cycles or even slower, a call is a few cycles at best, so the
additional function call is not really significant.

Kernel text size is neutral:
11852945	1671656	1822720	15347321	 ea2e79	vmlinux-no-msr
11852969	1671656	1822720	15347345	 ea2e91	vmlinux-msr

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/msr.h | 51 ++++----------------------------------------
 arch/x86/lib/msr.c         | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index de36f22..99d6864 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -57,53 +57,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
-static inline unsigned long long native_read_msr(unsigned int msr)
-{
-	DECLARE_ARGS(val, low, high);
-
-	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
-	return EAX_EDX_VAL(val, low, high);
-}
-
-static inline unsigned long long native_read_msr_safe(unsigned int msr,
-						      int *err)
-{
-	DECLARE_ARGS(val, low, high);
-
-	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
-		     ".previous\n\t"
-		     _ASM_EXTABLE(2b, 3b)
-		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
-		     : "c" (msr), [fault] "i" (-EIO));
-	return EAX_EDX_VAL(val, low, high);
-}
-
-static inline void native_write_msr(unsigned int msr,
-				    unsigned low, unsigned high)
-{
-	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
-}
-
-/* Can be uninlined because referenced by paravirt */
-notrace static inline int native_write_msr_safe(unsigned int msr,
-					unsigned low, unsigned high)
-{
-	int err;
-	asm volatile("2: wrmsr ; xor %[err],%[err]\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
-		     ".previous\n\t"
-		     _ASM_EXTABLE(2b, 3b)
-		     : [err] "=a" (err)
-		     : "c" (msr), "0" (low), "d" (high),
-		       [fault] "i" (-EIO)
-		     : "memory");
-	return err;
-}
+extern unsigned long long native_read_msr(unsigned int msr);
+extern unsigned long long native_read_msr_safe(unsigned int msr, int *err);
+extern int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high);
+extern void native_write_msr(unsigned int msr, unsigned low, unsigned high);
 
 extern unsigned long long native_read_tsc(void);
 
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4362373..7eed044 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -108,3 +108,56 @@ int msr_clear_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, false);
 }
+
+inline unsigned long long native_read_msr(unsigned int msr)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+	return EAX_EDX_VAL(val, low, high);
+}
+EXPORT_SYMBOL(native_read_msr);
+
+inline unsigned long long native_read_msr_safe(unsigned int msr,
+						      int *err)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     ".previous\n\t"
+		     _ASM_EXTABLE(2b, 3b)
+		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+		     : "c" (msr), [fault] "i" (-EIO));
+	return EAX_EDX_VAL(val, low, high);
+}
+EXPORT_SYMBOL(native_read_msr_safe);
+
+inline void native_write_msr(unsigned int msr,
+				    unsigned low, unsigned high)
+{
+	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+EXPORT_SYMBOL(native_write_msr);
+
+/* Can be uninlined because referenced by paravirt */
+notrace inline int native_write_msr_safe(unsigned int msr,
+					unsigned low, unsigned high)
+{
+	int err;
+
+	asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     ".previous\n\t"
+		     _ASM_EXTABLE(2b, 3b)
+		     : [err] "=a" (err)
+		     : "c" (msr), "0" (low), "d" (high),
+		       [fault] "i" (-EIO)
+		     : "memory");
+	return err;
+}
+EXPORT_SYMBOL(native_write_msr_safe);
-- 
1.9.3


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 2/3] x86: Add trace point for MSR accesses
  2015-02-21  1:38 [PATCH 1/3] x86: Move msr accesses out of line Andi Kleen
@ 2015-02-21  1:38 ` Andi Kleen
  2015-02-21  1:38 ` [PATCH 3/3] perf, x86: Remove old MSR perf tracing code Andi Kleen
  2015-02-23 17:04 ` [PATCH 1/3] x86: Move msr accesses out of line Peter Zijlstra
  2 siblings, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2015-02-21  1:38 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel, peterz, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

For debugging low level code interacting with the CPU
it is often useful to trace the MSR read/writes. This gives
a concise summary of PMU and other operations.

perf has an ad-hoc way to do this using trace_printk,
but it's somewhat limited (and also now spews ugly
messages when enabled)

Instead define real trace points for all MSR accesses.

This adds two new trace point: read_msr and write_msr.
They also report if the access faulted (if *_safe is used)

This allows filtering and triggering on specific
MSR values, which allows various more advanced
debugging techniques.

All the values are well defined in the CPU documentation.

I only added it to native MSR accesses in C, not paravirtualized
or in entry*.S (which is not too interesting)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/lib/msr.c         | 14 ++++++++++++--
 include/trace/events/msr.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 2 deletions(-)
 create mode 100644 include/trace/events/msr.h

diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 7eed044..29d1952 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,6 +1,8 @@
 #include <linux/module.h>
 #include <linux/preempt.h>
 #include <asm/msr.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/msr.h>
 
 struct msr *msrs_alloc(void)
 {
@@ -111,16 +113,20 @@ int msr_clear_bit(u32 msr, u8 bit)
 
 inline unsigned long long native_read_msr(unsigned int msr)
 {
+	unsigned long long lval;
 	DECLARE_ARGS(val, low, high);
 
 	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
-	return EAX_EDX_VAL(val, low, high);
+	lval = EAX_EDX_VAL(val, low, high);
+	trace_read_msr(msr, lval, 0);
+	return lval;
 }
 EXPORT_SYMBOL(native_read_msr);
 
 inline unsigned long long native_read_msr_safe(unsigned int msr,
 						      int *err)
 {
+	unsigned long long lval;
 	DECLARE_ARGS(val, low, high);
 
 	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
@@ -131,7 +137,9 @@ inline unsigned long long native_read_msr_safe(unsigned int msr,
 		     _ASM_EXTABLE(2b, 3b)
 		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
 		     : "c" (msr), [fault] "i" (-EIO));
-	return EAX_EDX_VAL(val, low, high);
+	lval = EAX_EDX_VAL(val, low, high);
+	trace_read_msr(msr, lval, *err);
+	return lval;
 }
 EXPORT_SYMBOL(native_read_msr_safe);
 
@@ -139,6 +147,7 @@ inline void native_write_msr(unsigned int msr,
 				    unsigned low, unsigned high)
 {
 	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+	trace_write_msr(msr, ((u64)high << 32 | low), 0);
 }
 EXPORT_SYMBOL(native_write_msr);
 
@@ -158,6 +167,7 @@ notrace inline int native_write_msr_safe(unsigned int msr,
 		     : "c" (msr), "0" (low), "d" (high),
 		       [fault] "i" (-EIO)
 		     : "memory");
+	trace_write_msr(msr, ((u64)high << 32 | low), err);
 	return err;
 }
 EXPORT_SYMBOL(native_write_msr_safe);
diff --git a/include/trace/events/msr.h b/include/trace/events/msr.h
new file mode 100644
index 0000000..e1677e8
--- /dev/null
+++ b/include/trace/events/msr.h
@@ -0,0 +1,46 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM msr
+
+#if !defined(_TRACE_MSR_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_MSR_H
+
+#include <linux/tracepoint.h>
+
+/*
+ * Tracing for x86 model specific registers. Directly maps to the
+ * RDMSR/WRMSR instructions.
+ */
+
+DECLARE_EVENT_CLASS(msr_trace_class,
+	    TP_PROTO(unsigned msr, u64 val, int failed),
+	    TP_ARGS(msr, val, failed),
+	    TP_STRUCT__entry(
+		    __field(	unsigned,	msr )
+		    __field(    u64,		val )
+		    __field(    int,		failed )
+	    ),
+	    TP_fast_assign(
+		    __entry->msr = msr;
+		    __entry->val = val;
+		    __entry->failed = failed;
+	    ),
+	    TP_printk("%x, value %llx%s",
+		      __entry->msr,
+		      __entry->val,
+		      __entry->failed ? " #GP" : "")
+);
+
+DEFINE_EVENT(msr_trace_class, read_msr,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+DEFINE_EVENT(msr_trace_class, write_msr,
+	     TP_PROTO(unsigned msr, u64 val, int failed),
+	     TP_ARGS(msr, val, failed)
+);
+
+#endif /* _TRACE_MSR_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
-- 
1.9.3


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 3/3] perf, x86: Remove old MSR perf tracing code
  2015-02-21  1:38 [PATCH 1/3] x86: Move msr accesses out of line Andi Kleen
  2015-02-21  1:38 ` [PATCH 2/3] x86: Add trace point for MSR accesses Andi Kleen
@ 2015-02-21  1:38 ` Andi Kleen
  2015-02-23 17:04 ` [PATCH 1/3] x86: Move msr accesses out of line Peter Zijlstra
  2 siblings, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2015-02-21  1:38 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel, peterz, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

Now that we have generic MSR trace points we can remove the old
hackish perf MSR read tracing code.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/cpu/perf_event.h | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a371d27..aa316c0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -14,17 +14,7 @@
 
 #include <linux/perf_event.h>
 
-#if 0
-#undef wrmsrl
-#define wrmsrl(msr, val) 						\
-do {									\
-	unsigned int _msr = (msr);					\
-	u64 _val = (val);						\
-	trace_printk("wrmsrl(%x, %Lx)\n", (unsigned int)(_msr),		\
-			(unsigned long long)(_val));			\
-	native_write_msr((_msr), (u32)(_val), (u32)(_val >> 32));	\
-} while (0)
-#endif
+/* To enable MSR tracing please use the generic trace points. */
 
 /*
  *          |   NHM/WSM    |      SNB     |
-- 
1.9.3


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] x86: Move msr accesses out of line
  2015-02-21  1:38 [PATCH 1/3] x86: Move msr accesses out of line Andi Kleen
  2015-02-21  1:38 ` [PATCH 2/3] x86: Add trace point for MSR accesses Andi Kleen
  2015-02-21  1:38 ` [PATCH 3/3] perf, x86: Remove old MSR perf tracing code Andi Kleen
@ 2015-02-23 17:04 ` Peter Zijlstra
  2015-02-23 17:43   ` Andi Kleen
  2 siblings, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-23 17:04 UTC (permalink / raw)
  To: Andi Kleen; +Cc: x86, linux-kernel, Andi Kleen

On Fri, Feb 20, 2015 at 05:38:55PM -0800, Andi Kleen wrote:

> This patch moves the MSR functions out of line. A MSR access is typically
> 40-100 cycles or even slower, a call is a few cycles at best, so the
> additional function call is not really significant.

If I look at the below PDF a CALL+PUSH EBP+MOV RSP,RBP+ ... +POP+RET
ends up being 5+1.5+0.5+ .. + 1.5+8 = 16.5 + .. cycles.

~16 is fairly significant on 40. And I figure people are working hard to
make some MSR accesses cheaper, which means it'll be even worse in the
future.

Now I appreciate the intent for debuggability, but I don't think we can
do this unconditionally.


http://www.intel.com/content/www/us/en/architecture-and-technology/64-ia-32-architectures-optimization-manual.html

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] x86: Move msr accesses out of line
  2015-02-23 17:04 ` [PATCH 1/3] x86: Move msr accesses out of line Peter Zijlstra
@ 2015-02-23 17:43   ` Andi Kleen
  2015-02-25 12:27     ` Peter Zijlstra
  2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
  0 siblings, 2 replies; 35+ messages in thread
From: Andi Kleen @ 2015-02-23 17:43 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, x86, linux-kernel

On Mon, Feb 23, 2015 at 06:04:36PM +0100, Peter Zijlstra wrote:
> On Fri, Feb 20, 2015 at 05:38:55PM -0800, Andi Kleen wrote:
> 
> > This patch moves the MSR functions out of line. A MSR access is typically
> > 40-100 cycles or even slower, a call is a few cycles at best, so the
> > additional function call is not really significant.
> 
> If I look at the below PDF a CALL+PUSH EBP+MOV RSP,RBP+ ... +POP+RET
> ends up being 5+1.5+0.5+ .. + 1.5+8 = 16.5 + .. cycles.

You cannot just add up the latency cycles. The CPU runs all of this 
in parallel. 

Latency cycles would only be interesting if these instructions were
on the critical path for computing the result, which they are not. 

It should be a few cycles overhead.

BTW if you really worry about perf overhead you could 
gain far more (in some cases ms) by applying 
http://comments.gmane.org/gmane.linux.kernel/1805207

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] x86: Move msr accesses out of line
  2015-02-23 17:43   ` Andi Kleen
@ 2015-02-25 12:27     ` Peter Zijlstra
  2015-02-25 18:20       ` Andi Kleen
  2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
  1 sibling, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-25 12:27 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Andi Kleen, x86, linux-kernel

On Mon, Feb 23, 2015 at 09:43:40AM -0800, Andi Kleen wrote:
> On Mon, Feb 23, 2015 at 06:04:36PM +0100, Peter Zijlstra wrote:
> > On Fri, Feb 20, 2015 at 05:38:55PM -0800, Andi Kleen wrote:
> > 
> > > This patch moves the MSR functions out of line. A MSR access is typically
> > > 40-100 cycles or even slower, a call is a few cycles at best, so the
> > > additional function call is not really significant.
> > 
> > If I look at the below PDF a CALL+PUSH EBP+MOV RSP,RBP+ ... +POP+RET
> > ends up being 5+1.5+0.5+ .. + 1.5+8 = 16.5 + .. cycles.
> 
> You cannot just add up the latency cycles. The CPU runs all of this 
> in parallel. 
> 
> Latency cycles would only be interesting if these instructions were
> on the critical path for computing the result, which they are not. 
> 
> It should be a few cycles overhead.

I thought that since CALL touches RSP, PUSH touches RSP, MOV RSP,
(obviously) touches RSP, POP touches RSP and well, RET does too. There
were strong dependencies on the instructions and there would be little
room to parallelize things.

I'm glad you so patiently educated me on the wonders of modern
architectures and how it can indeed do all this in parallel.

Still, I wondered, so I ran me a little test. Note that I used a
serializing instruction (LOCK XCHG) because WRMSR is too.

I see a ~14 cycle difference between the inline and noinline version.

If I substitute the LOCK XCHG with XADD, I get to 1,5 cycles in
difference, so clearly there is some magic happening, but serializing
instructions wreck it.

Anybody can explain how such RSP deps get magiced away?

---

root@ivb-ep:~# cat call.c

#define __always_inline         inline __attribute__((always_inline))
#define  noinline                       __attribute__((noinline))

static int
#ifdef FOO
noinline
#else
__always_inline
#endif
xchg(int *ptr, int val)
{
        asm volatile ("LOCK xchgl %0, %1\n"
                        : "+r" (val), "+m" (*(ptr))
                        : : "memory", "cc");
        return val;
}

void main(void)
{
        int val = 0, old;

        for (int i = 0; i < 1000000000; i++)
                old = xchg(&val, i);
}

root@ivb-ep:~# gcc -std=gnu99 -O3 -fno-omit-frame-pointer -DFOO -o call call.c
root@ivb-ep:~# objdump -D call | awk '/<[^>]*>:/ {p=0} /<main>:/ {p=1} /<xchg>:/ {p=1} { if (p) print $0 }'
00000000004003e0 <main>:
  4003e0:       55                      push   %rbp
  4003e1:       48 89 e5                mov    %rsp,%rbp
  4003e4:       53                      push   %rbx
  4003e5:       31 db                   xor    %ebx,%ebx
  4003e7:       48 83 ec 18             sub    $0x18,%rsp
  4003eb:       c7 45 e0 00 00 00 00    movl   $0x0,-0x20(%rbp)
  4003f2:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)
  4003f8:       48 8d 7d e0             lea    -0x20(%rbp),%rdi
  4003fc:       89 de                   mov    %ebx,%esi
  4003fe:       83 c3 01                add    $0x1,%ebx
  400401:       e8 fa 00 00 00          callq  400500 <xchg>
  400406:       81 fb 00 ca 9a 3b       cmp    $0x3b9aca00,%ebx
  40040c:       75 ea                   jne    4003f8 <main+0x18>
  40040e:       48 83 c4 18             add    $0x18,%rsp
  400412:       5b                      pop    %rbx
  400413:       5d                      pop    %rbp
  400414:       c3                      retq   

0000000000400500 <xchg>:
  400500:       55                      push   %rbp
  400501:       89 f0                   mov    %esi,%eax
  400503:       48 89 e5                mov    %rsp,%rbp
  400506:       f0 87 07                lock xchg %eax,(%rdi)
  400509:       5d                      pop    %rbp
  40050a:       c3                      retq   
  40050b:       90                      nop
  40050c:       90                      nop
  40050d:       90                      nop
  40050e:       90                      nop
  40050f:       90                      nop

root@ivb-ep:~# gcc -std=gnu99 -O3 -fno-omit-frame-pointer -o call-inline call.c
root@ivb-ep:~# objdump -D call-inline | awk '/<[^>]*>:/ {p=0} /<main>:/ {p=1} /<xchg>:/ {p=1} { if (p) print $0 }'
00000000004003e0 <main>:
  4003e0:       55                      push   %rbp
  4003e1:       31 c0                   xor    %eax,%eax
  4003e3:       48 89 e5                mov    %rsp,%rbp
  4003e6:       c7 45 f0 00 00 00 00    movl   $0x0,-0x10(%rbp)
  4003ed:       0f 1f 00                nopl   (%rax)
  4003f0:       89 c2                   mov    %eax,%edx
  4003f2:       f0 87 55 f0             lock xchg %edx,-0x10(%rbp)
  4003f6:       83 c0 01                add    $0x1,%eax
  4003f9:       3d 00 ca 9a 3b          cmp    $0x3b9aca00,%eax
  4003fe:       75 f0                   jne    4003f0 <main+0x10>
  400400:       5d                      pop    %rbp
  400401:       c3                      retq   

root@ivb-ep:~# perf stat -e "cycles:u" ./call

 Performance counter stats for './call':

    36,309,274,162      cycles:u                 

      10.561819310 seconds time elapsed

root@ivb-ep:~# perf stat -e "cycles:u" ./call-inline 

 Performance counter stats for './call-inline':

    22,004,045,745      cycles:u                 

       6.498271508 seconds time elapsed




^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] x86: Move msr accesses out of line
  2015-02-25 12:27     ` Peter Zijlstra
@ 2015-02-25 18:20       ` Andi Kleen
  2015-02-25 18:34         ` Borislav Petkov
  0 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2015-02-25 18:20 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Andi Kleen, Andi Kleen, x86, linux-kernel

> Still, I wondered, so I ran me a little test. Note that I used a
> serializing instruction (LOCK XCHG) because WRMSR is too.

WRMSR has a lot of uops internally unlike LOCK XCHG, so I expect it
will mostly overlap with what it does. I'll run some benchmarks on
this today.

Also we do quite a few RDMSRs, which are not necessarily
serializing.

> I see a ~14 cycle difference between the inline and noinline version.
> 
> If I substitute the LOCK XCHG with XADD, I get to 1,5 cycles in
> difference, so clearly there is some magic happening, but serializing
> instructions wreck it.
> 
> Anybody can explain how such RSP deps get magiced away?

On Intel Core (since Yonah), the CPU frontend has a special
stack tracker that avoids these dependencies.

See 2.3.2.5 in the optimization manual

Also BTW just from tracing MSRs there is a lot of optimization
potential. Will send some patches later.

-Andi

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/3] x86: Move msr accesses out of line
  2015-02-25 18:20       ` Andi Kleen
@ 2015-02-25 18:34         ` Borislav Petkov
  0 siblings, 0 replies; 35+ messages in thread
From: Borislav Petkov @ 2015-02-25 18:34 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Peter Zijlstra, Andi Kleen, x86, linux-kernel

On Wed, Feb 25, 2015 at 07:20:32PM +0100, Andi Kleen wrote:
> Also we do quite a few RDMSRs, which are not necessarily
> serializing.

RDMSR is not serializing. Not even WRMSR when to certain ranges, see
section 8.3 in the SDM 3A.

-- 
Regards/Gruss,
    Boris.

ECO tip #101: Trim your mails when you reply.
--

^ permalink raw reply	[flat|nested] 35+ messages in thread

* [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-23 17:43   ` Andi Kleen
  2015-02-25 12:27     ` Peter Zijlstra
@ 2015-02-26 11:43     ` Peter Zijlstra
  2015-02-26 12:00       ` Ingo Molnar
                         ` (3 more replies)
  1 sibling, 4 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 11:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andi Kleen, x86, linux-kernel, mathieu.desnoyers, oleg, paulmck,
	rusty, mingo

On Mon, Feb 23, 2015 at 09:43:40AM -0800, Andi Kleen wrote:

> BTW if you really worry about perf overhead you could 
> gain far more (in some cases ms) by applying 
> http://comments.gmane.org/gmane.linux.kernel/1805207


Yeah, how about something like so instead; it would work for everybody.

It might maybe do with a few more comments.. also it appears to work,
but I'm barely awake so who knows.

---
Subject: module: Optimize __module_address() using a latched RB-tree

Optimize __module_address() using a latched RB-tree.

Currently __module_address() is using a linear search through all
modules in order to find the module corresponding to the provided
address. With a lot of modules this can take a lot of time.

One of the users of this is __kernel_text_address() which is employed in
many stack unwinders; which in turn are used by perf-callchain (possibly
from NMI context).

So by optimizing __module_address() we optimize many stack unwinders
which are used by both perf and tracing in performance sensitive code.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/module.h |    8 ++
 kernel/module.c        |  157 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 160 insertions(+), 5 deletions(-)

Index: linux-2.6/include/linux/module.h
===================================================================
--- linux-2.6.orig/include/linux/module.h	2015-02-26 10:51:31.750869653 +0100
+++ linux-2.6/include/linux/module.h	2015-02-26 10:52:15.895872761 +0100
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/rbtree.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -210,12 +211,19 @@
 	MODULE_STATE_UNFORMED,	/* Still setting it up. */
 };
 
+struct module_node {
+	struct module *mod;
+	struct rb_node node;
+};
+
 struct module {
 	enum module_state state;
 
 	/* Member of list of modules */
 	struct list_head list;
 
+	struct module_node tree_node[4];
+
 	/* Unique handle for this module */
 	char name[MODULE_NAME_LEN];
 
Index: linux-2.6/kernel/module.c
===================================================================
--- linux-2.6.orig/kernel/module.c	2015-02-26 10:51:31.750869653 +0100
+++ linux-2.6/kernel/module.c	2015-02-26 12:19:42.766743460 +0100
@@ -102,6 +102,180 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+
+
+/*
+ * Use a latched RB-tree for __module_address()
+ *
+ * The latch concept is a multi-value concurrency data structure which allows
+ * unserialized access and guarantees at least one version is stable.
+ *
+ * It is employed here to optimize __module_address(), which needs to find the
+ * module (if any) associated with an address. Such questions are best answered
+ * using a binary search tree.
+ *
+ * Since modules use non-overlapping memory ranges we can use a regular RB-tree
+ * (as opposed to the interval tree). However, BSTs cannot be iterated while
+ * modified.
+ *
+ * To overcome this we use the latched RB-tree, it basically consists of two
+ * RB-trees which are modified in order, ensuring one is always consistent.
+ *
+ * Things are somewhat more complicated because there are two ranges per
+ * module, but other than that its pretty straight forward.
+ *
+ * idx
+ *  0	- latch 0, init
+ *  1	- latch 1, init
+ *  2	- latch 0, core
+ *  3	- latch 1, core
+ */
+
+struct latch_tree_root {
+	seqcount_t	seq;
+	struct rb_root	tree[2];
+};
+
+static unsigned long mod_value(struct module *mod, int idx)
+{
+	if (idx & 2) {		/* core */
+		return (unsigned long)mod->module_core;
+	} else {		/* init */
+		return (unsigned long)mod->module_init;
+	}
+}
+
+static struct module *mod_entry(struct rb_node *n)
+{
+	struct module_node *mn = container_of(n, struct module_node, node);
+	return mn->mod;
+}
+
+static int mod_node_idx(struct module *m, struct rb_node *n)
+{
+	struct module_node *mn = container_of(n, struct module_node, node);
+	int idx = mn - m->tree_node;
+
+	BUG_ON(mn->mod != m);
+	BUG_ON((unsigned)idx > 3);
+
+	return idx;
+}
+
+static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+	struct rb_root *root = &mod_tree->tree[idx & 1];
+	struct module_node *mn = &mod->tree_node[idx];
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *parent = NULL;
+	unsigned long mod_val, m_val;
+	struct module *m;
+	int i;
+
+	mn->mod = mod;
+	mod_val = mod_value(mod, idx);
+
+	while (*link) {
+		parent = *link;
+		m = mod_entry(parent);
+		i = mod_node_idx(m, parent);
+		m_val = mod_value(m, i);
+
+		if (mod_val < m_val)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(&mn->node, parent, link);
+	rb_insert_color(&mn->node, root);
+}
+
+static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+	struct rb_root *root = &mod_tree->tree[idx & 1];
+	struct module_node *mn = &mod->tree_node[idx];
+
+	rb_erase(&mn->node, root);
+}
+
+static struct module *__tree_find(struct rb_root *r, unsigned long addr)
+{
+	struct rb_node *n = r->rb_node;
+
+	while (n) {
+		struct module *m = mod_entry(n);
+		int idx = mod_node_idx(m, n);
+
+		if (idx & 2) {				/* core */
+			if (addr < (unsigned long)m->module_core)
+				n = n->rb_left;
+			else if (addr >= (unsigned long)m->module_core + m->core_size)
+				n = n->rb_right;
+			else
+				return m;
+		} else {				/* init */
+			if (addr < (unsigned long)m->module_init)
+				n = n->rb_left;
+			else if (addr >= (unsigned long)m->module_init + m->init_size)
+				n = n->rb_right;
+			else
+				return m;
+		}
+	}
+
+	return NULL;
+}
+
+static struct latch_tree_root mod_tree;
+
+static void mod_tree_insert(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_insert(&mod_tree, mod, 0);	/* init */
+	__tree_insert(&mod_tree, mod, 2);		/* core */
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_insert(&mod_tree, mod, 1);	/* init */
+	__tree_insert(&mod_tree, mod, 3);		/* core */
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, 0);	/* init */
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, 1);	/* init */
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, 0);	/* init */
+	__tree_remove(&mod_tree, mod, 2);		/* core */
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, 1);	/* init */
+	__tree_remove(&mod_tree, mod, 3);		/* core */
+}
+
+static struct module *mod_tree_find(unsigned long addr)
+{
+	struct module *m;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount(&mod_tree.seq);
+		m = __tree_find(&mod_tree.tree[seq & 1], addr);
+	} while (read_seqcount_retry(&mod_tree.seq, seq));
+
+	return m;
+}
+
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
@@ -1854,6 +2028,7 @@
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
 	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
@@ -3098,6 +3273,7 @@
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	mod_tree_remove_init(mod);
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
 	mod->module_init = NULL;
@@ -3168,6 +3344,7 @@
 		goto out;
 	}
 	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
 	err = 0;
 
 out:
@@ -3370,6 +3547,7 @@
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	wake_up_all(&module_wq);
 	/* Wait for RCU synchronizing before releasing mod->list. */
 	synchronize_rcu();
@@ -3810,13 +3988,13 @@
 	if (addr < module_addr_min || addr > module_addr_max)
 		return NULL;
 
-	list_for_each_entry_rcu(mod, &modules, list) {
+	mod = mod_tree_find(addr);
+	if (mod) {
+		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod))
-			return mod;
+			mod = NULL;
 	}
-	return NULL;
+	return mod;
 }
 EXPORT_SYMBOL_GPL(__module_address);
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
@ 2015-02-26 12:00       ` Ingo Molnar
  2015-02-26 14:12       ` Peter Zijlstra
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 35+ messages in thread
From: Ingo Molnar @ 2015-02-26 12:00 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, mathieu.desnoyers,
	oleg, paulmck, rusty


* Peter Zijlstra <peterz@infradead.org> wrote:

> +static struct module *mod_tree_find(unsigned long addr)
> +{
> +	struct module *m;
> +	unsigned int seq;
> +
> +	do {
> +		seq = raw_read_seqcount(&mod_tree.seq);
> +		m = __tree_find(&mod_tree.tree[seq & 1], addr);
> +	} while (read_seqcount_retry(&mod_tree.seq, seq));
> +
> +	return m;
> +}

Btw., if your approach works out fine I bet we could add 
one more optimization as well: a PER_CPU(last_module_found) 
front cache would help as well, as usually there's quite a 
bit of repetition between addresses being looked up and the 
resulting modules.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
  2015-02-26 12:00       ` Ingo Molnar
@ 2015-02-26 14:12       ` Peter Zijlstra
  2015-02-27 11:51         ` Rusty Russell
  2015-02-26 16:02       ` Mathieu Desnoyers
  2015-02-27 12:02       ` Rusty Russell
  3 siblings, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 14:12 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andi Kleen, x86, linux-kernel, mathieu.desnoyers, oleg, paulmck,
	rusty, mingo

On Thu, Feb 26, 2015 at 12:43:09PM +0100, Peter Zijlstra wrote:

> +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> +{
> +	struct rb_node *n = r->rb_node;
> +
> +	while (n) {
> +		struct module *m = mod_entry(n);
> +		int idx = mod_node_idx(m, n);
> +
> +		if (idx & 2) {				/* core */
> +			if (addr < (unsigned long)m->module_core)
> +				n = n->rb_left;
> +			else if (addr >= (unsigned long)m->module_core + m->core_size)
> +				n = n->rb_right;
> +			else
> +				return m;
> +		} else {				/* init */
> +			if (addr < (unsigned long)m->module_init)
> +				n = n->rb_left;
> +			else if (addr >= (unsigned long)m->module_init + m->init_size)
> +				n = n->rb_right;
> +			else
> +				return m;
> +		}
> +	}
> +
> +	return NULL;
> +}

Assuming struct module is cacheline aligned, Rusty? from what I can find
its squirreled away in some data structure:

  mod = (void *)info->sechdrs[info->index.mod].sh_addr

And I can't seem to quickly find its alignment. If its not cacheline
aligned, can we make it so?

The below patch shuffles things around a bit and relies on the above
assumption.

#pahole -EC module ivb-ep-build/kernel/module.o

struct module {
	...

        /* --- cacheline 6 boundary (384 bytes) --- */
        void *                     module_init;                                          /*   384     8 */
        void *                     module_core;                                          /*   392     8 */
        unsigned int               init_size;                                            /*   400     4 */
        unsigned int               core_size;                                            /*   404     4 */
        unsigned int               init_text_size;                                       /*   408     4 */
        unsigned int               core_text_size;                                       /*   412     4 */
        struct module_node {
                struct module *    mod;                                                  /*   416     8 */
                struct rb_node {
                        long unsigned int __rb_parent_color;                             /*   424     8 */
                        struct rb_node * rb_right;                                       /*   432     8 */
                        struct rb_node * rb_left;                                        /*   440     8 */
                } node; /*   424    24 */
        } tree_node[4]; /*   416   128 */

	...
};

Which gets module_{init,core} and {init,core}_size on the same cacheline
as tree_node[0].

And given that module loading/unloading is rare, tree modifications will
be rare, and we'll normally always iterate tree latch0. Furthermore,
execution of init code is brief and by switching core to elements 0,2
we've insured to typically only hit tree_node[0].

Therefore the loads in __tree_find() will (typically) hit the same
cacheline and life is good.

---

Index: linux-2.6/include/linux/module.h
===================================================================
--- linux-2.6.orig/include/linux/module.h	2015-02-26 14:59:06.549177534 +0100
+++ linux-2.6/include/linux/module.h	2015-02-26 14:53:27.708162155 +0100
@@ -222,8 +222,6 @@
 	/* Member of list of modules */
 	struct list_head list;
 
-	struct module_node tree_node[4];
-
 	/* Unique handle for this module */
 	char name[MODULE_NAME_LEN];
 
@@ -278,7 +276,7 @@
 	int (*init)(void);
 
 	/* If this is non-NULL, vfree after init() returns */
-	void *module_init;
+	void *module_init	____cacheline_aligned;
 
 	/* Here is the actual code + data, vfree'd on unload. */
 	void *module_core;
@@ -289,6 +287,8 @@
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+	struct module_node tree_node[4];
+
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
 
Index: linux-2.6/kernel/module.c
===================================================================
--- linux-2.6.orig/kernel/module.c	2015-02-26 14:59:06.551177534 +0100
+++ linux-2.6/kernel/module.c	2015-02-26 14:52:04.094158360 +0100
@@ -125,10 +125,10 @@
  * module, but other than that its pretty straight forward.
  *
  * idx
- *  0	- latch 0, init
- *  1	- latch 1, init
- *  2	- latch 0, core
- *  3	- latch 1, core
+ *  0	- latch 0, core
+ *  1	- latch 1, core
+ *  2	- latch 0, init
+ *  3	- latch 1, init
  */
 
 struct latch_tree_root {
@@ -138,10 +138,10 @@
 
 static unsigned long mod_value(struct module *mod, int idx)
 {
-	if (idx & 2) {		/* core */
-		return (unsigned long)mod->module_core;
-	} else {		/* init */
+	if (idx & 2) {		/* init */
 		return (unsigned long)mod->module_init;
+	} else {		/* core */
+		return (unsigned long)mod->module_core;
 	}
 }
 
@@ -207,17 +207,17 @@
 		struct module *m = mod_entry(n);
 		int idx = mod_node_idx(m, n);
 
-		if (idx & 2) {				/* core */
-			if (addr < (unsigned long)m->module_core)
+		if (idx & 2) {				/* init */
+			if (addr < (unsigned long)m->module_init)
 				n = n->rb_left;
-			else if (addr >= (unsigned long)m->module_core + m->core_size)
+			else if (addr >= (unsigned long)m->module_init + m->init_size)
 				n = n->rb_right;
 			else
 				return m;
-		} else {				/* init */
-			if (addr < (unsigned long)m->module_init)
+		} else {				/* core */
+			if (addr < (unsigned long)m->module_core)
 				n = n->rb_left;
-			else if (addr >= (unsigned long)m->module_init + m->init_size)
+			else if (addr >= (unsigned long)m->module_core + m->core_size)
 				n = n->rb_right;
 			else
 				return m;
@@ -232,35 +232,35 @@
 static void mod_tree_insert(struct module *mod)
 {
 	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_insert(&mod_tree, mod, 0);		/* core */
 	if (mod->init_size)
-		__tree_insert(&mod_tree, mod, 0);	/* init */
-	__tree_insert(&mod_tree, mod, 2);		/* core */
+		__tree_insert(&mod_tree, mod, 2);	/* init */
 	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_insert(&mod_tree, mod, 1);		/* core */
 	if (mod->init_size)
-		__tree_insert(&mod_tree, mod, 1);	/* init */
-	__tree_insert(&mod_tree, mod, 3);		/* core */
+		__tree_insert(&mod_tree, mod, 3);	/* init */
 }
 
 static void mod_tree_remove_init(struct module *mod)
 {
 	raw_write_seqcount_latch(&mod_tree.seq);
 	if (mod->init_size)
-		__tree_remove(&mod_tree, mod, 0);	/* init */
+		__tree_remove(&mod_tree, mod, 2);	/* init */
 	raw_write_seqcount_latch(&mod_tree.seq);
 	if (mod->init_size)
-		__tree_remove(&mod_tree, mod, 1);	/* init */
+		__tree_remove(&mod_tree, mod, 3);	/* init */
 }
 
 static void mod_tree_remove(struct module *mod)
 {
 	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_remove(&mod_tree, mod, 0);		/* core */
 	if (mod->init_size)
-		__tree_remove(&mod_tree, mod, 0);	/* init */
-	__tree_remove(&mod_tree, mod, 2);		/* core */
+		__tree_remove(&mod_tree, mod, 2);	/* init */
 	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_remove(&mod_tree, mod, 1);		/* core */
 	if (mod->init_size)
-		__tree_remove(&mod_tree, mod, 1);	/* init */
-	__tree_remove(&mod_tree, mod, 3);		/* core */
+		__tree_remove(&mod_tree, mod, 3);	/* init */
 }
 
 static struct module *mod_tree_find(unsigned long addr)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
  2015-02-26 12:00       ` Ingo Molnar
  2015-02-26 14:12       ` Peter Zijlstra
@ 2015-02-26 16:02       ` Mathieu Desnoyers
  2015-02-26 16:43         ` Peter Zijlstra
  2015-02-27 12:02       ` Rusty Russell
  3 siblings, 1 reply; 35+ messages in thread
From: Mathieu Desnoyers @ 2015-02-26 16:02 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, oleg, paulmck, rusty, mingo

----- Original Message -----
> From: "Peter Zijlstra" <peterz@infradead.org>
> To: "Andi Kleen" <ak@linux.intel.com>
> Cc: "Andi Kleen" <andi@firstfloor.org>, x86@kernel.org, linux-kernel@vger.kernel.org, "mathieu desnoyers"
> <mathieu.desnoyers@efficios.com>, oleg@redhat.com, paulmck@linux.vnet.ibm.com, rusty@rustcorp.com.au,
> mingo@kernel.org
> Sent: Thursday, February 26, 2015 6:43:09 AM
> Subject: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
> 
> On Mon, Feb 23, 2015 at 09:43:40AM -0800, Andi Kleen wrote:
> 
> > BTW if you really worry about perf overhead you could
> > gain far more (in some cases ms) by applying
> > http://comments.gmane.org/gmane.linux.kernel/1805207
> 
> 
> Yeah, how about something like so instead; it would work for everybody.
> 
> It might maybe do with a few more comments.. also it appears to work,
> but I'm barely awake so who knows.
> 
> ---
> Subject: module: Optimize __module_address() using a latched RB-tree
> 
> Optimize __module_address() using a latched RB-tree.
> 
> Currently __module_address() is using a linear search through all
> modules in order to find the module corresponding to the provided
> address. With a lot of modules this can take a lot of time.
> 
> One of the users of this is __kernel_text_address() which is employed in
> many stack unwinders; which in turn are used by perf-callchain (possibly
> from NMI context).
> 
> So by optimizing __module_address() we optimize many stack unwinders
> which are used by both perf and tracing in performance sensitive code.

It looks like a good re-use of the raw_write_seqcount_latch
API we introduced for nmi-safe clock monotonic recently. This
latch can indeed be applied to arbitrary data structures, as long
as doubling the memory usage is not an issue.

Comment below,

> 
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/module.h |    8 ++
>  kernel/module.c        |  157
>  +++++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 160 insertions(+), 5 deletions(-)
> 
> Index: linux-2.6/include/linux/module.h
> ===================================================================
> --- linux-2.6.orig/include/linux/module.h	2015-02-26 10:51:31.750869653 +0100
> +++ linux-2.6/include/linux/module.h	2015-02-26 10:52:15.895872761 +0100
> @@ -17,6 +17,7 @@
>  #include <linux/moduleparam.h>
>  #include <linux/jump_label.h>
>  #include <linux/export.h>
> +#include <linux/rbtree.h>
>  
>  #include <linux/percpu.h>
>  #include <asm/module.h>
> @@ -210,12 +211,19 @@
>  	MODULE_STATE_UNFORMED,	/* Still setting it up. */
>  };
>  
> +struct module_node {
> +	struct module *mod;
> +	struct rb_node node;
> +};
> +
>  struct module {
>  	enum module_state state;
>  
>  	/* Member of list of modules */
>  	struct list_head list;
>  
> +	struct module_node tree_node[4];
> +
>  	/* Unique handle for this module */
>  	char name[MODULE_NAME_LEN];
>  
> Index: linux-2.6/kernel/module.c
> ===================================================================
> --- linux-2.6.orig/kernel/module.c	2015-02-26 10:51:31.750869653 +0100
> +++ linux-2.6/kernel/module.c	2015-02-26 12:19:42.766743460 +0100
> @@ -102,6 +102,180 @@
>  DEFINE_MUTEX(module_mutex);
>  EXPORT_SYMBOL_GPL(module_mutex);
>  static LIST_HEAD(modules);
> +
> +
> +/*
> + * Use a latched RB-tree for __module_address()
> + *
> + * The latch concept is a multi-value concurrency data structure which
> allows
> + * unserialized access and guarantees at least one version is stable.
> + *
> + * It is employed here to optimize __module_address(), which needs to find
> the
> + * module (if any) associated with an address. Such questions are best
> answered
> + * using a binary search tree.
> + *
> + * Since modules use non-overlapping memory ranges we can use a regular
> RB-tree
> + * (as opposed to the interval tree). However, BSTs cannot be iterated while
> + * modified.
> + *
> + * To overcome this we use the latched RB-tree, it basically consists of two
> + * RB-trees which are modified in order, ensuring one is always consistent.
> + *
> + * Things are somewhat more complicated because there are two ranges per
> + * module, but other than that its pretty straight forward.
> + *
> + * idx
> + *  0	- latch 0, init
> + *  1	- latch 1, init
> + *  2	- latch 0, core
> + *  3	- latch 1, core
> + */
> +
> +struct latch_tree_root {
> +	seqcount_t	seq;
> +	struct rb_root	tree[2];
> +};
> +
> +static unsigned long mod_value(struct module *mod, int idx)
> +{
> +	if (idx & 2) {		/* core */
> +		return (unsigned long)mod->module_core;
> +	} else {		/* init */
> +		return (unsigned long)mod->module_init;
> +	}
> +}
> +
> +static struct module *mod_entry(struct rb_node *n)
> +{
> +	struct module_node *mn = container_of(n, struct module_node, node);
> +	return mn->mod;
> +}
> +
> +static int mod_node_idx(struct module *m, struct rb_node *n)
> +{
> +	struct module_node *mn = container_of(n, struct module_node, node);
> +	int idx = mn - m->tree_node;
> +
> +	BUG_ON(mn->mod != m);
> +	BUG_ON((unsigned)idx > 3);
> +
> +	return idx;
> +}
> +
> +static void __tree_insert(struct latch_tree_root *mod_tree, struct module
> *mod, int idx)
> +{
> +	struct rb_root *root = &mod_tree->tree[idx & 1];
> +	struct module_node *mn = &mod->tree_node[idx];
> +	struct rb_node **link = &root->rb_node;
> +	struct rb_node *parent = NULL;
> +	unsigned long mod_val, m_val;
> +	struct module *m;
> +	int i;
> +
> +	mn->mod = mod;
> +	mod_val = mod_value(mod, idx);
> +
> +	while (*link) {
> +		parent = *link;
> +		m = mod_entry(parent);
> +		i = mod_node_idx(m, parent);
> +		m_val = mod_value(m, i);
> +
> +		if (mod_val < m_val)
> +			link = &parent->rb_left;
> +		else
> +			link = &parent->rb_right;
> +	}
> +
> +	rb_link_node(&mn->node, parent, link);
> +	rb_insert_color(&mn->node, root);
> +}
> +
> +static void __tree_remove(struct latch_tree_root *mod_tree, struct module
> *mod, int idx)
> +{
> +	struct rb_root *root = &mod_tree->tree[idx & 1];
> +	struct module_node *mn = &mod->tree_node[idx];
> +
> +	rb_erase(&mn->node, root);
> +}
> +

Perhaps you could use mod_value() below, and introduce a
"mod_size()" too. This would keep the init vs core selection
out of the traversal code.

> +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> +{
> +	struct rb_node *n = r->rb_node;
> +
> +	while (n) {
> +		struct module *m = mod_entry(n);
> +		int idx = mod_node_idx(m, n);

And change for this ?

		unsigned long m_addr = mod_value(m, n);
		unsigned long m_size = mod_size(m, n);

		if (addr < m_addr) {
			n = n->rb_left;
		else if (addr >= m_addr + m_size)
			n = n->rb_right;
		else
			return m;

> +		if (idx & 2) {				/* core */
> +			if (addr < (unsigned long)m->module_core)
> +				n = n->rb_left;
> +			else if (addr >= (unsigned long)m->module_core + m->core_size)
> +				n = n->rb_right;
> +			else
> +				return m;
> +		} else {				/* init */
> +			if (addr < (unsigned long)m->module_init)
> +				n = n->rb_left;
> +			else if (addr >= (unsigned long)m->module_init + m->init_size)
> +				n = n->rb_right;
> +			else
> +				return m;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct latch_tree_root mod_tree;

Is it customary to define static variables in the
middle of a C file ?

The rest looks good, especially for use of the latch.
I'd be tempted to turn "0, 1, 2, 3" into an enum though,
so we can follow in the code what each of those array
entry really means.

Thanks!

Mathieu

> +
> +static void mod_tree_insert(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_insert(&mod_tree, mod, 0);	/* init */
> +	__tree_insert(&mod_tree, mod, 2);		/* core */
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_insert(&mod_tree, mod, 1);	/* init */
> +	__tree_insert(&mod_tree, mod, 3);		/* core */
> +}
> +
> +static void mod_tree_remove_init(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, 0);	/* init */
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, 1);	/* init */
> +}
> +
> +static void mod_tree_remove(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, 0);	/* init */
> +	__tree_remove(&mod_tree, mod, 2);		/* core */
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, 1);	/* init */
> +	__tree_remove(&mod_tree, mod, 3);		/* core */
> +}
> +
> +static struct module *mod_tree_find(unsigned long addr)
> +{
> +	struct module *m;
> +	unsigned int seq;
> +
> +	do {
> +		seq = raw_read_seqcount(&mod_tree.seq);
> +		m = __tree_find(&mod_tree.tree[seq & 1], addr);
> +	} while (read_seqcount_retry(&mod_tree.seq, seq));
> +
> +	return m;
> +}
> +
>  #ifdef CONFIG_KGDB_KDB
>  struct list_head *kdb_modules = &modules; /* kdb needs the list of modules
>  */
>  #endif /* CONFIG_KGDB_KDB */
> @@ -1854,6 +2028,7 @@
>  	mutex_lock(&module_mutex);
>  	/* Unlink carefully: kallsyms could be walking list. */
>  	list_del_rcu(&mod->list);
> +	mod_tree_remove(mod);
>  	/* Remove this module from bug list, this uses list_del_rcu */
>  	module_bug_cleanup(mod);
>  	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
> @@ -3098,6 +3273,7 @@
>  	mod->symtab = mod->core_symtab;
>  	mod->strtab = mod->core_strtab;
>  #endif
> +	mod_tree_remove_init(mod);
>  	unset_module_init_ro_nx(mod);
>  	module_arch_freeing_init(mod);
>  	mod->module_init = NULL;
> @@ -3168,6 +3344,7 @@
>  		goto out;
>  	}
>  	list_add_rcu(&mod->list, &modules);
> +	mod_tree_insert(mod);
>  	err = 0;
>  
>  out:
> @@ -3370,6 +3547,7 @@
>  	mutex_lock(&module_mutex);
>  	/* Unlink carefully: kallsyms could be walking list. */
>  	list_del_rcu(&mod->list);
> +	mod_tree_remove(mod);
>  	wake_up_all(&module_wq);
>  	/* Wait for RCU synchronizing before releasing mod->list. */
>  	synchronize_rcu();
> @@ -3810,13 +3988,13 @@
>  	if (addr < module_addr_min || addr > module_addr_max)
>  		return NULL;
>  
> -	list_for_each_entry_rcu(mod, &modules, list) {
> +	mod = mod_tree_find(addr);
> +	if (mod) {
> +		BUG_ON(!within_module(addr, mod));
>  		if (mod->state == MODULE_STATE_UNFORMED)
> -			continue;
> -		if (within_module(addr, mod))
> -			return mod;
> +			mod = NULL;
>  	}
> -	return NULL;
> +	return mod;
>  }
>  EXPORT_SYMBOL_GPL(__module_address);
>  
> 

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 16:02       ` Mathieu Desnoyers
@ 2015-02-26 16:43         ` Peter Zijlstra
  2015-02-26 16:55           ` Mathieu Desnoyers
  2015-02-26 18:28           ` Paul E. McKenney
  0 siblings, 2 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 16:43 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, oleg, paulmck, rusty, mingo

On Thu, Feb 26, 2015 at 04:02:43PM +0000, Mathieu Desnoyers wrote:
> Perhaps you could use mod_value() below, and introduce a
> "mod_size()" too. This would keep the init vs core selection
> out of the traversal code.

Indeed!

> Is it customary to define static variables in the
> middle of a C file ?

Dunno, it seemed like a good a place as any.

> The rest looks good, especially for use of the latch.
> I'd be tempted to turn "0, 1, 2, 3" into an enum though,
> so we can follow in the code what each of those array
> entry really means.

Agreed.

---
Subject: module: Optimize __module_address() using a latched RB-tree
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu Feb 26 10:57:34 CET 2015

Currently __module_address() is using a linear search through all
modules in order to find the module corresponding to the provided
address. With a lot of modules this can take a lot of time.

One of the users of this is __kernel_text_address() which is employed
in many stack unwinders; which in turn are used by perf-callchain and
ftrace (possibly from NMI context).

So by optimizing __module_address() we optimize many stack unwinders
which are used by both perf and tracing in performance sensitive code.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/module.h |   19 +++-
 kernel/module.c        |  212 +++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 224 insertions(+), 7 deletions(-)

--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -17,6 +17,7 @@
 #include <linux/moduleparam.h>
 #include <linux/jump_label.h>
 #include <linux/export.h>
+#include <linux/rbtree.h>
 
 #include <linux/percpu.h>
 #include <asm/module.h>
@@ -210,6 +211,11 @@ enum module_state {
 	MODULE_STATE_UNFORMED,	/* Still setting it up. */
 };
 
+struct module_node {
+	struct module *mod;
+	struct rb_node node;
+};
+
 struct module {
 	enum module_state state;
 
@@ -269,8 +275,15 @@ struct module {
 	/* Startup function. */
 	int (*init)(void);
 
-	/* If this is non-NULL, vfree after init() returns */
-	void *module_init;
+	/*
+	 * If this is non-NULL, vfree after init() returns
+	 *
+	 * cacheline align here, such that:
+	 *   module_init, module_core, init_size, core_size and
+	 *   tree_node[0]
+	 * are on the same cacheline.
+	 */
+	void *module_init	____cacheline_aligned;
 
 	/* Here is the actual code + data, vfree'd on unload. */
 	void *module_core;
@@ -281,6 +294,8 @@ struct module {
 	/* The size of the executable code in each section.  */
 	unsigned int init_text_size, core_text_size;
 
+	struct module_node tree_node[4];
+
 	/* Size of RO sections of the module (text+rodata) */
 	unsigned int init_ro_size, core_ro_size;
 
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -102,6 +102,204 @@
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
+
+
+/*
+ * Use a latched RB-tree for __module_address()
+ *
+ * The latch concept is a multi-value concurrency data structure which allows
+ * unserialized access and guarantees at least one version is stable.
+ *
+ * It is employed here to optimize __module_address(), which needs to find the
+ * module (if any) associated with an address. Such questions are best answered
+ * using a binary search tree.
+ *
+ * Since modules use non-overlapping memory ranges we can use a regular RB-tree
+ * (as opposed to the interval tree). However, BSTs cannot be iterated while
+ * modified.
+ *
+ * To overcome this we use the latched RB-tree, it basically consists of two
+ * RB-trees which are modified in order, ensuring one is always consistent.
+ *
+ * Things are somewhat more complicated because there are two ranges per
+ * module, but other than that its pretty straight forward.
+ *
+ */
+
+enum {
+	latch0_core = 0,
+	latch1_core = 1,
+	latch0_init = 2,
+	latch1_init = 3,
+};
+
+enum {
+	latch_bit = 0x01,
+	init_bit  = 0x02,
+};
+
+struct latch_tree_root {
+	seqcount_t	seq;
+	struct rb_root	tree[2];
+};
+
+static unsigned long mod_value(struct module *mod, int idx)
+{
+	if (idx & init_bit)
+		return (unsigned long)mod->module_init;
+	else
+		return (unsigned long)mod->module_core;
+}
+
+static unsigned long mod_size(struct module *mod, int idx)
+{
+	if (idx & init_bit)
+		return mod->init_size;
+	else
+		return mod->core_size;
+}
+
+static struct module *mod_entry(struct rb_node *n)
+{
+	struct module_node *mn = container_of(n, struct module_node, node);
+	return mn->mod;
+}
+
+static int mod_node_idx(struct module *m, struct rb_node *n)
+{
+	struct module_node *mn = container_of(n, struct module_node, node);
+	int idx = mn - m->tree_node;
+
+	BUG_ON(mn->mod != m);
+	BUG_ON((unsigned)idx > 3);
+
+	return idx;
+}
+
+static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
+	struct module_node *mn = &mod->tree_node[idx];
+	struct rb_node **link = &root->rb_node;
+	struct rb_node *parent = NULL;
+	unsigned long mod_val, m_val;
+	struct module *m;
+	int i;
+
+	mn->mod = mod;
+	mod_val = mod_value(mod, idx);
+
+	while (*link) {
+		parent = *link;
+		m = mod_entry(parent);
+		i = mod_node_idx(m, parent);
+		m_val = mod_value(m, i);
+
+		if (mod_val < m_val)
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(&mn->node, parent, link);
+	rb_insert_color(&mn->node, root);
+}
+
+static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
+{
+	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
+	struct module_node *mn = &mod->tree_node[idx];
+
+	rb_erase(&mn->node, root);
+}
+
+/*
+ * struct module is arranged such that:
+ *
+ *   module_init, module_core, init_size, core_size,
+ *   init_text_size, core_text_size and tree_node[0]
+ *
+ * are on the same cacheline, therefore if the below iteration is
+ * on latch0 and all module init has completed, we'll only hit
+ * tree_node[0] and every intermediate level will hit only a single
+ * cacheline.
+ *
+ * Furthermore, by ensuring init_text_size and core_text_size are
+ * also in this same cacheline we've made sure is_module_text_address()
+ * will also not require additional lines.
+ */
+static struct module *__tree_find(struct rb_root *r, unsigned long addr)
+{
+	struct rb_node *n = r->rb_node;
+	unsigned long m_val, m_size;
+
+	while (n) {
+		struct module *m = mod_entry(n);
+		int idx = mod_node_idx(m, n);
+
+		m_val = mod_value(m, idx);
+		m_size = mod_size(m, idx);
+
+		if (addr < m_val)
+			n = n->rb_left;
+		else if (addr >= m_val + m_size)
+			n = n->rb_right;
+		else
+			return m;
+	}
+
+	return NULL;
+}
+
+static struct latch_tree_root mod_tree;
+
+static void mod_tree_insert(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_insert(&mod_tree, mod, latch0_core);
+	if (mod->init_size)
+		__tree_insert(&mod_tree, mod, latch0_init);
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_insert(&mod_tree, mod, latch1_core);
+	if (mod->init_size)
+		__tree_insert(&mod_tree, mod, latch1_init);
+}
+
+static void mod_tree_remove_init(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch0_init);
+	raw_write_seqcount_latch(&mod_tree.seq);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch1_init);
+}
+
+static void mod_tree_remove(struct module *mod)
+{
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_remove(&mod_tree, mod, latch0_core);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch0_init);
+	raw_write_seqcount_latch(&mod_tree.seq);
+	__tree_remove(&mod_tree, mod, latch1_core);
+	if (mod->init_size)
+		__tree_remove(&mod_tree, mod, latch1_init);
+}
+
+static struct module *mod_tree_find(unsigned long addr)
+{
+	struct module *m;
+	unsigned int seq;
+
+	do {
+		seq = raw_read_seqcount(&mod_tree.seq);
+		m = __tree_find(&mod_tree.tree[seq & latch_bit], addr);
+	} while (read_seqcount_retry(&mod_tree.seq, seq));
+
+	return m;
+}
+
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
@@ -1854,6 +2052,7 @@ static void free_module(struct module *m
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	/* Remove this module from bug list, this uses list_del_rcu */
 	module_bug_cleanup(mod);
 	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
@@ -3098,6 +3297,7 @@ static noinline int do_init_module(struc
 	mod->symtab = mod->core_symtab;
 	mod->strtab = mod->core_strtab;
 #endif
+	mod_tree_remove_init(mod);
 	unset_module_init_ro_nx(mod);
 	module_arch_freeing_init(mod);
 	mod->module_init = NULL;
@@ -3168,6 +3368,7 @@ static int add_unformed_module(struct mo
 		goto out;
 	}
 	list_add_rcu(&mod->list, &modules);
+	mod_tree_insert(mod);
 	err = 0;
 
 out:
@@ -3367,6 +3568,7 @@ static int load_module(struct load_info
 	mutex_lock(&module_mutex);
 	/* Unlink carefully: kallsyms could be walking list. */
 	list_del_rcu(&mod->list);
+	mod_tree_remove(mod);
 	wake_up_all(&module_wq);
 	/* Wait for RCU synchronizing before releasing mod->list. */
 	synchronize_rcu();
@@ -3810,13 +4012,13 @@ struct module *__module_address(unsigned
 	if (addr < module_addr_min || addr > module_addr_max)
 		return NULL;
 
-	list_for_each_entry_rcu(mod, &modules, list) {
+	mod = mod_tree_find(addr);
+	if (mod) {
+		BUG_ON(!within_module(addr, mod));
 		if (mod->state == MODULE_STATE_UNFORMED)
-			continue;
-		if (within_module(addr, mod))
-			return mod;
+			mod = NULL;
 	}
-	return NULL;
+	return mod;
 }
 EXPORT_SYMBOL_GPL(__module_address);
 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 16:43         ` Peter Zijlstra
@ 2015-02-26 16:55           ` Mathieu Desnoyers
  2015-02-26 17:16             ` Peter Zijlstra
  2015-02-26 17:22             ` Peter Zijlstra
  2015-02-26 18:28           ` Paul E. McKenney
  1 sibling, 2 replies; 35+ messages in thread
From: Mathieu Desnoyers @ 2015-02-26 16:55 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, oleg, paulmck, rusty, mingo

----- Original Message -----
> From: "Peter Zijlstra" <peterz@infradead.org>
> To: "Mathieu Desnoyers" <mathieu.desnoyers@efficios.com>
> Cc: "Andi Kleen" <ak@linux.intel.com>, "Andi Kleen" <andi@firstfloor.org>, x86@kernel.org,
> linux-kernel@vger.kernel.org, oleg@redhat.com, paulmck@linux.vnet.ibm.com, rusty@rustcorp.com.au, mingo@kernel.org
> Sent: Thursday, February 26, 2015 11:43:56 AM
> Subject: Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
> 
> On Thu, Feb 26, 2015 at 04:02:43PM +0000, Mathieu Desnoyers wrote:
> > Perhaps you could use mod_value() below, and introduce a
> > "mod_size()" too. This would keep the init vs core selection
> > out of the traversal code.
> 
> Indeed!
> 
> > Is it customary to define static variables in the
> > middle of a C file ?
> 
> Dunno, it seemed like a good a place as any.

My personal coding-style is to put all definitions
at the top of C files, but I don't know if it's within
the kernel coding style guide lines or just something
I'm personally used to. I have no strong opinion here.

More nits inline below,

> 
> > The rest looks good, especially for use of the latch.
> > I'd be tempted to turn "0, 1, 2, 3" into an enum though,
> > so we can follow in the code what each of those array
> > entry really means.
> 
> Agreed.
> 
> ---
> Subject: module: Optimize __module_address() using a latched RB-tree
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Feb 26 10:57:34 CET 2015
> 
> Currently __module_address() is using a linear search through all
> modules in order to find the module corresponding to the provided
> address. With a lot of modules this can take a lot of time.
> 
> One of the users of this is __kernel_text_address() which is employed
> in many stack unwinders; which in turn are used by perf-callchain and
> ftrace (possibly from NMI context).
> 
> So by optimizing __module_address() we optimize many stack unwinders
> which are used by both perf and tracing in performance sensitive code.
> 
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  include/linux/module.h |   19 +++-
>  kernel/module.c        |  212
>  +++++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 224 insertions(+), 7 deletions(-)
> 
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -17,6 +17,7 @@
>  #include <linux/moduleparam.h>
>  #include <linux/jump_label.h>
>  #include <linux/export.h>
> +#include <linux/rbtree.h>
>  
>  #include <linux/percpu.h>
>  #include <asm/module.h>
> @@ -210,6 +211,11 @@ enum module_state {
>  	MODULE_STATE_UNFORMED,	/* Still setting it up. */
>  };
>  
> +struct module_node {
> +	struct module *mod;
> +	struct rb_node node;
> +};
> +
>  struct module {
>  	enum module_state state;
>  
> @@ -269,8 +275,15 @@ struct module {
>  	/* Startup function. */
>  	int (*init)(void);
>  
> -	/* If this is non-NULL, vfree after init() returns */
> -	void *module_init;
> +	/*
> +	 * If this is non-NULL, vfree after init() returns
> +	 *
> +	 * cacheline align here, such that:
> +	 *   module_init, module_core, init_size, core_size and
> +	 *   tree_node[0]
> +	 * are on the same cacheline.

Fat-fingered newline ? ;)

> +	 */
> +	void *module_init	____cacheline_aligned;
>  
>  	/* Here is the actual code + data, vfree'd on unload. */
>  	void *module_core;
> @@ -281,6 +294,8 @@ struct module {
>  	/* The size of the executable code in each section.  */
>  	unsigned int init_text_size, core_text_size;
>  
> +	struct module_node tree_node[4];

4 -> nr_module_addr_latch

> +
>  	/* Size of RO sections of the module (text+rodata) */
>  	unsigned int init_ro_size, core_ro_size;
>  
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -102,6 +102,204 @@
>  DEFINE_MUTEX(module_mutex);
>  EXPORT_SYMBOL_GPL(module_mutex);
>  static LIST_HEAD(modules);
> +
> +
> +/*
> + * Use a latched RB-tree for __module_address()
> + *
> + * The latch concept is a multi-value concurrency data structure which
> allows
> + * unserialized access and guarantees at least one version is stable.
> + *
> + * It is employed here to optimize __module_address(), which needs to find
> the
> + * module (if any) associated with an address. Such questions are best
> answered
> + * using a binary search tree.
> + *
> + * Since modules use non-overlapping memory ranges we can use a regular
> RB-tree
> + * (as opposed to the interval tree). However, BSTs cannot be iterated while
> + * modified.
> + *
> + * To overcome this we use the latched RB-tree, it basically consists of two
> + * RB-trees which are modified in order, ensuring one is always consistent.
> + *
> + * Things are somewhat more complicated because there are two ranges per
> + * module, but other than that its pretty straight forward.
> + *
> + */
> +
> +enum {
> +	latch0_core = 0,
> +	latch1_core = 1,
> +	latch0_init = 2,
> +	latch1_init = 3,

     nr_module_addr_latch,

Perhaps namespace this enum and move it to module.h so we
can expose nr_module_addr_latch ?

> +};
> +
> +enum {
> +	latch_bit = 0x01,
> +	init_bit  = 0x02,
> +};
> +
> +struct latch_tree_root {
> +	seqcount_t	seq;
> +	struct rb_root	tree[2];
> +};
> +
> +static unsigned long mod_value(struct module *mod, int idx)
> +{
> +	if (idx & init_bit)

Hrm, my first reflex is to look for a "init_bit" variable here.
Should we use caps for enum entries instead ? e.g. INIT_BIT ?

Thanks,

Mathieu

> +		return (unsigned long)mod->module_init;
> +	else
> +		return (unsigned long)mod->module_core;
> +}
> +
> +static unsigned long mod_size(struct module *mod, int idx)
> +{
> +	if (idx & init_bit)
> +		return mod->init_size;
> +	else
> +		return mod->core_size;
> +}
> +
> +static struct module *mod_entry(struct rb_node *n)
> +{
> +	struct module_node *mn = container_of(n, struct module_node, node);
> +	return mn->mod;
> +}
> +
> +static int mod_node_idx(struct module *m, struct rb_node *n)
> +{
> +	struct module_node *mn = container_of(n, struct module_node, node);
> +	int idx = mn - m->tree_node;
> +
> +	BUG_ON(mn->mod != m);
> +	BUG_ON((unsigned)idx > 3);
> +
> +	return idx;
> +}
> +
> +static void __tree_insert(struct latch_tree_root *mod_tree, struct module
> *mod, int idx)
> +{
> +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> +	struct module_node *mn = &mod->tree_node[idx];
> +	struct rb_node **link = &root->rb_node;
> +	struct rb_node *parent = NULL;
> +	unsigned long mod_val, m_val;
> +	struct module *m;
> +	int i;
> +
> +	mn->mod = mod;
> +	mod_val = mod_value(mod, idx);
> +
> +	while (*link) {
> +		parent = *link;
> +		m = mod_entry(parent);
> +		i = mod_node_idx(m, parent);
> +		m_val = mod_value(m, i);
> +
> +		if (mod_val < m_val)
> +			link = &parent->rb_left;
> +		else
> +			link = &parent->rb_right;
> +	}
> +
> +	rb_link_node(&mn->node, parent, link);
> +	rb_insert_color(&mn->node, root);
> +}
> +
> +static void __tree_remove(struct latch_tree_root *mod_tree, struct module
> *mod, int idx)
> +{
> +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> +	struct module_node *mn = &mod->tree_node[idx];
> +
> +	rb_erase(&mn->node, root);
> +}
> +
> +/*
> + * struct module is arranged such that:
> + *
> + *   module_init, module_core, init_size, core_size,
> + *   init_text_size, core_text_size and tree_node[0]
> + *
> + * are on the same cacheline, therefore if the below iteration is
> + * on latch0 and all module init has completed, we'll only hit
> + * tree_node[0] and every intermediate level will hit only a single
> + * cacheline.
> + *
> + * Furthermore, by ensuring init_text_size and core_text_size are
> + * also in this same cacheline we've made sure is_module_text_address()
> + * will also not require additional lines.
> + */
> +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> +{
> +	struct rb_node *n = r->rb_node;
> +	unsigned long m_val, m_size;
> +
> +	while (n) {
> +		struct module *m = mod_entry(n);
> +		int idx = mod_node_idx(m, n);
> +
> +		m_val = mod_value(m, idx);
> +		m_size = mod_size(m, idx);
> +
> +		if (addr < m_val)
> +			n = n->rb_left;
> +		else if (addr >= m_val + m_size)
> +			n = n->rb_right;
> +		else
> +			return m;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct latch_tree_root mod_tree;
> +
> +static void mod_tree_insert(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_insert(&mod_tree, mod, latch0_core);
> +	if (mod->init_size)
> +		__tree_insert(&mod_tree, mod, latch0_init);
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_insert(&mod_tree, mod, latch1_core);
> +	if (mod->init_size)
> +		__tree_insert(&mod_tree, mod, latch1_init);
> +}
> +
> +static void mod_tree_remove_init(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch0_init);
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch1_init);
> +}
> +
> +static void mod_tree_remove(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_remove(&mod_tree, mod, latch0_core);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch0_init);
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_remove(&mod_tree, mod, latch1_core);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch1_init);
> +}
> +
> +static struct module *mod_tree_find(unsigned long addr)
> +{
> +	struct module *m;
> +	unsigned int seq;
> +
> +	do {
> +		seq = raw_read_seqcount(&mod_tree.seq);
> +		m = __tree_find(&mod_tree.tree[seq & latch_bit], addr);
> +	} while (read_seqcount_retry(&mod_tree.seq, seq));
> +
> +	return m;
> +}
> +
>  #ifdef CONFIG_KGDB_KDB
>  struct list_head *kdb_modules = &modules; /* kdb needs the list of modules
>  */
>  #endif /* CONFIG_KGDB_KDB */
> @@ -1854,6 +2052,7 @@ static void free_module(struct module *m
>  	mutex_lock(&module_mutex);
>  	/* Unlink carefully: kallsyms could be walking list. */
>  	list_del_rcu(&mod->list);
> +	mod_tree_remove(mod);
>  	/* Remove this module from bug list, this uses list_del_rcu */
>  	module_bug_cleanup(mod);
>  	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
> @@ -3098,6 +3297,7 @@ static noinline int do_init_module(struc
>  	mod->symtab = mod->core_symtab;
>  	mod->strtab = mod->core_strtab;
>  #endif
> +	mod_tree_remove_init(mod);
>  	unset_module_init_ro_nx(mod);
>  	module_arch_freeing_init(mod);
>  	mod->module_init = NULL;
> @@ -3168,6 +3368,7 @@ static int add_unformed_module(struct mo
>  		goto out;
>  	}
>  	list_add_rcu(&mod->list, &modules);
> +	mod_tree_insert(mod);
>  	err = 0;
>  
>  out:
> @@ -3367,6 +3568,7 @@ static int load_module(struct load_info
>  	mutex_lock(&module_mutex);
>  	/* Unlink carefully: kallsyms could be walking list. */
>  	list_del_rcu(&mod->list);
> +	mod_tree_remove(mod);
>  	wake_up_all(&module_wq);
>  	/* Wait for RCU synchronizing before releasing mod->list. */
>  	synchronize_rcu();
> @@ -3810,13 +4012,13 @@ struct module *__module_address(unsigned
>  	if (addr < module_addr_min || addr > module_addr_max)
>  		return NULL;
>  
> -	list_for_each_entry_rcu(mod, &modules, list) {
> +	mod = mod_tree_find(addr);
> +	if (mod) {
> +		BUG_ON(!within_module(addr, mod));
>  		if (mod->state == MODULE_STATE_UNFORMED)
> -			continue;
> -		if (within_module(addr, mod))
> -			return mod;
> +			mod = NULL;
>  	}
> -	return NULL;
> +	return mod;
>  }
>  EXPORT_SYMBOL_GPL(__module_address);
>  
> 

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 16:55           ` Mathieu Desnoyers
@ 2015-02-26 17:16             ` Peter Zijlstra
  2015-02-26 17:22             ` Peter Zijlstra
  1 sibling, 0 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 17:16 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, oleg, paulmck, rusty, mingo

On Thu, Feb 26, 2015 at 04:55:35PM +0000, Mathieu Desnoyers wrote:
> > Dunno, it seemed like a good a place as any.
> 
> My personal coding-style is to put all definitions
> at the top of C files, but I don't know if it's within
> the kernel coding style guide lines or just something
> I'm personally used to. I have no strong opinion here.

So I had all the data structure definition in the C file as well to keep
them private, and then you get into the situation where you can't put it
at the top anyhow.

I further wanted to keep the entire mod_tree stuff together so it can be
easily read. I suppose I can move it to the top of that section, but the
reason I had it there is that it didn't need to exist before that.

> > @@ -269,8 +275,15 @@ struct module {
> >  	/* Startup function. */
> >  	int (*init)(void);
> >  
> > +	/*
> > +	 * If this is non-NULL, vfree after init() returns
> > +	 *
> > +	 * cacheline align here, such that:
> > +	 *   module_init, module_core, init_size, core_size and
> > +	 *   tree_node[0]
> > +	 * are on the same cacheline.
> 
> Fat-fingered newline ? ;)

I made it:

        /*
         * If this is non-NULL, vfree after init() returns.
         *
         * Cacheline align here, such that:
         *   module_init, module_core, init_size, core_size,
         *   init_text_size, core_text_size and tree_node[0]
         * are on the same cacheline.
         */

To be consistent with the other comment.

> > +	 */
> > +	void *module_init	____cacheline_aligned;
> >  
> >  	/* Here is the actual code + data, vfree'd on unload. */
> >  	void *module_core;
> > @@ -281,6 +294,8 @@ struct module {
> >  	/* The size of the executable code in each section.  */
> >  	unsigned int init_text_size, core_text_size;
> >  
> > +	struct module_node tree_node[4];
> 
> 4 -> nr_module_addr_latch

Yeah, I dunno, that means moving that enum to the header file, I kinda
liked having it all together in the C file.

What Rusty want though.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 16:55           ` Mathieu Desnoyers
  2015-02-26 17:16             ` Peter Zijlstra
@ 2015-02-26 17:22             ` Peter Zijlstra
  1 sibling, 0 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 17:22 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, oleg, paulmck, rusty, mingo

On Thu, Feb 26, 2015 at 04:55:35PM +0000, Mathieu Desnoyers wrote:
> > +static unsigned long mod_value(struct module *mod, int idx)
> > +{
> > +	if (idx & init_bit)
> 
> Hrm, my first reflex is to look for a "init_bit" variable here.
> Should we use caps for enum entries instead ? e.g. INIT_BIT ?

I could change those bit things to CPP and then use caps :-)


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 16:43         ` Peter Zijlstra
  2015-02-26 16:55           ` Mathieu Desnoyers
@ 2015-02-26 18:28           ` Paul E. McKenney
  2015-02-26 19:06             ` Mathieu Desnoyers
  2015-02-26 19:13             ` Peter Zijlstra
  1 sibling, 2 replies; 35+ messages in thread
From: Paul E. McKenney @ 2015-02-26 18:28 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 05:43:56PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 26, 2015 at 04:02:43PM +0000, Mathieu Desnoyers wrote:
> > Perhaps you could use mod_value() below, and introduce a
> > "mod_size()" too. This would keep the init vs core selection
> > out of the traversal code.
> 
> Indeed!
> 
> > Is it customary to define static variables in the
> > middle of a C file ?
> 
> Dunno, it seemed like a good a place as any.
> 
> > The rest looks good, especially for use of the latch.
> > I'd be tempted to turn "0, 1, 2, 3" into an enum though,
> > so we can follow in the code what each of those array
> > entry really means.
> 
> Agreed.
> 
> ---
> Subject: module: Optimize __module_address() using a latched RB-tree
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Thu Feb 26 10:57:34 CET 2015
> 
> Currently __module_address() is using a linear search through all
> modules in order to find the module corresponding to the provided
> address. With a lot of modules this can take a lot of time.
> 
> One of the users of this is __kernel_text_address() which is employed
> in many stack unwinders; which in turn are used by perf-callchain and
> ftrace (possibly from NMI context).
> 
> So by optimizing __module_address() we optimize many stack unwinders
> which are used by both perf and tracing in performance sensitive code.
> 
> Cc: Oleg Nesterov <oleg@redhat.com>
> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> Cc: Rusty Russell <rusty@rustcorp.com.au>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

Color me confused, both by the existing code and the modifications.

It appears that you are using seqlock to force readers to retry when
a concurrent update occurs, but I don't see what is ensuring that the
readers see good data when racing with an insertion or a deletion.  Yes,
the reader will be retried, courtesy of the seqlock, but that won't help
if the reader segfaults before it gets to the read_seqcount_retry().

Questions below.

> ---
>  include/linux/module.h |   19 +++-
>  kernel/module.c        |  212 +++++++++++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 224 insertions(+), 7 deletions(-)
> 
> --- a/include/linux/module.h
> +++ b/include/linux/module.h
> @@ -17,6 +17,7 @@
>  #include <linux/moduleparam.h>
>  #include <linux/jump_label.h>
>  #include <linux/export.h>
> +#include <linux/rbtree.h>
> 
>  #include <linux/percpu.h>
>  #include <asm/module.h>
> @@ -210,6 +211,11 @@ enum module_state {
>  	MODULE_STATE_UNFORMED,	/* Still setting it up. */
>  };
> 
> +struct module_node {
> +	struct module *mod;
> +	struct rb_node node;
> +};
> +
>  struct module {
>  	enum module_state state;
> 
> @@ -269,8 +275,15 @@ struct module {
>  	/* Startup function. */
>  	int (*init)(void);
> 
> -	/* If this is non-NULL, vfree after init() returns */
> -	void *module_init;
> +	/*
> +	 * If this is non-NULL, vfree after init() returns
> +	 *
> +	 * cacheline align here, such that:
> +	 *   module_init, module_core, init_size, core_size and
> +	 *   tree_node[0]
> +	 * are on the same cacheline.
> +	 */
> +	void *module_init	____cacheline_aligned;
> 
>  	/* Here is the actual code + data, vfree'd on unload. */
>  	void *module_core;
> @@ -281,6 +294,8 @@ struct module {
>  	/* The size of the executable code in each section.  */
>  	unsigned int init_text_size, core_text_size;
> 
> +	struct module_node tree_node[4];
> +
>  	/* Size of RO sections of the module (text+rodata) */
>  	unsigned int init_ro_size, core_ro_size;
> 
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -102,6 +102,204 @@
>  DEFINE_MUTEX(module_mutex);
>  EXPORT_SYMBOL_GPL(module_mutex);
>  static LIST_HEAD(modules);
> +
> +
> +/*
> + * Use a latched RB-tree for __module_address()
> + *
> + * The latch concept is a multi-value concurrency data structure which allows
> + * unserialized access and guarantees at least one version is stable.
> + *
> + * It is employed here to optimize __module_address(), which needs to find the
> + * module (if any) associated with an address. Such questions are best answered
> + * using a binary search tree.
> + *
> + * Since modules use non-overlapping memory ranges we can use a regular RB-tree
> + * (as opposed to the interval tree). However, BSTs cannot be iterated while
> + * modified.
> + *
> + * To overcome this we use the latched RB-tree, it basically consists of two
> + * RB-trees which are modified in order, ensuring one is always consistent.
> + *
> + * Things are somewhat more complicated because there are two ranges per
> + * module, but other than that its pretty straight forward.
> + *
> + */
> +
> +enum {
> +	latch0_core = 0,
> +	latch1_core = 1,
> +	latch0_init = 2,
> +	latch1_init = 3,
> +};
> +
> +enum {
> +	latch_bit = 0x01,
> +	init_bit  = 0x02,
> +};
> +
> +struct latch_tree_root {
> +	seqcount_t	seq;
> +	struct rb_root	tree[2];
> +};
> +
> +static unsigned long mod_value(struct module *mod, int idx)
> +{
> +	if (idx & init_bit)
> +		return (unsigned long)mod->module_init;
> +	else
> +		return (unsigned long)mod->module_core;
> +}
> +
> +static unsigned long mod_size(struct module *mod, int idx)
> +{
> +	if (idx & init_bit)
> +		return mod->init_size;
> +	else
> +		return mod->core_size;
> +}
> +
> +static struct module *mod_entry(struct rb_node *n)
> +{
> +	struct module_node *mn = container_of(n, struct module_node, node);
> +	return mn->mod;
> +}
> +
> +static int mod_node_idx(struct module *m, struct rb_node *n)
> +{
> +	struct module_node *mn = container_of(n, struct module_node, node);
> +	int idx = mn - m->tree_node;
> +
> +	BUG_ON(mn->mod != m);
> +	BUG_ON((unsigned)idx > 3);
> +
> +	return idx;
> +}
> +
> +static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> +{
> +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> +	struct module_node *mn = &mod->tree_node[idx];
> +	struct rb_node **link = &root->rb_node;
> +	struct rb_node *parent = NULL;
> +	unsigned long mod_val, m_val;
> +	struct module *m;
> +	int i;
> +
> +	mn->mod = mod;
> +	mod_val = mod_value(mod, idx);
> +
> +	while (*link) {
> +		parent = *link;
> +		m = mod_entry(parent);
> +		i = mod_node_idx(m, parent);
> +		m_val = mod_value(m, i);
> +
> +		if (mod_val < m_val)
> +			link = &parent->rb_left;
> +		else
> +			link = &parent->rb_right;
> +	}
> +
> +	rb_link_node(&mn->node, parent, link);

This makes the new module visible to readers, if I understand correctly.
There needs to be a memory barrier between initialization and this call
to rb_link_node(), otherwise, both the CPU (well, for Alpha) and the
compiler (for everyone) can reorder, which could result in some hapless
reader seeing pre-initialized data.

Or did I miss the memory barrier?

> +	rb_insert_color(&mn->node, root);

This -might- be OK -- the rotations do not make new nodes visible,
instead simply rearranging nodes that are already visible.

For both rb_link_node() and rb_insert_color(), given that we are updating
pointers while readers are traversing them, READ_ONCE() and WRITE_ONCE()
would be good.  Yes, the compiler -probably- doesn't mess you up, but it
would be completely within its rights to do so.  :-/

Alpha would like either rcu_dereference() or lockless_dereference()
instead of READ_ONCE(), of course!

> +}
> +
> +static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> +{
> +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> +	struct module_node *mn = &mod->tree_node[idx];
> +
> +	rb_erase(&mn->node, root);

This is just removing and not freeing, so OK from a read-side viewpoint.
WRITE_ONCE() would be good.

> +}
> +
> +/*
> + * struct module is arranged such that:
> + *
> + *   module_init, module_core, init_size, core_size,
> + *   init_text_size, core_text_size and tree_node[0]
> + *
> + * are on the same cacheline, therefore if the below iteration is
> + * on latch0 and all module init has completed, we'll only hit
> + * tree_node[0] and every intermediate level will hit only a single
> + * cacheline.
> + *
> + * Furthermore, by ensuring init_text_size and core_text_size are
> + * also in this same cacheline we've made sure is_module_text_address()
> + * will also not require additional lines.
> + */
> +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> +{
> +	struct rb_node *n = r->rb_node;
> +	unsigned long m_val, m_size;
> +
> +	while (n) {
> +		struct module *m = mod_entry(n);
> +		int idx = mod_node_idx(m, n);
> +
> +		m_val = mod_value(m, idx);
> +		m_size = mod_size(m, idx);
> +
> +		if (addr < m_val)
> +			n = n->rb_left;

We need a rcu_dereference() or lockless_dereference() here, I think.

> +		else if (addr >= m_val + m_size)
> +			n = n->rb_right;

And here.

> +		else
> +			return m;
> +	}
> +
> +	return NULL;

I did finally find the RCU read-side critical section, supplied by
is_module_text_address()'s preempt_disable() and preempt_enable().
The matching synchronize_sched() is supplied by do_init_module() and
load_module().  I expected a synchronize_sched() in free_module() as well,
but I only see a synchronize_rcu().  Is the required synchronize_sched()
on this code path hiding somewhere?

Or did I miss an rcu_read_lock()/rcu_read_unlock() pair somewhere?

							Thanx, Paul

> +}
> +
> +static struct latch_tree_root mod_tree;
> +
> +static void mod_tree_insert(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_insert(&mod_tree, mod, latch0_core);
> +	if (mod->init_size)
> +		__tree_insert(&mod_tree, mod, latch0_init);
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_insert(&mod_tree, mod, latch1_core);
> +	if (mod->init_size)
> +		__tree_insert(&mod_tree, mod, latch1_init);
> +}
> +
> +static void mod_tree_remove_init(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch0_init);
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch1_init);
> +}
> +
> +static void mod_tree_remove(struct module *mod)
> +{
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_remove(&mod_tree, mod, latch0_core);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch0_init);
> +	raw_write_seqcount_latch(&mod_tree.seq);
> +	__tree_remove(&mod_tree, mod, latch1_core);
> +	if (mod->init_size)
> +		__tree_remove(&mod_tree, mod, latch1_init);
> +}
> +
> +static struct module *mod_tree_find(unsigned long addr)
> +{
> +	struct module *m;
> +	unsigned int seq;
> +
> +	do {
> +		seq = raw_read_seqcount(&mod_tree.seq);
> +		m = __tree_find(&mod_tree.tree[seq & latch_bit], addr);
> +	} while (read_seqcount_retry(&mod_tree.seq, seq));
> +
> +	return m;
> +}
> +
>  #ifdef CONFIG_KGDB_KDB
>  struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
>  #endif /* CONFIG_KGDB_KDB */
> @@ -1854,6 +2052,7 @@ static void free_module(struct module *m
>  	mutex_lock(&module_mutex);
>  	/* Unlink carefully: kallsyms could be walking list. */
>  	list_del_rcu(&mod->list);
> +	mod_tree_remove(mod);
>  	/* Remove this module from bug list, this uses list_del_rcu */
>  	module_bug_cleanup(mod);
>  	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
> @@ -3098,6 +3297,7 @@ static noinline int do_init_module(struc
>  	mod->symtab = mod->core_symtab;
>  	mod->strtab = mod->core_strtab;
>  #endif
> +	mod_tree_remove_init(mod);
>  	unset_module_init_ro_nx(mod);
>  	module_arch_freeing_init(mod);
>  	mod->module_init = NULL;
> @@ -3168,6 +3368,7 @@ static int add_unformed_module(struct mo
>  		goto out;
>  	}
>  	list_add_rcu(&mod->list, &modules);
> +	mod_tree_insert(mod);
>  	err = 0;
> 
>  out:
> @@ -3367,6 +3568,7 @@ static int load_module(struct load_info
>  	mutex_lock(&module_mutex);
>  	/* Unlink carefully: kallsyms could be walking list. */
>  	list_del_rcu(&mod->list);
> +	mod_tree_remove(mod);
>  	wake_up_all(&module_wq);
>  	/* Wait for RCU synchronizing before releasing mod->list. */
>  	synchronize_rcu();
> @@ -3810,13 +4012,13 @@ struct module *__module_address(unsigned
>  	if (addr < module_addr_min || addr > module_addr_max)
>  		return NULL;
> 
> -	list_for_each_entry_rcu(mod, &modules, list) {
> +	mod = mod_tree_find(addr);
> +	if (mod) {
> +		BUG_ON(!within_module(addr, mod));
>  		if (mod->state == MODULE_STATE_UNFORMED)
> -			continue;
> -		if (within_module(addr, mod))
> -			return mod;
> +			mod = NULL;
>  	}
> -	return NULL;
> +	return mod;
>  }
>  EXPORT_SYMBOL_GPL(__module_address);
> 
> 


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 18:28           ` Paul E. McKenney
@ 2015-02-26 19:06             ` Mathieu Desnoyers
  2015-02-26 19:13             ` Peter Zijlstra
  1 sibling, 0 replies; 35+ messages in thread
From: Mathieu Desnoyers @ 2015-02-26 19:06 UTC (permalink / raw)
  To: paulmck
  Cc: Peter Zijlstra, Andi Kleen, Andi Kleen, x86, linux-kernel, oleg,
	rusty, mingo

----- Original Message -----
> From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> To: "Peter Zijlstra" <peterz@infradead.org>
> Cc: "Mathieu Desnoyers" <mathieu.desnoyers@efficios.com>, "Andi Kleen" <ak@linux.intel.com>, "Andi Kleen"
> <andi@firstfloor.org>, x86@kernel.org, linux-kernel@vger.kernel.org, oleg@redhat.com, rusty@rustcorp.com.au,
> mingo@kernel.org
> Sent: Thursday, February 26, 2015 1:28:17 PM
> Subject: Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
> 
> On Thu, Feb 26, 2015 at 05:43:56PM +0100, Peter Zijlstra wrote:
> > On Thu, Feb 26, 2015 at 04:02:43PM +0000, Mathieu Desnoyers wrote:
> > > Perhaps you could use mod_value() below, and introduce a
> > > "mod_size()" too. This would keep the init vs core selection
> > > out of the traversal code.
> > 
> > Indeed!
> > 
> > > Is it customary to define static variables in the
> > > middle of a C file ?
> > 
> > Dunno, it seemed like a good a place as any.
> > 
> > > The rest looks good, especially for use of the latch.
> > > I'd be tempted to turn "0, 1, 2, 3" into an enum though,
> > > so we can follow in the code what each of those array
> > > entry really means.
> > 
> > Agreed.
> > 
> > ---
> > Subject: module: Optimize __module_address() using a latched RB-tree
> > From: Peter Zijlstra <peterz@infradead.org>
> > Date: Thu Feb 26 10:57:34 CET 2015
> > 
> > Currently __module_address() is using a linear search through all
> > modules in order to find the module corresponding to the provided
> > address. With a lot of modules this can take a lot of time.
> > 
> > One of the users of this is __kernel_text_address() which is employed
> > in many stack unwinders; which in turn are used by perf-callchain and
> > ftrace (possibly from NMI context).
> > 
> > So by optimizing __module_address() we optimize many stack unwinders
> > which are used by both perf and tracing in performance sensitive code.
> > 
> > Cc: Oleg Nesterov <oleg@redhat.com>
> > Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> > Cc: Rusty Russell <rusty@rustcorp.com.au>
> > Cc: Steven Rostedt <rostedt@goodmis.org>
> > Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> > Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> 
> Color me confused, both by the existing code and the modifications.
> 
> It appears that you are using seqlock to force readers to retry when
> a concurrent update occurs, but I don't see what is ensuring that the
> readers see good data when racing with an insertion or a deletion.  Yes,
> the reader will be retried, courtesy of the seqlock, but that won't help
> if the reader segfaults before it gets to the read_seqcount_retry().

Good point! This scheme works when readers have no "side effect" due
to reading a data structure while it is changing. Dereferencing a
pointer is indeed a side-effect which could lead to a segfault.

> 
> Questions below.
> 
> > ---
> >  include/linux/module.h |   19 +++-
> >  kernel/module.c        |  212
> >  +++++++++++++++++++++++++++++++++++++++++++++++--
> >  2 files changed, 224 insertions(+), 7 deletions(-)
> > 
> > --- a/include/linux/module.h
> > +++ b/include/linux/module.h
> > @@ -17,6 +17,7 @@
> >  #include <linux/moduleparam.h>
> >  #include <linux/jump_label.h>
> >  #include <linux/export.h>
> > +#include <linux/rbtree.h>
> > 
> >  #include <linux/percpu.h>
> >  #include <asm/module.h>
> > @@ -210,6 +211,11 @@ enum module_state {
> >  	MODULE_STATE_UNFORMED,	/* Still setting it up. */
> >  };
> > 
> > +struct module_node {
> > +	struct module *mod;
> > +	struct rb_node node;
> > +};
> > +
> >  struct module {
> >  	enum module_state state;
> > 
> > @@ -269,8 +275,15 @@ struct module {
> >  	/* Startup function. */
> >  	int (*init)(void);
> > 
> > -	/* If this is non-NULL, vfree after init() returns */
> > -	void *module_init;
> > +	/*
> > +	 * If this is non-NULL, vfree after init() returns
> > +	 *
> > +	 * cacheline align here, such that:
> > +	 *   module_init, module_core, init_size, core_size and
> > +	 *   tree_node[0]
> > +	 * are on the same cacheline.
> > +	 */
> > +	void *module_init	____cacheline_aligned;
> > 
> >  	/* Here is the actual code + data, vfree'd on unload. */
> >  	void *module_core;
> > @@ -281,6 +294,8 @@ struct module {
> >  	/* The size of the executable code in each section.  */
> >  	unsigned int init_text_size, core_text_size;
> > 
> > +	struct module_node tree_node[4];
> > +
> >  	/* Size of RO sections of the module (text+rodata) */
> >  	unsigned int init_ro_size, core_ro_size;
> > 
> > --- a/kernel/module.c
> > +++ b/kernel/module.c
> > @@ -102,6 +102,204 @@
> >  DEFINE_MUTEX(module_mutex);
> >  EXPORT_SYMBOL_GPL(module_mutex);
> >  static LIST_HEAD(modules);
> > +
> > +
> > +/*
> > + * Use a latched RB-tree for __module_address()
> > + *
> > + * The latch concept is a multi-value concurrency data structure which
> > allows
> > + * unserialized access and guarantees at least one version is stable.
> > + *
> > + * It is employed here to optimize __module_address(), which needs to find
> > the
> > + * module (if any) associated with an address. Such questions are best
> > answered
> > + * using a binary search tree.
> > + *
> > + * Since modules use non-overlapping memory ranges we can use a regular
> > RB-tree
> > + * (as opposed to the interval tree). However, BSTs cannot be iterated
> > while
> > + * modified.
> > + *
> > + * To overcome this we use the latched RB-tree, it basically consists of
> > two
> > + * RB-trees which are modified in order, ensuring one is always
> > consistent.
> > + *
> > + * Things are somewhat more complicated because there are two ranges per
> > + * module, but other than that its pretty straight forward.
> > + *
> > + */
> > +
> > +enum {
> > +	latch0_core = 0,
> > +	latch1_core = 1,
> > +	latch0_init = 2,
> > +	latch1_init = 3,
> > +};
> > +
> > +enum {
> > +	latch_bit = 0x01,
> > +	init_bit  = 0x02,
> > +};
> > +
> > +struct latch_tree_root {
> > +	seqcount_t	seq;
> > +	struct rb_root	tree[2];
> > +};
> > +
> > +static unsigned long mod_value(struct module *mod, int idx)
> > +{
> > +	if (idx & init_bit)
> > +		return (unsigned long)mod->module_init;
> > +	else
> > +		return (unsigned long)mod->module_core;
> > +}
> > +
> > +static unsigned long mod_size(struct module *mod, int idx)
> > +{
> > +	if (idx & init_bit)
> > +		return mod->init_size;
> > +	else
> > +		return mod->core_size;
> > +}
> > +
> > +static struct module *mod_entry(struct rb_node *n)
> > +{
> > +	struct module_node *mn = container_of(n, struct module_node, node);
> > +	return mn->mod;
> > +}
> > +
> > +static int mod_node_idx(struct module *m, struct rb_node *n)
> > +{
> > +	struct module_node *mn = container_of(n, struct module_node, node);
> > +	int idx = mn - m->tree_node;
> > +
> > +	BUG_ON(mn->mod != m);
> > +	BUG_ON((unsigned)idx > 3);
> > +
> > +	return idx;
> > +}
> > +
> > +static void __tree_insert(struct latch_tree_root *mod_tree, struct module
> > *mod, int idx)
> > +{
> > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > +	struct module_node *mn = &mod->tree_node[idx];
> > +	struct rb_node **link = &root->rb_node;
> > +	struct rb_node *parent = NULL;
> > +	unsigned long mod_val, m_val;
> > +	struct module *m;
> > +	int i;
> > +
> > +	mn->mod = mod;
> > +	mod_val = mod_value(mod, idx);
> > +
> > +	while (*link) {
> > +		parent = *link;
> > +		m = mod_entry(parent);
> > +		i = mod_node_idx(m, parent);
> > +		m_val = mod_value(m, i);
> > +
> > +		if (mod_val < m_val)
> > +			link = &parent->rb_left;
> > +		else
> > +			link = &parent->rb_right;
> > +	}
> > +
> > +	rb_link_node(&mn->node, parent, link);
> 
> This makes the new module visible to readers, if I understand correctly.
> There needs to be a memory barrier between initialization and this call
> to rb_link_node(), otherwise, both the CPU (well, for Alpha) and the
> compiler (for everyone) can reorder, which could result in some hapless
> reader seeing pre-initialized data.
> 
> Or did I miss the memory barrier?

Given that readers (which will retry eventually) can indeed
try to concurrently access the object, those would be needed.

> 
> > +	rb_insert_color(&mn->node, root);
> 
> This -might- be OK -- the rotations do not make new nodes visible,
> instead simply rearranging nodes that are already visible.
> 
> For both rb_link_node() and rb_insert_color(), given that we are updating
> pointers while readers are traversing them, READ_ONCE() and WRITE_ONCE()
> would be good.  Yes, the compiler -probably- doesn't mess you up, but it
> would be completely within its rights to do so.  :-/
> 
> Alpha would like either rcu_dereference() or lockless_dereference()
> instead of READ_ONCE(), of course!

Yes, probably better to err on the safe side here. We have to
take into account that a reader can observe the data structure
while being modified, but if this occurs, it will retry (but should
not segfault nor loop endlessly).

> 
> > +}
> > +
> > +static void __tree_remove(struct latch_tree_root *mod_tree, struct module
> > *mod, int idx)
> > +{
> > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > +	struct module_node *mn = &mod->tree_node[idx];
> > +
> > +	rb_erase(&mn->node, root);
> 
> This is just removing and not freeing, so OK from a read-side viewpoint.
> WRITE_ONCE() would be good.

I'd be worried about the eventual free after this function.
There is a need to use RCU here to delay reclaim.

> 
> > +}
> > +
> > +/*
> > + * struct module is arranged such that:
> > + *
> > + *   module_init, module_core, init_size, core_size,
> > + *   init_text_size, core_text_size and tree_node[0]
> > + *
> > + * are on the same cacheline, therefore if the below iteration is
> > + * on latch0 and all module init has completed, we'll only hit
> > + * tree_node[0] and every intermediate level will hit only a single
> > + * cacheline.
> > + *
> > + * Furthermore, by ensuring init_text_size and core_text_size are
> > + * also in this same cacheline we've made sure is_module_text_address()
> > + * will also not require additional lines.
> > + */
> > +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> > +{
> > +	struct rb_node *n = r->rb_node;
> > +	unsigned long m_val, m_size;
> > +
> > +	while (n) {
> > +		struct module *m = mod_entry(n);
> > +		int idx = mod_node_idx(m, n);
> > +
> > +		m_val = mod_value(m, idx);
> > +		m_size = mod_size(m, idx);
> > +
> > +		if (addr < m_val)
> > +			n = n->rb_left;
> 
> We need a rcu_dereference() or lockless_dereference() here, I think.
> 
> > +		else if (addr >= m_val + m_size)
> > +			n = n->rb_right;
> 
> And here.
> 
> > +		else
> > +			return m;
> > +	}
> > +
> > +	return NULL;
> 
> I did finally find the RCU read-side critical section, supplied by
> is_module_text_address()'s preempt_disable() and preempt_enable().
> The matching synchronize_sched() is supplied by do_init_module() and
> load_module().  I expected a synchronize_sched() in free_module() as well,
> but I only see a synchronize_rcu().  Is the required synchronize_sched()
> on this code path hiding somewhere?
> 
> Or did I miss an rcu_read_lock()/rcu_read_unlock() pair somewhere?

I did not see any, nor any documentation that points to
those. We might want to add documentation to
raw_write_seqcount_latch() telling about the requirements
about making sure concurrent readers don't trigger side-effects.
It's not as simple as the timekeeper use-case, due to having
pointers here.

Good catch !

Thanks,

Mathieu

> 
> 							Thanx, Paul
> 
> > +}
> > +
> > +static struct latch_tree_root mod_tree;
> > +
> > +static void mod_tree_insert(struct module *mod)
> > +{
> > +	raw_write_seqcount_latch(&mod_tree.seq);
> > +	__tree_insert(&mod_tree, mod, latch0_core);
> > +	if (mod->init_size)
> > +		__tree_insert(&mod_tree, mod, latch0_init);
> > +	raw_write_seqcount_latch(&mod_tree.seq);
> > +	__tree_insert(&mod_tree, mod, latch1_core);
> > +	if (mod->init_size)
> > +		__tree_insert(&mod_tree, mod, latch1_init);
> > +}
> > +
> > +static void mod_tree_remove_init(struct module *mod)
> > +{
> > +	raw_write_seqcount_latch(&mod_tree.seq);
> > +	if (mod->init_size)
> > +		__tree_remove(&mod_tree, mod, latch0_init);
> > +	raw_write_seqcount_latch(&mod_tree.seq);
> > +	if (mod->init_size)
> > +		__tree_remove(&mod_tree, mod, latch1_init);
> > +}
> > +
> > +static void mod_tree_remove(struct module *mod)
> > +{
> > +	raw_write_seqcount_latch(&mod_tree.seq);
> > +	__tree_remove(&mod_tree, mod, latch0_core);
> > +	if (mod->init_size)
> > +		__tree_remove(&mod_tree, mod, latch0_init);
> > +	raw_write_seqcount_latch(&mod_tree.seq);
> > +	__tree_remove(&mod_tree, mod, latch1_core);
> > +	if (mod->init_size)
> > +		__tree_remove(&mod_tree, mod, latch1_init);
> > +}
> > +
> > +static struct module *mod_tree_find(unsigned long addr)
> > +{
> > +	struct module *m;
> > +	unsigned int seq;
> > +
> > +	do {
> > +		seq = raw_read_seqcount(&mod_tree.seq);
> > +		m = __tree_find(&mod_tree.tree[seq & latch_bit], addr);
> > +	} while (read_seqcount_retry(&mod_tree.seq, seq));
> > +
> > +	return m;
> > +}
> > +
> >  #ifdef CONFIG_KGDB_KDB
> >  struct list_head *kdb_modules = &modules; /* kdb needs the list of modules
> >  */
> >  #endif /* CONFIG_KGDB_KDB */
> > @@ -1854,6 +2052,7 @@ static void free_module(struct module *m
> >  	mutex_lock(&module_mutex);
> >  	/* Unlink carefully: kallsyms could be walking list. */
> >  	list_del_rcu(&mod->list);
> > +	mod_tree_remove(mod);
> >  	/* Remove this module from bug list, this uses list_del_rcu */
> >  	module_bug_cleanup(mod);
> >  	/* Wait for RCU synchronizing before releasing mod->list and buglist. */
> > @@ -3098,6 +3297,7 @@ static noinline int do_init_module(struc
> >  	mod->symtab = mod->core_symtab;
> >  	mod->strtab = mod->core_strtab;
> >  #endif
> > +	mod_tree_remove_init(mod);
> >  	unset_module_init_ro_nx(mod);
> >  	module_arch_freeing_init(mod);
> >  	mod->module_init = NULL;
> > @@ -3168,6 +3368,7 @@ static int add_unformed_module(struct mo
> >  		goto out;
> >  	}
> >  	list_add_rcu(&mod->list, &modules);
> > +	mod_tree_insert(mod);
> >  	err = 0;
> > 
> >  out:
> > @@ -3367,6 +3568,7 @@ static int load_module(struct load_info
> >  	mutex_lock(&module_mutex);
> >  	/* Unlink carefully: kallsyms could be walking list. */
> >  	list_del_rcu(&mod->list);
> > +	mod_tree_remove(mod);
> >  	wake_up_all(&module_wq);
> >  	/* Wait for RCU synchronizing before releasing mod->list. */
> >  	synchronize_rcu();
> > @@ -3810,13 +4012,13 @@ struct module *__module_address(unsigned
> >  	if (addr < module_addr_min || addr > module_addr_max)
> >  		return NULL;
> > 
> > -	list_for_each_entry_rcu(mod, &modules, list) {
> > +	mod = mod_tree_find(addr);
> > +	if (mod) {
> > +		BUG_ON(!within_module(addr, mod));
> >  		if (mod->state == MODULE_STATE_UNFORMED)
> > -			continue;
> > -		if (within_module(addr, mod))
> > -			return mod;
> > +			mod = NULL;
> >  	}
> > -	return NULL;
> > +	return mod;
> >  }
> >  EXPORT_SYMBOL_GPL(__module_address);
> > 
> > 
> 
> 

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 18:28           ` Paul E. McKenney
  2015-02-26 19:06             ` Mathieu Desnoyers
@ 2015-02-26 19:13             ` Peter Zijlstra
  2015-02-26 19:41               ` Paul E. McKenney
  2015-02-28 16:41               ` Peter Zijlstra
  1 sibling, 2 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 19:13 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 10:28:17AM -0800, Paul E. McKenney wrote:


> Color me confused, both by the existing code and the modifications.
> 
> It appears that you are using seqlock to force readers to retry when
> a concurrent update occurs, but I don't see what is ensuring that the
> readers see good data when racing with an insertion or a deletion. 

Yes, I suppose that needs a wee bit more work (and comments).

> Yes,
> the reader will be retried, courtesy of the seqlock, but that won't help
> if the reader segfaults before it gets to the read_seqcount_retry().

Right, I don't think that can happen, but see below.

> > +static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> > +{
> > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > +	struct module_node *mn = &mod->tree_node[idx];
> > +	struct rb_node **link = &root->rb_node;
> > +	struct rb_node *parent = NULL;
> > +	unsigned long mod_val, m_val;
> > +	struct module *m;
> > +	int i;
> > +
> > +	mn->mod = mod;
> > +	mod_val = mod_value(mod, idx);
> > +
> > +	while (*link) {
> > +		parent = *link;
> > +		m = mod_entry(parent);
> > +		i = mod_node_idx(m, parent);
> > +		m_val = mod_value(m, i);
> > +
> > +		if (mod_val < m_val)
> > +			link = &parent->rb_left;
> > +		else
> > +			link = &parent->rb_right;
> > +	}
> > +
> > +	rb_link_node(&mn->node, parent, link);
> 
> This makes the new module visible to readers, if I understand correctly.

You do.

> There needs to be a memory barrier between initialization and this call
> to rb_link_node(), otherwise, both the CPU (well, for Alpha) and the
> compiler (for everyone) can reorder, which could result in some hapless
> reader seeing pre-initialized data.
> 
> Or did I miss the memory barrier?

I was going with the one in list_add_rcu() and the two in
raw_write_seqcount_latch(), however, now that you made me look at it, I
need one in here to ensure mn->mod is observed.

So yes. The best solution might be a mod_tree_init() which initializes
those back pointers before this insertion.

> > +	rb_insert_color(&mn->node, root);
> 
> This -might- be OK -- the rotations do not make new nodes visible,
> instead simply rearranging nodes that are already visible.
> 
> For both rb_link_node() and rb_insert_color(), given that we are updating
> pointers while readers are traversing them, READ_ONCE() and WRITE_ONCE()
> would be good.  Yes, the compiler -probably- doesn't mess you up, but it
> would be completely within its rights to do so.  :-/
> 
> Alpha would like either rcu_dereference() or lockless_dereference()
> instead of READ_ONCE(), of course!

I _think_ we should be ok here. Since as you say, we only reorder. If we
observe nodes in the wrong order, our traversal gets confused and might
return the wrong or no node.

That is fine.

Of course, as you say, the compiler is within its right to split stores
and do other creative nonsense without the volatile thing. But I don't
think it will actually do so for native word sized thingies.

If we're really paranoid I suppose I can go make copies of the
functions that have READ/WRITE_ONCE() in.

As to Alpha, I think we're good there, the split cache would make us see
stale data, which is perfectly fine, we'll traverse a crap tree, no
problem. As long as native sized and native aligned loads/stores aren't
split -- which I'm pretty sure doesn't happen (because of the split
cache).

> > +}
> > +
> > +static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> > +{
> > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > +	struct module_node *mn = &mod->tree_node[idx];
> > +
> > +	rb_erase(&mn->node, root);
> 
> This is just removing and not freeing, so OK from a read-side viewpoint.
> WRITE_ONCE() would be good.

Yah, would need a copy of lib/rbtree.c again :/

> > +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> > +{
> > +	struct rb_node *n = r->rb_node;
> > +	unsigned long m_val, m_size;
> > +
> > +	while (n) {
> > +		struct module *m = mod_entry(n);
> > +		int idx = mod_node_idx(m, n);
> > +
> > +		m_val = mod_value(m, idx);
> > +		m_size = mod_size(m, idx);
> > +
> > +		if (addr < m_val)
> > +			n = n->rb_left;
> 
> We need a rcu_dereference() or lockless_dereference() here, I think.
> 
> > +		else if (addr >= m_val + m_size)
> > +			n = n->rb_right;
> 
> And here.

As per the above argument; without doing the whole READ/WRITE_ONCE for
the rb tree primitives, I think we're fine. We don't actually need the
read_barrier_depends() I think.

I think this because raw_read_seqcount() will issue an rmb, which should
be enough for the 'stable' tree. For the unstable one we don't care as
long as we don't see split loads/stores.

> > +		else
> > +			return m;
> > +	}
> > +
> > +	return NULL;
> 
> I did finally find the RCU read-side critical section, supplied by
> is_module_text_address()'s preempt_disable() and preempt_enable().

Indeed.

> The matching synchronize_sched() is supplied by do_init_module() and
> load_module().  I expected a synchronize_sched() in free_module() as well,
> but I only see a synchronize_rcu().  Is the required synchronize_sched()
> on this code path hiding somewhere?
> 
> Or did I miss an rcu_read_lock()/rcu_read_unlock() pair somewhere?

No, I think someone is 'hoping' sync_rcu() implies sync_sched(). But
yes, I should look harder at this. I was assuming the existing code was
OK here.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 19:13             ` Peter Zijlstra
@ 2015-02-26 19:41               ` Paul E. McKenney
  2015-02-26 19:45                 ` Peter Zijlstra
                                   ` (2 more replies)
  2015-02-28 16:41               ` Peter Zijlstra
  1 sibling, 3 replies; 35+ messages in thread
From: Paul E. McKenney @ 2015-02-26 19:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 08:13:35PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 26, 2015 at 10:28:17AM -0800, Paul E. McKenney wrote:
> 
> 
> > Color me confused, both by the existing code and the modifications.
> > 
> > It appears that you are using seqlock to force readers to retry when
> > a concurrent update occurs, but I don't see what is ensuring that the
> > readers see good data when racing with an insertion or a deletion. 
> 
> Yes, I suppose that needs a wee bit more work (and comments).
> 
> > Yes,
> > the reader will be retried, courtesy of the seqlock, but that won't help
> > if the reader segfaults before it gets to the read_seqcount_retry().
> 
> Right, I don't think that can happen, but see below.
> 
> > > +static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> > > +{
> > > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > > +	struct module_node *mn = &mod->tree_node[idx];
> > > +	struct rb_node **link = &root->rb_node;
> > > +	struct rb_node *parent = NULL;
> > > +	unsigned long mod_val, m_val;
> > > +	struct module *m;
> > > +	int i;
> > > +
> > > +	mn->mod = mod;
> > > +	mod_val = mod_value(mod, idx);
> > > +
> > > +	while (*link) {
> > > +		parent = *link;
> > > +		m = mod_entry(parent);
> > > +		i = mod_node_idx(m, parent);
> > > +		m_val = mod_value(m, i);
> > > +
> > > +		if (mod_val < m_val)
> > > +			link = &parent->rb_left;
> > > +		else
> > > +			link = &parent->rb_right;
> > > +	}
> > > +
> > > +	rb_link_node(&mn->node, parent, link);
> > 
> > This makes the new module visible to readers, if I understand correctly.
> 
> You do.

Whew!  ;-)

> > There needs to be a memory barrier between initialization and this call
> > to rb_link_node(), otherwise, both the CPU (well, for Alpha) and the
> > compiler (for everyone) can reorder, which could result in some hapless
> > reader seeing pre-initialized data.
> > 
> > Or did I miss the memory barrier?
> 
> I was going with the one in list_add_rcu() and the two in
> raw_write_seqcount_latch(), however, now that you made me look at it, I
> need one in here to ensure mn->mod is observed.
> 
> So yes. The best solution might be a mod_tree_init() which initializes
> those back pointers before this insertion.

OK, sounds good.

> > > +	rb_insert_color(&mn->node, root);
> > 
> > This -might- be OK -- the rotations do not make new nodes visible,
> > instead simply rearranging nodes that are already visible.
> > 
> > For both rb_link_node() and rb_insert_color(), given that we are updating
> > pointers while readers are traversing them, READ_ONCE() and WRITE_ONCE()
> > would be good.  Yes, the compiler -probably- doesn't mess you up, but it
> > would be completely within its rights to do so.  :-/
> > 
> > Alpha would like either rcu_dereference() or lockless_dereference()
> > instead of READ_ONCE(), of course!
> 
> I _think_ we should be ok here. Since as you say, we only reorder. If we
> observe nodes in the wrong order, our traversal gets confused and might
> return the wrong or no node.
> 
> That is fine.

And seeing an old pointer for some time might cause the CPU to loop,
but the update should eventually make its way through the cache
hierarchy, which should allow the CPU to find its way out.  So agreed.

> Of course, as you say, the compiler is within its right to split stores
> and do other creative nonsense without the volatile thing. But I don't
> think it will actually do so for native word sized thingies.
> 
> If we're really paranoid I suppose I can go make copies of the
> functions that have READ/WRITE_ONCE() in.

Well, I am really paranoid, as you know.  ;-)

> As to Alpha, I think we're good there, the split cache would make us see
> stale data, which is perfectly fine, we'll traverse a crap tree, no
> problem. As long as native sized and native aligned loads/stores aren't
> split -- which I'm pretty sure doesn't happen (because of the split
> cache).

I didn't see anything that would cause a pointer to span two cache lines.
(Of course, that would break everywhere, not just on Alpha.)

> > > +}
> > > +
> > > +static void __tree_remove(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> > > +{
> > > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > > +	struct module_node *mn = &mod->tree_node[idx];
> > > +
> > > +	rb_erase(&mn->node, root);
> > 
> > This is just removing and not freeing, so OK from a read-side viewpoint.
> > WRITE_ONCE() would be good.
> 
> Yah, would need a copy of lib/rbtree.c again :/

Or just have the *_ONCE() calls in the single version.  I bet that there
is no visible performance loss.

> > > +static struct module *__tree_find(struct rb_root *r, unsigned long addr)
> > > +{
> > > +	struct rb_node *n = r->rb_node;
> > > +	unsigned long m_val, m_size;
> > > +
> > > +	while (n) {
> > > +		struct module *m = mod_entry(n);
> > > +		int idx = mod_node_idx(m, n);
> > > +
> > > +		m_val = mod_value(m, idx);
> > > +		m_size = mod_size(m, idx);
> > > +
> > > +		if (addr < m_val)
> > > +			n = n->rb_left;
> > 
> > We need a rcu_dereference() or lockless_dereference() here, I think.
> > 
> > > +		else if (addr >= m_val + m_size)
> > > +			n = n->rb_right;
> > 
> > And here.
> 
> As per the above argument; without doing the whole READ/WRITE_ONCE for
> the rb tree primitives, I think we're fine. We don't actually need the
> read_barrier_depends() I think.
> 
> I think this because raw_read_seqcount() will issue an rmb, which should
> be enough for the 'stable' tree. For the unstable one we don't care as
> long as we don't see split loads/stores.

I agree that raw_read_seqcount() will issue an rmb(), but that won't help.
For Alpha, you need the smp_rmb() to be between the load of any given
pointer and the first dereference of that same pointer.

> > > +		else
> > > +			return m;
> > > +	}
> > > +
> > > +	return NULL;
> > 
> > I did finally find the RCU read-side critical section, supplied by
> > is_module_text_address()'s preempt_disable() and preempt_enable().
> 
> Indeed.
> 
> > The matching synchronize_sched() is supplied by do_init_module() and
> > load_module().  I expected a synchronize_sched() in free_module() as well,
> > but I only see a synchronize_rcu().  Is the required synchronize_sched()
> > on this code path hiding somewhere?
> > 
> > Or did I miss an rcu_read_lock()/rcu_read_unlock() pair somewhere?
> 
> No, I think someone is 'hoping' sync_rcu() implies sync_sched(). But
> yes, I should look harder at this. I was assuming the existing code was
> OK here.

I am not blaming you for the code itself, but rather just blaming you
for causing me to notice that the code was broken.  ;-)

How about if I make something that allows overlapping grace periods,
so that we could be safe without latency penalty?  One approach would
be a synchronize_rcu_mult() with a bitmask indicating which to wait for.
Another would be to make variants of get_state_synchronize_rcu() and
cond_synchronize_rcu() for RCU-sched as well as for RCU, but also to
make get_state_synchronize_rcu() force a grace period to start if
one was not already in progress.

Thoughts?

							Thanx, Paul


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 19:41               ` Paul E. McKenney
@ 2015-02-26 19:45                 ` Peter Zijlstra
  2015-02-26 22:32                   ` Peter Zijlstra
  2015-02-26 20:52                 ` Andi Kleen
  2015-02-27 10:01                 ` Peter Zijlstra
  2 siblings, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 19:45 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 11:41:44AM -0800, Paul E. McKenney wrote:
> Or just have the *_ONCE() calls in the single version.  I bet that there
> is no visible performance loss.

I'll go play with that tomorrow, see what GCC makes of it.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 19:41               ` Paul E. McKenney
  2015-02-26 19:45                 ` Peter Zijlstra
@ 2015-02-26 20:52                 ` Andi Kleen
  2015-02-26 22:36                   ` Peter Zijlstra
  2015-02-27 10:01                 ` Peter Zijlstra
  2 siblings, 1 reply; 35+ messages in thread
From: Andi Kleen @ 2015-02-26 20:52 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, Mathieu Desnoyers, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

> 
> Thoughts?

Just use my original patch using page tables which did not need any of these
research projects.

http://www.serverphorums.com/read.php?12,1043794,1043796#msg-1043796


-Andi

-- 
ak@linux.intel.com -- Speaking for myself only

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 19:45                 ` Peter Zijlstra
@ 2015-02-26 22:32                   ` Peter Zijlstra
  0 siblings, 0 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 22:32 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 08:45:16PM +0100, Peter Zijlstra wrote:
> On Thu, Feb 26, 2015 at 11:41:44AM -0800, Paul E. McKenney wrote:
> > Or just have the *_ONCE() calls in the single version.  I bet that there
> > is no visible performance loss.
> 
> I'll go play with that tomorrow, see what GCC makes of it.

Who needs sleep anyhow.. ;-)

21 extra bytes, I've not quite yet figured out what for.

The below changes the insert/erase functions to use WRITE_ONCE() for the
rb_{left,right} assignments and removes one program order loop.

I did make squiggles for all scenarios to see if any of the intermediate
states had loops in, I only found the one case (erase, case 3).

I also added an rb_link_node_rcu() which does the store_release thing --
I didn't want to include rcupdate.h for fear of header insanity.

We need the store_release because of the node setup in there. So I can
forget about my mod_tree_init().

---
 include/linux/rbtree.h           | 10 ++++++
 include/linux/rbtree_augmented.h | 21 ++++++++----
 lib/rbtree.c                     | 71 +++++++++++++++++++++++++++-------------
 3 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index fb31765e935a..ae9df86bf9ba 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -31,6 +31,7 @@
 
 #include <linux/kernel.h>
 #include <linux/stddef.h>
+#include <asm/barrier.h>
 
 struct rb_node {
 	unsigned long  __rb_parent_color;
@@ -85,6 +86,15 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 	*rb_link = node;
 }
 
+static inline void rb_link_node_rcu(struct rb_node * node, struct rb_node * parent,
+				struct rb_node ** rb_link)
+{
+	node->__rb_parent_color = (unsigned long)parent;
+	node->rb_left = node->rb_right = NULL;
+
+	smp_store_release(rb_link, node);
+}
+
 #define rb_entry_safe(ptr, type, member) \
 	({ typeof(ptr) ____ptr = (ptr); \
 	   ____ptr ? rb_entry(____ptr, type, member) : NULL; \
diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
index 378c5ee75f78..14d7b831b63a 100644
--- a/include/linux/rbtree_augmented.h
+++ b/include/linux/rbtree_augmented.h
@@ -123,11 +123,11 @@ __rb_change_child(struct rb_node *old, struct rb_node *new,
 {
 	if (parent) {
 		if (parent->rb_left == old)
-			parent->rb_left = new;
+			WRITE_ONCE(parent->rb_left, new);
 		else
-			parent->rb_right = new;
+			WRITE_ONCE(parent->rb_right, new);
 	} else
-		root->rb_node = new;
+		WRITE_ONCE(root->rb_node, new);
 }
 
 extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root,
@@ -137,7 +137,8 @@ static __always_inline struct rb_node *
 __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		     const struct rb_augment_callbacks *augment)
 {
-	struct rb_node *child = node->rb_right, *tmp = node->rb_left;
+	struct rb_node *child = node->rb_right;
+	struct rb_node *tmp = node->rb_left;
 	struct rb_node *parent, *rebalance;
 	unsigned long pc;
 
@@ -167,6 +168,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 		tmp = parent;
 	} else {
 		struct rb_node *successor = child, *child2;
+
 		tmp = child->rb_left;
 		if (!tmp) {
 			/*
@@ -180,6 +182,7 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 			 */
 			parent = successor;
 			child2 = successor->rb_right;
+
 			augment->copy(node, successor);
 		} else {
 			/*
@@ -201,19 +204,23 @@ __rb_erase_augmented(struct rb_node *node, struct rb_root *root,
 				successor = tmp;
 				tmp = tmp->rb_left;
 			} while (tmp);
-			parent->rb_left = child2 = successor->rb_right;
-			successor->rb_right = child;
+			child2 = successor->rb_right;
+			WRITE_ONCE(parent->rb_left, child2);
+			WRITE_ONCE(successor->rb_right, child);
 			rb_set_parent(child, successor);
+
 			augment->copy(node, successor);
 			augment->propagate(parent, successor);
 		}
 
-		successor->rb_left = tmp = node->rb_left;
+		tmp = node->rb_left;
+		WRITE_ONCE(successor->rb_left, tmp);
 		rb_set_parent(tmp, successor);
 
 		pc = node->__rb_parent_color;
 		tmp = __rb_parent(pc);
 		__rb_change_child(node, successor, tmp, root);
+
 		if (child2) {
 			successor->__rb_parent_color = pc;
 			rb_set_parent_color(child2, parent, RB_BLACK);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c16c81a3d430..0b3ec6a75b76 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -44,6 +44,25 @@
  *  parentheses and have some accompanying text comment.
  */
 
+/*
+ * All stores to the tree structure (rb_left and rb_right) must be done using
+ * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the
+ * tree structure as seen in program order.
+ *
+ * These two requirements will allow lockless iteration of the tree -- not
+ * correct iteration mind you, tree rotations are not atomic so a lookup might
+ * miss entire subtrees.
+ *
+ * But they do guarantee that any such traversal will only see valid elements
+ * and that it will indeed complete -- does not get stuck in a loop.
+ *
+ * NOTE:
+ *
+ * Stores to __rb_parent_color are not important for simple lookups so those
+ * are left undone as of now. Nor did I check for loops involving parent
+ * pointers.
+ */
+
 static inline void rb_set_black(struct rb_node *rb)
 {
 	rb->__rb_parent_color |= RB_BLACK;
@@ -129,8 +148,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 				 * This still leaves us in violation of 4), the
 				 * continuation into Case 3 will fix that.
 				 */
-				parent->rb_right = tmp = node->rb_left;
-				node->rb_left = parent;
+				tmp = node->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp);
+				WRITE_ONCE(node->rb_left, parent);
 				if (tmp)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
@@ -149,8 +169,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			 *     /                 \
 			 *    n                   U
 			 */
-			gparent->rb_left = tmp;  /* == parent->rb_right */
-			parent->rb_right = gparent;
+			WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */
+			WRITE_ONCE(parent->rb_right, gparent);
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
@@ -171,8 +191,9 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			tmp = parent->rb_left;
 			if (node == tmp) {
 				/* Case 2 - right rotate at parent */
-				parent->rb_left = tmp = node->rb_right;
-				node->rb_right = parent;
+				tmp = node->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp);
+				WRITE_ONCE(node->rb_right, parent);
 				if (tmp)
 					rb_set_parent_color(tmp, parent,
 							    RB_BLACK);
@@ -183,8 +204,8 @@ __rb_insert(struct rb_node *node, struct rb_root *root,
 			}
 
 			/* Case 3 - left rotate at gparent */
-			gparent->rb_right = tmp;  /* == parent->rb_left */
-			parent->rb_left = gparent;
+			WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */
+			WRITE_ONCE(parent->rb_left, gparent);
 			if (tmp)
 				rb_set_parent_color(tmp, gparent, RB_BLACK);
 			__rb_rotate_set_parents(gparent, parent, root, RB_RED);
@@ -224,8 +245,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *      / \         / \
 				 *     Sl  Sr      N   Sl
 				 */
-				parent->rb_right = tmp1 = sibling->rb_left;
-				sibling->rb_left = parent;
+				tmp1 = sibling->rb_left;
+				WRITE_ONCE(parent->rb_right, tmp1);
+				WRITE_ONCE(sibling->rb_left, parent);
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
@@ -275,9 +297,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				 *                       \
 				 *                        Sr
 				 */
-				sibling->rb_left = tmp1 = tmp2->rb_right;
-				tmp2->rb_right = sibling;
-				parent->rb_right = tmp2;
+				tmp1 = tmp2->rb_right;
+				WRITE_ONCE(sibling->rb_left, tmp1);
+				WRITE_ONCE(parent->rb_right, tmp2);
+				WRITE_ONCE(tmp2->rb_right, sibling);
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
@@ -297,8 +320,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 			 *        / \         / \
 			 *      (sl) sr      N  (sl)
 			 */
-			parent->rb_right = tmp2 = sibling->rb_left;
-			sibling->rb_left = parent;
+			tmp2 = sibling->rb_left;
+			WRITE_ONCE(parent->rb_right, tmp2);
+			WRITE_ONCE(sibling->rb_left, parent);
 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
 			if (tmp2)
 				rb_set_parent(tmp2, parent);
@@ -310,8 +334,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 			sibling = parent->rb_left;
 			if (rb_is_red(sibling)) {
 				/* Case 1 - right rotate at parent */
-				parent->rb_left = tmp1 = sibling->rb_right;
-				sibling->rb_right = parent;
+				tmp1 = sibling->rb_right;
+				WRITE_ONCE(parent->rb_left, tmp1);
+				WRITE_ONCE(sibling->rb_right, parent);
 				rb_set_parent_color(tmp1, parent, RB_BLACK);
 				__rb_rotate_set_parents(parent, sibling, root,
 							RB_RED);
@@ -336,9 +361,10 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 					break;
 				}
 				/* Case 3 - right rotate at sibling */
-				sibling->rb_right = tmp1 = tmp2->rb_left;
-				tmp2->rb_left = sibling;
-				parent->rb_left = tmp2;
+				tmp1 = tmp2->rb_left;
+				WRITE_ONCE(sibling->rb_right, tmp1);
+				WRITE_ONCE(parent->rb_left, tmp2);
+				WRITE_ONCE(tmp2->rb_left, sibling);
 				if (tmp1)
 					rb_set_parent_color(tmp1, sibling,
 							    RB_BLACK);
@@ -347,8 +373,9 @@ ____rb_erase_color(struct rb_node *parent, struct rb_root *root,
 				sibling = tmp2;
 			}
 			/* Case 4 - left rotate at parent + color flips */
-			parent->rb_left = tmp2 = sibling->rb_right;
-			sibling->rb_right = parent;
+			tmp2 = sibling->rb_right;
+			WRITE_ONCE(parent->rb_left, tmp2);
+			WRITE_ONCE(sibling->rb_right, parent);
 			rb_set_parent_color(tmp1, sibling, RB_BLACK);
 			if (tmp2)
 				rb_set_parent(tmp2, parent);

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 20:52                 ` Andi Kleen
@ 2015-02-26 22:36                   ` Peter Zijlstra
  0 siblings, 0 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-26 22:36 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Paul E. McKenney, Mathieu Desnoyers, Andi Kleen, x86,
	linux-kernel, oleg, rusty, mingo

On Thu, Feb 26, 2015 at 12:52:16PM -0800, Andi Kleen wrote:
> > 
> > Thoughts?
> 
> Just use my original patch using page tables which did not need any of these
> research projects.

Hey, some people actually like to think! And the benefit is that all
archs get a win.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 19:41               ` Paul E. McKenney
  2015-02-26 19:45                 ` Peter Zijlstra
  2015-02-26 20:52                 ` Andi Kleen
@ 2015-02-27 10:01                 ` Peter Zijlstra
  2015-02-28 23:30                   ` Paul E. McKenney
  2 siblings, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-27 10:01 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 11:41:44AM -0800, Paul E. McKenney wrote:
> > As per the above argument; without doing the whole READ/WRITE_ONCE for
> > the rb tree primitives, I think we're fine. We don't actually need the
> > read_barrier_depends() I think.
> > 
> > I think this because raw_read_seqcount() will issue an rmb, which should
> > be enough for the 'stable' tree. For the unstable one we don't care as
> > long as we don't see split loads/stores.
> 
> I agree that raw_read_seqcount() will issue an rmb(), but that won't help.
> For Alpha, you need the smp_rmb() to be between the load of any given
> pointer and the first dereference of that same pointer.

OK, it seems I'm confused on Alpha, perhaps you can clarify.

My confusion stems from the fact that when things are properly
serialized with locks we do not need these additional barriers.

Then why would we need them to correctly iterate the stable copy of the
tree?

I appreciate we would need them to correctly iterate an true lockless
data structure, but our in-flight copy of the tree cannot be correctly
iterated anyway, all we want is for that iteration to:

  1) only observe valid nodes -- ie. no pointers out into the wild due
  to split load/stores.

  2) terminate -- ie. not get stuck in loops.

And I don't see that read_barrier_depends() helping with either for the
unstable tree; 1) is guaranteed by my patch making the modifiers user
WRITE_ONCE(), and 2) we agreed is guaranteed by the modifiers not having
loops in program order, any cache effects will disappear over time.

> > No, I think someone is 'hoping' sync_rcu() implies sync_sched(). But
> > yes, I should look harder at this. I was assuming the existing code was
> > OK here.
> 
> I am not blaming you for the code itself, but rather just blaming you
> for causing me to notice that the code was broken.  ;-)
> 
> How about if I make something that allows overlapping grace periods,
> so that we could be safe without latency penalty?  One approach would
> be a synchronize_rcu_mult() with a bitmask indicating which to wait for.
> Another would be to make variants of get_state_synchronize_rcu() and
> cond_synchronize_rcu() for RCU-sched as well as for RCU, but also to
> make get_state_synchronize_rcu() force a grace period to start if
> one was not already in progress.

This is module unload, not a fast path by anyones measure I think.
However if you do go about making such a primitive I know of at least
one other place this could be used; we have the following in
_cpu_down():

#ifdef CONFIG_PREEMPT
	synchronize_sched();
#endif
	synchronize_rcu();

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 14:12       ` Peter Zijlstra
@ 2015-02-27 11:51         ` Rusty Russell
  0 siblings, 0 replies; 35+ messages in thread
From: Rusty Russell @ 2015-02-27 11:51 UTC (permalink / raw)
  To: Peter Zijlstra, Andi Kleen
  Cc: Andi Kleen, x86, linux-kernel, mathieu.desnoyers, oleg, paulmck, mingo

Peter Zijlstra <peterz@infradead.org> writes:
> On Thu, Feb 26, 2015 at 12:43:09PM +0100, Peter Zijlstra wrote:
>
> Assuming struct module is cacheline aligned, Rusty? from what I can find
> its squirreled away in some data structure:
>
>   mod = (void *)info->sechdrs[info->index.mod].sh_addr
>
> And I can't seem to quickly find its alignment. If its not cacheline
> aligned, can we make it so?

If the ELF says to align it, it will be aligned.  Thus your patch
below should make it Just Work:

> @@ -278,7 +276,7 @@
>  	int (*init)(void);
>  
>  	/* If this is non-NULL, vfree after init() returns */
> -	void *module_init;
> +	void *module_init	____cacheline_aligned;
>  
>  	/* Here is the actual code + data, vfree'd on unload. */
>  	void *module_core;

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
                         ` (2 preceding siblings ...)
  2015-02-26 16:02       ` Mathieu Desnoyers
@ 2015-02-27 12:02       ` Rusty Russell
  2015-02-27 14:30         ` Peter Zijlstra
  3 siblings, 1 reply; 35+ messages in thread
From: Rusty Russell @ 2015-02-27 12:02 UTC (permalink / raw)
  To: Peter Zijlstra, Andi Kleen
  Cc: Andi Kleen, x86, linux-kernel, mathieu.desnoyers, oleg, paulmck, mingo

Peter Zijlstra <peterz@infradead.org> writes:
> One of the users of this is __kernel_text_address() which is employed in
> many stack unwinders; which in turn are used by perf-callchain (possibly
> from NMI context).

Um, so the stack unwinders use "does this look like a kernel address"
because we omit the frame pointer?

To keep that optimization, we add 220 non-trivial lines to module.c?

Don't get me wrong, it's cute code, but I do wonder if at some point a
grown up is going to come along and tell us to stop :)

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-27 12:02       ` Rusty Russell
@ 2015-02-27 14:30         ` Peter Zijlstra
  0 siblings, 0 replies; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-27 14:30 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andi Kleen, Andi Kleen, x86, linux-kernel, mathieu.desnoyers,
	oleg, paulmck, mingo

On Fri, Feb 27, 2015 at 10:32:39PM +1030, Rusty Russell wrote:
> Peter Zijlstra <peterz@infradead.org> writes:
> > One of the users of this is __kernel_text_address() which is employed in
> > many stack unwinders; which in turn are used by perf-callchain (possibly
> > from NMI context).
> 
> Um, so the stack unwinders use "does this look like a kernel address"
> because we omit the frame pointer?

Not only because of that I think; also as a general robustness check.
I'm not sure people want their stack unwinder to go off into the woods.

> To keep that optimization, we add 220 non-trivial lines to module.c?

Can I make you feel better by writing more comments? I'm not sure we can
convince the arch people to take this test out. Its all over the place.

> Don't get me wrong, it's cute code, but I do wonder if at some point a
> grown up is going to come along and tell us to stop :)

Now where's the fun in that ;-)

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-26 19:13             ` Peter Zijlstra
  2015-02-26 19:41               ` Paul E. McKenney
@ 2015-02-28 16:41               ` Peter Zijlstra
  2015-02-28 16:56                 ` Peter Zijlstra
  1 sibling, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-28 16:41 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Thu, Feb 26, 2015 at 08:13:35PM +0100, Peter Zijlstra wrote:
> > > +static void __tree_insert(struct latch_tree_root *mod_tree, struct module *mod, int idx)
> > > +{
> > > +	struct rb_root *root = &mod_tree->tree[idx & latch_bit];
> > > +	struct module_node *mn = &mod->tree_node[idx];
> > > +	struct rb_node **link = &root->rb_node;
> > > +	struct rb_node *parent = NULL;
> > > +	unsigned long mod_val, m_val;
> > > +	struct module *m;
> > > +	int i;
> > > +
> > > +	mn->mod = mod;
> > > +	mod_val = mod_value(mod, idx);
> > > +
> > > +	while (*link) {
> > > +		parent = *link;
> > > +		m = mod_entry(parent);
> > > +		i = mod_node_idx(m, parent);
> > > +		m_val = mod_value(m, i);
> > > +
> > > +		if (mod_val < m_val)
> > > +			link = &parent->rb_left;
> > > +		else
> > > +			link = &parent->rb_right;
> > > +	}
> > > +
> > > +	rb_link_node(&mn->node, parent, link);
> > 
> > This makes the new module visible to readers, if I understand correctly.
> 
> You do.

I think I have reconsidered; this does not in fact make it visible.

> > There needs to be a memory barrier between initialization and this call
> > to rb_link_node(), otherwise, both the CPU (well, for Alpha) and the
> > compiler (for everyone) can reorder, which could result in some hapless
> > reader seeing pre-initialized data.
> > 
> > Or did I miss the memory barrier?
> 
> I was going with the one in list_add_rcu() and the two in
> raw_write_seqcount_latch(), however, now that you made me look at it, I
> need one in here to ensure mn->mod is observed.

The 'second' raw_write_seqcount_latch() makes it visible, not the first.

Let me clarify with some documentation I just wrote:

 * The basic form is a data structure like:
 *
 * struct latch_struct {
 *     seqcount_t              seq;
 *     struct data_struct      data[2];
 * };
 *
 * Where a modification, which is assumed to be externally serialized, does the
 * following:
 *
 * void latch_modify(struct latch_struct *latch, ...)
 * {
 *     smp_wmb();      <- Ensure that the last data[1] update is visible
 *     latch->seq++;
 *     smp_wmb();      <- Ensure that the seqcount update is visible
 *
 *     modify(latch->data[0], ...);
 *
 *     smp_wmb();      <- Ensure that the data[0] update is visible
 *     latch->seq++;
 *     smp_wmb();      <- Ensure that the seqcount update is visible
 *
 *     modify(latch->data[1], ...);
 * }
 *
 * The query will have a form like:
 *
 * struct entry *latch_query(struct latch_struct *latch, ...)
 * {
 *     struct entry *entry;
 *     unsigned seq;
 *     int idx;
 *
 *     do {
 *             seq = latch->seq;
 *             smp_rmb();
 *
 *             idx = seq & 0x01;
 *             entry = data_query(latch->data[idx], ...);
 *
 *             smp_rmb();
 *     } while (seq != latch->seq);
 *
 *     return entry;
 * }
 *
 * So during the modification, queries are first redirected to data[1]. Then we
 * modify data[0]. When that is complete, we redirect queries back to data[0]
 * and we can modify data[1].

So any addition to data[0] does not become visible until we redirect
the readers back to it, which is the second latch increment.

This therefore obviates the need for the atomic publish APIs RCU
traditionally employs.

Let me go clarify this point in said documentation.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-28 16:41               ` Peter Zijlstra
@ 2015-02-28 16:56                 ` Peter Zijlstra
  2015-02-28 23:32                   ` Paul E. McKenney
  0 siblings, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-02-28 16:56 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Sat, Feb 28, 2015 at 05:41:12PM +0100, Peter Zijlstra wrote:
> > > > +	rb_link_node(&mn->node, parent, link);
> > > 
> > > This makes the new module visible to readers, if I understand correctly.
> > 
> > You do.
> 
> I think I have reconsidered; this does not in fact make it visible.

Ah; never mind, it is. An iterator could have started in data[0], gone
on holiday while the modification took place, and resumed once it is
done and magically see the element appear.

Therefore we do indeed need the atomic publishing and I think this also
settles my question about Alpha.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-27 10:01                 ` Peter Zijlstra
@ 2015-02-28 23:30                   ` Paul E. McKenney
  0 siblings, 0 replies; 35+ messages in thread
From: Paul E. McKenney @ 2015-02-28 23:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Fri, Feb 27, 2015 at 11:01:14AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 26, 2015 at 11:41:44AM -0800, Paul E. McKenney wrote:
> > > As per the above argument; without doing the whole READ/WRITE_ONCE for
> > > the rb tree primitives, I think we're fine. We don't actually need the
> > > read_barrier_depends() I think.
> > > 
> > > I think this because raw_read_seqcount() will issue an rmb, which should
> > > be enough for the 'stable' tree. For the unstable one we don't care as
> > > long as we don't see split loads/stores.
> > 
> > I agree that raw_read_seqcount() will issue an rmb(), but that won't help.
> > For Alpha, you need the smp_rmb() to be between the load of any given
> > pointer and the first dereference of that same pointer.
> 
> OK, it seems I'm confused on Alpha, perhaps you can clarify.
> 
> My confusion stems from the fact that when things are properly
> serialized with locks we do not need these additional barriers.

The readers hold locks?  Keep in mind that raw_read_seqcount() isn't
really a lock -- updates can run concurrently with a read.  A doomed
read, to be sure, but nevertheless a read that is traversing pointers
that are changing out from under it.

> Then why would we need them to correctly iterate the stable copy of the
> tree?
> 
> I appreciate we would need them to correctly iterate an true lockless
> data structure, but our in-flight copy of the tree cannot be correctly
> iterated anyway, all we want is for that iteration to:
> 
>   1) only observe valid nodes -- ie. no pointers out into the wild due
>   to split load/stores.
> 
>   2) terminate -- ie. not get stuck in loops.

True, but don't new nodes get inserted into a tree that might possibly
be concurrently read?

> And I don't see that read_barrier_depends() helping with either for the
> unstable tree; 1) is guaranteed by my patch making the modifiers user
> WRITE_ONCE(), and 2) we agreed is guaranteed by the modifiers not having
> loops in program order, any cache effects will disappear over time.

Well, if the readers and writer are really truly excluding each other,
then no problem.  But if the readers really can run concurrently with
writers, even if the readers will subsequently retry, you do need those
barriers on Alpha.

The canonical URL on this topic:

	http://www.openvms.compaq.com/wizard/wiz_2637.html

> > > No, I think someone is 'hoping' sync_rcu() implies sync_sched(). But
> > > yes, I should look harder at this. I was assuming the existing code was
> > > OK here.
> > 
> > I am not blaming you for the code itself, but rather just blaming you
> > for causing me to notice that the code was broken.  ;-)
> > 
> > How about if I make something that allows overlapping grace periods,
> > so that we could be safe without latency penalty?  One approach would
> > be a synchronize_rcu_mult() with a bitmask indicating which to wait for.
> > Another would be to make variants of get_state_synchronize_rcu() and
> > cond_synchronize_rcu() for RCU-sched as well as for RCU, but also to
> > make get_state_synchronize_rcu() force a grace period to start if
> > one was not already in progress.
> 
> This is module unload, not a fast path by anyones measure I think.
> However if you do go about making such a primitive I know of at least
> one other place this could be used; we have the following in
> _cpu_down():
> 
> #ifdef CONFIG_PREEMPT
> 	synchronize_sched();
> #endif
> 	synchronize_rcu();

OK, definitely seems worth thinking about, then.

							Thanx, Paul


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-28 16:56                 ` Peter Zijlstra
@ 2015-02-28 23:32                   ` Paul E. McKenney
  2015-03-02  9:24                     ` Peter Zijlstra
  0 siblings, 1 reply; 35+ messages in thread
From: Paul E. McKenney @ 2015-02-28 23:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Sat, Feb 28, 2015 at 05:56:54PM +0100, Peter Zijlstra wrote:
> On Sat, Feb 28, 2015 at 05:41:12PM +0100, Peter Zijlstra wrote:
> > > > > +	rb_link_node(&mn->node, parent, link);
> > > > 
> > > > This makes the new module visible to readers, if I understand correctly.
> > > 
> > > You do.
> > 
> > I think I have reconsidered; this does not in fact make it visible.
> 
> Ah; never mind, it is. An iterator could have started in data[0], gone
> on holiday while the modification took place, and resumed once it is
> done and magically see the element appear.
> 
> Therefore we do indeed need the atomic publishing and I think this also
> settles my question about Alpha.

Whew!

Though otherwise whatever you were doing would have been pretty cool
and fun to learn about.  ;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-02-28 23:32                   ` Paul E. McKenney
@ 2015-03-02  9:24                     ` Peter Zijlstra
  2015-03-02 16:58                       ` Paul E. McKenney
  0 siblings, 1 reply; 35+ messages in thread
From: Peter Zijlstra @ 2015-03-02  9:24 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Sat, Feb 28, 2015 at 03:32:03PM -0800, Paul E. McKenney wrote:
> Whew!
> 
> Though otherwise whatever you were doing would have been pretty cool
> and fun to learn about.  ;-)

So I think I can do that; where readers and writers are fully separated,
but it requires:

 - tripple latch
 - copy operator
 - nested RCU

And the result would be horribly expensive (mostly due to the copy
operator on dynamic data structures) on the update side, which severely
limits the applicability of the scheme.



^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree
  2015-03-02  9:24                     ` Peter Zijlstra
@ 2015-03-02 16:58                       ` Paul E. McKenney
  0 siblings, 0 replies; 35+ messages in thread
From: Paul E. McKenney @ 2015-03-02 16:58 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Andi Kleen, Andi Kleen, x86, linux-kernel,
	oleg, rusty, mingo

On Mon, Mar 02, 2015 at 10:24:40AM +0100, Peter Zijlstra wrote:
> On Sat, Feb 28, 2015 at 03:32:03PM -0800, Paul E. McKenney wrote:
> > Whew!
> > 
> > Though otherwise whatever you were doing would have been pretty cool
> > and fun to learn about.  ;-)
> 
> So I think I can do that; where readers and writers are fully separated,
> but it requires:
> 
>  - tripple latch
>  - copy operator
>  - nested RCU
> 
> And the result would be horribly expensive (mostly due to the copy
> operator on dynamic data structures) on the update side, which severely
> limits the applicability of the scheme.

True enough, if you have a single pointer to an RCU-protected data
structure, you can update anything in any way by doing a deep copy of
the original, updating, and swapping pointers.  And what is a little
copy overhead among friends?  ;-)

							Thanx, Paul


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/3] x86: Move msr accesses out of line
  2015-03-20  0:29 Updated MSR tracing patchkit v2 Andi Kleen
@ 2015-03-20  0:29 ` Andi Kleen
  0 siblings, 0 replies; 35+ messages in thread
From: Andi Kleen @ 2015-03-20  0:29 UTC (permalink / raw)
  To: x86; +Cc: linux-kernel, peterz, Andi Kleen

From: Andi Kleen <ak@linux.intel.com>

To add trace points to msr accesses we need to include
linux/tracepoint.h. Unfortunately this causes hellish include loops
when with the msr inlines in asm/msr.h, which are included all over.
I tried to fix several of them, but eventually gave up.

This patch moves the MSR functions out of line. A MSR access is typically
40-100 cycles or even slower, a call is a few cycles at best, so the
additional function call is not really significant.

Kernel text size is neutral:

11852945	1671656	1822720	15347321	 ea2e79	vmlinux-no-msr
11852969	1671656	1822720	15347345	 ea2e91	vmlinux-msr

As requested, some benchmarking on the difference to inline MSR (including
the trace points from the next patch):

The absolute differences are fairly low, 6-8 cycles for out of line +
trace point. 6-7% on Haswell. On Avoton the percentages are higher
because the base costs are lower, but the absolute cycle deltas are
very low too and in the same range.

I think it's reasonable to spend 6-8 cycles/call for much better
debuggability. In fact looking at the traces already exposed a number
of optimization possibilities for optimizing away unnecessary
accesses, that should give much higher gains.

haswell:
136 cycles ool wrmsr
128 cycles inline wrmsr             6%
90 cycles ool rdmsr
84 cycles inline rdmsr              7%

avoton:

68 cycles ool wrmsr
54 cycles inline wrmsr             20%
60 cycles ool rdmsr
44 cycles inline rdmsr             26%

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/include/asm/msr.h | 51 ++++----------------------------------------
 arch/x86/lib/msr.c         | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index de36f22..99d6864 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -57,53 +57,10 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
-static inline unsigned long long native_read_msr(unsigned int msr)
-{
-	DECLARE_ARGS(val, low, high);
-
-	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
-	return EAX_EDX_VAL(val, low, high);
-}
-
-static inline unsigned long long native_read_msr_safe(unsigned int msr,
-						      int *err)
-{
-	DECLARE_ARGS(val, low, high);
-
-	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
-		     ".previous\n\t"
-		     _ASM_EXTABLE(2b, 3b)
-		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
-		     : "c" (msr), [fault] "i" (-EIO));
-	return EAX_EDX_VAL(val, low, high);
-}
-
-static inline void native_write_msr(unsigned int msr,
-				    unsigned low, unsigned high)
-{
-	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
-}
-
-/* Can be uninlined because referenced by paravirt */
-notrace static inline int native_write_msr_safe(unsigned int msr,
-					unsigned low, unsigned high)
-{
-	int err;
-	asm volatile("2: wrmsr ; xor %[err],%[err]\n"
-		     "1:\n\t"
-		     ".section .fixup,\"ax\"\n\t"
-		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
-		     ".previous\n\t"
-		     _ASM_EXTABLE(2b, 3b)
-		     : [err] "=a" (err)
-		     : "c" (msr), "0" (low), "d" (high),
-		       [fault] "i" (-EIO)
-		     : "memory");
-	return err;
-}
+extern unsigned long long native_read_msr(unsigned int msr);
+extern unsigned long long native_read_msr_safe(unsigned int msr, int *err);
+extern int native_write_msr_safe(unsigned int msr, unsigned low, unsigned high);
+extern void native_write_msr(unsigned int msr, unsigned low, unsigned high);
 
 extern unsigned long long native_read_tsc(void);
 
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 4362373..7eed044 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -108,3 +108,56 @@ int msr_clear_bit(u32 msr, u8 bit)
 {
 	return __flip_bit(msr, bit, false);
 }
+
+inline unsigned long long native_read_msr(unsigned int msr)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
+	return EAX_EDX_VAL(val, low, high);
+}
+EXPORT_SYMBOL(native_read_msr);
+
+inline unsigned long long native_read_msr_safe(unsigned int msr,
+						      int *err)
+{
+	DECLARE_ARGS(val, low, high);
+
+	asm volatile("2: rdmsr ; xor %[err],%[err]\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     ".previous\n\t"
+		     _ASM_EXTABLE(2b, 3b)
+		     : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
+		     : "c" (msr), [fault] "i" (-EIO));
+	return EAX_EDX_VAL(val, low, high);
+}
+EXPORT_SYMBOL(native_read_msr_safe);
+
+inline void native_write_msr(unsigned int msr,
+				    unsigned low, unsigned high)
+{
+	asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
+}
+EXPORT_SYMBOL(native_write_msr);
+
+/* Can be uninlined because referenced by paravirt */
+notrace inline int native_write_msr_safe(unsigned int msr,
+					unsigned low, unsigned high)
+{
+	int err;
+
+	asm volatile("2: wrmsr ; xor %[err],%[err]\n"
+		     "1:\n\t"
+		     ".section .fixup,\"ax\"\n\t"
+		     "3:  mov %[fault],%[err] ; jmp 1b\n\t"
+		     ".previous\n\t"
+		     _ASM_EXTABLE(2b, 3b)
+		     : [err] "=a" (err)
+		     : "c" (msr), "0" (low), "d" (high),
+		       [fault] "i" (-EIO)
+		     : "memory");
+	return err;
+}
+EXPORT_SYMBOL(native_write_msr_safe);
-- 
1.9.3


^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2015-03-20  0:30 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-21  1:38 [PATCH 1/3] x86: Move msr accesses out of line Andi Kleen
2015-02-21  1:38 ` [PATCH 2/3] x86: Add trace point for MSR accesses Andi Kleen
2015-02-21  1:38 ` [PATCH 3/3] perf, x86: Remove old MSR perf tracing code Andi Kleen
2015-02-23 17:04 ` [PATCH 1/3] x86: Move msr accesses out of line Peter Zijlstra
2015-02-23 17:43   ` Andi Kleen
2015-02-25 12:27     ` Peter Zijlstra
2015-02-25 18:20       ` Andi Kleen
2015-02-25 18:34         ` Borislav Petkov
2015-02-26 11:43     ` [RFC][PATCH] module: Optimize __module_address() using a latched RB-tree Peter Zijlstra
2015-02-26 12:00       ` Ingo Molnar
2015-02-26 14:12       ` Peter Zijlstra
2015-02-27 11:51         ` Rusty Russell
2015-02-26 16:02       ` Mathieu Desnoyers
2015-02-26 16:43         ` Peter Zijlstra
2015-02-26 16:55           ` Mathieu Desnoyers
2015-02-26 17:16             ` Peter Zijlstra
2015-02-26 17:22             ` Peter Zijlstra
2015-02-26 18:28           ` Paul E. McKenney
2015-02-26 19:06             ` Mathieu Desnoyers
2015-02-26 19:13             ` Peter Zijlstra
2015-02-26 19:41               ` Paul E. McKenney
2015-02-26 19:45                 ` Peter Zijlstra
2015-02-26 22:32                   ` Peter Zijlstra
2015-02-26 20:52                 ` Andi Kleen
2015-02-26 22:36                   ` Peter Zijlstra
2015-02-27 10:01                 ` Peter Zijlstra
2015-02-28 23:30                   ` Paul E. McKenney
2015-02-28 16:41               ` Peter Zijlstra
2015-02-28 16:56                 ` Peter Zijlstra
2015-02-28 23:32                   ` Paul E. McKenney
2015-03-02  9:24                     ` Peter Zijlstra
2015-03-02 16:58                       ` Paul E. McKenney
2015-02-27 12:02       ` Rusty Russell
2015-02-27 14:30         ` Peter Zijlstra
2015-03-20  0:29 Updated MSR tracing patchkit v2 Andi Kleen
2015-03-20  0:29 ` [PATCH 1/3] x86: Move msr accesses out of line Andi Kleen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).