LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack
@ 2011-02-14 15:12 Jiri Olsa
  2011-02-15  9:41 ` Masami Hiramatsu
  0 siblings, 1 reply; 26+ messages in thread
From: Jiri Olsa @ 2011-02-14 15:12 UTC (permalink / raw)
  To: masami.hiramatsu.pt; +Cc: linux-kernel, Jiri Olsa

hi,

you can crash the kernel using kprobe tracer via:

echo "p system_call_after_swapgs" > ./kprobe_events
echo 1 > ./events/kprobes/enable

The reason is that at the system_call_after_swapgs label,
the kernel stack is not set up. If optimized kprobes are
enabled, the user space stack is being used in this case
(see optimized kprobe template) and this might result in a crash.

Looks like there are several places like this over the entry_$(BIT)
code. First I thought it'd be ok to localize those places, but
I haven't found any reasonable/maintainable way to disable only those
places.

So I switched off the whole entry code from optimizing, but this
also switch many safe places (attached patch - tested on x86_64).

Also not sure this crash falls in to the area of that once such
probe is used, user should know consequences..

any ideas?

wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/kprobes.c         |    8 ++++++++
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 7 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0ed7896..50f1630 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..051b4e2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffb..0a0ed79 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477..d03bc1e 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
 	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
+	/*
+	 * Do not optimize in the entry code due to the unstable
+	 * stack handling.
+	 */
+	if ((paddr >= (unsigned long ) __entry_text_start) &&
+	    (paddr <  (unsigned long ) __entry_text_end))
+		return 0;
+
 	/* Check there is enough space for a relative jump. */
 	if (size - offset < RELATIVEJUMP_SIZE)
 		return 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e9f7a3c..0381e1f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-- 
1.7.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack
  2011-02-14 15:12 [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack Jiri Olsa
@ 2011-02-15  9:41 ` Masami Hiramatsu
  2011-02-15 12:30   ` Jiri Olsa
  0 siblings, 1 reply; 26+ messages in thread
From: Masami Hiramatsu @ 2011-02-15  9:41 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: linux-kernel

(2011/02/15 0:12), Jiri Olsa wrote:
> hi,
> 
> you can crash the kernel using kprobe tracer via:
> 
> echo "p system_call_after_swapgs" > ./kprobe_events
> echo 1 > ./events/kprobes/enable

Ah, thank you very much!

> The reason is that at the system_call_after_swapgs label,
> the kernel stack is not set up. If optimized kprobes are
> enabled, the user space stack is being used in this case
> (see optimized kprobe template) and this might result in a crash.

Verified here, and also it didn't occur when turning optimization
off by sysctl. So this is a bug of kprobe jump optimization, not
kprobes itself.

> Looks like there are several places like this over the entry_$(BIT)
> code. First I thought it'd be ok to localize those places, but
> I haven't found any reasonable/maintainable way to disable only those
> places.

Hmm, agreed.

> So I switched off the whole entry code from optimizing, but this
> also switch many safe places (attached patch - tested on x86_64).

I'm OK for this solution. I think possible another solution is using
interrupt stack in optprobe template too. Anyway in short term, this
solution will be good.

> Also not sure this crash falls in to the area of that once such
> probe is used, user should know consequences..

User can see that those probe is not optimized via sysfs.

Thanks again,

> 
> any ideas?
> 
> wbr,
> jirka
> 
> 
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>
> ---
>  arch/x86/ia32/ia32entry.S         |    2 ++
>  arch/x86/kernel/entry_32.S        |    6 ++++--
>  arch/x86/kernel/entry_64.S        |    6 ++++--
>  arch/x86/kernel/kprobes.c         |    8 ++++++++
>  arch/x86/kernel/vmlinux.lds.S     |    1 +
>  include/asm-generic/sections.h    |    1 +
>  include/asm-generic/vmlinux.lds.h |    6 ++++++
>  7 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 0ed7896..50f1630 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -25,6 +25,8 @@
>  #define sysretl_audit ia32_ret_from_sys_call
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
>  
>  	.macro IA32_ARG_FIXUP noebp=0
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index c8b4efa..051b4e2 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -65,6 +65,8 @@
>  #define sysexit_audit	syscall_exit_work
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  /*
>   * We use macros for low-level operations which need to be overridden
>   * for paravirtualization.  The following will never clobber any registers:
> @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
>   */
>  .section .init.rodata,"a"
>  ENTRY(interrupt)
> -.text
> +.entry.text
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.long 1b
> -      .text
> +      .entry.text
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index aed1ffb..0a0ed79 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -61,6 +61,8 @@
>  #define __AUDIT_ARCH_LE	   0x40000000
>  
>  	.code64
> +	.section .entry.text, "ax"
> +
>  #ifdef CONFIG_FUNCTION_TRACER
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  ENTRY(mcount)
> @@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
>   */
>  	.section .init.rodata,"a"
>  ENTRY(interrupt)
> -	.text
> +	.section .entry.text
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.quad 1b
> -      .text
> +      .section .entry.text
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
> index d91c477..d03bc1e 100644
> --- a/arch/x86/kernel/kprobes.c
> +++ b/arch/x86/kernel/kprobes.c
> @@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
>  	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
>  		return 0;
>  
> +	/*
> +	 * Do not optimize in the entry code due to the unstable
> +	 * stack handling.
> +	 */
> +	if ((paddr >= (unsigned long ) __entry_text_start) &&
> +	    (paddr <  (unsigned long ) __entry_text_end))
> +		return 0;
> +
>  	/* Check there is enough space for a relative jump. */
>  	if (size - offset < RELATIVEJUMP_SIZE)
>  		return 0;
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index e9f7a3c..0381e1f 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -105,6 +105,7 @@ SECTIONS
>  		SCHED_TEXT
>  		LOCK_TEXT
>  		KPROBES_TEXT
> +		ENTRY_TEXT
>  		IRQENTRY_TEXT
>  		*(.fixup)
>  		*(.gnu.warning)
> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index b3bfabc..c1a1216 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
>  extern char _end[];
>  extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
>  extern char __kprobes_text_start[], __kprobes_text_end[];
> +extern char __entry_text_start[], __entry_text_end[];
>  extern char __initdata_begin[], __initdata_end[];
>  extern char __start_rodata[], __end_rodata[];
>  
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index fe77e33..906c3ce 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -424,6 +424,12 @@
>  		*(.kprobes.text)					\
>  		VMLINUX_SYMBOL(__kprobes_text_end) = .;
>  
> +#define ENTRY_TEXT							\
> +		ALIGN_FUNCTION();					\
> +		VMLINUX_SYMBOL(__entry_text_start) = .;			\
> +		*(.entry.text)						\
> +		VMLINUX_SYMBOL(__entry_text_end) = .;
> +
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>  #define IRQENTRY_TEXT							\
>  		ALIGN_FUNCTION();					\


-- 
Masami HIRAMATSU
2nd Dept. Linux Technology Center
Hitachi, Ltd., Systems Development Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack
  2011-02-15  9:41 ` Masami Hiramatsu
@ 2011-02-15 12:30   ` Jiri Olsa
  2011-02-15 15:55     ` Masami Hiramatsu
  0 siblings, 1 reply; 26+ messages in thread
From: Jiri Olsa @ 2011-02-15 12:30 UTC (permalink / raw)
  To: Masami Hiramatsu; +Cc: linux-kernel

On Tue, Feb 15, 2011 at 06:41:58PM +0900, Masami Hiramatsu wrote:
> (2011/02/15 0:12), Jiri Olsa wrote:
> > hi,
> > 
> > you can crash the kernel using kprobe tracer via:
> > 
> > echo "p system_call_after_swapgs" > ./kprobe_events
> > echo 1 > ./events/kprobes/enable
> 
> Ah, thank you very much!
> 
> > The reason is that at the system_call_after_swapgs label,
> > the kernel stack is not set up. If optimized kprobes are
> > enabled, the user space stack is being used in this case
> > (see optimized kprobe template) and this might result in a crash.
> 
> Verified here, and also it didn't occur when turning optimization
> off by sysctl. So this is a bug of kprobe jump optimization, not
> kprobes itself.
> 
> > Looks like there are several places like this over the entry_$(BIT)
> > code. First I thought it'd be ok to localize those places, but
> > I haven't found any reasonable/maintainable way to disable only those
> > places.
> 
> Hmm, agreed.
> 
> > So I switched off the whole entry code from optimizing, but this
> > also switch many safe places (attached patch - tested on x86_64).
> 
> I'm OK for this solution. I think possible another solution is using
> interrupt stack in optprobe template too. Anyway in short term, this
> solution will be good.

ok, I'll test on 32 bits and resend to Ingo

> 
> > Also not sure this crash falls in to the area of that once such
> > probe is used, user should know consequences..
> 
> User can see that those probe is not optimized via sysfs.

I cannot find this, where can I see this info?

thanks,
jirka

> 
> Thanks again,
> 
> > 
> > any ideas?
> > 
> > wbr,
> > jirka
> > 
> > 
> > Signed-off-by: Jiri Olsa <jolsa@redhat.com>
> > ---
> >  arch/x86/ia32/ia32entry.S         |    2 ++
> >  arch/x86/kernel/entry_32.S        |    6 ++++--
> >  arch/x86/kernel/entry_64.S        |    6 ++++--
> >  arch/x86/kernel/kprobes.c         |    8 ++++++++
> >  arch/x86/kernel/vmlinux.lds.S     |    1 +
> >  include/asm-generic/sections.h    |    1 +
> >  include/asm-generic/vmlinux.lds.h |    6 ++++++
> >  7 files changed, 26 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> > index 0ed7896..50f1630 100644
> > --- a/arch/x86/ia32/ia32entry.S
> > +++ b/arch/x86/ia32/ia32entry.S
> > @@ -25,6 +25,8 @@
> >  #define sysretl_audit ia32_ret_from_sys_call
> >  #endif
> >  
> > +	.section .entry.text, "ax"
> > +
> >  #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
> >  
> >  	.macro IA32_ARG_FIXUP noebp=0
> > diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> > index c8b4efa..051b4e2 100644
> > --- a/arch/x86/kernel/entry_32.S
> > +++ b/arch/x86/kernel/entry_32.S
> > @@ -65,6 +65,8 @@
> >  #define sysexit_audit	syscall_exit_work
> >  #endif
> >  
> > +	.section .entry.text, "ax"
> > +
> >  /*
> >   * We use macros for low-level operations which need to be overridden
> >   * for paravirtualization.  The following will never clobber any registers:
> > @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
> >   */
> >  .section .init.rodata,"a"
> >  ENTRY(interrupt)
> > -.text
> > +.entry.text
> >  	.p2align 5
> >  	.p2align CONFIG_X86_L1_CACHE_SHIFT
> >  ENTRY(irq_entries_start)
> > @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
> >        .endif
> >        .previous
> >  	.long 1b
> > -      .text
> > +      .entry.text
> >  vector=vector+1
> >      .endif
> >    .endr
> > diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> > index aed1ffb..0a0ed79 100644
> > --- a/arch/x86/kernel/entry_64.S
> > +++ b/arch/x86/kernel/entry_64.S
> > @@ -61,6 +61,8 @@
> >  #define __AUDIT_ARCH_LE	   0x40000000
> >  
> >  	.code64
> > +	.section .entry.text, "ax"
> > +
> >  #ifdef CONFIG_FUNCTION_TRACER
> >  #ifdef CONFIG_DYNAMIC_FTRACE
> >  ENTRY(mcount)
> > @@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
> >   */
> >  	.section .init.rodata,"a"
> >  ENTRY(interrupt)
> > -	.text
> > +	.section .entry.text
> >  	.p2align 5
> >  	.p2align CONFIG_X86_L1_CACHE_SHIFT
> >  ENTRY(irq_entries_start)
> > @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
> >        .endif
> >        .previous
> >  	.quad 1b
> > -      .text
> > +      .section .entry.text
> >  vector=vector+1
> >      .endif
> >    .endr
> > diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
> > index d91c477..d03bc1e 100644
> > --- a/arch/x86/kernel/kprobes.c
> > +++ b/arch/x86/kernel/kprobes.c
> > @@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
> >  	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
> >  		return 0;
> >  
> > +	/*
> > +	 * Do not optimize in the entry code due to the unstable
> > +	 * stack handling.
> > +	 */
> > +	if ((paddr >= (unsigned long ) __entry_text_start) &&
> > +	    (paddr <  (unsigned long ) __entry_text_end))
> > +		return 0;
> > +
> >  	/* Check there is enough space for a relative jump. */
> >  	if (size - offset < RELATIVEJUMP_SIZE)
> >  		return 0;
> > diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> > index e9f7a3c..0381e1f 100644
> > --- a/arch/x86/kernel/vmlinux.lds.S
> > +++ b/arch/x86/kernel/vmlinux.lds.S
> > @@ -105,6 +105,7 @@ SECTIONS
> >  		SCHED_TEXT
> >  		LOCK_TEXT
> >  		KPROBES_TEXT
> > +		ENTRY_TEXT
> >  		IRQENTRY_TEXT
> >  		*(.fixup)
> >  		*(.gnu.warning)
> > diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> > index b3bfabc..c1a1216 100644
> > --- a/include/asm-generic/sections.h
> > +++ b/include/asm-generic/sections.h
> > @@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
> >  extern char _end[];
> >  extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
> >  extern char __kprobes_text_start[], __kprobes_text_end[];
> > +extern char __entry_text_start[], __entry_text_end[];
> >  extern char __initdata_begin[], __initdata_end[];
> >  extern char __start_rodata[], __end_rodata[];
> >  
> > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> > index fe77e33..906c3ce 100644
> > --- a/include/asm-generic/vmlinux.lds.h
> > +++ b/include/asm-generic/vmlinux.lds.h
> > @@ -424,6 +424,12 @@
> >  		*(.kprobes.text)					\
> >  		VMLINUX_SYMBOL(__kprobes_text_end) = .;
> >  
> > +#define ENTRY_TEXT							\
> > +		ALIGN_FUNCTION();					\
> > +		VMLINUX_SYMBOL(__entry_text_start) = .;			\
> > +		*(.entry.text)						\
> > +		VMLINUX_SYMBOL(__entry_text_end) = .;
> > +
> >  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
> >  #define IRQENTRY_TEXT							\
> >  		ALIGN_FUNCTION();					\
> 
> 
> -- 
> Masami HIRAMATSU
> 2nd Dept. Linux Technology Center
> Hitachi, Ltd., Systems Development Laboratory
> E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack
  2011-02-15 12:30   ` Jiri Olsa
@ 2011-02-15 15:55     ` Masami Hiramatsu
  2011-02-15 16:54       ` Jiri Olsa
  2011-02-15 17:05       ` [PATCH] kprobes - do not allow optimized kprobes in entry code Jiri Olsa
  0 siblings, 2 replies; 26+ messages in thread
From: Masami Hiramatsu @ 2011-02-15 15:55 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: linux-kernel

(2011/02/15 21:30), Jiri Olsa wrote:
> On Tue, Feb 15, 2011 at 06:41:58PM +0900, Masami Hiramatsu wrote:
>> (2011/02/15 0:12), Jiri Olsa wrote:
>>> hi,
>>>
>>> you can crash the kernel using kprobe tracer via:
>>>
>>> echo "p system_call_after_swapgs" > ./kprobe_events
>>> echo 1 > ./events/kprobes/enable
>>
>> Ah, thank you very much!
>>
>>> The reason is that at the system_call_after_swapgs label,
>>> the kernel stack is not set up. If optimized kprobes are
>>> enabled, the user space stack is being used in this case
>>> (see optimized kprobe template) and this might result in a crash.
>>
>> Verified here, and also it didn't occur when turning optimization
>> off by sysctl. So this is a bug of kprobe jump optimization, not
>> kprobes itself.
>>
>>> Looks like there are several places like this over the entry_$(BIT)
>>> code. First I thought it'd be ok to localize those places, but
>>> I haven't found any reasonable/maintainable way to disable only those
>>> places.
>>
>> Hmm, agreed.
>>
>>> So I switched off the whole entry code from optimizing, but this
>>> also switch many safe places (attached patch - tested on x86_64).
>>
>> I'm OK for this solution. I think possible another solution is using
>> interrupt stack in optprobe template too. Anyway in short term, this
>> solution will be good.
> 
> ok, I'll test on 32 bits and resend to Ingo

Thanks!
And also, with deeply thinking about this problem, it seems that
your idea could be the best way to fix, because kprobes can not
know where the kernel stack is ready without those text section.


>>> Also not sure this crash falls in to the area of that once such
>>> probe is used, user should know consequences..
>>
>> User can see that those probe is not optimized via sysfs.
> 
> I cannot find this, where can I see this info?

Ah, actually, that is under debugfs, which is usually mounted on
/sys/kernel/debug. You can read "/sys/kernel/debug/kprobes/list"
for getting a list of currently registered probes.

Thank you,
-- 
Masami HIRAMATSU
2nd Dept. Linux Technology Center
Hitachi, Ltd., Systems Development Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack
  2011-02-15 15:55     ` Masami Hiramatsu
@ 2011-02-15 16:54       ` Jiri Olsa
  2011-02-15 17:05       ` [PATCH] kprobes - do not allow optimized kprobes in entry code Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-02-15 16:54 UTC (permalink / raw)
  To: Masami Hiramatsu; +Cc: linux-kernel

On Wed, Feb 16, 2011 at 12:55:53AM +0900, Masami Hiramatsu wrote:
> (2011/02/15 21:30), Jiri Olsa wrote:
> > On Tue, Feb 15, 2011 at 06:41:58PM +0900, Masami Hiramatsu wrote:
> >> (2011/02/15 0:12), Jiri Olsa wrote:
> >>> hi,
> >>>
> >>> you can crash the kernel using kprobe tracer via:
> >>>
> >>> echo "p system_call_after_swapgs" > ./kprobe_events
> >>> echo 1 > ./events/kprobes/enable
> >>
> >> Ah, thank you very much!
> >>
> >>> The reason is that at the system_call_after_swapgs label,
> >>> the kernel stack is not set up. If optimized kprobes are
> >>> enabled, the user space stack is being used in this case
> >>> (see optimized kprobe template) and this might result in a crash.
> >>
> >> Verified here, and also it didn't occur when turning optimization
> >> off by sysctl. So this is a bug of kprobe jump optimization, not
> >> kprobes itself.
> >>
> >>> Looks like there are several places like this over the entry_$(BIT)
> >>> code. First I thought it'd be ok to localize those places, but
> >>> I haven't found any reasonable/maintainable way to disable only those
> >>> places.
> >>
> >> Hmm, agreed.
> >>
> >>> So I switched off the whole entry code from optimizing, but this
> >>> also switch many safe places (attached patch - tested on x86_64).
> >>
> >> I'm OK for this solution. I think possible another solution is using
> >> interrupt stack in optprobe template too. Anyway in short term, this
> >> solution will be good.
> > 
> > ok, I'll test on 32 bits and resend to Ingo
> 
> Thanks!
> And also, with deeply thinking about this problem, it seems that
> your idea could be the best way to fix, because kprobes can not
> know where the kernel stack is ready without those text section.
> 
> 
> >>> Also not sure this crash falls in to the area of that once such
> >>> probe is used, user should know consequences..
> >>
> >> User can see that those probe is not optimized via sysfs.
> > 
> > I cannot find this, where can I see this info?
> 
> Ah, actually, that is under debugfs, which is usually mounted on
> /sys/kernel/debug. You can read "/sys/kernel/debug/kprobes/list"
> for getting a list of currently registered probes.

I see, thanks

jirka

> 
> Thank you,
> -- 
> Masami HIRAMATSU
> 2nd Dept. Linux Technology Center
> Hitachi, Ltd., Systems Development Laboratory
> E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-15 15:55     ` Masami Hiramatsu
  2011-02-15 16:54       ` Jiri Olsa
@ 2011-02-15 17:05       ` Jiri Olsa
  2011-02-16  3:36         ` Masami Hiramatsu
  1 sibling, 1 reply; 26+ messages in thread
From: Jiri Olsa @ 2011-02-15 17:05 UTC (permalink / raw)
  To: Masami Hiramatsu, mingo, ananth, davem; +Cc: linux-kernel

You can crash the kernel using kprobe tracer by running:

echo "p system_call_after_swapgs" > ./kprobe_events
echo 1 > ./events/kprobes/enable

The reason is that at the system_call_after_swapgs label, the kernel
stack is not set up. If optimized kprobes are enabled, the user space
stack is being used in this case (see optimized kprobe template) and
this might result in a crash.

There are several places like this over the entry code (entry_$BIT).
As it seems there's no any reasonable/maintainable way to disable only
those places where the stack is not ready, I switched off the whole
entry code from kprobe optimizing.

wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/kprobes.c         |    8 ++++++++
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 7 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0ed7896..50f1630 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..f5accf8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 891268c..39f8d21 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477..d03bc1e 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
 	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
+	/*
+	 * Do not optimize in the entry code due to the unstable
+	 * stack handling.
+	 */
+	if ((paddr >= (unsigned long ) __entry_text_start) &&
+	    (paddr <  (unsigned long ) __entry_text_end))
+		return 0;
+
 	/* Check there is enough space for a relative jump. */
 	if (size - offset < RELATIVEJUMP_SIZE)
 		return 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e9f7a3c..0381e1f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-- 
1.7.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-15 17:05       ` [PATCH] kprobes - do not allow optimized kprobes in entry code Jiri Olsa
@ 2011-02-16  3:36         ` Masami Hiramatsu
  2011-02-17 15:11           ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Masami Hiramatsu @ 2011-02-16  3:36 UTC (permalink / raw)
  To: Jiri Olsa; +Cc: mingo, ananth, davem, linux-kernel

(2011/02/16 2:05), Jiri Olsa wrote:
> You can crash the kernel using kprobe tracer by running:
> 
> echo "p system_call_after_swapgs" > ./kprobe_events
> echo 1 > ./events/kprobes/enable
> 
> The reason is that at the system_call_after_swapgs label, the kernel
> stack is not set up. If optimized kprobes are enabled, the user space
> stack is being used in this case (see optimized kprobe template) and
> this might result in a crash.
> 
> There are several places like this over the entry code (entry_$BIT).
> As it seems there's no any reasonable/maintainable way to disable only
> those places where the stack is not ready, I switched off the whole
> entry code from kprobe optimizing.

Agreed, and this could be the best way, because kprobes can not
know where the kernel stack is ready without this text section.

> 
> wbr,
> jirka
> 
>
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>

Looks good for me :)

Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>

> ---
>  arch/x86/ia32/ia32entry.S         |    2 ++
>  arch/x86/kernel/entry_32.S        |    6 ++++--
>  arch/x86/kernel/entry_64.S        |    6 ++++--
>  arch/x86/kernel/kprobes.c         |    8 ++++++++
>  arch/x86/kernel/vmlinux.lds.S     |    1 +
>  include/asm-generic/sections.h    |    1 +
>  include/asm-generic/vmlinux.lds.h |    6 ++++++
>  7 files changed, 26 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 0ed7896..50f1630 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -25,6 +25,8 @@
>  #define sysretl_audit ia32_ret_from_sys_call
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
>  
>  	.macro IA32_ARG_FIXUP noebp=0
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index c8b4efa..f5accf8 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -65,6 +65,8 @@
>  #define sysexit_audit	syscall_exit_work
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  /*
>   * We use macros for low-level operations which need to be overridden
>   * for paravirtualization.  The following will never clobber any registers:
> @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
>   */
>  .section .init.rodata,"a"
>  ENTRY(interrupt)
> -.text
> +.section .entry.text, "ax"
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.long 1b
> -      .text
> +      .section .entry.text, "ax"
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 891268c..39f8d21 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -61,6 +61,8 @@
>  #define __AUDIT_ARCH_LE	   0x40000000
>  
>  	.code64
> +	.section .entry.text, "ax"
> +
>  #ifdef CONFIG_FUNCTION_TRACER
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  ENTRY(mcount)
> @@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
>   */
>  	.section .init.rodata,"a"
>  ENTRY(interrupt)
> -	.text
> +	.section .entry.text
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.quad 1b
> -      .text
> +      .section .entry.text
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
> index d91c477..d03bc1e 100644
> --- a/arch/x86/kernel/kprobes.c
> +++ b/arch/x86/kernel/kprobes.c
> @@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
>  	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
>  		return 0;
>  
> +	/*
> +	 * Do not optimize in the entry code due to the unstable
> +	 * stack handling.
> +	 */
> +	if ((paddr >= (unsigned long ) __entry_text_start) &&
> +	    (paddr <  (unsigned long ) __entry_text_end))
> +		return 0;
> +
>  	/* Check there is enough space for a relative jump. */
>  	if (size - offset < RELATIVEJUMP_SIZE)
>  		return 0;
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index e9f7a3c..0381e1f 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -105,6 +105,7 @@ SECTIONS
>  		SCHED_TEXT
>  		LOCK_TEXT
>  		KPROBES_TEXT
> +		ENTRY_TEXT
>  		IRQENTRY_TEXT
>  		*(.fixup)
>  		*(.gnu.warning)
> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index b3bfabc..c1a1216 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
>  extern char _end[];
>  extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
>  extern char __kprobes_text_start[], __kprobes_text_end[];
> +extern char __entry_text_start[], __entry_text_end[];
>  extern char __initdata_begin[], __initdata_end[];
>  extern char __start_rodata[], __end_rodata[];
>  
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index fe77e33..906c3ce 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -424,6 +424,12 @@
>  		*(.kprobes.text)					\
>  		VMLINUX_SYMBOL(__kprobes_text_end) = .;
>  
> +#define ENTRY_TEXT							\
> +		ALIGN_FUNCTION();					\
> +		VMLINUX_SYMBOL(__entry_text_start) = .;			\
> +		*(.entry.text)						\
> +		VMLINUX_SYMBOL(__entry_text_end) = .;
> +
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>  #define IRQENTRY_TEXT							\
>  		ALIGN_FUNCTION();					\


-- 
Masami HIRAMATSU
2nd Dept. Linux Technology Center
Hitachi, Ltd., Systems Development Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-16  3:36         ` Masami Hiramatsu
@ 2011-02-17 15:11           ` Ingo Molnar
  2011-02-17 15:20             ` Jiri Olsa
  2011-02-18 16:26             ` Jiri Olsa
  0 siblings, 2 replies; 26+ messages in thread
From: Ingo Molnar @ 2011-02-17 15:11 UTC (permalink / raw)
  To: Masami Hiramatsu, H. Peter Anvin
  Cc: Jiri Olsa, ananth, davem, linux-kernel, Thomas Gleixner,
	Peter Zijlstra, Eric Dumazet


* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> wrote:

> (2011/02/16 2:05), Jiri Olsa wrote:
> > You can crash the kernel using kprobe tracer by running:
> > 
> > echo "p system_call_after_swapgs" > ./kprobe_events
> > echo 1 > ./events/kprobes/enable
> > 
> > The reason is that at the system_call_after_swapgs label, the kernel
> > stack is not set up. If optimized kprobes are enabled, the user space
> > stack is being used in this case (see optimized kprobe template) and
> > this might result in a crash.
> > 
> > There are several places like this over the entry code (entry_$BIT).
> > As it seems there's no any reasonable/maintainable way to disable only
> > those places where the stack is not ready, I switched off the whole
> > entry code from kprobe optimizing.
> 
> Agreed, and this could be the best way, because kprobes can not
> know where the kernel stack is ready without this text section.

The only worry would be that if we move the syscall entry code out of the regular 
text section fragments the icache layout a tiny bit, possibly hurting performance.

It's probably not measurable, but we need to measure it:

Testing could be done of some syscall but also cache-intense workload, like 
'hackbench 10', via perf 'stat --repeat 30' and have a very close look at 
instruction cache eviction differences.

Perhaps also explicitly enable measure one of these:

  L1-icache-loads                            [Hardware cache event]
  L1-icache-load-misses                      [Hardware cache event]
  L1-icache-prefetches                       [Hardware cache event]
  L1-icache-prefetch-misses                  [Hardware cache event]

  iTLB-loads                                 [Hardware cache event]
  iTLB-load-misses                           [Hardware cache event]

to see whether there's any statistically significant difference in icache/iTLB 
evictions, with and without the patch.

If such stats are included in the changelog - even if just to show that any change 
is within measurement accuracy, it would make it easier to apply this change.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-17 15:11           ` Ingo Molnar
@ 2011-02-17 15:20             ` Jiri Olsa
  2011-02-18 16:26             ` Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-02-17 15:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Masami Hiramatsu, H. Peter Anvin, ananth, davem, linux-kernel,
	Thomas Gleixner, Peter Zijlstra, Eric Dumazet

On Thu, Feb 17, 2011 at 04:11:03PM +0100, Ingo Molnar wrote:
> 
> * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> wrote:
> 
> > (2011/02/16 2:05), Jiri Olsa wrote:
> > > You can crash the kernel using kprobe tracer by running:
> > > 
> > > echo "p system_call_after_swapgs" > ./kprobe_events
> > > echo 1 > ./events/kprobes/enable
> > > 
> > > The reason is that at the system_call_after_swapgs label, the kernel
> > > stack is not set up. If optimized kprobes are enabled, the user space
> > > stack is being used in this case (see optimized kprobe template) and
> > > this might result in a crash.
> > > 
> > > There are several places like this over the entry code (entry_$BIT).
> > > As it seems there's no any reasonable/maintainable way to disable only
> > > those places where the stack is not ready, I switched off the whole
> > > entry code from kprobe optimizing.
> > 
> > Agreed, and this could be the best way, because kprobes can not
> > know where the kernel stack is ready without this text section.
> 
> The only worry would be that if we move the syscall entry code out of the regular 
> text section fragments the icache layout a tiny bit, possibly hurting performance.
> 
> It's probably not measurable, but we need to measure it:
> 
> Testing could be done of some syscall but also cache-intense workload, like 
> 'hackbench 10', via perf 'stat --repeat 30' and have a very close look at 
> instruction cache eviction differences.
> 
> Perhaps also explicitly enable measure one of these:
> 
>   L1-icache-loads                            [Hardware cache event]
>   L1-icache-load-misses                      [Hardware cache event]
>   L1-icache-prefetches                       [Hardware cache event]
>   L1-icache-prefetch-misses                  [Hardware cache event]
> 
>   iTLB-loads                                 [Hardware cache event]
>   iTLB-load-misses                           [Hardware cache event]
> 
> to see whether there's any statistically significant difference in icache/iTLB 
> evictions, with and without the patch.
> 
> If such stats are included in the changelog - even if just to show that any change 
> is within measurement accuracy, it would make it easier to apply this change.

ok, I'll run it 

thanks,
jirka

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-17 15:11           ` Ingo Molnar
  2011-02-17 15:20             ` Jiri Olsa
@ 2011-02-18 16:26             ` Jiri Olsa
  2011-02-19 14:14               ` Masami Hiramatsu
  1 sibling, 1 reply; 26+ messages in thread
From: Jiri Olsa @ 2011-02-18 16:26 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Masami Hiramatsu, H. Peter Anvin, ananth, davem, linux-kernel,
	Thomas Gleixner, Peter Zijlstra, Eric Dumazet

On Thu, Feb 17, 2011 at 04:11:03PM +0100, Ingo Molnar wrote:
> 
> * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> wrote:
> 
> > (2011/02/16 2:05), Jiri Olsa wrote:
> > > You can crash the kernel using kprobe tracer by running:
> > > 
> > > echo "p system_call_after_swapgs" > ./kprobe_events
> > > echo 1 > ./events/kprobes/enable
> > > 
> > > The reason is that at the system_call_after_swapgs label, the kernel
> > > stack is not set up. If optimized kprobes are enabled, the user space
> > > stack is being used in this case (see optimized kprobe template) and
> > > this might result in a crash.
> > > 
> > > There are several places like this over the entry code (entry_$BIT).
> > > As it seems there's no any reasonable/maintainable way to disable only
> > > those places where the stack is not ready, I switched off the whole
> > > entry code from kprobe optimizing.
> > 
> > Agreed, and this could be the best way, because kprobes can not
> > know where the kernel stack is ready without this text section.
> 
> The only worry would be that if we move the syscall entry code out of the regular 
> text section fragments the icache layout a tiny bit, possibly hurting performance.
> 
> It's probably not measurable, but we need to measure it:
> 
> Testing could be done of some syscall but also cache-intense workload, like 
> 'hackbench 10', via perf 'stat --repeat 30' and have a very close look at 
> instruction cache eviction differences.
> 
> Perhaps also explicitly enable measure one of these:
> 
>   L1-icache-loads                            [Hardware cache event]
>   L1-icache-load-misses                      [Hardware cache event]
>   L1-icache-prefetches                       [Hardware cache event]
>   L1-icache-prefetch-misses                  [Hardware cache event]
> 
>   iTLB-loads                                 [Hardware cache event]
>   iTLB-load-misses                           [Hardware cache event]
> 
> to see whether there's any statistically significant difference in icache/iTLB 
> evictions, with and without the patch.
> 
> If such stats are included in the changelog - even if just to show that any change 
> is within measurement accuracy, it would make it easier to apply this change.
> 
> Thanks,
> 
> 	Ingo


hi,

I have some results, but need help with interpretation.. ;)

I ran following command (with repeat 100 and 500)

perf stat --repeat 100 -e L1-icache-load -e L1-icache-load-misses -e
L1-icache-prefetches -e L1-icache-prefetch-misses -e iTLB-loads -e
iTLB-load-misses ./hackbench/hackbench 10

I can tell just the obvious:
- the cache load count is higher for the patched kernel,
  but the cache misses count is lower
- patched kernel has also lower count of prefetches,
  other counts are bigger for patched kernel

there's still some variability in counter values each time I run the perf

please let me know what you think, I can run other tests if needed

thanks,
jirka


--------------------------------------------------------------------------
the results for current tip tree are:

 Performance counter stats for './hackbench/hackbench 10' (100 runs):
             
          815008015  L1-icache-loads            ( +-   0.316% )  (scaled from 81.00%)
           26267361  L1-icache-load-misses      ( +-   0.210% )  (scaled from 81.00%)
             204143  L1-icache-prefetches       ( +-   1.291% )  (scaled from 81.01%)
      <not counted>  L1-icache-prefetch-misses
          814902708  iTLB-loads                 ( +-   0.315% )  (scaled from 80.99%)
              82082  iTLB-load-misses           ( +-   0.931% )  (scaled from 80.98%)

        0.205850655  seconds time elapsed   ( +-   0.333% )


 Performance counter stats for './hackbench/hackbench 10' (500 runs):

          817646684  L1-icache-loads            ( +-   0.150% )  (scaled from 80.99%)
           26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
             211864  L1-icache-prefetches       ( +-   0.616% )  (scaled from 80.99%)
      <not counted>  L1-icache-prefetch-misses
          817646737  iTLB-loads                 ( +-   0.151% )  (scaled from 80.98%)
              82368  iTLB-load-misses           ( +-   0.451% )  (scaled from 80.98%)

        0.206651959  seconds time elapsed   ( +-   0.152% )



--------------------------------------------------------------------------
the results for tip tree with the patch applied are:


 Performance counter stats for './hackbench/hackbench 10' (100 runs):

          959206624  L1-icache-loads            ( +-   0.320% )  (scaled from 80.98%)
           24322357  L1-icache-load-misses      ( +-   0.334% )  (scaled from 80.93%)
             177970  L1-icache-prefetches       ( +-   1.240% )  (scaled from 80.97%)
      <not counted>  L1-icache-prefetch-misses
          959349089  iTLB-loads                 ( +-   0.320% )  (scaled from 80.93%)
              85535  iTLB-load-misses           ( +-   1.329% )  (scaled from 80.92%)
          
        0.209696972  seconds time elapsed   ( +-   0.352% )
             
      
 Performance counter stats for './hackbench/hackbench 10' (500 runs):

          960162049  L1-icache-loads            ( +-   0.114% )  (scaled from 80.95%)
           24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
             179800  L1-icache-prefetches       ( +-   0.530% )  (scaled from 80.95%)
      <not counted>  L1-icache-prefetch-misses
          960352725  iTLB-loads                 ( +-   0.114% )  (scaled from 80.93%)
              84410  iTLB-load-misses           ( +-   0.491% )  (scaled from 80.92%)

        0.210509948  seconds time elapsed   ( +-   0.140% )


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-18 16:26             ` Jiri Olsa
@ 2011-02-19 14:14               ` Masami Hiramatsu
  2011-02-20 12:59                 ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Masami Hiramatsu @ 2011-02-19 14:14 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Ingo Molnar, H. Peter Anvin, ananth, davem, linux-kernel,
	Thomas Gleixner, Peter Zijlstra, Eric Dumazet, 2nddept-manager

(2011/02/19 1:26), Jiri Olsa wrote:
[...]
>> The only worry would be that if we move the syscall entry code out of the regular 
>> text section fragments the icache layout a tiny bit, possibly hurting performance.
>>
>> It's probably not measurable, but we need to measure it:
>>
>> Testing could be done of some syscall but also cache-intense workload, like 
>> 'hackbench 10', via perf 'stat --repeat 30' and have a very close look at 
>> instruction cache eviction differences.
>>
>> Perhaps also explicitly enable measure one of these:
>>
>>   L1-icache-loads                            [Hardware cache event]
>>   L1-icache-load-misses                      [Hardware cache event]
>>   L1-icache-prefetches                       [Hardware cache event]
>>   L1-icache-prefetch-misses                  [Hardware cache event]
>>
>>   iTLB-loads                                 [Hardware cache event]
>>   iTLB-load-misses                           [Hardware cache event]
>>
>> to see whether there's any statistically significant difference in icache/iTLB 
>> evictions, with and without the patch.
>>
>> If such stats are included in the changelog - even if just to show that any change 
>> is within measurement accuracy, it would make it easier to apply this change.
>>
>> Thanks,
>>
>> 	Ingo
> 
> 
> hi,
> 
> I have some results, but need help with interpretation.. ;)
> 
> I ran following command (with repeat 100 and 500)
> 
> perf stat --repeat 100 -e L1-icache-load -e L1-icache-load-misses -e
> L1-icache-prefetches -e L1-icache-prefetch-misses -e iTLB-loads -e
> iTLB-load-misses ./hackbench/hackbench 10
> 
> I can tell just the obvious:
> - the cache load count is higher for the patched kernel,
>   but the cache misses count is lower
> - patched kernel has also lower count of prefetches,
>   other counts are bigger for patched kernel
> 
> there's still some variability in counter values each time I run the perf

Thanks, I've also tested. (But my machine has no L1-icache-prefetches* support)
What I can tell is both of L1-icache-load and L1-icache-load-misses is
reduced by the patch. ;-)

Thank you,
--------------------------------------------------------------------------
the results for current tip tree are:

$ ./perf stat --repeat 100 -e L1-icache-load -e L1-icache-
load-misses -e iTLB-loads -e iTLB-load-misses hackbench 10

 Performance counter stats for 'hackbench 10' (100 runs):

        16,949,055 L1-icache-load             ( +-   0.303% )
         1,237,453 L1-icache-load-misses      ( +-   0.254% )
        40,000,357 iTLB-loads                 ( +-   0.257% )
            14,545 iTLB-load-misses           ( +-   0.306% )

        0.171622060  seconds time elapsed   ( +-   0.196% )

$ ./perf stat --repeat 500 -e L1-icache-load -e L1-icache-
load-misses -e iTLB-loads -e iTLB-load-misses hackbench 10

 Performance counter stats for 'hackbench 10' (500 runs):

        16,896,081 L1-icache-load             ( +-   0.146% )
         1,234,272 L1-icache-load-misses      ( +-   0.105% )
        39,850,899 iTLB-loads                 ( +-   0.116% )
            14,455 iTLB-load-misses           ( +-   0.119% )

        0.171901412  seconds time elapsed   ( +-   0.083% )

--------------------------------------------------------------------------
the results for tip tree with the patch applied are:

$ ./perf stat --repeat 100 -e L1-icache-load -e L1-icache-
load-misses -e iTLB-loads -e iTLB-load-misses hackbench 10

 Performance counter stats for 'hackbench 10' (100 runs):

        16,819,190 L1-icache-load             ( +-   0.288% )
         1,162,386 L1-icache-load-misses      ( +-   0.269% )
        40,020,154 iTLB-loads                 ( +-   0.254% )
            14,440 iTLB-load-misses           ( +-   0.220% )

        0.169014989  seconds time elapsed   ( +-   0.361% )

$ ./perf stat --repeat 500 -e L1-icache-load -e L1-icache-
load-misses -e iTLB-loads -e iTLB-load-misses hackbench 10

 Performance counter stats for 'hackbench 10' (500 runs):

        16,783,970 L1-icache-load             ( +-   0.144% )
         1,155,816 L1-icache-load-misses      ( +-   0.113% )
        39,958,292 iTLB-loads                 ( +-   0.122% )
            14,462 iTLB-load-misses           ( +-   0.138% )

        0.168279115  seconds time elapsed   ( +-   0.089% )


--------------------------------------------------------------------------
Here is an entry of the /proc/cpuinfo.

processor       : 0
vendor_id       : GenuineIntel
cpu family      : 6
model           : 26
model name      : Intel(R) Core(TM) i7 CPU         920  @ 2.67GHz
stepping        : 4
cpu MHz         : 2673.700
cache size      : 8192 KB
physical id     : 0
siblings        : 8
core id         : 0
cpu cores       : 4
apicid          : 0
initial apicid  : 0
fpu             : yes
fpu_exception   : yes
cpuid level     : 11
wp              : yes
flags           : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp lm
constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc
aperfmperf pni dtes64 monitor ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm sse4_1
sse4_2 popcnt lahf_lm ida dts tpr_shadow vnmi flexpriority ept vpid
bogomips        : 5347.40
clflush size    : 64
cache_alignment : 64
address sizes   : 36 bits physical, 48 bits virtual
power management:

-- 
Masami HIRAMATSU
2nd Dept. Linux Technology Center
Hitachi, Ltd., Systems Development Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-19 14:14               ` Masami Hiramatsu
@ 2011-02-20 12:59                 ` Ingo Molnar
  2011-02-21 11:54                   ` Jiri Olsa
  2011-02-21 14:25                   ` [PATCH 0/2] x86: separating entry text section + kprobes fix Jiri Olsa
  0 siblings, 2 replies; 26+ messages in thread
From: Ingo Molnar @ 2011-02-20 12:59 UTC (permalink / raw)
  To: Masami Hiramatsu, Arnaldo Carvalho de Melo,
	Frédéric Weisbecker
  Cc: Jiri Olsa, H. Peter Anvin, ananth, davem, linux-kernel,
	Thomas Gleixner, Peter Zijlstra, Eric Dumazet, 2nddept-manager


* Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> wrote:

> Thanks, I've also tested. (But my machine has no L1-icache-prefetches* support) 
> What I can tell is both of L1-icache-load and L1-icache-load-misses is reduced by 
> the patch. ;-)

That's actually a pretty interesting result: it means that compressing entry code 
into a single section compresses the icache footprint in a measurable way. The 
icache miss rate went down by about 6%:

>          1,234,272 L1-icache-load-misses      ( +-   0.105% )
>          1,155,816 L1-icache-load-misses      ( +-   0.113% )

Which, assuming that there's no kernel build and bootup related skew effect that is 
larger than 2-3% means that this is an improvement.

perf feature request: would be nice if it was able to do:

	perf stat --record ...
	perf diff

and it would show a comparison of the two runs.

In hindsight it makes sense: the patch probably reduced the fragmentation of the 
icache for this workload.

But it's still surprising :-)

Mind splitting the patch up into two parts, the first one that does the performance 
optimization intentionally (with numbers, explanation, etc.), the second one that 
uses the new section for kprobes exclusion?

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH] kprobes - do not allow optimized kprobes in entry code
  2011-02-20 12:59                 ` Ingo Molnar
@ 2011-02-21 11:54                   ` Jiri Olsa
  2011-02-21 14:25                   ` [PATCH 0/2] x86: separating entry text section + kprobes fix Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-02-21 11:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Masami Hiramatsu, Arnaldo Carvalho de Melo,
	Frédéric Weisbecker, H. Peter Anvin, ananth, davem,
	linux-kernel, Thomas Gleixner, Peter Zijlstra, Eric Dumazet,
	2nddept-manager

On Sun, Feb 20, 2011 at 01:59:48PM +0100, Ingo Molnar wrote:
> 
> * Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> wrote:
> 
> > Thanks, I've also tested. (But my machine has no L1-icache-prefetches* support) 
> > What I can tell is both of L1-icache-load and L1-icache-load-misses is reduced by 
> > the patch. ;-)
> 
> That's actually a pretty interesting result: it means that compressing entry code 
> into a single section compresses the icache footprint in a measurable way. The 
> icache miss rate went down by about 6%:
> 
> >          1,234,272 L1-icache-load-misses      ( +-   0.105% )
> >          1,155,816 L1-icache-load-misses      ( +-   0.113% )
> 
> Which, assuming that there's no kernel build and bootup related skew effect that is 
> larger than 2-3% means that this is an improvement.
> 
> perf feature request: would be nice if it was able to do:
> 
> 	perf stat --record ...
> 	perf diff
> 
> and it would show a comparison of the two runs.
> 
> In hindsight it makes sense: the patch probably reduced the fragmentation of the 
> icache for this workload.
> 
> But it's still surprising :-)
> 
> Mind splitting the patch up into two parts, the first one that does the performance 
> optimization intentionally (with numbers, explanation, etc.), the second one that 
> uses the new section for kprobes exclusion?
np, I'll send v2 shortly

thanks,
jirka

> 
> Thanks,
> 
> 	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 0/2] x86: separating entry text section + kprobes fix
  2011-02-20 12:59                 ` Ingo Molnar
  2011-02-21 11:54                   ` Jiri Olsa
@ 2011-02-21 14:25                   ` Jiri Olsa
  2011-02-21 14:25                     ` [PATCH 1/2] x86: separating entry text section Jiri Olsa
  2011-02-21 14:25                     ` [PATCH 2/2] kprobes: disabling optimized kprobes for " Jiri Olsa
  1 sibling, 2 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-02-21 14:25 UTC (permalink / raw)
  To: mingo
  Cc: masami.hiramatsu.pt, acme, fweisbec, hpa, ananth, davem,
	linux-kernel, tglx, a.p.zijlstra, eric.dumazet, 2nddept-manager

hi,

while fixing the optimized kprobes issue, we found out that separating
the entry text section might have performance benefits with regards to
the instruction cache usage.

attached patches:
1/2 - x86: separating entry text section
2/2 - kprobes: disabling optimized kprobes for entry text section

wbr,
jirka
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/kprobes.c         |    8 ++++++++
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 7 files changed, 26 insertions(+), 4 deletions(-)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/2] x86: separating entry text section
  2011-02-21 14:25                   ` [PATCH 0/2] x86: separating entry text section + kprobes fix Jiri Olsa
@ 2011-02-21 14:25                     ` Jiri Olsa
  2011-02-22  3:22                       ` Masami Hiramatsu
  2011-02-22  8:09                       ` Ingo Molnar
  2011-02-21 14:25                     ` [PATCH 2/2] kprobes: disabling optimized kprobes for " Jiri Olsa
  1 sibling, 2 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-02-21 14:25 UTC (permalink / raw)
  To: mingo
  Cc: masami.hiramatsu.pt, acme, fweisbec, hpa, ananth, davem,
	linux-kernel, tglx, a.p.zijlstra, eric.dumazet, 2nddept-manager,
	Jiri Olsa

Putting x86 entry code to the separate section: .entry.text.

Separating the entry text section seems to have performance
benefits with regards to the instruction cache usage.

Running hackbench showed that the change compresses the icache
footprint. The icache miss rate went down by about 8%:

before patch:
     26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)

after patch:
     24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)


Whole perf output follows.

- results for current tip tree:

 Performance counter stats for './hackbench/hackbench 10' (500 runs):

    817646684  L1-icache-loads            ( +-   0.150% )  (scaled from 80.99%)
     26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
       211864  L1-icache-prefetches       ( +-   0.616% )  (scaled from 80.99%)
<not counted>  L1-icache-prefetch-misses
    817646737  iTLB-loads                 ( +-   0.151% )  (scaled from 80.98%)
        82368  iTLB-load-misses           ( +-   0.451% )  (scaled from 80.98%)

  0.206651959  seconds time elapsed   ( +-   0.152% )


- results for current tip tree with the patch applied are:

 Performance counter stats for './hackbench/hackbench 10' (500 runs):

    960162049  L1-icache-loads            ( +-   0.114% )  (scaled from 80.95%)
     24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
       179800  L1-icache-prefetches       ( +-   0.530% )  (scaled from 80.95%)
<not counted>  L1-icache-prefetch-misses
    960352725  iTLB-loads                 ( +-   0.114% )  (scaled from 80.93%)
        84410  iTLB-load-misses           ( +-   0.491% )  (scaled from 80.92%)

  0.210509948  seconds time elapsed   ( +-   0.140% )


wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 6 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0ed7896..50f1630 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..f5accf8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 891268c..39f8d21 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e70cc3d..459dce2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-- 
1.7.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 2/2] kprobes: disabling optimized kprobes for entry text section
  2011-02-21 14:25                   ` [PATCH 0/2] x86: separating entry text section + kprobes fix Jiri Olsa
  2011-02-21 14:25                     ` [PATCH 1/2] x86: separating entry text section Jiri Olsa
@ 2011-02-21 14:25                     ` Jiri Olsa
  2011-02-22  3:22                       ` Masami Hiramatsu
  2011-03-08 20:16                       ` [tip:perf/core] kprobes: Disabling " tip-bot for Jiri Olsa
  1 sibling, 2 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-02-21 14:25 UTC (permalink / raw)
  To: mingo
  Cc: masami.hiramatsu.pt, acme, fweisbec, hpa, ananth, davem,
	linux-kernel, tglx, a.p.zijlstra, eric.dumazet, 2nddept-manager,
	Jiri Olsa

You can crash the kernel using kprobe tracer by running:

echo "p system_call_after_swapgs" > ./kprobe_events
echo 1 > ./events/kprobes/enable

The reason is that at the system_call_after_swapgs label, the kernel
stack is not set up. If optimized kprobes are enabled, the user space
stack is being used in this case (see optimized kprobe template) and
this might result in a crash.

There are several places like this over the entry code (entry_$BIT).
As it seems there's no any reasonable/maintainable way to disable only
those places where the stack is not ready, I switched off the whole
entry code from kprobe optimizing.

wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/kernel/kprobes.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477..d03bc1e 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
 	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
+	/*
+	 * Do not optimize in the entry code due to the unstable
+	 * stack handling.
+	 */
+	if ((paddr >= (unsigned long ) __entry_text_start) &&
+	    (paddr <  (unsigned long ) __entry_text_end))
+		return 0;
+
 	/* Check there is enough space for a relative jump. */
 	if (size - offset < RELATIVEJUMP_SIZE)
 		return 0;
-- 
1.7.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/2] kprobes: disabling optimized kprobes for entry text section
  2011-02-21 14:25                     ` [PATCH 2/2] kprobes: disabling optimized kprobes for " Jiri Olsa
@ 2011-02-22  3:22                       ` Masami Hiramatsu
  2011-03-08 20:16                       ` [tip:perf/core] kprobes: Disabling " tip-bot for Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: Masami Hiramatsu @ 2011-02-22  3:22 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: mingo, acme, fweisbec, hpa, ananth, davem, linux-kernel, tglx,
	a.p.zijlstra, eric.dumazet, 2nddept-manager, 2nddept-manager

(2011/02/21 23:25), Jiri Olsa wrote:
> You can crash the kernel using kprobe tracer by running:
> 
> echo "p system_call_after_swapgs" > ./kprobe_events
> echo 1 > ./events/kprobes/enable
> 
> The reason is that at the system_call_after_swapgs label, the kernel
> stack is not set up. If optimized kprobes are enabled, the user space
> stack is being used in this case (see optimized kprobe template) and
> this might result in a crash.
> 
> There are several places like this over the entry code (entry_$BIT).
> As it seems there's no any reasonable/maintainable way to disable only
> those places where the stack is not ready, I switched off the whole
> entry code from kprobe optimizing.

Thank you very much!

> 
> wbr,
> jirka
> 
> 
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>

> ---
>  arch/x86/kernel/kprobes.c |    8 ++++++++
>  1 files changed, 8 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
> index d91c477..d03bc1e 100644
> --- a/arch/x86/kernel/kprobes.c
> +++ b/arch/x86/kernel/kprobes.c
> @@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
>  	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
>  		return 0;
>  
> +	/*
> +	 * Do not optimize in the entry code due to the unstable
> +	 * stack handling.
> +	 */
> +	if ((paddr >= (unsigned long ) __entry_text_start) &&
> +	    (paddr <  (unsigned long ) __entry_text_end))
> +		return 0;
> +
>  	/* Check there is enough space for a relative jump. */
>  	if (size - offset < RELATIVEJUMP_SIZE)
>  		return 0;


-- 
Masami HIRAMATSU
2nd Dept. Linux Technology Center
Hitachi, Ltd., Systems Development Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-02-21 14:25                     ` [PATCH 1/2] x86: separating entry text section Jiri Olsa
@ 2011-02-22  3:22                       ` Masami Hiramatsu
  2011-02-22  8:09                       ` Ingo Molnar
  1 sibling, 0 replies; 26+ messages in thread
From: Masami Hiramatsu @ 2011-02-22  3:22 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: mingo, acme, fweisbec, hpa, ananth, davem, linux-kernel, tglx,
	a.p.zijlstra, eric.dumazet, 2nddept-manager, 2nddept-manager

(2011/02/21 23:25), Jiri Olsa wrote:
> Putting x86 entry code to the separate section: .entry.text.
> 
> Separating the entry text section seems to have performance
> benefits with regards to the instruction cache usage.
> 
> Running hackbench showed that the change compresses the icache
> footprint. The icache miss rate went down by about 8%:
> 
> before patch:
>      26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
> 
> after patch:
>      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
> 
> 
> Whole perf output follows.
> 
> - results for current tip tree:
> 
>  Performance counter stats for './hackbench/hackbench 10' (500 runs):
> 
>     817646684  L1-icache-loads            ( +-   0.150% )  (scaled from 80.99%)
>      26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
>        211864  L1-icache-prefetches       ( +-   0.616% )  (scaled from 80.99%)
> <not counted>  L1-icache-prefetch-misses
>     817646737  iTLB-loads                 ( +-   0.151% )  (scaled from 80.98%)
>         82368  iTLB-load-misses           ( +-   0.451% )  (scaled from 80.98%)
> 
>   0.206651959  seconds time elapsed   ( +-   0.152% )
> 
> 
> - results for current tip tree with the patch applied are:
> 
>  Performance counter stats for './hackbench/hackbench 10' (500 runs):
> 
>     960162049  L1-icache-loads            ( +-   0.114% )  (scaled from 80.95%)
>      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
>        179800  L1-icache-prefetches       ( +-   0.530% )  (scaled from 80.95%)
> <not counted>  L1-icache-prefetch-misses
>     960352725  iTLB-loads                 ( +-   0.114% )  (scaled from 80.93%)
>         84410  iTLB-load-misses           ( +-   0.491% )  (scaled from 80.92%)
> 
>   0.210509948  seconds time elapsed   ( +-   0.140% )
> 
> 
> wbr,
> jirka
> 
> 
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>

Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>

Thanks!

> ---
>  arch/x86/ia32/ia32entry.S         |    2 ++
>  arch/x86/kernel/entry_32.S        |    6 ++++--
>  arch/x86/kernel/entry_64.S        |    6 ++++--
>  arch/x86/kernel/vmlinux.lds.S     |    1 +
>  include/asm-generic/sections.h    |    1 +
>  include/asm-generic/vmlinux.lds.h |    6 ++++++
>  6 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 0ed7896..50f1630 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -25,6 +25,8 @@
>  #define sysretl_audit ia32_ret_from_sys_call
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
>  
>  	.macro IA32_ARG_FIXUP noebp=0
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index c8b4efa..f5accf8 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -65,6 +65,8 @@
>  #define sysexit_audit	syscall_exit_work
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  /*
>   * We use macros for low-level operations which need to be overridden
>   * for paravirtualization.  The following will never clobber any registers:
> @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
>   */
>  .section .init.rodata,"a"
>  ENTRY(interrupt)
> -.text
> +.section .entry.text, "ax"
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.long 1b
> -      .text
> +      .section .entry.text, "ax"
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 891268c..39f8d21 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -61,6 +61,8 @@
>  #define __AUDIT_ARCH_LE	   0x40000000
>  
>  	.code64
> +	.section .entry.text, "ax"
> +
>  #ifdef CONFIG_FUNCTION_TRACER
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  ENTRY(mcount)
> @@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
>   */
>  	.section .init.rodata,"a"
>  ENTRY(interrupt)
> -	.text
> +	.section .entry.text
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.quad 1b
> -      .text
> +      .section .entry.text
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index e70cc3d..459dce2 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -105,6 +105,7 @@ SECTIONS
>  		SCHED_TEXT
>  		LOCK_TEXT
>  		KPROBES_TEXT
> +		ENTRY_TEXT
>  		IRQENTRY_TEXT
>  		*(.fixup)
>  		*(.gnu.warning)
> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index b3bfabc..c1a1216 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
>  extern char _end[];
>  extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
>  extern char __kprobes_text_start[], __kprobes_text_end[];
> +extern char __entry_text_start[], __entry_text_end[];
>  extern char __initdata_begin[], __initdata_end[];
>  extern char __start_rodata[], __end_rodata[];
>  
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index fe77e33..906c3ce 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -424,6 +424,12 @@
>  		*(.kprobes.text)					\
>  		VMLINUX_SYMBOL(__kprobes_text_end) = .;
>  
> +#define ENTRY_TEXT							\
> +		ALIGN_FUNCTION();					\
> +		VMLINUX_SYMBOL(__entry_text_start) = .;			\
> +		*(.entry.text)						\
> +		VMLINUX_SYMBOL(__entry_text_end) = .;
> +
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>  #define IRQENTRY_TEXT							\
>  		ALIGN_FUNCTION();					\


-- 
Masami HIRAMATSU
2nd Dept. Linux Technology Center
Hitachi, Ltd., Systems Development Laboratory
E-mail: masami.hiramatsu.pt@hitachi.com

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-02-21 14:25                     ` [PATCH 1/2] x86: separating entry text section Jiri Olsa
  2011-02-22  3:22                       ` Masami Hiramatsu
@ 2011-02-22  8:09                       ` Ingo Molnar
  2011-02-22 12:52                         ` Jiri Olsa
  1 sibling, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2011-02-22  8:09 UTC (permalink / raw)
  To: Jiri Olsa, Arnaldo Carvalho de Melo,
	Frédéric Weisbecker, Peter Zijlstra
  Cc: masami.hiramatsu.pt, acme, fweisbec, hpa, ananth, davem,
	linux-kernel, tglx, a.p.zijlstra, eric.dumazet, 2nddept-manager


* Jiri Olsa <jolsa@redhat.com> wrote:

> Putting x86 entry code to the separate section: .entry.text.

Trying to apply your patch i noticed one detail:

> before patch:
>      26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
>   0.206651959  seconds time elapsed   ( +-   0.152% )
> 
> after patch:
>      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
>   0.210509948  seconds time elapsed   ( +-   0.140% )

So time elapsed actually went up.

hackbench is notoriously unstable when it comes to runtime - and increasing the 
--repeat value only has limited effects on that.

Dropping all system caches:

   echo 1 > /proc/sys/vm/drop_caches

Seems to do a better job of 'resetting' system state, but if we put that into the 
measured workload then the results are all over the place (as we now depend on IO 
being done):

 # cat hb10

 echo 1 > /proc/sys/vm/drop_caches
 ./hackbench 10

 # perf stat --repeat 3 ./hb10

 Time: 0.097
 Time: 0.095
 Time: 0.101

 Performance counter stats for './hb10' (3 runs):

         21.351257 task-clock-msecs         #      0.044 CPUs    ( +-  27.165% )
                 6 context-switches         #      0.000 M/sec   ( +-  34.694% )
                 1 CPU-migrations           #      0.000 M/sec   ( +-  25.000% )
               410 page-faults              #      0.019 M/sec   ( +-   0.081% )
        25,407,650 cycles                   #   1189.984 M/sec   ( +-  49.154% )
        25,407,650 instructions             #      1.000 IPC     ( +-  49.154% )
         5,126,580 branches                 #    240.107 M/sec   ( +-  46.012% )
           192,272 branch-misses            #      3.750 %       ( +-  44.911% )
           901,701 cache-references         #     42.232 M/sec   ( +-  12.857% )
           802,767 cache-misses             #     37.598 M/sec   ( +-   9.282% )

        0.483297792  seconds time elapsed   ( +-  31.152% )

So here's a perf stat feature suggestion to solve such measurement problems: a new 
'pre-run' 'dry' command could be specified that is executed before the real 'hot' 
run is executed. Something like this:

  perf stat --pre-run-script ./hb10 --repeat 10 ./hackbench 10

Would do the cache-clearing before each run, it would run hackbench once (dry run) 
and then would run hackbench 10 for real - and would repeat the whole thing 10 
times. Only the 'hot' portion of the run would be measured and displayed in the perf 
stat output event counts.

Another observation:

>      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)

Could you please do runs that do not display 'scaled from' messages? Since we are 
measuring a relatively small effect here, and scaling adds noise, it would be nice 
to ensure that the effect persists with non-scaled events as well:

You can do that by reducing the number of events that are measured. The PMU can not 
measure all those L1 cache events you listed - so only use the most important one 
and add cycles and instructions to make sure the measurements are comparable:

  -e L1-icache-load-misses -e instructions -e cycles

Btw., there's another 'perf stat' feature suggestion: it would be nice if it was 
possible to 'record' a perf stat run, and do a 'perf diff' over it. That would 
compare the two runs all automatically, without you having to do the comparison 
manually.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-02-22  8:09                       ` Ingo Molnar
@ 2011-02-22 12:52                         ` Jiri Olsa
  2011-03-07 10:44                           ` Jiri Olsa
  0 siblings, 1 reply; 26+ messages in thread
From: Jiri Olsa @ 2011-02-22 12:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Frédéric Weisbecker,
	Peter Zijlstra, masami.hiramatsu.pt, hpa, ananth, davem,
	linux-kernel, tglx, eric.dumazet, 2nddept-manager

On Tue, Feb 22, 2011 at 09:09:34AM +0100, Ingo Molnar wrote:
> 
> * Jiri Olsa <jolsa@redhat.com> wrote:
> 
> > Putting x86 entry code to the separate section: .entry.text.
> 
> Trying to apply your patch i noticed one detail:
> 
> > before patch:
> >      26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
> >   0.206651959  seconds time elapsed   ( +-   0.152% )
> > 
> > after patch:
> >      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
> >   0.210509948  seconds time elapsed   ( +-   0.140% )
> 
> So time elapsed actually went up.
> 
> hackbench is notoriously unstable when it comes to runtime - and increasing the 
> --repeat value only has limited effects on that.
> 
> Dropping all system caches:
> 
>    echo 1 > /proc/sys/vm/drop_caches
> 
> Seems to do a better job of 'resetting' system state, but if we put that into the 
> measured workload then the results are all over the place (as we now depend on IO 
> being done):
> 
>  # cat hb10
> 
>  echo 1 > /proc/sys/vm/drop_caches
>  ./hackbench 10
> 
>  # perf stat --repeat 3 ./hb10
> 
>  Time: 0.097
>  Time: 0.095
>  Time: 0.101
> 
>  Performance counter stats for './hb10' (3 runs):
> 
>          21.351257 task-clock-msecs         #      0.044 CPUs    ( +-  27.165% )
>                  6 context-switches         #      0.000 M/sec   ( +-  34.694% )
>                  1 CPU-migrations           #      0.000 M/sec   ( +-  25.000% )
>                410 page-faults              #      0.019 M/sec   ( +-   0.081% )
>         25,407,650 cycles                   #   1189.984 M/sec   ( +-  49.154% )
>         25,407,650 instructions             #      1.000 IPC     ( +-  49.154% )
>          5,126,580 branches                 #    240.107 M/sec   ( +-  46.012% )
>            192,272 branch-misses            #      3.750 %       ( +-  44.911% )
>            901,701 cache-references         #     42.232 M/sec   ( +-  12.857% )
>            802,767 cache-misses             #     37.598 M/sec   ( +-   9.282% )
> 
>         0.483297792  seconds time elapsed   ( +-  31.152% )
> 
> So here's a perf stat feature suggestion to solve such measurement problems: a new 
> 'pre-run' 'dry' command could be specified that is executed before the real 'hot' 
> run is executed. Something like this:
> 
>   perf stat --pre-run-script ./hb10 --repeat 10 ./hackbench 10
> 
> Would do the cache-clearing before each run, it would run hackbench once (dry run) 
> and then would run hackbench 10 for real - and would repeat the whole thing 10 
> times. Only the 'hot' portion of the run would be measured and displayed in the perf 
> stat output event counts.
> 
> Another observation:
> 
> >      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
> 
> Could you please do runs that do not display 'scaled from' messages? Since we are 
> measuring a relatively small effect here, and scaling adds noise, it would be nice 
> to ensure that the effect persists with non-scaled events as well:
> 
> You can do that by reducing the number of events that are measured. The PMU can not 
> measure all those L1 cache events you listed - so only use the most important one 
> and add cycles and instructions to make sure the measurements are comparable:
> 
>   -e L1-icache-load-misses -e instructions -e cycles
> 
> Btw., there's another 'perf stat' feature suggestion: it would be nice if it was 
> possible to 'record' a perf stat run, and do a 'perf diff' over it. That would 
> compare the two runs all automatically, without you having to do the comparison 
> manually.

hi,

I made another test with "reseting" the system state as suggested and
only for cache-misses together with instructions and cycles events.

I can see even bigger drop of icache load misses than before
from 19359739 to 16448709 (about 15%).

The instruction/cycles count is slightly bigger in the patched
kernel run though..

perf stat --repeat 100  -e L1-icache-load-misses -e instructions -e cycles ./hackbench/hackbench 10

-------------------------------------------------------------------------------
before patch:

 Performance counter stats for './hackbench/hackbench 10' (100 runs):

           19359739  L1-icache-load-misses      ( +-   0.313% )
         2667528936  instructions             #      0.498 IPC     ( +- 0.165% )
         5352849800  cycles                     ( +-   0.303% )

        0.205402048  seconds time elapsed   ( +-   0.299% )

 Performance counter stats for './hackbench/hackbench 10' (500 runs):

           19417627  L1-icache-load-misses      ( +-   0.147% )
         2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
         5389516026  cycles                     ( +-   0.144% )

        0.206267711  seconds time elapsed   ( +-   0.138% )


-------------------------------------------------------------------------------
after patch:

 Performance counter stats for './hackbench/hackbench 10' (100 runs):

           16448709  L1-icache-load-misses      ( +-   0.426% )
         2698406306  instructions             #      0.500 IPC     ( +- 0.177% )
         5393976267  cycles                     ( +-   0.321% )

        0.206072845  seconds time elapsed   ( +-   0.276% )

 Performance counter stats for './hackbench/hackbench 10' (500 runs):

           16490788  L1-icache-load-misses      ( +-   0.180% )
         2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
         5414756975  cycles                     ( +-   0.148% )

        0.206747566  seconds time elapsed   ( +-   0.137% )


Attaching patch with above numbers in comment.

thanks,
jirka


---
Putting x86 entry code to the separate section: .entry.text.

Separating the entry text section seems to have performance
benefits with regards to the instruction cache usage.

Running hackbench showed that the change compresses the icache
footprint. The icache load miss rate went down by about 15%:

before patch:
         19417627  L1-icache-load-misses      ( +-   0.147% )

after patch:
         16490788  L1-icache-load-misses      ( +-   0.180% )


Whole perf output follows.

- results for current tip tree:
  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         19417627  L1-icache-load-misses      ( +-   0.147% )
       2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
       5389516026  cycles                     ( +-   0.144% )

      0.206267711  seconds time elapsed   ( +-   0.138% )

- results for current tip tree with the patch applied are:
  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         16490788  L1-icache-load-misses      ( +-   0.180% )
       2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
       5414756975  cycles                     ( +-   0.148% )

      0.206747566  seconds time elapsed   ( +-   0.137% )


wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 6 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0ed7896..50f1630 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..f5accf8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 891268c..39f8d21 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e70cc3d..459dce2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-- 
1.7.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-02-22 12:52                         ` Jiri Olsa
@ 2011-03-07 10:44                           ` Jiri Olsa
  2011-03-07 15:29                             ` Ingo Molnar
  0 siblings, 1 reply; 26+ messages in thread
From: Jiri Olsa @ 2011-03-07 10:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Frédéric Weisbecker,
	Peter Zijlstra, masami.hiramatsu.pt, hpa, ananth, davem,
	linux-kernel, tglx, eric.dumazet, 2nddept-manager

hi,
any feedback?

thanks,
jirka

On Tue, Feb 22, 2011 at 01:52:01PM +0100, Jiri Olsa wrote:
> On Tue, Feb 22, 2011 at 09:09:34AM +0100, Ingo Molnar wrote:
> > 
> > * Jiri Olsa <jolsa@redhat.com> wrote:
> > 
> > > Putting x86 entry code to the separate section: .entry.text.
> > 
> > Trying to apply your patch i noticed one detail:
> > 
> > > before patch:
> > >      26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
> > >   0.206651959  seconds time elapsed   ( +-   0.152% )
> > > 
> > > after patch:
> > >      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
> > >   0.210509948  seconds time elapsed   ( +-   0.140% )
> > 
> > So time elapsed actually went up.
> > 
> > hackbench is notoriously unstable when it comes to runtime - and increasing the 
> > --repeat value only has limited effects on that.
> > 
> > Dropping all system caches:
> > 
> >    echo 1 > /proc/sys/vm/drop_caches
> > 
> > Seems to do a better job of 'resetting' system state, but if we put that into the 
> > measured workload then the results are all over the place (as we now depend on IO 
> > being done):
> > 
> >  # cat hb10
> > 
> >  echo 1 > /proc/sys/vm/drop_caches
> >  ./hackbench 10
> > 
> >  # perf stat --repeat 3 ./hb10
> > 
> >  Time: 0.097
> >  Time: 0.095
> >  Time: 0.101
> > 
> >  Performance counter stats for './hb10' (3 runs):
> > 
> >          21.351257 task-clock-msecs         #      0.044 CPUs    ( +-  27.165% )
> >                  6 context-switches         #      0.000 M/sec   ( +-  34.694% )
> >                  1 CPU-migrations           #      0.000 M/sec   ( +-  25.000% )
> >                410 page-faults              #      0.019 M/sec   ( +-   0.081% )
> >         25,407,650 cycles                   #   1189.984 M/sec   ( +-  49.154% )
> >         25,407,650 instructions             #      1.000 IPC     ( +-  49.154% )
> >          5,126,580 branches                 #    240.107 M/sec   ( +-  46.012% )
> >            192,272 branch-misses            #      3.750 %       ( +-  44.911% )
> >            901,701 cache-references         #     42.232 M/sec   ( +-  12.857% )
> >            802,767 cache-misses             #     37.598 M/sec   ( +-   9.282% )
> > 
> >         0.483297792  seconds time elapsed   ( +-  31.152% )
> > 
> > So here's a perf stat feature suggestion to solve such measurement problems: a new 
> > 'pre-run' 'dry' command could be specified that is executed before the real 'hot' 
> > run is executed. Something like this:
> > 
> >   perf stat --pre-run-script ./hb10 --repeat 10 ./hackbench 10
> > 
> > Would do the cache-clearing before each run, it would run hackbench once (dry run) 
> > and then would run hackbench 10 for real - and would repeat the whole thing 10 
> > times. Only the 'hot' portion of the run would be measured and displayed in the perf 
> > stat output event counts.
> > 
> > Another observation:
> > 
> > >      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
> > 
> > Could you please do runs that do not display 'scaled from' messages? Since we are 
> > measuring a relatively small effect here, and scaling adds noise, it would be nice 
> > to ensure that the effect persists with non-scaled events as well:
> > 
> > You can do that by reducing the number of events that are measured. The PMU can not 
> > measure all those L1 cache events you listed - so only use the most important one 
> > and add cycles and instructions to make sure the measurements are comparable:
> > 
> >   -e L1-icache-load-misses -e instructions -e cycles
> > 
> > Btw., there's another 'perf stat' feature suggestion: it would be nice if it was 
> > possible to 'record' a perf stat run, and do a 'perf diff' over it. That would 
> > compare the two runs all automatically, without you having to do the comparison 
> > manually.
> 
> hi,
> 
> I made another test with "reseting" the system state as suggested and
> only for cache-misses together with instructions and cycles events.
> 
> I can see even bigger drop of icache load misses than before
> from 19359739 to 16448709 (about 15%).
> 
> The instruction/cycles count is slightly bigger in the patched
> kernel run though..
> 
> perf stat --repeat 100  -e L1-icache-load-misses -e instructions -e cycles ./hackbench/hackbench 10
> 
> -------------------------------------------------------------------------------
> before patch:
> 
>  Performance counter stats for './hackbench/hackbench 10' (100 runs):
> 
>            19359739  L1-icache-load-misses      ( +-   0.313% )
>          2667528936  instructions             #      0.498 IPC     ( +- 0.165% )
>          5352849800  cycles                     ( +-   0.303% )
> 
>         0.205402048  seconds time elapsed   ( +-   0.299% )
> 
>  Performance counter stats for './hackbench/hackbench 10' (500 runs):
> 
>            19417627  L1-icache-load-misses      ( +-   0.147% )
>          2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
>          5389516026  cycles                     ( +-   0.144% )
> 
>         0.206267711  seconds time elapsed   ( +-   0.138% )
> 
> 
> -------------------------------------------------------------------------------
> after patch:
> 
>  Performance counter stats for './hackbench/hackbench 10' (100 runs):
> 
>            16448709  L1-icache-load-misses      ( +-   0.426% )
>          2698406306  instructions             #      0.500 IPC     ( +- 0.177% )
>          5393976267  cycles                     ( +-   0.321% )
> 
>         0.206072845  seconds time elapsed   ( +-   0.276% )
> 
>  Performance counter stats for './hackbench/hackbench 10' (500 runs):
> 
>            16490788  L1-icache-load-misses      ( +-   0.180% )
>          2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
>          5414756975  cycles                     ( +-   0.148% )
> 
>         0.206747566  seconds time elapsed   ( +-   0.137% )
> 
> 
> Attaching patch with above numbers in comment.
> 
> thanks,
> jirka
> 
> 
> ---
> Putting x86 entry code to the separate section: .entry.text.
> 
> Separating the entry text section seems to have performance
> benefits with regards to the instruction cache usage.
> 
> Running hackbench showed that the change compresses the icache
> footprint. The icache load miss rate went down by about 15%:
> 
> before patch:
>          19417627  L1-icache-load-misses      ( +-   0.147% )
> 
> after patch:
>          16490788  L1-icache-load-misses      ( +-   0.180% )
> 
> 
> Whole perf output follows.
> 
> - results for current tip tree:
>   Performance counter stats for './hackbench/hackbench 10' (500 runs):
> 
>          19417627  L1-icache-load-misses      ( +-   0.147% )
>        2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
>        5389516026  cycles                     ( +-   0.144% )
> 
>       0.206267711  seconds time elapsed   ( +-   0.138% )
> 
> - results for current tip tree with the patch applied are:
>   Performance counter stats for './hackbench/hackbench 10' (500 runs):
> 
>          16490788  L1-icache-load-misses      ( +-   0.180% )
>        2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
>        5414756975  cycles                     ( +-   0.148% )
> 
>       0.206747566  seconds time elapsed   ( +-   0.137% )
> 
> 
> wbr,
> jirka
> 
> 
> Signed-off-by: Jiri Olsa <jolsa@redhat.com>
> ---
>  arch/x86/ia32/ia32entry.S         |    2 ++
>  arch/x86/kernel/entry_32.S        |    6 ++++--
>  arch/x86/kernel/entry_64.S        |    6 ++++--
>  arch/x86/kernel/vmlinux.lds.S     |    1 +
>  include/asm-generic/sections.h    |    1 +
>  include/asm-generic/vmlinux.lds.h |    6 ++++++
>  6 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
> index 0ed7896..50f1630 100644
> --- a/arch/x86/ia32/ia32entry.S
> +++ b/arch/x86/ia32/ia32entry.S
> @@ -25,6 +25,8 @@
>  #define sysretl_audit ia32_ret_from_sys_call
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
>  
>  	.macro IA32_ARG_FIXUP noebp=0
> diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
> index c8b4efa..f5accf8 100644
> --- a/arch/x86/kernel/entry_32.S
> +++ b/arch/x86/kernel/entry_32.S
> @@ -65,6 +65,8 @@
>  #define sysexit_audit	syscall_exit_work
>  #endif
>  
> +	.section .entry.text, "ax"
> +
>  /*
>   * We use macros for low-level operations which need to be overridden
>   * for paravirtualization.  The following will never clobber any registers:
> @@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
>   */
>  .section .init.rodata,"a"
>  ENTRY(interrupt)
> -.text
> +.section .entry.text, "ax"
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.long 1b
> -      .text
> +      .section .entry.text, "ax"
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
> index 891268c..39f8d21 100644
> --- a/arch/x86/kernel/entry_64.S
> +++ b/arch/x86/kernel/entry_64.S
> @@ -61,6 +61,8 @@
>  #define __AUDIT_ARCH_LE	   0x40000000
>  
>  	.code64
> +	.section .entry.text, "ax"
> +
>  #ifdef CONFIG_FUNCTION_TRACER
>  #ifdef CONFIG_DYNAMIC_FTRACE
>  ENTRY(mcount)
> @@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
>   */
>  	.section .init.rodata,"a"
>  ENTRY(interrupt)
> -	.text
> +	.section .entry.text
>  	.p2align 5
>  	.p2align CONFIG_X86_L1_CACHE_SHIFT
>  ENTRY(irq_entries_start)
> @@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
>        .endif
>        .previous
>  	.quad 1b
> -      .text
> +      .section .entry.text
>  vector=vector+1
>      .endif
>    .endr
> diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
> index e70cc3d..459dce2 100644
> --- a/arch/x86/kernel/vmlinux.lds.S
> +++ b/arch/x86/kernel/vmlinux.lds.S
> @@ -105,6 +105,7 @@ SECTIONS
>  		SCHED_TEXT
>  		LOCK_TEXT
>  		KPROBES_TEXT
> +		ENTRY_TEXT
>  		IRQENTRY_TEXT
>  		*(.fixup)
>  		*(.gnu.warning)
> diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
> index b3bfabc..c1a1216 100644
> --- a/include/asm-generic/sections.h
> +++ b/include/asm-generic/sections.h
> @@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
>  extern char _end[];
>  extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
>  extern char __kprobes_text_start[], __kprobes_text_end[];
> +extern char __entry_text_start[], __entry_text_end[];
>  extern char __initdata_begin[], __initdata_end[];
>  extern char __start_rodata[], __end_rodata[];
>  
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index fe77e33..906c3ce 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -424,6 +424,12 @@
>  		*(.kprobes.text)					\
>  		VMLINUX_SYMBOL(__kprobes_text_end) = .;
>  
> +#define ENTRY_TEXT							\
> +		ALIGN_FUNCTION();					\
> +		VMLINUX_SYMBOL(__entry_text_start) = .;			\
> +		*(.entry.text)						\
> +		VMLINUX_SYMBOL(__entry_text_end) = .;
> +
>  #ifdef CONFIG_FUNCTION_GRAPH_TRACER
>  #define IRQENTRY_TEXT							\
>  		ALIGN_FUNCTION();					\
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-03-07 10:44                           ` Jiri Olsa
@ 2011-03-07 15:29                             ` Ingo Molnar
  2011-03-07 18:10                               ` Jiri Olsa
  0 siblings, 1 reply; 26+ messages in thread
From: Ingo Molnar @ 2011-03-07 15:29 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Arnaldo Carvalho de Melo, Frédéric Weisbecker,
	Peter Zijlstra, masami.hiramatsu.pt, hpa, ananth, davem,
	linux-kernel, tglx, eric.dumazet, 2nddept-manager


* Jiri Olsa <jolsa@redhat.com> wrote:

> hi,
> any feedback?

Did you get my reply below?

Thanks,

	Ingo

----- Forwarded message from Ingo Molnar <mingo@elte.hu> -----

Date: Tue, 22 Feb 2011 09:09:34 +0100
From: Ingo Molnar <mingo@elte.hu>
To: Jiri Olsa <jolsa@redhat.com>,
	Arnaldo Carvalho de Melo <acme@redhat.com>,
	Frédéric Weisbecker <fweisbec@gmail.com>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: masami.hiramatsu.pt@hitachi.com, acme@redhat.com, fweisbec@gmail.com,
	hpa@zytor.com, ananth@in.ibm.com, davem@davemloft.net,
	linux-kernel@vger.kernel.org, tglx@linutronix.de,
	a.p.zijlstra@chello.nl, eric.dumazet@gmail.com,
	2nddept-manager@sdl.hitachi.co.jp
Subject: Re: [PATCH 1/2] x86: separating entry text section


* Jiri Olsa <jolsa@redhat.com> wrote:

> Putting x86 entry code to the separate section: .entry.text.

Trying to apply your patch i noticed one detail:

> before patch:
>      26282174  L1-icache-load-misses      ( +-   0.099% )  (scaled from 81.00%)
>   0.206651959  seconds time elapsed   ( +-   0.152% )
> 
> after patch:
>      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)
>   0.210509948  seconds time elapsed   ( +-   0.140% )

So time elapsed actually went up.

hackbench is notoriously unstable when it comes to runtime - and increasing the 
--repeat value only has limited effects on that.

Dropping all system caches:

   echo 1 > /proc/sys/vm/drop_caches

Seems to do a better job of 'resetting' system state, but if we put that into the 
measured workload then the results are all over the place (as we now depend on IO 
being done):

 # cat hb10

 echo 1 > /proc/sys/vm/drop_caches
 ./hackbench 10

 # perf stat --repeat 3 ./hb10

 Time: 0.097
 Time: 0.095
 Time: 0.101

 Performance counter stats for './hb10' (3 runs):

         21.351257 task-clock-msecs         #      0.044 CPUs    ( +-  27.165% )
                 6 context-switches         #      0.000 M/sec   ( +-  34.694% )
                 1 CPU-migrations           #      0.000 M/sec   ( +-  25.000% )
               410 page-faults              #      0.019 M/sec   ( +-   0.081% )
        25,407,650 cycles                   #   1189.984 M/sec   ( +-  49.154% )
        25,407,650 instructions             #      1.000 IPC     ( +-  49.154% )
         5,126,580 branches                 #    240.107 M/sec   ( +-  46.012% )
           192,272 branch-misses            #      3.750 %       ( +-  44.911% )
           901,701 cache-references         #     42.232 M/sec   ( +-  12.857% )
           802,767 cache-misses             #     37.598 M/sec   ( +-   9.282% )

        0.483297792  seconds time elapsed   ( +-  31.152% )

So here's a perf stat feature suggestion to solve such measurement problems: a new 
'pre-run' 'dry' command could be specified that is executed before the real 'hot' 
run is executed. Something like this:

  perf stat --pre-run-script ./hb10 --repeat 10 ./hackbench 10

Would do the cache-clearing before each run, it would run hackbench once (dry run) 
and then would run hackbench 10 for real - and would repeat the whole thing 10 
times. Only the 'hot' portion of the run would be measured and displayed in the perf 
stat output event counts.

Another observation:

>      24237651  L1-icache-load-misses      ( +-   0.117% )  (scaled from 80.96%)

Could you please do runs that do not display 'scaled from' messages? Since we are 
measuring a relatively small effect here, and scaling adds noise, it would be nice 
to ensure that the effect persists with non-scaled events as well:

You can do that by reducing the number of events that are measured. The PMU can not 
measure all those L1 cache events you listed - so only use the most important one 
and add cycles and instructions to make sure the measurements are comparable:

  -e L1-icache-load-misses -e instructions -e cycles

Btw., there's another 'perf stat' feature suggestion: it would be nice if it was 
possible to 'record' a perf stat run, and do a 'perf diff' over it. That would 
compare the two runs all automatically, without you having to do the comparison 
manually.

Thanks,

	Ingo


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-03-07 15:29                             ` Ingo Molnar
@ 2011-03-07 18:10                               ` Jiri Olsa
  2011-03-08 16:15                                 ` Ingo Molnar
  2011-03-08 20:15                                 ` [tip:perf/core] x86: Separate out " tip-bot for Jiri Olsa
  0 siblings, 2 replies; 26+ messages in thread
From: Jiri Olsa @ 2011-03-07 18:10 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arnaldo Carvalho de Melo, Frédéric Weisbecker,
	Peter Zijlstra, masami.hiramatsu.pt, hpa, ananth, davem,
	linux-kernel, tglx, eric.dumazet, 2nddept-manager

On Mon, Mar 07, 2011 at 04:29:21PM +0100, Ingo Molnar wrote:
> 
> * Jiri Olsa <jolsa@redhat.com> wrote:
> 
> > hi,
> > any feedback?
> 
> Did you get my reply below?
> 
> Thanks,
> 
> 	Ingo
> 

hi,

strange, I believe I did reply on this.. got missed
somehow or I'm missing simething :) reattached..

thanks,
jirka


---
hi,

I made another test with "reseting" the system state as suggested and
only for cache-misses together with instructions and cycles events.

I can see even bigger drop of icache load misses than before
from 19359739 to 16448709 (about 15%).

The instruction/cycles count is slightly bigger in the patched
kernel run though..

perf stat --repeat 100  -e L1-icache-load-misses -e instructions -e cycles ./hackbench/hackbench 10

-------------------------------------------------------------------------------
before patch:

 Performance counter stats for './hackbench/hackbench 10' (100 runs):

           19359739  L1-icache-load-misses      ( +-   0.313% )
         2667528936  instructions             #      0.498 IPC     ( +- 0.165% )
         5352849800  cycles                     ( +-   0.303% )

        0.205402048  seconds time elapsed   ( +-   0.299% )

 Performance counter stats for './hackbench/hackbench 10' (500 runs):

           19417627  L1-icache-load-misses      ( +-   0.147% )
         2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
         5389516026  cycles                     ( +-   0.144% )

        0.206267711  seconds time elapsed   ( +-   0.138% )


-------------------------------------------------------------------------------
after patch:

 Performance counter stats for './hackbench/hackbench 10' (100 runs):

           16448709  L1-icache-load-misses      ( +-   0.426% )
         2698406306  instructions             #      0.500 IPC     ( +- 0.177% )
         5393976267  cycles                     ( +-   0.321% )

        0.206072845  seconds time elapsed   ( +-   0.276% )

 Performance counter stats for './hackbench/hackbench 10' (500 runs):

           16490788  L1-icache-load-misses      ( +-   0.180% )
         2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
         5414756975  cycles                     ( +-   0.148% )

        0.206747566  seconds time elapsed   ( +-   0.137% )


Attaching patch with above numbers in comment.

thanks,
jirka


---
Putting x86 entry code to the separate section: .entry.text.

Separating the entry text section seems to have performance
benefits with regards to the instruction cache usage.

Running hackbench showed that the change compresses the icache
footprint. The icache load miss rate went down by about 15%:

before patch:
         19417627  L1-icache-load-misses      ( +-   0.147% )

after patch:
         16490788  L1-icache-load-misses      ( +-   0.180% )


Whole perf output follows.

- results for current tip tree:
  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         19417627  L1-icache-load-misses      ( +-   0.147% )
       2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
       5389516026  cycles                     ( +-   0.144% )

      0.206267711  seconds time elapsed   ( +-   0.138% )

- results for current tip tree with the patch applied are:
  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         16490788  L1-icache-load-misses      ( +-   0.180% )
       2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
       5414756975  cycles                     ( +-   0.148% )

      0.206747566  seconds time elapsed   ( +-   0.137% )


wbr,
jirka


Signed-off-by: Jiri Olsa <jolsa@redhat.com>
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 6 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 0ed7896..50f1630 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..f5accf8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 891268c..39f8d21 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index e70cc3d..459dce2 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\
-- 
1.7.1


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/2] x86: separating entry text section
  2011-03-07 18:10                               ` Jiri Olsa
@ 2011-03-08 16:15                                 ` Ingo Molnar
  2011-03-08 20:15                                 ` [tip:perf/core] x86: Separate out " tip-bot for Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: Ingo Molnar @ 2011-03-08 16:15 UTC (permalink / raw)
  To: Jiri Olsa
  Cc: Arnaldo Carvalho de Melo, Frédéric Weisbecker,
	Peter Zijlstra, masami.hiramatsu.pt, hpa, ananth, davem,
	linux-kernel, tglx, eric.dumazet, 2nddept-manager


* Jiri Olsa <jolsa@redhat.com> wrote:

> On Mon, Mar 07, 2011 at 04:29:21PM +0100, Ingo Molnar wrote:
> > 
> > * Jiri Olsa <jolsa@redhat.com> wrote:
> > 
> > > hi,
> > > any feedback?
> > 
> > Did you get my reply below?
> > 
> > Thanks,
> > 
> > 	Ingo
> > 
> 
> hi,
> 
> strange, I believe I did reply on this.. got missed
> somehow or I'm missing simething :) reattached..

hm, the direct mail to me got lost, but i found your reply in the lkml folder. 
That's how i missed your reply.

> I made another test with "reseting" the system state as suggested and only for 
> cache-misses together with instructions and cycles events.
> 
> I can see even bigger drop of icache load misses than before
> from 19359739 to 16448709 (about 15%).
> 
> The instruction/cycles count is slightly bigger in the patched
> kernel run though..

Ok, elapsed time is more stable, it went from:

>         0.205402048  seconds time elapsed   ( +-   0.299% )
>         0.206267711  seconds time elapsed   ( +-   0.138% )

to:

>         0.206072845  seconds time elapsed   ( +-   0.276% )
>         0.206747566  seconds time elapsed   ( +-   0.137% )

which is 0.1% slower, within the noise of the measurement.

Will apply the patches, thanks Jiri!

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [tip:perf/core] x86: Separate out entry text section
  2011-03-07 18:10                               ` Jiri Olsa
  2011-03-08 16:15                                 ` Ingo Molnar
@ 2011-03-08 20:15                                 ` tip-bot for Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: tip-bot for Jiri Olsa @ 2011-03-08 20:15 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, acme, hpa, mingo, eric.dumazet, torvalds,
	a.p.zijlstra, jolsa, npiggin, fweisbec, akpm, tglx, mingo

Commit-ID:  ea7145477a461e09d8d194cac4b996dc4f449107
Gitweb:     http://git.kernel.org/tip/ea7145477a461e09d8d194cac4b996dc4f449107
Author:     Jiri Olsa <jolsa@redhat.com>
AuthorDate: Mon, 7 Mar 2011 19:10:39 +0100
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Tue, 8 Mar 2011 17:22:11 +0100

x86: Separate out entry text section

Put x86 entry code into a separate link section: .entry.text.

Separating the entry text section seems to have performance
benefits - caused by more efficient instruction cache usage.

Running hackbench with perf stat --repeat showed that the change
compresses the icache footprint. The icache load miss rate went
down by about 15%:

 before patch:
         19417627  L1-icache-load-misses      ( +-   0.147% )

 after patch:
         16490788  L1-icache-load-misses      ( +-   0.180% )

The motivation of the patch was to fix a particular kprobes
bug that relates to the entry text section, the performance
advantage was discovered accidentally.

Whole perf output follows:

 - results for current tip tree:

  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         19417627  L1-icache-load-misses      ( +-   0.147% )
       2676914223  instructions             #      0.497 IPC     ( +- 0.079% )
       5389516026  cycles                     ( +-   0.144% )

      0.206267711  seconds time elapsed   ( +-   0.138% )

 - results for current tip tree with the patch applied:

  Performance counter stats for './hackbench/hackbench 10' (500 runs):

         16490788  L1-icache-load-misses      ( +-   0.180% )
       2717734941  instructions             #      0.502 IPC     ( +- 0.079% )
       5414756975  cycles                     ( +-   0.148% )

      0.206747566  seconds time elapsed   ( +-   0.137% )

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: masami.hiramatsu.pt@hitachi.com
Cc: ananth@in.ibm.com
Cc: davem@davemloft.net
Cc: 2nddept-manager@sdl.hitachi.co.jp
LKML-Reference: <20110307181039.GB15197@jolsa.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/ia32/ia32entry.S         |    2 ++
 arch/x86/kernel/entry_32.S        |    6 ++++--
 arch/x86/kernel/entry_64.S        |    6 ++++--
 arch/x86/kernel/vmlinux.lds.S     |    1 +
 include/asm-generic/sections.h    |    1 +
 include/asm-generic/vmlinux.lds.h |    6 ++++++
 6 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99..f729b2e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
 #define sysretl_audit ia32_ret_from_sys_call
 #endif
 
+	.section .entry.text, "ax"
+
 #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
 
 	.macro IA32_ARG_FIXUP noebp=0
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c8b4efa..f5accf8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
 #define sysexit_audit	syscall_exit_work
 #endif
 
+	.section .entry.text, "ax"
+
 /*
  * We use macros for low-level operations which need to be overridden
  * for paravirtualization.  The following will never clobber any registers:
@@ -788,7 +790,7 @@ ENDPROC(ptregs_clone)
  */
 .section .init.rodata,"a"
 ENTRY(interrupt)
-.text
+.section .entry.text, "ax"
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -807,7 +809,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.long 1b
-      .text
+      .section .entry.text, "ax"
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed1ffb..0a0ed79 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -61,6 +61,8 @@
 #define __AUDIT_ARCH_LE	   0x40000000
 
 	.code64
+	.section .entry.text, "ax"
+
 #ifdef CONFIG_FUNCTION_TRACER
 #ifdef CONFIG_DYNAMIC_FTRACE
 ENTRY(mcount)
@@ -744,7 +746,7 @@ END(stub_rt_sigreturn)
  */
 	.section .init.rodata,"a"
 ENTRY(interrupt)
-	.text
+	.section .entry.text
 	.p2align 5
 	.p2align CONFIG_X86_L1_CACHE_SHIFT
 ENTRY(irq_entries_start)
@@ -763,7 +765,7 @@ vector=FIRST_EXTERNAL_VECTOR
       .endif
       .previous
 	.quad 1b
-      .text
+      .section .entry.text
 vector=vector+1
     .endif
   .endr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bf47007..6d4341d 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -105,6 +105,7 @@ SECTIONS
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		ENTRY_TEXT
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index b3bfabc..c1a1216 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -11,6 +11,7 @@ extern char _sinittext[], _einittext[];
 extern char _end[];
 extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
+extern char __entry_text_start[], __entry_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fe77e33..906c3ce 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -424,6 +424,12 @@
 		*(.kprobes.text)					\
 		VMLINUX_SYMBOL(__kprobes_text_end) = .;
 
+#define ENTRY_TEXT							\
+		ALIGN_FUNCTION();					\
+		VMLINUX_SYMBOL(__entry_text_start) = .;			\
+		*(.entry.text)						\
+		VMLINUX_SYMBOL(__entry_text_end) = .;
+
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 #define IRQENTRY_TEXT							\
 		ALIGN_FUNCTION();					\

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [tip:perf/core] kprobes: Disabling optimized kprobes for entry text section
  2011-02-21 14:25                     ` [PATCH 2/2] kprobes: disabling optimized kprobes for " Jiri Olsa
  2011-02-22  3:22                       ` Masami Hiramatsu
@ 2011-03-08 20:16                       ` tip-bot for Jiri Olsa
  1 sibling, 0 replies; 26+ messages in thread
From: tip-bot for Jiri Olsa @ 2011-03-08 20:16 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, masami.hiramatsu.pt, tglx, mingo, jolsa

Commit-ID:  2a8247a2600c3e087a568fc68a6ec4eedac27ef1
Gitweb:     http://git.kernel.org/tip/2a8247a2600c3e087a568fc68a6ec4eedac27ef1
Author:     Jiri Olsa <jolsa@redhat.com>
AuthorDate: Mon, 21 Feb 2011 15:25:13 +0100
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Tue, 8 Mar 2011 17:22:12 +0100

kprobes: Disabling optimized kprobes for entry text section

You can crash the kernel (with root/admin privileges) using kprobe tracer by running:

 echo "p system_call_after_swapgs" > ./kprobe_events
 echo 1 > ./events/kprobes/enable

The reason is that at the system_call_after_swapgs label, the
kernel stack is not set up. If optimized kprobes are enabled,
the user space stack is being used in this case (see optimized
kprobe template) and this might result in a crash.

There are several places like this over the entry code
(entry_$BIT). As it seems there's no any reasonable/maintainable
way to disable only those places where the stack is not ready, I
switched off the whole entry code from kprobe optimizing.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: acme@redhat.com
Cc: fweisbec@gmail.com
Cc: ananth@in.ibm.com
Cc: davem@davemloft.net
Cc: a.p.zijlstra@chello.nl
Cc: eric.dumazet@gmail.com
Cc: 2nddept-manager@sdl.hitachi.co.jp
LKML-Reference: <1298298313-5980-3-git-send-email-jolsa@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/kprobes.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index d91c477..c969fd9 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -1276,6 +1276,14 @@ static int __kprobes can_optimize(unsigned long paddr)
 	if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
 		return 0;
 
+	/*
+	 * Do not optimize in the entry code due to the unstable
+	 * stack handling.
+	 */
+	if ((paddr >= (unsigned long )__entry_text_start) &&
+	    (paddr <  (unsigned long )__entry_text_end))
+		return 0;
+
 	/* Check there is enough space for a relative jump. */
 	if (size - offset < RELATIVEJUMP_SIZE)
 		return 0;

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2011-03-08 20:16 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-14 15:12 [RFC,PATCH] kprobes - optimized kprobes might crash before setting kernel stack Jiri Olsa
2011-02-15  9:41 ` Masami Hiramatsu
2011-02-15 12:30   ` Jiri Olsa
2011-02-15 15:55     ` Masami Hiramatsu
2011-02-15 16:54       ` Jiri Olsa
2011-02-15 17:05       ` [PATCH] kprobes - do not allow optimized kprobes in entry code Jiri Olsa
2011-02-16  3:36         ` Masami Hiramatsu
2011-02-17 15:11           ` Ingo Molnar
2011-02-17 15:20             ` Jiri Olsa
2011-02-18 16:26             ` Jiri Olsa
2011-02-19 14:14               ` Masami Hiramatsu
2011-02-20 12:59                 ` Ingo Molnar
2011-02-21 11:54                   ` Jiri Olsa
2011-02-21 14:25                   ` [PATCH 0/2] x86: separating entry text section + kprobes fix Jiri Olsa
2011-02-21 14:25                     ` [PATCH 1/2] x86: separating entry text section Jiri Olsa
2011-02-22  3:22                       ` Masami Hiramatsu
2011-02-22  8:09                       ` Ingo Molnar
2011-02-22 12:52                         ` Jiri Olsa
2011-03-07 10:44                           ` Jiri Olsa
2011-03-07 15:29                             ` Ingo Molnar
2011-03-07 18:10                               ` Jiri Olsa
2011-03-08 16:15                                 ` Ingo Molnar
2011-03-08 20:15                                 ` [tip:perf/core] x86: Separate out " tip-bot for Jiri Olsa
2011-02-21 14:25                     ` [PATCH 2/2] kprobes: disabling optimized kprobes for " Jiri Olsa
2011-02-22  3:22                       ` Masami Hiramatsu
2011-03-08 20:16                       ` [tip:perf/core] kprobes: Disabling " tip-bot for Jiri Olsa

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).