LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH][RFC] trace: profile likely and unlikely annotations
       [not found]   ` <20081028001340.GB9797@mit.edu>
@ 2008-10-28  4:12     ` Steven Rostedt
  2008-10-28  4:23       ` Arjan van de Ven
                         ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-28  4:12 UTC (permalink / raw)
  To: LKML
  Cc: Mike Snitzer, Theodore Tso, Andrew Morton, Ingo Molnar,
	Thomas Gleixner, Peter Zijlstra, Arjan van de Ven,
	Frederic Weisbecker


Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

  /debugfs/tracing/profile_likely    - All likely markers
  /debugfs/tracing/profile_unlikely  - All unlikely markers.

# cat /debug/tracing/profile_unlikely | \
  awk '{ if ((($1 + 1.0)/ ($2 + 1.0)) > 0.5) { print $0; }}' |head -15
      Hit      Missed         Function
      ---      ------         --------
      36278      44749    __switch_to+0x129/0x55d
      70064      10964    __switch_to+0xcd/0x55d
      70060      10964    __switch_to+0x63/0x55d
       2628          0    syscall_trace_leave+0x0/0x154
          1          0    native_smp_prepare_cpus+0x3e9/0x485
     287349      98809    place_entity+0xac/0x11c
     141308     236748    calc_delta_mine+0x8b/0x109
      55091          0    enqueue_task+0x0/0x9b
      71724      52117    pick_next_task_fair+0x0/0xc1
         11          5    yield_task_fair+0x0/0x13b
       1269          0    pre_schedule_rt+0x0/0x98
      46392       1265    pick_next_task_rt+0x0/0x12f
      51523          0    dequeue_task+0x57/0x114

# cat /debug/tracing/profile_likely | \
  awk '{ if ((($2 + 1.0)/ ($1 + 1.0)) > 0.5) { print $0; }}'
      Hit      Missed         Function
      ---      ------         --------
      43316      37700    schedule+0x6e7/0xdb2
      23032      12997    schedule+0xa9b/0xdb2
          0          7    __cancel_work_timer+0x4b/0x292
        885       1683    update_wall_time+0x27b/0x4fd
          0     369948    audit_syscall_exit+0x131/0x403
         19     370851    audit_free+0x0/0x27e
          0     368981    audit_syscall_entry+0x1a4/0x226
         19        900    audit_free+0xf0/0x27e
          0        973    audit_alloc+0xa6/0x13c
          0          6    move_masked_irq+0x2b/0x174
      12878     223023    page_evictable+0x9a/0x1f1
          9          7    flush_cpu_slab+0x0/0x143
     634634     324186    kmem_cache_free+0x79/0x139
       3142      61979    dentry_lru_del_init+0x0/0x89
        948       1166    load_elf_binary+0xa00/0x18ad
        102         82    disk_part_iter_next+0x0/0xf1
          0         11    tcp_mss_split_point+0x0/0xbd


As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 60 hits
that were more than 50%. 

(*) Not ever unlikely is recorded, those that are used by vsyscalls
 (a few of them) had to be marked as unlikely_notrace().

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/vsyscall_64.c     |    5 -
 include/asm-generic/vmlinux.lds.h |   14 +++
 include/linux/compiler.h          |   38 ++++++++
 include/linux/seqlock.h           |    3 
 kernel/trace/Kconfig              |   16 +++
 kernel/trace/Makefile             |    2 
 kernel/trace/trace_unlikely.c     |  163 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 237 insertions(+), 4 deletions(-)

Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile	2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile	2008-10-27 23:12:08.000000000 -0400
@@ -26,4 +26,6 @@ obj-$(CONFIG_FUNCTION_PROFILER) += trace
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 
+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+
 libftrace-y := ftrace.o
Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c	2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/arch/x86/kernel/vsyscall_64.c	2008-10-27 23:12:44.000000000 -0400
@@ -124,7 +124,8 @@ static __always_inline void do_vgettimeo
 		seq = read_seqbegin(&__vsyscall_gtod_data.lock);
 
 		vread = __vsyscall_gtod_data.clock.vread;
-		if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) {
+		if (unlikely_notrace(!__vsyscall_gtod_data.sysctl_enabled ||
+				     !vread)) {
 			gettimeofday(tv,NULL);
 			return;
 		}
@@ -165,7 +166,7 @@ time_t __vsyscall(1) vtime(time_t *t)
 {
 	struct timeval tv;
 	time_t result;
-	if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
+	if (unlikely_notrace(!__vsyscall_gtod_data.sysctl_enabled))
 		return time_syscall(t);
 
 	vgettimeofday(&tv, NULL);
Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h	2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/include/asm-generic/vmlinux.lds.h	2008-10-27 23:12:08.000000000 -0400
@@ -45,6 +45,17 @@
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
+				*(_ftrace_likely)			      \
+				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
+				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+				*(_ftrace_unlikely)			      \
+				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -62,7 +73,8 @@
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	LIKELY_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
Index: linux-tip.git/include/linux/compiler.h
===================================================================
--- linux-tip.git.orig/include/linux/compiler.h	2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/include/linux/compiler.h	2008-10-28 00:00:50.000000000 -0400
@@ -59,8 +59,46 @@ extern void __chk_io_ptr(const volatile 
  * specific implementations come from the above header files
  */
 
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+	unsigned long ip;
+	unsigned long hit;
+	unsigned long missed;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val);
+
+#define likely_notrace(x)	__builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
+
+/* Can't use _THIS_IP_ because it screws up header dependencies */
+#define __THIS_IP__  ({ __label__ __here; __here: (unsigned long)&&__here; })
+
+#define likely(x) ({							\
+			int ______r;					\
+			static struct ftrace_likely_data ______f	\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_likely"))); \
+			if (unlikely_notrace(!______f.ip))		\
+				______f.ip = __THIS_IP__;		\
+			______r = likely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r);	\
+			______r;					\
+		})
+#define unlikely(x) ({							\
+			int ______r;					\
+			static struct ftrace_likely_data ______f	\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_unlikely"))); \
+			if (unlikely_notrace(!______f.ip))		\
+				______f.ip = __THIS_IP__;		\
+			______r = unlikely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r);	\
+			______r;					\
+		})
+#else
 #define likely(x)	__builtin_expect(!!(x), 1)
 #define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig	2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig	2008-10-27 23:12:08.000000000 -0400
@@ -138,6 +138,22 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config TRACE_UNLIKELY_PROFILE
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_likely
+	  /debugfs/tracing/profile_unlikely
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-tip.git/kernel/trace/trace_unlikely.c	2008-10-27 23:13:53.000000000 -0400
@@ -0,0 +1,163 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val)
+{
+	/* FIXME: Make this atomic! */
+	if (val)
+		f->hit++;
+	else
+		f->missed++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_likely_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int trace_lookup_sym(struct seq_file *m, unsigned long addr)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(str, addr);
+
+	return seq_printf(m, "%s\n", str);
+#else
+	return seq_printf(m, "%p\n", (void *)addr);
+#endif
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+
+	if (v == (void *)1) {
+		seq_printf(m, "      Hit      Missed         Function\n"
+			      "      ---      ------         --------\n");
+		return 0;
+	}
+
+	/* skip null records */
+	if (!p->ip)
+		return 0;
+
+	seq_printf(m, "   %8lu   %8lu    ", p->hit, p->missed);
+	trace_lookup_sym(m, p->ip);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_unlikely_init);
Index: linux-tip.git/include/linux/seqlock.h
===================================================================
--- linux-tip.git.orig/include/linux/seqlock.h	2008-10-27 23:11:52.000000000 -0400
+++ linux-tip.git/include/linux/seqlock.h	2008-10-27 23:12:08.000000000 -0400
@@ -90,7 +90,8 @@ static __always_inline unsigned read_seq
 repeat:
 	ret = sl->sequence;
 	smp_rmb();
-	if (unlikely(ret & 1)) {
+	/* Used in vsyscall, need notrace version */
+	if (unlikely_notrace(ret & 1)) {
 		cpu_relax();
 		goto repeat;
 	}

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28  4:12     ` [PATCH][RFC] trace: profile likely and unlikely annotations Steven Rostedt
@ 2008-10-28  4:23       ` Arjan van de Ven
  2008-10-28  4:39       ` Andrew Morton
  2008-10-28 14:37       ` Theodore Tso
  2 siblings, 0 replies; 17+ messages in thread
From: Arjan van de Ven @ 2008-10-28  4:23 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Mike Snitzer, Theodore Tso, Andrew Morton, Ingo Molnar,
	Thomas Gleixner, Peter Zijlstra, Frederic Weisbecker

On Tue, 28 Oct 2008 00:12:48 -0400 (EDT)
Steven Rostedt <rostedt@goodmis.org> wrote:

> 
> Andrew Morton recently suggested having an in-kernel way to profile
> likely and unlikely macros. This patch achieves that goal.


alternatively we remove likely/unlikely ;)

or maybe only ban it from drivers or something.


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28  4:12     ` [PATCH][RFC] trace: profile likely and unlikely annotations Steven Rostedt
  2008-10-28  4:23       ` Arjan van de Ven
@ 2008-10-28  4:39       ` Andrew Morton
  2008-10-28 14:37       ` Theodore Tso
  2 siblings, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2008-10-28  4:39 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Mike Snitzer, Theodore Tso, Ingo Molnar, Thomas Gleixner,
	Peter Zijlstra, Arjan van de Ven, Frederic Weisbecker

On Tue, 28 Oct 2008 00:12:48 -0400 (EDT) Steven Rostedt <rostedt@goodmis.org> wrote:

> (*) Not ever unlikely is recorded, those that are used by vsyscalls
>  (a few of them) had to be marked as unlikely_notrace().

Yes, it took continuous maintenance to weed this stuff out of
profile-likely-unlikely-macros.patch.

I'd suggest that you take a peek at this patch and make sure that
you've addressed all the sites which had to be fixed up while
maintaining this one.

(I've been maintaining this for 2-1/2 years.  Sniff)


From: Daniel Walker <dwalker@mvista.com>

-ENOCHANGELOG!

Creates /proc/likely_prof.

[randy.dunlap@oracle.com: profile_likely: export do_check_likely]
[akpm@linux-foundation.org: fixes and general maintenance]
[dwalker@mvista.com: likely_prof changed to use proc_create]
[12o3l@tiscali.nl: likeliness accounting change and cleanup]
[adobriyan@gmail.com: proc: remove proc_root from drivers]
[12o3l@tiscali.nl: update to test_and_set_bit_lock / clear_bit_unlock]
[dwalker@mvista.com: likely-profiling: disable ftrace]
Signed-off-by: Daniel Walker <dwalker@mvista.com>
Signed-off-by: Hua Zhong <hzhong@gmail.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Roel Kluin <12o3l@tiscali.nl>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---

 arch/x86/kernel/vsyscall_64.c  |    2 
 arch/x86/vdso/vclock_gettime.c |    2 
 include/linux/compiler.h       |   36 +++++++
 lib/Kconfig.debug              |    8 +
 lib/Makefile                   |    5 +
 lib/likely_prof.c              |  150 +++++++++++++++++++++++++++++++
 6 files changed, 203 insertions(+)

diff -puN arch/x86/kernel/vsyscall_64.c~profile-likely-unlikely-macros arch/x86/kernel/vsyscall_64.c
--- a/arch/x86/kernel/vsyscall_64.c~profile-likely-unlikely-macros
+++ a/arch/x86/kernel/vsyscall_64.c
@@ -17,6 +17,8 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+#define SUPPRESS_LIKELY_PROFILING
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
diff -puN arch/x86/vdso/vclock_gettime.c~profile-likely-unlikely-macros arch/x86/vdso/vclock_gettime.c
--- a/arch/x86/vdso/vclock_gettime.c~profile-likely-unlikely-macros
+++ a/arch/x86/vdso/vclock_gettime.c
@@ -9,6 +9,8 @@
  * Also alternative() doesn't work.
  */
 
+#define SUPPRESS_LIKELY_PROFILING
+
 #include <linux/kernel.h>
 #include <linux/posix-timers.h>
 #include <linux/time.h>
diff -puN include/linux/compiler.h~profile-likely-unlikely-macros include/linux/compiler.h
--- a/include/linux/compiler.h~profile-likely-unlikely-macros
+++ a/include/linux/compiler.h
@@ -53,6 +53,41 @@ extern void __chk_io_ptr(const volatile 
 # include <linux/compiler-intel.h>
 #endif
 
+#if defined(CONFIG_PROFILE_LIKELY) && !defined(SUPPRESS_LIKELY_PROFILING) && \
+	!(defined(CONFIG_MODULE_UNLOAD) && defined(MODULE))
+struct likeliness {
+	char *const file;
+	unsigned long caller;
+	unsigned int count[2];
+	struct likeliness *next;
+	unsigned int label;
+};
+
+extern int do_check_likely(struct likeliness *likeliness, unsigned int exp);
+
+#define LP_IS_EXPECTED	1
+#define LP_UNSEEN	4
+#define LP_LINE_SHIFT	3
+
+#define __check_likely(exp, is_likely)					\
+	({								\
+		static struct likeliness likeliness = {			\
+			.file = __FILE__,				\
+			.label = __LINE__ << LP_LINE_SHIFT |		\
+						LP_UNSEEN | is_likely,	\
+		};							\
+		do_check_likely(&likeliness, !!(exp));			\
+	})
+
+/*
+ * We check for constant values with __builtin_constant_p() since
+ * it's not interesting to profile them, and there is a compiler
+ * bug in gcc 3.x which blows up during constant evalution when
+ * CONFIG_PROFILE_LIKELY is turned on.
+ */
+#define likely(x)	(__builtin_constant_p(x) ? (!!(x)) : __check_likely((x), 1))
+#define unlikely(x)	(__builtin_constant_p(x) ? (!!(x)) : __check_likely((x), 0))
+#else
 /*
  * Generic compiler-dependent macros required for kernel
  * build go below this comment. Actual compiler/compiler version
@@ -61,6 +96,7 @@ extern void __chk_io_ptr(const volatile 
 
 #define likely(x)	__builtin_expect(!!(x), 1)
 #define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
diff -puN lib/Kconfig.debug~profile-likely-unlikely-macros lib/Kconfig.debug
--- a/lib/Kconfig.debug~profile-likely-unlikely-macros
+++ a/lib/Kconfig.debug
@@ -568,6 +568,14 @@ config DEBUG_SYNCHRO_TEST
 
 	  See Documentation/synchro-test.txt.
 
+config	PROFILE_LIKELY
+	bool "Record return values from likely/unlikely macros"
+	default n
+	help
+	  Adds profiling on likely/unlikly macros . To see the
+	  results of the profiling you can view the following,
+		/proc/likely_prof
+
 config BOOT_PRINTK_DELAY
 	bool "Delay each boot printk message by N milliseconds"
 	depends on DEBUG_KERNEL && PRINTK && GENERIC_CALIBRATE_DELAY
diff -puN lib/Makefile~profile-likely-unlikely-macros lib/Makefile
--- a/lib/Makefile~profile-likely-unlikely-macros
+++ a/lib/Makefile
@@ -22,6 +22,9 @@ obj-y += bcd.o div64.o sort.o parser.o h
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 string_helpers.o
 
+# likely profiling can cause recursion in ftrace, so don't trace it.
+CFLAGS_REMOVE_likely_prof.o = -pg
+
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
 CFLAGS_kobject_uevent.o += -DDEBUG
@@ -82,6 +85,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += sys
 
 obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
 
+obj-$(CONFIG_PROFILE_LIKELY) += likely_prof.o
+
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
 
diff -puN /dev/null lib/likely_prof.c
--- /dev/null
+++ a/lib/likely_prof.c
@@ -0,0 +1,150 @@
+/*
+ * This code should enable profiling the likely and unlikely macros.
+ *
+ * Output goes in /proc/likely_prof
+ *
+ * Authors:
+ * Daniel Walker <dwalker@mvista.com>
+ * Hua Zhong <hzhong@gmail.com>
+ * Andrew Morton <akpm@osdl.org>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include <linux/kallsyms.h>
+
+#include <asm/bug.h>
+#include <asm/atomic.h>
+
+static struct likeliness *likeliness_head;
+
+int do_check_likely(struct likeliness *likeliness, unsigned int ret)
+{
+	static unsigned long likely_lock;
+
+	likeliness->count[ret]++;
+
+	if (likeliness->label & LP_UNSEEN) {
+		/*
+		 * We don't simply use a spinlock because internally to the
+		 * spinlock there is a call to unlikely which causes recursion.
+		 * We opted for this method because we didn't need a preempt/irq
+		 * disable and it was a bit cleaner then using internal __raw
+		 * spinlock calls.
+		 */
+		if (!test_and_set_bit_lock(0, &likely_lock)) {
+			if (likeliness->label & LP_UNSEEN) {
+				likeliness->label &= (~LP_UNSEEN);
+				likeliness->next = likeliness_head;
+				likeliness_head = likeliness;
+				likeliness->caller = (unsigned long)
+						__builtin_return_address(0);
+			}
+			clear_bit_unlock(0, &likely_lock);
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(do_check_likely);
+
+static void * lp_seq_start(struct seq_file *out, loff_t *pos)
+{
+
+	if (!*pos) {
+
+		seq_printf(out, "Likely Profiling Results\n");
+		seq_printf(out, " --------------------------------------------"
+				"------------------------\n");
+		seq_printf(out, "[+- ]Type | # True  | # False | Function@"
+				"Filename:Line\n");
+
+		out->private = likeliness_head;
+	}
+
+	return out->private;
+}
+
+static void *lp_seq_next(struct seq_file *out, void *p, loff_t *pos)
+{
+	struct likeliness *entry = p;
+
+	if (entry->next) {
+		++(*pos);
+		out->private = entry->next;
+	} else
+		out->private = NULL;
+
+	return out->private;
+}
+
+static int lp_seq_show(struct seq_file *out, void *p)
+{
+	struct likeliness *entry = p;
+	unsigned int pos = entry->count[1];
+	unsigned int neg = entry->count[0];
+	char function[KSYM_SYMBOL_LEN];
+
+	/*
+	 * Balanced if the suggestion was false in less than 5% of the tests
+	 */
+	if (!(entry->label & LP_IS_EXPECTED)) {
+		if (pos + neg < 20 * pos)
+			seq_printf(out, "+");
+		else
+			seq_printf(out, " ");
+
+		seq_printf(out, "unlikely ");
+	} else {
+		if (pos + neg < 20 * neg)
+			seq_printf(out, "-");
+		else
+			seq_printf(out, " ");
+
+		seq_printf(out, "likely   ");
+	}
+
+	sprint_symbol(function, entry->caller);
+	seq_printf(out, "|%9u|%9u|\t%s@%s:%u\n", pos, neg, function,
+			entry->file, entry->label >> LP_LINE_SHIFT);
+
+	return 0;
+}
+
+static void lp_seq_stop(struct seq_file *m, void *p)
+{
+}
+
+static struct seq_operations likely_profiling_ops = {
+	.start  = lp_seq_start,
+	.next   = lp_seq_next,
+	.stop   = lp_seq_stop,
+	.show   = lp_seq_show
+};
+
+static int lp_results_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &likely_profiling_ops);
+}
+
+static struct file_operations proc_likely_operations  = {
+	.open           = lp_results_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static int __init init_likely(void)
+{
+	struct proc_dir_entry *entry =
+		proc_create("likely_prof", 0, NULL, &proc_likely_operations);
+	if (!entry)
+		return 1;
+
+	return 0;
+}
+__initcall(init_likely);
_


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28  4:12     ` [PATCH][RFC] trace: profile likely and unlikely annotations Steven Rostedt
  2008-10-28  4:23       ` Arjan van de Ven
  2008-10-28  4:39       ` Andrew Morton
@ 2008-10-28 14:37       ` Theodore Tso
  2008-10-28 14:48         ` Arjan van de Ven
  2008-10-28 14:49         ` [PATCH][RFC] " Steven Rostedt
  2 siblings, 2 replies; 17+ messages in thread
From: Theodore Tso @ 2008-10-28 14:37 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Mike Snitzer, Andrew Morton, Ingo Molnar, Thomas Gleixner,
	Peter Zijlstra, Arjan van de Ven, Frederic Weisbecker

On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> 
> Andrew Morton recently suggested having an in-kernel way to profile
> likely and unlikely macros. This patch achieves that goal.

Maybe I'm confused, but when I read through the patch, it looks like
that 'hit' is incremented whenever the condition is true, and 'missed'
is incremented whenever the condition is false, correct?

Is that what you intended?  So for profile_unlikely, "missed" is good,
and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
is bad.  That seems horribly confusing.

If that wasn't what you intended, the meaning of "hit" and "missed"
seems to be highly confusing, either way.  Can we perhaps use some
other terminology?  Simply using "True" and "False" would be better,
since there's no possible confusion what the labels mean.   

> +#define unlikely(x) ({							\
> +			int ______r;					\
> +			static struct ftrace_likely_data ______f	\
> +				__attribute__((__aligned__(4)))		\
> +				__attribute__((section("_ftrace_unlikely"))); \
> +			if (unlikely_notrace(!______f.ip))		\
> +				______f.ip = __THIS_IP__;		\
> +			______r = unlikely_notrace(x);			\
> +			ftrace_likely_update(&______f, ______r);	\
> +			______r;					\
> +		})

Note that unlikely(x) calls ftrace_likely_update(), which does this:

> +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
> +{
> +	/* FIXME: Make this atomic! */
> +	if (val)
> +		f->hit++;
> +	else
> +		f->missed++;
> +}
> +EXPORT_SYMBOL(ftrace_likely_update);


So that seems to mean that if unlikely(x) is false, then _____r is 0,
which means we increment f->missed.   Or am I missing something?

I would have thought that if unlikely(x) is false, that's *good*,
since it means the unlikely label was correct.  And normally, when
people think about cache hits vs cache misses, hits are good and
misses are bad.  Which is why I think the terminology is highly
confusing...

      	       		 	      	    - Ted

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28 14:37       ` Theodore Tso
@ 2008-10-28 14:48         ` Arjan van de Ven
  2008-10-28 14:51           ` Steven Rostedt
  2008-10-28 14:49         ` [PATCH][RFC] " Steven Rostedt
  1 sibling, 1 reply; 17+ messages in thread
From: Arjan van de Ven @ 2008-10-28 14:48 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Steven Rostedt, LKML, Mike Snitzer, Andrew Morton, Ingo Molnar,
	Thomas Gleixner, Peter Zijlstra, Frederic Weisbecker

On Tue, 28 Oct 2008 10:37:20 -0400
Theodore Tso <tytso@mit.edu> wrote:

> On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> > 
> > Andrew Morton recently suggested having an in-kernel way to profile
> > likely and unlikely macros. This patch achieves that goal.
> 
> Maybe I'm confused, but when I read through the patch, it looks like
> that 'hit' is incremented whenever the condition is true, and 'missed'
> is incremented whenever the condition is false, correct?
> 
> Is that what you intended?  So for profile_unlikely, "missed" is good,
> and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
> is bad.  That seems horribly confusing.
> 
> If that wasn't what you intended, the meaning of "hit" and "missed"
> seems to be highly confusing, either way.  Can we perhaps use some
> other terminology?  Simply using "True" and "False" would be better,
> since there's no possible confusion what the labels mean.

or "correct" and "incorrect"



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28 14:37       ` Theodore Tso
  2008-10-28 14:48         ` Arjan van de Ven
@ 2008-10-28 14:49         ` Steven Rostedt
  2008-10-28 18:29           ` Theodore Tso
  1 sibling, 1 reply; 17+ messages in thread
From: Steven Rostedt @ 2008-10-28 14:49 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Mike Snitzer, Andrew Morton, Ingo Molnar, Thomas Gleixner,
	Peter Zijlstra, Arjan van de Ven, Frederic Weisbecker


On Tue, 28 Oct 2008, Theodore Tso wrote:

> On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> > 
> > Andrew Morton recently suggested having an in-kernel way to profile
> > likely and unlikely macros. This patch achieves that goal.
> 
> Maybe I'm confused, but when I read through the patch, it looks like
> that 'hit' is incremented whenever the condition is true, and 'missed'
> is incremented whenever the condition is false, correct?

Correct.

> 
> Is that what you intended?  So for profile_unlikely, "missed" is good,
> and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
> is bad.  That seems horribly confusing.

Correct. Yeah, I figured I'd get complaints about this (hence the RFC).
If you look at my awk example, you will also notice that I switched the
$1 and $2 around when reading the other file.

This can be confusing either way. I did this to reuse the code for both 
outputs.

> 
> If that wasn't what you intended, the meaning of "hit" and "missed"
> seems to be highly confusing, either way.  Can we perhaps use some
> other terminology?  Simply using "True" and "False" would be better,
> since there's no possible confusion what the labels mean.   

So renaming 'hit' and 'miss' to 'True' and 'False' would be good enough?
That is, it will still mean that a 'True' is bad for unlikely but good for 
a likely?

> 
> > +#define unlikely(x) ({							\
> > +			int ______r;					\
> > +			static struct ftrace_likely_data ______f	\
> > +				__attribute__((__aligned__(4)))		\
> > +				__attribute__((section("_ftrace_unlikely"))); \
> > +			if (unlikely_notrace(!______f.ip))		\
> > +				______f.ip = __THIS_IP__;		\
> > +			______r = unlikely_notrace(x);			\
> > +			ftrace_likely_update(&______f, ______r);	\
> > +			______r;					\
> > +		})
> 
> Note that unlikely(x) calls ftrace_likely_update(), which does this:
> 
> > +void ftrace_likely_update(struct ftrace_likely_data *f, int val)
> > +{
> > +	/* FIXME: Make this atomic! */
> > +	if (val)
> > +		f->hit++;
> > +	else
> > +		f->missed++;
> > +}
> > +EXPORT_SYMBOL(ftrace_likely_update);
> 
> 
> So that seems to mean that if unlikely(x) is false, then _____r is 0,
> which means we increment f->missed.   Or am I missing something?
> 
> I would have thought that if unlikely(x) is false, that's *good*,
> since it means the unlikely label was correct.  And normally, when
> people think about cache hits vs cache misses, hits are good and
> misses are bad.  Which is why I think the terminology is highly
> confusing...

OK, I'm fine with changing the terminology. v2 will do:

  s/hit/True/
  s/missed/False/

Fine with you?

-- Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28 14:48         ` Arjan van de Ven
@ 2008-10-28 14:51           ` Steven Rostedt
  2008-10-29 16:35             ` [PATCH 1/2 v2][RFC] " Steven Rostedt
  0 siblings, 1 reply; 17+ messages in thread
From: Steven Rostedt @ 2008-10-28 14:51 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Theodore Tso, LKML, Mike Snitzer, Andrew Morton, Ingo Molnar,
	Thomas Gleixner, Peter Zijlstra, Frederic Weisbecker


On Tue, 28 Oct 2008, Arjan van de Ven wrote:

> On Tue, 28 Oct 2008 10:37:20 -0400
> Theodore Tso <tytso@mit.edu> wrote:
> 
> > On Tue, Oct 28, 2008 at 12:12:48AM -0400, Steven Rostedt wrote:
> > > 
> > > Andrew Morton recently suggested having an in-kernel way to profile
> > > likely and unlikely macros. This patch achieves that goal.
> > 
> > Maybe I'm confused, but when I read through the patch, it looks like
> > that 'hit' is incremented whenever the condition is true, and 'missed'
> > is incremented whenever the condition is false, correct?
> > 
> > Is that what you intended?  So for profile_unlikely, "missed" is good,
> > and "hit" is bad, and for profile_likely, "hit" is good, and "missed"
> > is bad.  That seems horribly confusing.
> > 
> > If that wasn't what you intended, the meaning of "hit" and "missed"
> > seems to be highly confusing, either way.  Can we perhaps use some
> > other terminology?  Simply using "True" and "False" would be better,
> > since there's no possible confusion what the labels mean.
> 
> or "correct" and "incorrect"

This means that the code will need to be different for both. Or at least a 
way to differentiate between the two.  Not that hard, but I wanted to make 
the code as trivial as possible.

-- Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28 14:49         ` [PATCH][RFC] " Steven Rostedt
@ 2008-10-28 18:29           ` Theodore Tso
  2008-10-28 18:41             ` Steven Rostedt
  0 siblings, 1 reply; 17+ messages in thread
From: Theodore Tso @ 2008-10-28 18:29 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Mike Snitzer, Andrew Morton, Ingo Molnar, Thomas Gleixner,
	Peter Zijlstra, Arjan van de Ven, Frederic Weisbecker

On Tue, Oct 28, 2008 at 10:49:16AM -0400, Steven Rostedt wrote:
> 
> OK, I'm fine with changing the terminology. v2 will do:
> 
>   s/hit/True/
>   s/missed/False/
> 
> Fine with you?

I'm OK with either that, or with Arjan's suggestion of "Correct" and
"Incorrect" --- although that would changing a line in the definition
of #define unlikely(x):

                    ftrace_likely_update(&______f, !______r);

Either "True" / "False" or "Correct" / "Incorrect" has the advantage
of being unambiguous.  "Correct" / "Incorrect" has the advantage that
people don't have to think about the fact that for
/proc/profile_unlikely, high numbers of "False" is a good thing, where
as for /proc/profile_likely, high numbers of "True" is a good thing.
With "Correct" / "Incorrect" it's easier to undersatnd that high
numbers of "Correct" is good.   

So I can see why Arjan suggested Correct/Incorrect, although I can
live with either.

							- Ted

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH][RFC] trace: profile likely and unlikely annotations
  2008-10-28 18:29           ` Theodore Tso
@ 2008-10-28 18:41             ` Steven Rostedt
  0 siblings, 0 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-28 18:41 UTC (permalink / raw)
  To: Theodore Tso
  Cc: LKML, Mike Snitzer, Andrew Morton, Ingo Molnar, Thomas Gleixner,
	Peter Zijlstra, Arjan van de Ven, Frederic Weisbecker


On Tue, 28 Oct 2008, Theodore Tso wrote:

> On Tue, Oct 28, 2008 at 10:49:16AM -0400, Steven Rostedt wrote:
> > 
> > OK, I'm fine with changing the terminology. v2 will do:
> > 
> >   s/hit/True/
> >   s/missed/False/
> > 
> > Fine with you?
> 
> I'm OK with either that, or with Arjan's suggestion of "Correct" and
> "Incorrect" --- although that would changing a line in the definition
> of #define unlikely(x):
> 
>                     ftrace_likely_update(&______f, !______r);
> 
> Either "True" / "False" or "Correct" / "Incorrect" has the advantage
> of being unambiguous.  "Correct" / "Incorrect" has the advantage that
> people don't have to think about the fact that for
> /proc/profile_unlikely, high numbers of "False" is a good thing, where
> as for /proc/profile_likely, high numbers of "True" is a good thing.
> With "Correct" / "Incorrect" it's easier to undersatnd that high
> numbers of "Correct" is good.   
> 
> So I can see why Arjan suggested Correct/Incorrect, although I can
> live with either.

I'm almost done with v2. Using Arjan's version (Correct/Incorrect) was 
actually a trivial change.

-- Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations
  2008-10-28 14:51           ` Steven Rostedt
@ 2008-10-29 16:35             ` Steven Rostedt
  2008-10-29 16:38               ` [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer Steven Rostedt
                                 ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-29 16:35 UTC (permalink / raw)
  To: LKML
  Cc: Arjan van de Ven, Theodore Tso, Mike Snitzer, Andrew Morton,
	Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker


Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

  /debugfs/tracing/profile_likely    - All likely markers
  /debugfs/tracing/profile_unlikely  - All unlikely markers.

# cat /debug/tracing/profile_unlikely | \
   awk '{if (($1 || $2) && (($2+0.1)/($1+0.1) > 0.25)) { print $0; }}' | \
   head -20
 correct  incorrect       Function                      File              Line
 -------  ---------       --------                      ----              ----
    46894    38216    __switch_to                    process_64.c         624
    11951    73138    __switch_to                    process_64.c         594
    11951    73097    __switch_to                    process_64.c         590
   117873    35049    need_resched                   sched.h              2103
     1345      711    syscall_get_error              syscall.h            51
        0     2510    syscall_trace_leave            ptrace.c             1568
        0        1    native_smp_prepare_cpus        smpboot.c            1231
    91635   276125    calc_delta_fair                sched_fair.c         411
   219028   113449    calc_delta_mine                sched.c              1279
        0    58048    sched_info_queued              sched_stats.h        222
    55449    71591    pick_next_task_fair            sched_fair.c         1394
       10        5    yield_task_fair                sched_fair.c         983
     7117     2553    rt_policy                      sched.c              144
        0     1220    pre_schedule_rt                sched_rt.c           1260
     1216    52402    pick_next_task_rt              sched_rt.c           884
        0    48009    sched_info_dequeued            sched_stats.h        177
        0       14    sched_move_task                sched.c              8707
        0       14    sched_move_task                sched.c              8697


# cat /debug/tracing/profile_likely | \
  awk '{if (($1 || $2) && (($2+0.1)/($1+0.1) > 0.25)) { print $0; }}'
 correct  incorrect       Function                      File              Line
 -------  ---------       --------                      ----              ----
    42987    40240    pick_next_task                 sched.c              4407
    21419    18103    switch_mm                      mmu_context_64.h     18
        0        7    __cancel_work_timer            workqueue.c          560
      861     3960    clocksource_adjust             timekeeping.c        456
      390     1913    clocksource_adjust             timekeeping.c        450
        0   363683    audit_syscall_exit             auditsc.c            1570
       30   364573    audit_get_context              auditsc.c            732
        0   362744    audit_syscall_entry            auditsc.c            1541
       30      879    audit_free                     auditsc.c            1446
        0      952    audit_alloc                    auditsc.c            862
     2606      952    audit_alloc                    auditsc.c            858
        0        6    move_masked_irq                migration.c          20
     4465     2091    __grab_cache_page              filemap.c            2212
     9495     2787    iov_iter_advance               filemap.c            1884
     3121      899    __set_page_dirty               page-writeback.c     1134
    12874   221371    mapping_unevictable            pagemap.h            50
        4       10    __flush_cpu_slab               slub.c               1466
   635239   320760    slab_free                      slub.c               1731
     3105    61055    dentry_lru_del_init            dcache.c             153
      942     1156    load_elf_binary                binfmt_elf.c         904
      102       82    disk_put_part                  genhd.h              206
        0        1    tcp_mss_split_point            tcp_output.c         1126



As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 76 hits
that were more than 25%. 

Note:  After submitting my first version of this patch, Andrew Morton
  showed me a version written by Daniel Walker, where I picked up
  the following ideas from:

  1)  Using __builtin_constant_p to avoid profiling fixed values.
  2)  Using __FILE__ instead of instruction pointers.
  3)  Using the preprocessor to stop all profiling of likely
       annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
 (a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/vsyscall_64.c     |    8 +
 include/asm-generic/vmlinux.lds.h |   14 +++
 include/linux/compiler.h          |   61 ++++++++++++++
 kernel/trace/Kconfig              |   16 +++
 kernel/trace/Makefile             |    2 
 kernel/trace/trace_unlikely.c     |  157 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 255 insertions(+), 3 deletions(-)

Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile	2008-10-29 11:30:05.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile	2008-10-29 12:32:40.000000000 -0400
@@ -25,4 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stac
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 
+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+
 libftrace-y := ftrace.o
Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c	2008-10-29 11:30:05.000000000 -0400
+++ linux-tip.git/arch/x86/kernel/vsyscall_64.c	2008-10-29 11:30:12.000000000 -0400
@@ -17,6 +17,14 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Protect userspace from profiling */
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+# undef likely
+# undef unlikely
+# define likely(x)		likely_notrace(x)
+# define unlikely(x)		unlikely_notrace(x)
+#endif
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h	2008-10-29 11:30:06.000000000 -0400
+++ linux-tip.git/include/asm-generic/vmlinux.lds.h	2008-10-29 11:30:12.000000000 -0400
@@ -45,6 +45,17 @@
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
+				*(_ftrace_likely)			      \
+				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
+				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+				*(_ftrace_unlikely)			      \
+				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -62,7 +73,8 @@
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	LIKELY_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
Index: linux-tip.git/include/linux/compiler.h
===================================================================
--- linux-tip.git.orig/include/linux/compiler.h	2008-10-29 11:30:06.000000000 -0400
+++ linux-tip.git/include/linux/compiler.h	2008-10-29 11:32:05.000000000 -0400
@@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile 
  * specific implementations come from the above header files
  */
 
-#define likely(x)	__builtin_expect(!!(x), 1)
-#define unlikely(x)	__builtin_expect(!!(x), 0)
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	unsigned long correct;
+	unsigned long incorrect;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+#define likely_notrace(x)	__builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
+
+#define likely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_likely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = likely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 1);	\
+			______r;					\
+		})
+#define unlikely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_unlikely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = unlikely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 0);	\
+			______r;					\
+		})
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : likely_check(x))
+# endif
+# ifndef unlikely
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+# endif
+#else
+# define likely(x)	__builtin_expect(!!(x), 1)
+# define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig	2008-10-29 11:30:10.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig	2008-10-29 12:32:40.000000000 -0400
@@ -138,6 +138,22 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config TRACE_UNLIKELY_PROFILE
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_likely
+	  /debugfs/tracing/profile_unlikely
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-tip.git/kernel/trace/trace_unlikely.c	2008-10-29 12:32:40.000000000 -0400
@@ -0,0 +1,157 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_likely_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct  incorrect"
+			      "       Function                 "
+			      "     File              Line\n"
+			      " -------  ---------"
+			      "       --------                 "
+			      "     ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	seq_printf(m, " %8lu %8lu    ", p->correct, p->incorrect);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_unlikely_init);



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer
  2008-10-29 16:35             ` [PATCH 1/2 v2][RFC] " Steven Rostedt
@ 2008-10-29 16:38               ` Steven Rostedt
  2008-10-29 16:40               ` [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations Arjan van de Ven
  2008-10-29 22:39               ` [PATCH 1/2 v3][RFC] " Steven Rostedt
  2 siblings, 0 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-29 16:38 UTC (permalink / raw)
  To: LKML
  Cc: Arjan van de Ven, Theodore Tso, Mike Snitzer, Andrew Morton,
	Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker


This patch adds a way to record the instances of the likely and unlikely
condition annotations.

When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
will be added to any of the ftrace tracers.

For example:

      irqbalance-2662  [002] 41127.102858: [correct] debug_mutex_free_waiter        51
          <idle>-0     [001] 41127.102858: [correct] add_preempt_count              4307
      irqbalance-2662  [002] 41127.102859: [INCORRECT] debug_smp_processor_id         15
          <idle>-0     [001] 41127.102860: [correct] add_preempt_count              4316
      irqbalance-2662  [002] 41127.102861: [correct] slab_alloc                     1608

Which shows the normal tracer heading, as well as whether the condition was
correct "[correct]" or was mistaken "[INCORRECT]", followed by the 
function and line number.


Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/Makefile               |    5 +
 kernel/trace/Kconfig          |   22 +++++++
 kernel/trace/Makefile         |    5 +
 kernel/trace/ring_buffer.c    |   11 +++
 kernel/trace/trace.c          |   37 +++++++++++
 kernel/trace/trace.h          |   43 +++++++++++++
 kernel/trace/trace_unlikely.c |  130 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 253 insertions(+)

Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig	2008-10-29 12:32:35.000000000 -0400
@@ -154,6 +154,28 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
+config TRACING_UNLIKELY
+	bool
+	help
+	  Selected by tracers that will trace the likely and unlikely
+	  conditions. This prevents the tracers themselves from being
+	  profiled. Profiling the tracing infrastructure can only happen
+	  when the likelys and unlikelys are not being traced.
+
+config UNLIKELY_TRACER
+	bool "Trace likely/unlikely instances"
+	depends on TRACE_UNLIKELY_PROFILE
+	select TRACING_UNLIKELY
+	help
+	  This is traces the events of likely and unlikely condition
+	  calls in the kernel.  The difference between this and the
+	  "Trace likely/unlikely profiler" is that this is not a
+	  histogram of the callers, but actually places the calling
+	  events into a running buffer to see when and where the
+	  events happened, as well as their results.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/Makefile
===================================================================
--- linux-tip.git.orig/kernel/Makefile	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/Makefile	2008-10-29 12:32:35.000000000 -0400
@@ -24,6 +24,11 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
+ifdef CONFIG_TRACING_UNLIKELY
+# Do not profile tracepoint if we are tracing the unlikely conditions
+CFLAGS_tracepoint.o += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile	2008-10-29 12:32:35.000000000 -0400
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
 
+ifdef CONFIG_TRACING_UNLIKELY
+KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
+KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace_unlikely.c	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace_unlikely.c	2008-10-29 12:32:35.000000000 -0400
@@ -15,8 +15,138 @@
 #include <asm/local.h>
 #include "trace.h"
 
+#ifdef CONFIG_UNLIKELY_TRACER
+
+static int unlikely_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(unlikely_tracing_mutex);
+static DEFINE_PER_CPU(atomic_t, unlikely_trace_disabled);
+static struct trace_array *unlikely_tracer;
+
+void trace_unlikely_stop(void)
+{
+	/* preemption should be disabled */
+	int cpu = smp_processor_id();
+
+	atomic_inc(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+void trace_unlikely_start(void)
+{
+	/* preemption should be disabled */
+	int cpu = smp_processor_id();
+
+	atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+static void
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	struct trace_array *tr = unlikely_tracer;
+	struct ring_buffer_event *event;
+	struct trace_unlikely *entry;
+	unsigned long flags, irq_flags;
+	int cpu, resched, pc;
+
+	if (unlikely(!tr))
+		return;
+
+	pc = preempt_count();
+	/* Can be called within schedule */
+	resched = need_resched();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&per_cpu(unlikely_trace_disabled, cpu)) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+
+	local_save_flags(flags);
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_UNLIKELY;
+
+	/*
+	 * I would love to save just the ftrace_likely_data pointer, but
+	 * this code can also be used by modules. Ugly things can happen
+	 * if the module is unloaded, and then we go and read the
+	 * pointer.  This is slower, but much safer.
+	 */
+
+	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->file, f->file, TRACE_FILE_SIZE);
+	entry->func[TRACE_FUNC_SIZE] = 0;
+	entry->file[TRACE_FILE_SIZE] = 0;
+	entry->line = f->line;
+	entry->correct = val == expect;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+	atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	if (!unlikely_tracing_enabled)
+		return;
+
+	probe_likely_condition(f, val, expect);
+}
+
+int enable_unlikely_tracing(struct trace_array *tr)
+{
+	int ret = 0;
+
+	mutex_lock(&unlikely_tracing_mutex);
+	unlikely_tracer = tr;
+	/*
+	 * Must be seen before enabling. The reader is a condition
+	 * where we do not need a matching rmb()
+	 */
+	smp_wmb();
+	unlikely_tracing_enabled++;
+	mutex_unlock(&unlikely_tracing_mutex);
+
+	return ret;
+}
+
+void disable_unlikely_tracing(void)
+{
+	mutex_lock(&unlikely_tracing_mutex);
+
+	if (!unlikely_tracing_enabled)
+		goto out_unlock;
+
+	unlikely_tracing_enabled--;
+
+ out_unlock:
+	mutex_unlock(&unlikely_tracing_mutex);
+}
+#else
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
+	/*
+	 * I would love to have a trace point here instead, but the
+	 * trace point code is so inundated with unlikely and likely
+	 * conditions that the recursive nightmare that exists is too
+	 * much to try to get working. At least for now.
+	 */
+	trace_likely_condition(f, val, expect);
+
 	/* FIXME: Make this atomic! */
 	if (val == expect)
 		f->correct++;
Index: linux-tip.git/kernel/trace/trace.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.c	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.c	2008-10-29 12:32:35.000000000 -0400
@@ -235,6 +235,9 @@ static const char *trace_options[] = {
 	"stacktrace",
 	"sched-tree",
 	"ftrace_printk",
+#ifdef CONFIG_UNLIKELY_TRACER
+	"unlikely",
+#endif
 	NULL
 };
 
@@ -1457,6 +1460,17 @@ print_lat_fmt(struct trace_iterator *ite
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %-30.30s %d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->line);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1593,6 +1607,17 @@ static enum print_line_t print_trace_fmt
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %-30.30s %d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->line);
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -1675,6 +1700,10 @@ static enum print_line_t print_raw_fmt(s
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		/* IGNORE */
+		break;
+	};
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -2362,8 +2391,14 @@ tracing_ctrl_write(struct file *filp, co
 
 		tr->ctrl = val;
 
+		if (!val)
+			trace_unlikely_disable();
+
 		if (current_trace && current_trace->ctrl_update)
 			current_trace->ctrl_update(tr);
+
+		if (val)
+			trace_unlikely_enable(tr);
 	}
 	mutex_unlock(&trace_types_lock);
 
@@ -2425,6 +2460,7 @@ tracing_set_trace_write(struct file *fil
 	if (t == current_trace)
 		goto out;
 
+	trace_unlikely_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
@@ -2432,6 +2468,7 @@ tracing_set_trace_write(struct file *fil
 	if (t->init)
 		t->init(tr);
 
+	trace_unlikely_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
 
Index: linux-tip.git/kernel/trace/trace.h
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.h	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.h	2008-10-29 12:32:35.000000000 -0400
@@ -21,6 +21,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
+	TRACE_UNLIKELY,
 	TRACE_BOOT,
 
 	__TRACE_LAST_TYPE
@@ -117,6 +118,16 @@ struct trace_boot {
 	struct boot_trace	initcall;
 };
 
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_unlikely {
+	struct trace_entry	ent;
+	unsigned	        line;
+	char			func[TRACE_FUNC_SIZE+1];
+	char			file[TRACE_FILE_SIZE+1];
+	char			correct;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -217,6 +228,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -413,8 +425,39 @@ enum trace_iterator_flags {
 	TRACE_ITER_STACKTRACE		= 0x100,
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
+#ifdef CONFIG_UNLIKELY_TRACER
+	TRACE_ITER_UNLIKELY		= 0x800,
+#endif
 };
 
 extern struct tracer nop_trace;
 
+#ifdef CONFIG_UNLIKELY_TRACER
+extern int enable_unlikely_tracing(struct trace_array *tr);
+extern void disable_unlikely_tracing(void);
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	if (trace_flags & TRACE_ITER_UNLIKELY)
+		return enable_unlikely_tracing(tr);
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+	/* due to races, always disable */
+	disable_unlikely_tracing();
+}
+extern void trace_unlikely_stop(void);
+extern void trace_unlikely_start(void);
+#else
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+}
+static inline void trace_unlikely_stop(void) { }
+static inline void trace_unlikely_start(void) { }
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c	2008-10-29 12:32:30.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c	2008-10-29 12:32:35.000000000 -0400
@@ -16,6 +16,8 @@
 #include <linux/list.h>
 #include <linux/fs.h>
 
+#include "trace.h"
+
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
@@ -1139,7 +1141,14 @@ ring_buffer_lock_reserve(struct ring_buf
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
+	/*
+	 * Ugly, but the unlikely tracing will enter back into
+	 * the ring buffer code. Namely through the timestamp, causing
+	 * a triggering of the "again" loop, that will be infinite.
+	 */
+	trace_unlikely_stop();
 	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	trace_unlikely_start();
 	if (!event)
 		goto out;
 
@@ -1251,8 +1260,10 @@ int ring_buffer_write(struct ring_buffer
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
+	trace_unlikely_stop();
 	event = rb_reserve_next_event(cpu_buffer,
 				      RINGBUF_TYPE_DATA, event_length);
+	trace_unlikely_start();
 	if (!event)
 		goto out;
 



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations
  2008-10-29 16:35             ` [PATCH 1/2 v2][RFC] " Steven Rostedt
  2008-10-29 16:38               ` [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer Steven Rostedt
@ 2008-10-29 16:40               ` Arjan van de Ven
  2008-10-29 22:39               ` [PATCH 1/2 v3][RFC] " Steven Rostedt
  2 siblings, 0 replies; 17+ messages in thread
From: Arjan van de Ven @ 2008-10-29 16:40 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Theodore Tso, Mike Snitzer, Andrew Morton, Ingo Molnar,
	Thomas Gleixner, Peter Zijlstra, Frederic Weisbecker,
	Daniel Walker

On Wed, 29 Oct 2008 12:35:44 -0400 (EDT)
Steven Rostedt <rostedt@goodmis.org> wrote:

> 
> Andrew Morton recently suggested having an in-kernel way to profile
> likely and unlikely macros. This patch achieves that goal.
> 
> When configured, every(*) likely and unlikely macro gets a counter
> attached to it. When the condition is hit, the hit and misses of that
> condition are recorded. These numbers can later be retrieved by:
> 
>   /debugfs/tracing/profile_likely    - All likely markers
>   /debugfs/tracing/profile_unlikely  - All unlikely markers.
> 

I like the general approach
it'd be very convenient if the debugfs files also had a
percentage-incorrect (or percentage-correct, either way) listed,
so that we humans can focus straight on which ones are outliers rather
than having to do math in our heads first...


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations
  2008-10-29 16:35             ` [PATCH 1/2 v2][RFC] " Steven Rostedt
  2008-10-29 16:38               ` [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer Steven Rostedt
  2008-10-29 16:40               ` [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations Arjan van de Ven
@ 2008-10-29 22:39               ` Steven Rostedt
  2008-10-29 22:40                 ` [PATCH 2/2 v3][RFC] ftrace: unlikely annotation tracer Steven Rostedt
  2008-10-30 14:32                 ` [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations Jörn Engel
  2 siblings, 2 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-29 22:39 UTC (permalink / raw)
  To: LKML
  Cc: Arjan van de Ven, Theodore Tso, Mike Snitzer, Andrew Morton,
	Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker, Linus Torvalds


[
  changes since v2:

  Added Arjan's recommendation about calculating a percentage of 
    failures in the output.

  Question: Do people prefer the "func file line" notation or the
     kallsyms "func+0x<offset>" notation?
]

Andrew Morton recently suggested having an in-kernel way to profile
likely and unlikely macros. This patch achieves that goal.

When configured, every(*) likely and unlikely macro gets a counter attached
to it. When the condition is hit, the hit and misses of that condition
are recorded. These numbers can later be retrieved by:

  /debugfs/tracing/profile_likely    - All likely markers
  /debugfs/tracing/profile_unlikely  - All unlikely markers.

# cat /debug/tracing/profile_unlikely | head
 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
    2494        0   0 do_arch_prctl                  process_64.c         832
       0        0   0 do_arch_prctl                  process_64.c         804
    2988        0   0 IS_ERR                         err.h                34
  124668     6133   4 __switch_to                    process_64.c         673
  130811        0   0 __switch_to                    process_64.c         639
   69768    61064  87 __switch_to                    process_64.c         624
   15557   115251 100 __switch_to                    process_64.c         594
   15555   115227 100 __switch_to                    process_64.c         590

# cat /debug/tracing/profile_unlikely | \
  awk '{ if ($3 > 25) print $0; }' |head -20
   59851    49675  82 __switch_to                    process_64.c         624
   12688    96800 100 __switch_to                    process_64.c         594
   12688    96778 100 __switch_to                    process_64.c         590
  152290    49467  32 need_resched                   sched.h              2103
    1385     1034  74 syscall_get_error              syscall.h            51
       0     2967 100 syscall_trace_leave            ptrace.c             1568
       0        1 100 native_smp_prepare_cpus        smpboot.c            1231
  304770    92199  30 update_curr                    sched_fair.c         489
  113195   297065 100 calc_delta_fair                sched_fair.c         411
  227018   123309  54 calc_delta_mine                sched.c              1279
       0    70299 100 sched_info_queued              sched_stats.h        222
   67434   100268 100 pick_next_task_fair            sched_fair.c         1394
       7        9 100 yield_task_fair                sched_fair.c         983
    7215     3321  46 rt_policy                      sched.c              144
       0     1611 100 pre_schedule_rt                sched_rt.c           1260
    1607    66013 100 pick_next_task_rt              sched_rt.c           884
       0    60198 100 sched_info_dequeued            sched_stats.h        177
       0       15 100 sched_move_task                sched.c              8707
       0       15 100 sched_move_task                sched.c              8697
   68373    50822  74 schedule                       sched.c              4467


# cat /debug/tracing/profile_likely | awk '{ if ($3 > 25) print $0; }'
   54665    53649  98 pick_next_task                 sched.c              4407
   25344    23614  93 switch_mm                      mmu_context_64.h     18
       0        7 100 __cancel_work_timer            workqueue.c          560
    1202     1354 100 clocksource_adjust             timekeeping.c        456
       0   441888 100 audit_syscall_exit             auditsc.c            1570
      22   442776 100 audit_get_context              auditsc.c            732
       0   440930 100 audit_syscall_entry            auditsc.c            1541
      22      889 100 audit_free                     auditsc.c            1446
       0      962 100 audit_alloc                    auditsc.c            862
    2614      962  36 audit_alloc                    auditsc.c            858
       0        6 100 move_masked_irq                migration.c          20
    4482     2110  47 __grab_cache_page              filemap.c            2212
    9575     2787  29 iov_iter_advance               filemap.c            1884
    3121      905  28 __set_page_dirty               page-writeback.c     1134
   12882   222991 100 mapping_unevictable            pagemap.h            50
       4       11 100 __flush_cpu_slab               slub.c               1466
  664688   320446  48 slab_free                      slub.c               1731
    2962    61920 100 dentry_lru_del_init            dcache.c             153
     950     1157 100 load_elf_binary                binfmt_elf.c         904
     102       82  80 disk_put_part                  genhd.h              206
       0        4 100 tcp_mss_split_point            tcp_output.c         1126


As you can see by the above, there's a bit of work to do in rethinking
the use of some unlikelys and likelys. Note: the unlikely case had 77 hits
that were more than 25%. 

Note:  After submitting my first version of this patch, Andrew Morton
  showed me a version written by Daniel Walker, where I picked up
  the following ideas from:

  1)  Using __builtin_constant_p to avoid profiling fixed values.
  2)  Using __FILE__ instead of instruction pointers.
  3)  Using the preprocessor to stop all profiling of likely
       annotations from vsyscall_64.c.

Thanks to Andrew Morton, Arjan van de Ven, Theodore Tso and Ingo Molnar
for their feed back on this patch.

(*) Not ever unlikely is recorded, those that are used by vsyscalls
 (a few of them) had to have profiling disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/vsyscall_64.c     |    8 +
 include/asm-generic/vmlinux.lds.h |   14 ++-
 include/linux/compiler.h          |   61 +++++++++++++
 kernel/trace/Kconfig              |   16 +++
 kernel/trace/Makefile             |    2 
 kernel/trace/trace_unlikely.c     |  167 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 265 insertions(+), 3 deletions(-)

Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile	2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile	2008-10-29 18:31:26.000000000 -0400
@@ -25,4 +25,6 @@ obj-$(CONFIG_STACK_TRACER) += trace_stac
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 
+obj-$(CONFIG_TRACE_UNLIKELY_PROFILE) += trace_unlikely.o
+
 libftrace-y := ftrace.o
Index: linux-tip.git/arch/x86/kernel/vsyscall_64.c
===================================================================
--- linux-tip.git.orig/arch/x86/kernel/vsyscall_64.c	2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/arch/x86/kernel/vsyscall_64.c	2008-10-29 16:13:00.000000000 -0400
@@ -17,6 +17,14 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+/* Protect userspace from profiling */
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+# undef likely
+# undef unlikely
+# define likely(x)		likely_notrace(x)
+# define unlikely(x)		unlikely_notrace(x)
+#endif
+
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
Index: linux-tip.git/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-tip.git.orig/include/asm-generic/vmlinux.lds.h	2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/include/asm-generic/vmlinux.lds.h	2008-10-29 16:13:00.000000000 -0400
@@ -45,6 +45,17 @@
 #define MCOUNT_REC()
 #endif
 
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+#define LIKELY_PROFILE()	VMLINUX_SYMBOL(__start_likely_profile) = .;   \
+				*(_ftrace_likely)			      \
+				VMLINUX_SYMBOL(__stop_likely_profile) = .;    \
+				VMLINUX_SYMBOL(__start_unlikely_profile) = .; \
+				*(_ftrace_unlikely)			      \
+				VMLINUX_SYMBOL(__stop_unlikely_profile) = .;
+#else
+#define LIKELY_PROFILE()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -62,7 +73,8 @@
 	VMLINUX_SYMBOL(__stop___markers) = .;				\
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
-	VMLINUX_SYMBOL(__stop___tracepoints) = .;
+	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	LIKELY_PROFILE()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
Index: linux-tip.git/include/linux/compiler.h
===================================================================
--- linux-tip.git.orig/include/linux/compiler.h	2008-10-29 12:38:54.000000000 -0400
+++ linux-tip.git/include/linux/compiler.h	2008-10-29 16:13:00.000000000 -0400
@@ -59,8 +59,65 @@ extern void __chk_io_ptr(const volatile 
  * specific implementations come from the above header files
  */
 
-#define likely(x)	__builtin_expect(!!(x), 1)
-#define unlikely(x)	__builtin_expect(!!(x), 0)
+#ifdef CONFIG_TRACE_UNLIKELY_PROFILE
+struct ftrace_likely_data {
+	const char *func;
+	const char *file;
+	unsigned line;
+	unsigned long correct;
+	unsigned long incorrect;
+};
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect);
+
+#define likely_notrace(x)	__builtin_expect(!!(x), 1)
+#define unlikely_notrace(x)	__builtin_expect(!!(x), 0)
+
+#define likely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_likely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = likely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 1);	\
+			______r;					\
+		})
+#define unlikely_check(x) ({						\
+			int ______r;					\
+			static struct ftrace_likely_data		\
+				__attribute__((__aligned__(4)))		\
+				__attribute__((section("_ftrace_unlikely"))) \
+				______f = {				\
+				.func = __func__,			\
+				.file = __FILE__,			\
+				.line = __LINE__,			\
+			};						\
+			______f.line = __LINE__;			\
+			______r = unlikely_notrace(x);			\
+			ftrace_likely_update(&______f, ______r, 0);	\
+			______r;					\
+		})
+
+/*
+ * Using __builtin_constant_p(x) to ignore cases where the return
+ * value is always the same.  This idea is taken from a similar patch
+ * written by Daniel Walker.
+ */
+# ifndef likely
+#  define likely(x)	(__builtin_constant_p(x) ? !!(x) : likely_check(x))
+# endif
+# ifndef unlikely
+#  define unlikely(x)	(__builtin_constant_p(x) ? !!(x) : unlikely_check(x))
+# endif
+#else
+# define likely(x)	__builtin_expect(!!(x), 1)
+# define unlikely(x)	__builtin_expect(!!(x), 0)
+#endif
 
 /* Optimization barrier */
 #ifndef barrier
Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig	2008-10-29 16:12:53.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig	2008-10-29 18:31:26.000000000 -0400
@@ -138,6 +138,22 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config TRACE_UNLIKELY_PROFILE
+	bool "Trace likely/unlikely profiler"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer profiles all the the likely and unlikely macros
+	  in the kernel. It will display the results in:
+
+	  /debugfs/tracing/profile_likely
+	  /debugfs/tracing/profile_unlikely
+
+	  Note: this will add a significant overhead, only turn this
+	  on if you need to profile the system's use of these macros.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-tip.git/kernel/trace/trace_unlikely.c	2008-10-29 18:31:26.000000000 -0400
@@ -0,0 +1,167 @@
+/*
+ * unlikely profiler
+ *
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/hash.h>
+#include <linux/fs.h>
+#include <asm/local.h>
+#include "trace.h"
+
+void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
+{
+	/* FIXME: Make this atomic! */
+	if (val == expect)
+		f->correct++;
+	else
+		f->incorrect++;
+}
+EXPORT_SYMBOL(ftrace_likely_update);
+
+struct ftrace_pointer {
+	void		*start;
+	void		*stop;
+};
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_pointer *f = m->private;
+	struct ftrace_likely_data *p = v;
+
+	(*pos)++;
+
+	if (v == (void *)1)
+		return f->start;
+
+	++p;
+
+	if ((void *)p >= (void *)f->stop)
+		return NULL;
+
+	return p;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	void *t = (void *)1;
+	loff_t l = 0;
+
+	for (; t && l < *pos; t = t_next(m, t, &l))
+		;
+
+	return t;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_likely_data *p = v;
+	const char *f;
+	unsigned long percent;
+
+	if (v == (void *)1) {
+		seq_printf(m, " correct incorrect  %% "
+			      "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+		return 0;
+	}
+
+	/* Only print the file, not the path */
+	f = p->file + strlen(p->file);
+	while (f >= p->file && *f != '/')
+		f--;
+	f++;
+
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct;
+		/* No need to see huge numbers */
+		if (percent > 100)
+			percent = 100;
+	} else
+		percent = p->incorrect ? 100 : 0;
+
+	seq_printf(m, "%8lu %8lu %3lu ", p->correct, p->incorrect, percent);
+	seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line);
+	return 0;
+}
+
+static struct seq_operations tracing_likely_seq_ops = {
+	.start		= t_start,
+	.next		= t_next,
+	.stop		= t_stop,
+	.show		= t_show,
+};
+
+static int tracing_likely_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &tracing_likely_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = (void *)inode->i_private;
+	}
+
+	return ret;
+}
+
+static struct file_operations tracing_likely_fops = {
+	.open		= tracing_likely_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+};
+
+extern unsigned long __start_likely_profile[];
+extern unsigned long __stop_likely_profile[];
+extern unsigned long __start_unlikely_profile[];
+extern unsigned long __stop_unlikely_profile[];
+
+static struct ftrace_pointer ftrace_likely_pos = {
+	.start			= __start_likely_profile,
+	.stop			= __stop_likely_profile,
+};
+
+static struct ftrace_pointer ftrace_unlikely_pos = {
+	.start			= __start_unlikely_profile,
+	.stop			= __stop_unlikely_profile,
+};
+
+static __init int ftrace_unlikely_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+
+	entry = debugfs_create_file("profile_likely", 0444, d_tracer,
+				    &ftrace_likely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'profile_likely' entry\n");
+
+	entry = debugfs_create_file("profile_unlikely", 0444, d_tracer,
+				    &ftrace_unlikely_pos,
+				    &tracing_likely_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs"
+			   " 'profile_unlikely' entry\n");
+
+	return 0;
+}
+
+device_initcall(ftrace_unlikely_init);

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 2/2 v3][RFC] ftrace: unlikely annotation tracer
  2008-10-29 22:39               ` [PATCH 1/2 v3][RFC] " Steven Rostedt
@ 2008-10-29 22:40                 ` Steven Rostedt
  2008-10-30 14:32                 ` [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations Jörn Engel
  1 sibling, 0 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-29 22:40 UTC (permalink / raw)
  To: LKML
  Cc: Arjan van de Ven, Theodore Tso, Mike Snitzer, Andrew Morton,
	Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker, Linus Torvalds


[
  Changes since v2:

   Changed to a "func:file:line-num" format.
]

This patch adds a way to record the instances of the likely and unlikely
condition annotations.

When "unlikely" is set in /debugfs/tracing/iter_ctrl the unlikely conditions
will be added to any of the ftrace tracers.

For example:

 bash-3471  [003]   357.014755: [INCORRECT] sched_info_dequeued:sched_stats.h:177
 bash-3471  [003]   357.014756: [correct] update_curr:sched_fair.c:489
 bash-3471  [003]   357.014758: [correct] calc_delta_fair:sched_fair.c:411
 bash-3471  [003]   357.014759: [correct] account_group_exec_runtime:sched_stats.h:356
 bash-3471  [003]   357.014761: [correct] update_curr:sched_fair.c:489
 bash-3471  [003]   357.014763: [INCORRECT] calc_delta_fair:sched_fair.c:411
 bash-3471  [003]   357.014765: [correct] calc_delta_mine:sched.c:1279


Which shows the normal tracer heading, as well as whether the condition was
correct "[correct]" or was mistaken "[INCORRECT]", followed by the function,
file name and line number.


Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/Makefile               |    5 +
 kernel/trace/Kconfig          |   22 ++++++
 kernel/trace/Makefile         |    5 +
 kernel/trace/ring_buffer.c    |   11 +++
 kernel/trace/trace.c          |   39 +++++++++++
 kernel/trace/trace.h          |   43 +++++++++++++
 kernel/trace/trace_unlikely.c |  137 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 262 insertions(+)

Index: linux-tip.git/kernel/trace/Kconfig
===================================================================
--- linux-tip.git.orig/kernel/trace/Kconfig	2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/Kconfig	2008-10-29 17:46:37.000000000 -0400
@@ -154,6 +154,28 @@ config TRACE_UNLIKELY_PROFILE
 
 	  Say N if unsure.
 
+config TRACING_UNLIKELY
+	bool
+	help
+	  Selected by tracers that will trace the likely and unlikely
+	  conditions. This prevents the tracers themselves from being
+	  profiled. Profiling the tracing infrastructure can only happen
+	  when the likelys and unlikelys are not being traced.
+
+config UNLIKELY_TRACER
+	bool "Trace likely/unlikely instances"
+	depends on TRACE_UNLIKELY_PROFILE
+	select TRACING_UNLIKELY
+	help
+	  This is traces the events of likely and unlikely condition
+	  calls in the kernel.  The difference between this and the
+	  "Trace likely/unlikely profiler" is that this is not a
+	  histogram of the callers, but actually places the calling
+	  events into a running buffer to see when and where the
+	  events happened, as well as their results.
+
+	  Say N if unsure.
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
Index: linux-tip.git/kernel/Makefile
===================================================================
--- linux-tip.git.orig/kernel/Makefile	2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/Makefile	2008-10-29 17:46:37.000000000 -0400
@@ -24,6 +24,11 @@ CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -mno-spe -pg
 endif
 
+ifdef CONFIG_TRACING_UNLIKELY
+# Do not profile tracepoint if we are tracing the unlikely conditions
+CFLAGS_tracepoint.o += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
Index: linux-tip.git/kernel/trace/Makefile
===================================================================
--- linux-tip.git.orig/kernel/trace/Makefile	2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/Makefile	2008-10-29 17:46:37.000000000 -0400
@@ -10,6 +10,11 @@ CFLAGS_trace_selftest_dynamic.o = -pg
 obj-y += trace_selftest_dynamic.o
 endif
 
+ifdef CONFIG_TRACING_UNLIKELY
+KBUILD_CFLAGS += '-Dlikely(x)=likely_notrace(x)'
+KBUILD_CFLAGS += '-Dunlikely(x)=unlikely_notrace(x)'
+endif
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
Index: linux-tip.git/kernel/trace/trace_unlikely.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace_unlikely.c	2008-10-29 17:25:31.000000000 -0400
+++ linux-tip.git/kernel/trace/trace_unlikely.c	2008-10-29 18:03:30.000000000 -0400
@@ -15,8 +15,145 @@
 #include <asm/local.h>
 #include "trace.h"
 
+#ifdef CONFIG_UNLIKELY_TRACER
+
+static int unlikely_tracing_enabled __read_mostly;
+static DEFINE_MUTEX(unlikely_tracing_mutex);
+static DEFINE_PER_CPU(atomic_t, unlikely_trace_disabled);
+static struct trace_array *unlikely_tracer;
+
+void trace_unlikely_stop(void)
+{
+	/* preemption should be disabled */
+	int cpu = smp_processor_id();
+
+	atomic_inc(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+void trace_unlikely_start(void)
+{
+	/* preemption should be disabled */
+	int cpu = smp_processor_id();
+
+	atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+}
+
+static void
+probe_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	struct trace_array *tr = unlikely_tracer;
+	struct ring_buffer_event *event;
+	struct trace_unlikely *entry;
+	unsigned long flags, irq_flags;
+	int cpu, resched, pc;
+	const char *p;
+
+	/*
+	 * I would love to save just the ftrace_likely_data pointer, but
+	 * this code can also be used by modules. Ugly things can happen
+	 * if the module is unloaded, and then we go and read the
+	 * pointer.  This is slower, but much safer.
+	 */
+
+	if (unlikely(!tr))
+		return;
+
+	pc = preempt_count();
+	/* Can be called within schedule */
+	resched = need_resched();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&per_cpu(unlikely_trace_disabled, cpu)) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+
+	local_save_flags(flags);
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, flags, pc);
+	entry->ent.type		= TRACE_UNLIKELY;
+
+	/* Strip off the path, only save the file */
+	p = f->file + strlen(f->file);
+	while (p >= f->file && *p != '/')
+		p--;
+	p++;
+
+	strncpy(entry->func, f->func, TRACE_FUNC_SIZE);
+	strncpy(entry->file, p, TRACE_FILE_SIZE);
+	entry->func[TRACE_FUNC_SIZE] = 0;
+	entry->file[TRACE_FILE_SIZE] = 0;
+	entry->line = f->line;
+	entry->correct = val == expect;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+ out:
+	atomic_dec(&per_cpu(unlikely_trace_disabled, cpu));
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+	if (!unlikely_tracing_enabled)
+		return;
+
+	probe_likely_condition(f, val, expect);
+}
+
+int enable_unlikely_tracing(struct trace_array *tr)
+{
+	int ret = 0;
+
+	mutex_lock(&unlikely_tracing_mutex);
+	unlikely_tracer = tr;
+	/*
+	 * Must be seen before enabling. The reader is a condition
+	 * where we do not need a matching rmb()
+	 */
+	smp_wmb();
+	unlikely_tracing_enabled++;
+	mutex_unlock(&unlikely_tracing_mutex);
+
+	return ret;
+}
+
+void disable_unlikely_tracing(void)
+{
+	mutex_lock(&unlikely_tracing_mutex);
+
+	if (!unlikely_tracing_enabled)
+		goto out_unlock;
+
+	unlikely_tracing_enabled--;
+
+ out_unlock:
+	mutex_unlock(&unlikely_tracing_mutex);
+}
+#else
+static inline
+void trace_likely_condition(struct ftrace_likely_data *f, int val, int expect)
+{
+}
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 void ftrace_likely_update(struct ftrace_likely_data *f, int val, int expect)
 {
+	/*
+	 * I would love to have a trace point here instead, but the
+	 * trace point code is so inundated with unlikely and likely
+	 * conditions that the recursive nightmare that exists is too
+	 * much to try to get working. At least for now.
+	 */
+	trace_likely_condition(f, val, expect);
+
 	/* FIXME: Make this atomic! */
 	if (val == expect)
 		f->correct++;
Index: linux-tip.git/kernel/trace/trace.c
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.c	2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.c	2008-10-29 18:04:04.000000000 -0400
@@ -235,6 +235,9 @@ static const char *trace_options[] = {
 	"stacktrace",
 	"sched-tree",
 	"ftrace_printk",
+#ifdef CONFIG_UNLIKELY_TRACER
+	"unlikely",
+#endif
 	NULL
 };
 
@@ -1457,6 +1460,18 @@ print_lat_fmt(struct trace_iterator *ite
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %s:%s:%d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->file,
+				 field->line);
+		break;
+	}
 	default:
 		trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	}
@@ -1593,6 +1608,18 @@ static enum print_line_t print_trace_fmt
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		struct trace_unlikely *field;
+
+		trace_assign_type(field, entry);
+
+		trace_seq_printf(s, "[%s] %s:%s:%d\n",
+				 field->correct ? "correct" : "INCORRECT",
+				 field->func,
+				 field->file,
+				 field->line);
+		break;
+	}
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -1675,6 +1702,10 @@ static enum print_line_t print_raw_fmt(s
 			trace_seq_print_cont(s, iter);
 		break;
 	}
+	case TRACE_UNLIKELY: {
+		/* IGNORE */
+		break;
+	};
 	}
 	return TRACE_TYPE_HANDLED;
 }
@@ -2362,8 +2393,14 @@ tracing_ctrl_write(struct file *filp, co
 
 		tr->ctrl = val;
 
+		if (!val)
+			trace_unlikely_disable();
+
 		if (current_trace && current_trace->ctrl_update)
 			current_trace->ctrl_update(tr);
+
+		if (val)
+			trace_unlikely_enable(tr);
 	}
 	mutex_unlock(&trace_types_lock);
 
@@ -2425,6 +2462,7 @@ tracing_set_trace_write(struct file *fil
 	if (t == current_trace)
 		goto out;
 
+	trace_unlikely_disable();
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
@@ -2432,6 +2470,7 @@ tracing_set_trace_write(struct file *fil
 	if (t->init)
 		t->init(tr);
 
+	trace_unlikely_enable(tr);
  out:
 	mutex_unlock(&trace_types_lock);
 
Index: linux-tip.git/kernel/trace/trace.h
===================================================================
--- linux-tip.git.orig/kernel/trace/trace.h	2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/trace.h	2008-10-29 17:46:37.000000000 -0400
@@ -21,6 +21,7 @@ enum trace_type {
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
+	TRACE_UNLIKELY,
 	TRACE_BOOT,
 
 	__TRACE_LAST_TYPE
@@ -117,6 +118,16 @@ struct trace_boot {
 	struct boot_trace	initcall;
 };
 
+#define TRACE_FUNC_SIZE 30
+#define TRACE_FILE_SIZE 20
+struct trace_unlikely {
+	struct trace_entry	ent;
+	unsigned	        line;
+	char			func[TRACE_FUNC_SIZE+1];
+	char			file[TRACE_FILE_SIZE+1];
+	char			correct;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -217,6 +228,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct trace_unlikely, TRACE_UNLIKELY); \
 		__ftrace_bad_type();					\
 	} while (0)
 
@@ -413,8 +425,39 @@ enum trace_iterator_flags {
 	TRACE_ITER_STACKTRACE		= 0x100,
 	TRACE_ITER_SCHED_TREE		= 0x200,
 	TRACE_ITER_PRINTK		= 0x400,
+#ifdef CONFIG_UNLIKELY_TRACER
+	TRACE_ITER_UNLIKELY		= 0x800,
+#endif
 };
 
 extern struct tracer nop_trace;
 
+#ifdef CONFIG_UNLIKELY_TRACER
+extern int enable_unlikely_tracing(struct trace_array *tr);
+extern void disable_unlikely_tracing(void);
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	if (trace_flags & TRACE_ITER_UNLIKELY)
+		return enable_unlikely_tracing(tr);
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+	/* due to races, always disable */
+	disable_unlikely_tracing();
+}
+extern void trace_unlikely_stop(void);
+extern void trace_unlikely_start(void);
+#else
+static inline int trace_unlikely_enable(struct trace_array *tr)
+{
+	return 0;
+}
+static inline void trace_unlikely_disable(void)
+{
+}
+static inline void trace_unlikely_stop(void) { }
+static inline void trace_unlikely_start(void) { }
+#endif /* CONFIG_UNLIKELY_TRACER */
+
 #endif /* _LINUX_KERNEL_TRACE_H */
Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c	2008-10-29 17:01:11.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c	2008-10-29 17:46:37.000000000 -0400
@@ -16,6 +16,8 @@
 #include <linux/list.h>
 #include <linux/fs.h>
 
+#include "trace.h"
+
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
@@ -1151,7 +1153,14 @@ ring_buffer_lock_reserve(struct ring_buf
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
+	/*
+	 * Ugly, but the unlikely tracing will enter back into
+	 * the ring buffer code. Namely through the timestamp, causing
+	 * a triggering of the "again" loop, that will be infinite.
+	 */
+	trace_unlikely_stop();
 	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	trace_unlikely_start();
 	if (!event)
 		goto out;
 
@@ -1263,8 +1272,10 @@ int ring_buffer_write(struct ring_buffer
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
+	trace_unlikely_stop();
 	event = rb_reserve_next_event(cpu_buffer,
 				      RINGBUF_TYPE_DATA, event_length);
+	trace_unlikely_start();
 	if (!event)
 		goto out;
 



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations
  2008-10-29 22:39               ` [PATCH 1/2 v3][RFC] " Steven Rostedt
  2008-10-29 22:40                 ` [PATCH 2/2 v3][RFC] ftrace: unlikely annotation tracer Steven Rostedt
@ 2008-10-30 14:32                 ` Jörn Engel
  2008-10-30 14:55                   ` Theodore Tso
  1 sibling, 1 reply; 17+ messages in thread
From: Jörn Engel @ 2008-10-30 14:32 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: LKML, Arjan van de Ven, Theodore Tso, Mike Snitzer,
	Andrew Morton, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker, Linus Torvalds

On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
>
>    69768    61064  87 __switch_to                    process_64.c         624
>    15557   115251 100 __switch_to                    process_64.c         594
>    15555   115227 100 __switch_to                    process_64.c         590

I may be out of school for a while, but that math sure looks odd.

> +	if (p->correct) {
> +		percent = p->incorrect * 100;
> +		percent /= p->correct;
                percent /= p->correct + p->incorect;
> +		/* No need to see huge numbers */
> +		if (percent > 100)
> +			percent = 100;
> +	} else
> +		percent = p->incorrect ? 100 : 0;

Jörn

-- 
To my face you have the audacity to advise me to become a thief - the worst
kind of thief that is conceivable, a thief of spiritual things, a thief of
ideas! It is insufferable, intolerable!
-- M. Binet in Scarabouche

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations
  2008-10-30 14:32                 ` [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations Jörn Engel
@ 2008-10-30 14:55                   ` Theodore Tso
  2008-10-30 15:10                     ` Steven Rostedt
  0 siblings, 1 reply; 17+ messages in thread
From: Theodore Tso @ 2008-10-30 14:55 UTC (permalink / raw)
  To: Jörn Engel
  Cc: Steven Rostedt, LKML, Arjan van de Ven, Mike Snitzer,
	Andrew Morton, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker, Linus Torvalds

On Thu, Oct 30, 2008 at 03:32:39PM +0100, Jörn Engel wrote:
> On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
> >
> >    69768    61064  87 __switch_to                    process_64.c         624
> >    15557   115251 100 __switch_to                    process_64.c         594
> >    15555   115227 100 __switch_to                    process_64.c         590
> 
> I may be out of school for a while, but that math sure looks odd.
> 
> > +	if (p->correct) {
> > +		percent = p->incorrect * 100;
> > +		percent /= p->correct;
>                 percent /= p->correct + p->incorect;

And once you do the above fix, I don't think what is below is
necessary any more.  :-)

> > +		/* No need to see huge numbers */
> > +		if (percent > 100)
> > +			percent = 100;

I would also calculate the percent correct rather than the percent
incorrect, on the general theory that 100% good, 0% bad is easier for
my little brain to understand, but that's just a minor thing...

					- Ted

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations
  2008-10-30 14:55                   ` Theodore Tso
@ 2008-10-30 15:10                     ` Steven Rostedt
  0 siblings, 0 replies; 17+ messages in thread
From: Steven Rostedt @ 2008-10-30 15:10 UTC (permalink / raw)
  To: Theodore Tso
  Cc: Jörn Engel, LKML, Arjan van de Ven, Mike Snitzer,
	Andrew Morton, Ingo Molnar, Thomas Gleixner, Peter Zijlstra,
	Frederic Weisbecker, Daniel Walker, Linus Torvalds


On Thu, 30 Oct 2008, Theodore Tso wrote:

> On Thu, Oct 30, 2008 at 03:32:39PM +0100, J?rn Engel wrote:
> > On Wed, 29 October 2008 18:39:55 -0400, Steven Rostedt wrote:
> > >
> > >    69768    61064  87 __switch_to                    process_64.c         624
> > >    15557   115251 100 __switch_to                    process_64.c         594
> > >    15555   115227 100 __switch_to                    process_64.c         590
> > 
> > I may be out of school for a while, but that math sure looks odd.
> > 
> > > +	if (p->correct) {
> > > +		percent = p->incorrect * 100;
> > > +		percent /= p->correct;
> >                 percent /= p->correct + p->incorect;
> 
> And once you do the above fix, I don't think what is below is
> necessary any more.  :-)

Bah! Total brain-fart.  Thanks ;-)

> 
> > > +		/* No need to see huge numbers */
> > > +		if (percent > 100)
> > > +			percent = 100;
> 
> I would also calculate the percent correct rather than the percent
> incorrect, on the general theory that 100% good, 0% bad is easier for
> my little brain to understand, but that's just a minor thing...

I thought about whether or not to show percent correct or precent 
incorrect.  I chose, incorrect, just because it is easier to spot the 
trouble makers.  You can say, lets allow 25% incorrect, better than saying 
lets have a minimum of 75%.  I think the mind can pick out those that go 
over a number better than it can see those that are under a number.

Thanks,

-- Steve


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2008-10-30 15:10 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <170fa0d20810271529g3c64ae89me29ed8b65a9c3b5e@mail.gmail.com>
     [not found] ` <alpine.DEB.1.10.0810271918390.19731@gandalf.stny.rr.com>
     [not found]   ` <20081028001340.GB9797@mit.edu>
2008-10-28  4:12     ` [PATCH][RFC] trace: profile likely and unlikely annotations Steven Rostedt
2008-10-28  4:23       ` Arjan van de Ven
2008-10-28  4:39       ` Andrew Morton
2008-10-28 14:37       ` Theodore Tso
2008-10-28 14:48         ` Arjan van de Ven
2008-10-28 14:51           ` Steven Rostedt
2008-10-29 16:35             ` [PATCH 1/2 v2][RFC] " Steven Rostedt
2008-10-29 16:38               ` [PATCH 2/2 v2][RFC] ftrace: unlikely annotation tracer Steven Rostedt
2008-10-29 16:40               ` [PATCH 1/2 v2][RFC] trace: profile likely and unlikely annotations Arjan van de Ven
2008-10-29 22:39               ` [PATCH 1/2 v3][RFC] " Steven Rostedt
2008-10-29 22:40                 ` [PATCH 2/2 v3][RFC] ftrace: unlikely annotation tracer Steven Rostedt
2008-10-30 14:32                 ` [PATCH 1/2 v3][RFC] trace: profile likely and unlikely annotations Jörn Engel
2008-10-30 14:55                   ` Theodore Tso
2008-10-30 15:10                     ` Steven Rostedt
2008-10-28 14:49         ` [PATCH][RFC] " Steven Rostedt
2008-10-28 18:29           ` Theodore Tso
2008-10-28 18:41             ` Steven Rostedt

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).