LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
@ 2008-10-11 20:22 Frederic Weisbecker
  2008-10-21 12:28 ` Frédéric Weisbecker
  0 siblings, 1 reply; 12+ messages in thread
From: Frederic Weisbecker @ 2008-10-11 20:22 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Steven Rostedt

Introduce the Big Kernel Lock tracer.
This new tracer lets one to observe the latencies caused
by the lock_kernel() function.
It captures the time when the task request the spinlock,
the time when the spinlock is hold and the time it is released.
This way we can measure the latency for a task that wait for this
spinlock and the latency caused by the time this lock is hold.

I will surely add some improvements like:

_ Adding a stack trace to know which function locked or unlocked the kernel.
_ Adding a script to get some statistics about this: which functions hold it
often, which hold it for a long time and some other things....

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/bkl.h      |   29 ++++++++
 kernel/trace/Kconfig     |   11 +++
 kernel/trace/Makefile    |    1 +
 kernel/trace/trace.h     |   18 +++++
 kernel/trace/trace_bkl.c |  170 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/kernel_lock.c        |   22 ++++++-
 6 files changed, 249 insertions(+), 2 deletions(-)

diff --git a/include/trace/bkl.h b/include/trace/bkl.h
new file mode 100644
index 0000000..bed2f1b
--- /dev/null
+++ b/include/trace/bkl.h
@@ -0,0 +1,29 @@
+#ifndef _TRACE_BKL_H
+#define _TRACE_BKL_H
+
+
+/* Entry which log the time when the task tried first to acquire the bkl
+ * and the time when it acquired it.
+ * That will let us know the latency to acquire the kernel lock.
+ */
+struct bkl_trace_acquire {
+	unsigned long long acquire_req_time;
+	unsigned long long acquire_time;
+};
+
+/* This will log the time when the task released the bkl.
+ * So we will know the whole duration of its kernel lock.
+ */
+struct bkl_trace_release {
+	unsigned long long release_time;
+};
+
+#ifdef CONFIG_BKL_TRACER
+extern void trace_bkl_acquire(struct bkl_trace_acquire *trace);
+extern void trace_bkl_release(struct bkl_trace_release *trace);
+#else
+static void trace_bkl_acquire(struct bkl_trace_acquire *trace) { }
+static void trace_bkl_release(struct bkl_trace_release *trace) { }
+#endif
+
+#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 396aea1..fde7700 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -144,6 +144,17 @@ config STACK_TRACER
 	  This tracer records the max stack of the kernel, and displays
 	  it in debugfs/tracing/stack_trace
 
+config BKL_TRACER
+	bool "Trace Big Kernel Lock latencies"
+	depends on HAVE_FTRACE
+	depends on DEBUG_KERNEL
+	select FTRACE
+	select STACKTRACE
+	help
+	  This tracer records the latencies issued by the Big Kernel Lock.
+	  It traces the time when a task request the bkl, the time when it
+	  acquires it and the time when it releases it.
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index a85dfba..2ba8904 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_BKL_TRACER) += trace_bkl.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 707620c..b1df671 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
 #include <linux/ring_buffer.h>
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
+#include <trace/bkl.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -22,6 +23,8 @@ enum trace_type {
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BOOT,
+	TRACE_BKL_ACQ,
+	TRACE_BKL_REL,
 
 	__TRACE_LAST_TYPE
 };
@@ -119,6 +122,17 @@ struct trace_boot {
 	struct boot_trace	initcall;
 };
 
+/* Trace bkl entries */
+struct trace_bkl_acq {
+	struct trace_entry		ent;
+	struct bkl_trace_acquire	bkl_acq;
+};
+
+struct trace_bkl_rel {
+	struct trace_entry		ent;
+	struct bkl_trace_release	bkl_rel;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -219,6 +233,10 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct trace_bkl_acq, 		\
+			  TRACE_BKL_ACQ);				\
+		IF_ASSIGN(var, ent, struct trace_bkl_rel,		\
+			  TRACE_BKL_REL);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_bkl.c b/kernel/trace/trace_bkl.c
new file mode 100644
index 0000000..169ee01
--- /dev/null
+++ b/kernel/trace/trace_bkl.c
@@ -0,0 +1,170 @@
+/*
+ * Ring buffer based Big Kernel Lock tracer
+ * for purpose of measuring the latencies issued
+ * by the BKL.
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <trace/bkl.h>
+#include "trace.h"
+#include <linux/ftrace.h>
+
+
+static struct trace_array *bkl_trace;
+static bool bkl_tracer_enabled;
+
+
+static void bkl_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	bkl_trace = tr;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+
+	if (tr->ctrl)
+		bkl_tracer_enabled = 1;
+}
+
+static void reset_bkl_trace(struct trace_array *tr)
+{
+	bkl_tracer_enabled = 0;
+}
+
+static void bkl_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		bkl_tracer_enabled = 1;
+	else
+		bkl_tracer_enabled = 0;
+}
+
+static enum print_line_t bkl_acq_print_line(struct trace_iterator *iter)
+{
+	int ret;
+	struct trace_entry *entry = iter->ent;
+	struct trace_bkl_acq *field = (struct trace_bkl_acq *) entry;
+	struct bkl_trace_acquire *bta = &field->bkl_acq;
+	struct trace_seq *s = &iter->seq;
+	unsigned long sec, nsec_rem;
+	unsigned long long wait_for_spin;
+
+	wait_for_spin = bta->acquire_time - bta->acquire_req_time;
+	nsec_rem = do_div(bta->acquire_req_time, 1000000000);
+	sec = (unsigned long) bta->acquire_req_time;
+	ret = trace_seq_printf(s, "Kernel lock requested at %5lu.%09lu ",
+				  sec, nsec_rem);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	nsec_rem = do_div(bta->acquire_time, 1000000000);
+	sec = (unsigned long) bta->acquire_time;
+	ret = trace_seq_printf(s, "acquired at %5lu.%09lu "
+				  "(waited %llu nsecs)\n",
+				  sec, nsec_rem,
+				  wait_for_spin);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t bkl_rel_print_line(struct trace_iterator *iter)
+{
+	int ret;
+	struct trace_entry *entry = iter->ent;
+	struct trace_bkl_rel *field = (struct trace_bkl_rel *) entry;
+	struct bkl_trace_release *btr = &field->bkl_rel;
+	struct trace_seq *s = &iter->seq;
+	unsigned long sec, nsec_rem;
+
+	nsec_rem = do_div(btr->release_time, 1000000000);
+	sec = (unsigned long) btr->release_time;
+	ret = trace_seq_printf(s, "Kernel lock released at  %5lu.%09lu\n",
+				  sec, nsec_rem);
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t bkl_trace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	if (entry->type == TRACE_BKL_ACQ)
+		return bkl_acq_print_line(iter);
+	if (entry->type == TRACE_BKL_REL)
+		return bkl_rel_print_line(iter);
+
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer bkl_tracer __read_mostly =
+{
+	.name		= "bkl",
+	.init		= bkl_trace_init,
+	.reset		= reset_bkl_trace,
+	.ctrl_update	= bkl_trace_ctrl_update,
+	.print_line	= bkl_trace_print_line,
+};
+
+__init static int init_bkl_tracer(void)
+{
+	return register_tracer(&bkl_tracer);
+}
+device_initcall(init_bkl_tracer);
+
+/*
+ * These functions don't need us to disable
+ * the preemption since it has been done by lock_kernel()
+ */
+
+void trace_bkl_acquire(struct bkl_trace_acquire *trace)
+{
+	struct ring_buffer_event *event;
+	struct trace_bkl_acq *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = bkl_trace;
+
+	if (!bkl_tracer_enabled)
+		return;
+
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_BKL_ACQ;
+	entry->bkl_acq = *trace;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+void trace_bkl_release(struct bkl_trace_release *trace)
+{
+	struct ring_buffer_event *event;
+	struct trace_bkl_rel *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = bkl_trace;
+
+	if (!bkl_tracer_enabled)
+		return;
+
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_BKL_REL;
+	entry->bkl_rel = *trace;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 01a3c22..9987834 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -9,6 +9,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/semaphore.h>
+#include <trace/bkl.h>
 
 /*
  * The 'big kernel lock'
@@ -116,16 +117,33 @@ static inline void __unlock_kernel(void)
 void __lockfunc lock_kernel(void)
 {
 	int depth = current->lock_depth+1;
-	if (likely(!depth))
+	if (likely(!depth)) {
+#ifdef CONFIG_BKL_TRACER
+		struct bkl_trace_acquire trace;
+		preempt_disable();
+		trace.acquire_req_time = cpu_clock(raw_smp_processor_id());
+		preempt_enable();
 		__lock_kernel();
+		trace.acquire_time = cpu_clock(raw_smp_processor_id());
+		trace_bkl_acquire(&trace);
+#else
+		__lock_kernel();
+#endif
+	}
 	current->lock_depth = depth;
 }
 
 void __lockfunc unlock_kernel(void)
 {
 	BUG_ON(current->lock_depth < 0);
-	if (likely(--current->lock_depth < 0))
+	if (likely(--current->lock_depth < 0)) {
+#ifdef CONFIG_BKL_TRACER
+		struct bkl_trace_release trace;
+		trace.release_time = cpu_clock(raw_smp_processor_id());
+		trace_bkl_release(&trace);
+#endif
 		__unlock_kernel();
+	}
 }
 
 EXPORT_SYMBOL(lock_kernel);

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-11 20:22 [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer Frederic Weisbecker
@ 2008-10-21 12:28 ` Frédéric Weisbecker
  2008-10-21 12:33   ` Ingo Molnar
  2008-10-23 18:15   ` Frank Ch. Eigler
  0 siblings, 2 replies; 12+ messages in thread
From: Frédéric Weisbecker @ 2008-10-21 12:28 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Steven Rostedt

2008/10/11 Frederic Weisbecker <fweisbec@gmail.com>:
> Introduce the Big Kernel Lock tracer.
> This new tracer lets one to observe the latencies caused
> by the lock_kernel() function.
> It captures the time when the task request the spinlock,
> the time when the spinlock is hold and the time it is released.
> This way we can measure the latency for a task that wait for this
> spinlock and the latency caused by the time this lock is hold.
>
> I will surely add some improvements like:
>
> _ Adding a stack trace to know which function locked or unlocked the kernel.
> _ Adding a script to get some statistics about this: which functions hold it
> often, which hold it for a long time and some other things....
>
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> Cc: Steven Rostedt <rostedt@goodmis.org>
> ---
>  include/trace/bkl.h      |   29 ++++++++
>  kernel/trace/Kconfig     |   11 +++
>  kernel/trace/Makefile    |    1 +
>  kernel/trace/trace.h     |   18 +++++
>  kernel/trace/trace_bkl.c |  170 ++++++++++++++++++++++++++++++++++++++++++++++
>  lib/kernel_lock.c        |   22 ++++++-
>  6 files changed, 249 insertions(+), 2 deletions(-)
>
> diff --git a/include/trace/bkl.h b/include/trace/bkl.h
> new file mode 100644
> index 0000000..bed2f1b
> --- /dev/null
> +++ b/include/trace/bkl.h
> @@ -0,0 +1,29 @@
> +#ifndef _TRACE_BKL_H
> +#define _TRACE_BKL_H
> +
> +
> +/* Entry which log the time when the task tried first to acquire the bkl
> + * and the time when it acquired it.
> + * That will let us know the latency to acquire the kernel lock.
> + */
> +struct bkl_trace_acquire {
> +       unsigned long long acquire_req_time;
> +       unsigned long long acquire_time;
> +};
> +
> +/* This will log the time when the task released the bkl.
> + * So we will know the whole duration of its kernel lock.
> + */
> +struct bkl_trace_release {
> +       unsigned long long release_time;
> +};
> +
> +#ifdef CONFIG_BKL_TRACER
> +extern void trace_bkl_acquire(struct bkl_trace_acquire *trace);
> +extern void trace_bkl_release(struct bkl_trace_release *trace);
> +#else
> +static void trace_bkl_acquire(struct bkl_trace_acquire *trace) { }
> +static void trace_bkl_release(struct bkl_trace_release *trace) { }
> +#endif
> +
> +#endif
> diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
> index 396aea1..fde7700 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -144,6 +144,17 @@ config STACK_TRACER
>          This tracer records the max stack of the kernel, and displays
>          it in debugfs/tracing/stack_trace
>
> +config BKL_TRACER
> +       bool "Trace Big Kernel Lock latencies"
> +       depends on HAVE_FTRACE
> +       depends on DEBUG_KERNEL
> +       select FTRACE
> +       select STACKTRACE
> +       help
> +         This tracer records the latencies issued by the Big Kernel Lock.
> +         It traces the time when a task request the bkl, the time when it
> +         acquires it and the time when it releases it.
> +
>  config DYNAMIC_FTRACE
>        bool "enable/disable ftrace tracepoints dynamically"
>        depends on FTRACE
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index a85dfba..2ba8904 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
> @@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
>  obj-$(CONFIG_STACK_TRACER) += trace_stack.o
>  obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
>  obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
> +obj-$(CONFIG_BKL_TRACER) += trace_bkl.o
>
>  libftrace-y := ftrace.o
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 707620c..b1df671 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -8,6 +8,7 @@
>  #include <linux/ring_buffer.h>
>  #include <linux/mmiotrace.h>
>  #include <linux/ftrace.h>
> +#include <trace/bkl.h>
>
>  enum trace_type {
>        __TRACE_FIRST_TYPE = 0,
> @@ -22,6 +23,8 @@ enum trace_type {
>        TRACE_MMIO_RW,
>        TRACE_MMIO_MAP,
>        TRACE_BOOT,
> +       TRACE_BKL_ACQ,
> +       TRACE_BKL_REL,
>
>        __TRACE_LAST_TYPE
>  };
> @@ -119,6 +122,17 @@ struct trace_boot {
>        struct boot_trace       initcall;
>  };
>
> +/* Trace bkl entries */
> +struct trace_bkl_acq {
> +       struct trace_entry              ent;
> +       struct bkl_trace_acquire        bkl_acq;
> +};
> +
> +struct trace_bkl_rel {
> +       struct trace_entry              ent;
> +       struct bkl_trace_release        bkl_rel;
> +};
> +
>  /*
>  * trace_flag_type is an enumeration that holds different
>  * states when a trace occurs. These are:
> @@ -219,6 +233,10 @@ extern void __ftrace_bad_type(void);
>                IF_ASSIGN(var, ent, struct trace_mmiotrace_map,         \
>                          TRACE_MMIO_MAP);                              \
>                IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);     \
> +               IF_ASSIGN(var, ent, struct trace_bkl_acq,               \
> +                         TRACE_BKL_ACQ);                               \
> +               IF_ASSIGN(var, ent, struct trace_bkl_rel,               \
> +                         TRACE_BKL_REL);                               \
>                __ftrace_bad_type();                                    \
>        } while (0)
>
> diff --git a/kernel/trace/trace_bkl.c b/kernel/trace/trace_bkl.c
> new file mode 100644
> index 0000000..169ee01
> --- /dev/null
> +++ b/kernel/trace/trace_bkl.c
> @@ -0,0 +1,170 @@
> +/*
> + * Ring buffer based Big Kernel Lock tracer
> + * for purpose of measuring the latencies issued
> + * by the BKL.
> + *
> + * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
> + */
> +
> +#include <trace/bkl.h>
> +#include "trace.h"
> +#include <linux/ftrace.h>
> +
> +
> +static struct trace_array *bkl_trace;
> +static bool bkl_tracer_enabled;
> +
> +
> +static void bkl_trace_init(struct trace_array *tr)
> +{
> +       int cpu;
> +       bkl_trace = tr;
> +
> +       for_each_cpu_mask(cpu, cpu_possible_map)
> +               tracing_reset(tr, cpu);
> +
> +       if (tr->ctrl)
> +               bkl_tracer_enabled = 1;
> +}
> +
> +static void reset_bkl_trace(struct trace_array *tr)
> +{
> +       bkl_tracer_enabled = 0;
> +}
> +
> +static void bkl_trace_ctrl_update(struct trace_array *tr)
> +{
> +       if (tr->ctrl)
> +               bkl_tracer_enabled = 1;
> +       else
> +               bkl_tracer_enabled = 0;
> +}
> +
> +static enum print_line_t bkl_acq_print_line(struct trace_iterator *iter)
> +{
> +       int ret;
> +       struct trace_entry *entry = iter->ent;
> +       struct trace_bkl_acq *field = (struct trace_bkl_acq *) entry;
> +       struct bkl_trace_acquire *bta = &field->bkl_acq;
> +       struct trace_seq *s = &iter->seq;
> +       unsigned long sec, nsec_rem;
> +       unsigned long long wait_for_spin;
> +
> +       wait_for_spin = bta->acquire_time - bta->acquire_req_time;
> +       nsec_rem = do_div(bta->acquire_req_time, 1000000000);
> +       sec = (unsigned long) bta->acquire_req_time;
> +       ret = trace_seq_printf(s, "Kernel lock requested at %5lu.%09lu ",
> +                                 sec, nsec_rem);
> +       if (!ret)
> +               return TRACE_TYPE_PARTIAL_LINE;
> +
> +       nsec_rem = do_div(bta->acquire_time, 1000000000);
> +       sec = (unsigned long) bta->acquire_time;
> +       ret = trace_seq_printf(s, "acquired at %5lu.%09lu "
> +                                 "(waited %llu nsecs)\n",
> +                                 sec, nsec_rem,
> +                                 wait_for_spin);
> +       if (!ret)
> +               return TRACE_TYPE_PARTIAL_LINE;
> +       return TRACE_TYPE_HANDLED;
> +}
> +
> +static enum print_line_t bkl_rel_print_line(struct trace_iterator *iter)
> +{
> +       int ret;
> +       struct trace_entry *entry = iter->ent;
> +       struct trace_bkl_rel *field = (struct trace_bkl_rel *) entry;
> +       struct bkl_trace_release *btr = &field->bkl_rel;
> +       struct trace_seq *s = &iter->seq;
> +       unsigned long sec, nsec_rem;
> +
> +       nsec_rem = do_div(btr->release_time, 1000000000);
> +       sec = (unsigned long) btr->release_time;
> +       ret = trace_seq_printf(s, "Kernel lock released at  %5lu.%09lu\n",
> +                                 sec, nsec_rem);
> +       return TRACE_TYPE_HANDLED;
> +}
> +
> +static enum print_line_t bkl_trace_print_line(struct trace_iterator *iter)
> +{
> +       struct trace_entry *entry = iter->ent;
> +
> +       if (entry->type == TRACE_BKL_ACQ)
> +               return bkl_acq_print_line(iter);
> +       if (entry->type == TRACE_BKL_REL)
> +               return bkl_rel_print_line(iter);
> +
> +       return TRACE_TYPE_UNHANDLED;
> +}
> +
> +static struct tracer bkl_tracer __read_mostly =
> +{
> +       .name           = "bkl",
> +       .init           = bkl_trace_init,
> +       .reset          = reset_bkl_trace,
> +       .ctrl_update    = bkl_trace_ctrl_update,
> +       .print_line     = bkl_trace_print_line,
> +};
> +
> +__init static int init_bkl_tracer(void)
> +{
> +       return register_tracer(&bkl_tracer);
> +}
> +device_initcall(init_bkl_tracer);
> +
> +/*
> + * These functions don't need us to disable
> + * the preemption since it has been done by lock_kernel()
> + */
> +
> +void trace_bkl_acquire(struct bkl_trace_acquire *trace)
> +{
> +       struct ring_buffer_event *event;
> +       struct trace_bkl_acq *entry;
> +       struct trace_array_cpu *data;
> +       unsigned long irq_flags;
> +       struct trace_array *tr = bkl_trace;
> +
> +       if (!bkl_tracer_enabled)
> +               return;
> +
> +       data = tr->data[smp_processor_id()];
> +
> +       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
> +                                        &irq_flags);
> +       if (!event)
> +               return;
> +       entry   = ring_buffer_event_data(event);
> +       tracing_generic_entry_update(&entry->ent, 0, 0);
> +       entry->ent.type = TRACE_BKL_ACQ;
> +       entry->bkl_acq = *trace;
> +       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
> +
> +       trace_wake_up();
> +}
> +
> +void trace_bkl_release(struct bkl_trace_release *trace)
> +{
> +       struct ring_buffer_event *event;
> +       struct trace_bkl_rel *entry;
> +       struct trace_array_cpu *data;
> +       unsigned long irq_flags;
> +       struct trace_array *tr = bkl_trace;
> +
> +       if (!bkl_tracer_enabled)
> +               return;
> +
> +       data = tr->data[smp_processor_id()];
> +
> +       event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
> +                                        &irq_flags);
> +       if (!event)
> +               return;
> +       entry   = ring_buffer_event_data(event);
> +       tracing_generic_entry_update(&entry->ent, 0, 0);
> +       entry->ent.type = TRACE_BKL_REL;
> +       entry->bkl_rel = *trace;
> +       ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
> +
> +       trace_wake_up();
> +}
> diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
> index 01a3c22..9987834 100644
> --- a/lib/kernel_lock.c
> +++ b/lib/kernel_lock.c
> @@ -9,6 +9,7 @@
>  #include <linux/module.h>
>  #include <linux/kallsyms.h>
>  #include <linux/semaphore.h>
> +#include <trace/bkl.h>
>
>  /*
>  * The 'big kernel lock'
> @@ -116,16 +117,33 @@ static inline void __unlock_kernel(void)
>  void __lockfunc lock_kernel(void)
>  {
>        int depth = current->lock_depth+1;
> -       if (likely(!depth))
> +       if (likely(!depth)) {
> +#ifdef CONFIG_BKL_TRACER
> +               struct bkl_trace_acquire trace;
> +               preempt_disable();
> +               trace.acquire_req_time = cpu_clock(raw_smp_processor_id());
> +               preempt_enable();
>                __lock_kernel();
> +               trace.acquire_time = cpu_clock(raw_smp_processor_id());
> +               trace_bkl_acquire(&trace);
> +#else
> +               __lock_kernel();
> +#endif
> +       }
>        current->lock_depth = depth;
>  }
>
>  void __lockfunc unlock_kernel(void)
>  {
>        BUG_ON(current->lock_depth < 0);
> -       if (likely(--current->lock_depth < 0))
> +       if (likely(--current->lock_depth < 0)) {
> +#ifdef CONFIG_BKL_TRACER
> +               struct bkl_trace_release trace;
> +               trace.release_time = cpu_clock(raw_smp_processor_id());
> +               trace_bkl_release(&trace);
> +#endif
>                __unlock_kernel();
> +       }
>  }
>
>  EXPORT_SYMBOL(lock_kernel);
>

Are these last 15 days patches submitted to tip put in a wait queue
because of the
merge window? I hope that the reason for no comment on them is not because
they stand in a total irrelevance...

Sorry I just wonder.....

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-21 12:28 ` Frédéric Weisbecker
@ 2008-10-21 12:33   ` Ingo Molnar
  2008-10-21 12:42     ` Frédéric Weisbecker
  2008-10-23 18:15   ` Frank Ch. Eigler
  1 sibling, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2008-10-21 12:33 UTC (permalink / raw)
  To: Frédéric Weisbecker; +Cc: linux-kernel, Steven Rostedt


* Frédéric Weisbecker <fweisbec@gmail.com> wrote:

> Are these last 15 days patches submitted to tip put in a wait queue 
> because of the merge window? I hope that the reason for no comment on 
> them is not because they stand in a total irrelevance...

Correct, we froze down for the merge window. Thawing it now :)

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-21 12:33   ` Ingo Molnar
@ 2008-10-21 12:42     ` Frédéric Weisbecker
  0 siblings, 0 replies; 12+ messages in thread
From: Frédéric Weisbecker @ 2008-10-21 12:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Steven Rostedt

2008/10/21 Ingo Molnar <mingo@elte.hu>:
> Correct, we froze down for the merge window. Thawing it now :)

Ok, it's an element of the process that I wasn't aware about :)

Thanks!

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-21 12:28 ` Frédéric Weisbecker
  2008-10-21 12:33   ` Ingo Molnar
@ 2008-10-23 18:15   ` Frank Ch. Eigler
  2008-10-24 13:43     ` Frédéric Weisbecker
  1 sibling, 1 reply; 12+ messages in thread
From: Frank Ch. Eigler @ 2008-10-23 18:15 UTC (permalink / raw)
  To: =?ISO-8859-1?Q?Fr=E9d=E9ric_Weisbecker?=
  Cc: Ingo Molnar, linux-kernel, Steven Rostedt, systemtap


>> Introduce the Big Kernel Lock tracer.
>> This new tracer lets one to observe the latencies caused
>> by the lock_kernel() function.
>> It captures the time when the task request the spinlock,
>> the time when the spinlock is hold and the time it is released.
>> This way we can measure the latency for a task that wait for this
>> spinlock and the latency caused by the time this lock is hold.

For comparison, this is how this sort of analysis may be done with
systemtap, just for fun collecting locking latency on a
per-locker/unlocker (stack traceback) basis.  We don't have the stack
traceback extended right into userspace yet, but will before too long.


% cat bkl.stp
probe kernel.function("lock_kernel") {
        locker=backtrace()
        locktime=gettimeofday_us()
}

probe kernel.function("lock_kernel").return {
        unlocktime=gettimeofday_us()
        delay = unlocktime-locktime
        lockhistory[locker,unlocker] <<< delay
}

probe kernel.function("unlock_kernel") {
        unlocker=backtrace()
}

global locker, unlocker, locktime, lockhistory

probe end,timer.s(30) { // interrupt any time; automatic ending in 30s
        foreach ([l,u] in lockhistory+) { // in increasing order of @count()

                println("locker:") print_stack (l)
                println("unlocker:") print_stack (u)

                println(@hist_log (lockhistory[l,u]))

                if (@max (lockhistory[l,u]) > 100)
                   println("--- holy cow, that can take a long time ---")
        }
        exit()
}



% sudo stap bkl.stp
[...]
locker:
 0xffffffff812bdcec : lock_kernel+0x1/0x37 [kernel]
 0xffffffff812bf12b : kretprobe_trampoline_holder+0x4/0x50 [kernel]
 0xffffffff811b067b : write_chan+0x273/0x338 [kernel]
 0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
 0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
 0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
 0x00000ffffffff810
unlocker:
 0xffffffff812bdca9 : unlock_kernel+0x1/0x31 [kernel]
 0xffffffff811b064e : write_chan+0x246/0x338 [kernel]
 0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
 0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
 0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
 0xffffffff810b6f89 : vfs_write+0xae/0x157 [kernel]
 0x00000fffffffff7f
value |-------------------------------------------------- count
    0 |                                                     3
    1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    570
    2 |@@@@@@@@@@@@@                                      159
    4 |                                                     9
    8 |@                                                   16
   16 |                                                     3
   32 |@@                                                  35
   64 |                                                     5
  128 |                                                     0
  256 |                                                     0

locker:
 0xffffffff812bdcec : lock_kernel+0x1/0x37 [kernel]
 0xffffffff812bf12b : kretprobe_trampoline_holder+0x4/0x50 [kernel]
 0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
 0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
 0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
 0xffffffff810b6f89 : vfs_write+0xae/0x157 [kernel]
 0x00000fffffffff7f
unlocker:
 0xffffffff812bdca9 : unlock_kernel+0x1/0x31 [kernel]
 0xffffffff811b064e : write_chan+0x246/0x338 [kernel]
 0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
 0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
 0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
 0xffffffff810b6f89 : vfs_write+0xae/0x157 [kernel]
 0x00000fffffffff7f
value |-------------------------------------------------- count
    0 |                                                     0
    1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      454
    2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  497
    4 |@@                                                  21
    8 |@                                                   15
   16 |                                                     1
   32 |@@@                                                 33
   64 |                                                     2
  128 |                                                     0
  256 |                                                     0



NB: Observed lock times in the 1-2 us range indicate no contention, as
that's on the order of magnitude of the (un)lock_kernel kprobe
overheads.


- FChE

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-23 18:15   ` Frank Ch. Eigler
@ 2008-10-24 13:43     ` Frédéric Weisbecker
  2008-10-24 14:37       ` Frank Ch. Eigler
  0 siblings, 1 reply; 12+ messages in thread
From: Frédéric Weisbecker @ 2008-10-24 13:43 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: Ingo Molnar, linux-kernel, Steven Rostedt, systemtap

2008/10/23 Frank Ch. Eigler <fche@redhat.com>:
> For comparison, this is how this sort of analysis may be done with
> systemtap, just for fun collecting locking latency on a
> per-locker/unlocker (stack traceback) basis.  We don't have the stack
> traceback extended right into userspace yet, but will before too long.
>
>
> % cat bkl.stp
> probe kernel.function("lock_kernel") {
>        locker=backtrace()
>        locktime=gettimeofday_us()
> }
>
> probe kernel.function("lock_kernel").return {
>        unlocktime=gettimeofday_us()
>        delay = unlocktime-locktime
>        lockhistory[locker,unlocker] <<< delay
> }
>
> probe kernel.function("unlock_kernel") {
>        unlocker=backtrace()
> }
>
> global locker, unlocker, locktime, lockhistory
>
> probe end,timer.s(30) { // interrupt any time; automatic ending in 30s
>        foreach ([l,u] in lockhistory+) { // in increasing order of @count()
>
>                println("locker:") print_stack (l)
>                println("unlocker:") print_stack (u)
>
>                println(@hist_log (lockhistory[l,u]))
>
>                if (@max (lockhistory[l,u]) > 100)
>                   println("--- holy cow, that can take a long time ---")
>        }
>        exit()
> }
>
>
>
> % sudo stap bkl.stp
> [...]
> locker:
>  0xffffffff812bdcec : lock_kernel+0x1/0x37 [kernel]
>  0xffffffff812bf12b : kretprobe_trampoline_holder+0x4/0x50 [kernel]
>  0xffffffff811b067b : write_chan+0x273/0x338 [kernel]
>  0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
>  0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
>  0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
>  0x00000ffffffff810
> unlocker:
>  0xffffffff812bdca9 : unlock_kernel+0x1/0x31 [kernel]
>  0xffffffff811b064e : write_chan+0x246/0x338 [kernel]
>  0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
>  0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
>  0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
>  0xffffffff810b6f89 : vfs_write+0xae/0x157 [kernel]
>  0x00000fffffffff7f
> value |-------------------------------------------------- count
>    0 |                                                     3
>    1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    570
>    2 |@@@@@@@@@@@@@                                      159
>    4 |                                                     9
>    8 |@                                                   16
>   16 |                                                     3
>   32 |@@                                                  35
>   64 |                                                     5
>  128 |                                                     0
>  256 |                                                     0
>
> locker:
>  0xffffffff812bdcec : lock_kernel+0x1/0x37 [kernel]
>  0xffffffff812bf12b : kretprobe_trampoline_holder+0x4/0x50 [kernel]
>  0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
>  0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
>  0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
>  0xffffffff810b6f89 : vfs_write+0xae/0x157 [kernel]
>  0x00000fffffffff7f
> unlocker:
>  0xffffffff812bdca9 : unlock_kernel+0x1/0x31 [kernel]
>  0xffffffff811b064e : write_chan+0x246/0x338 [kernel]
>  0xffffffff810305e0 : default_wake_function+0x0/0xf [kernel]
>  0xffffffff811ade5f : tty_write+0x184/0x213 [kernel]
>  0xffffffff811b0408 : write_chan+0x0/0x338 [kernel]
>  0xffffffff810b6f89 : vfs_write+0xae/0x157 [kernel]
>  0x00000fffffffff7f
> value |-------------------------------------------------- count
>    0 |                                                     0
>    1 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      454
>    2 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  497
>    4 |@@                                                  21
>    8 |@                                                   15
>   16 |                                                     1
>   32 |@@@                                                 33
>   64 |                                                     2
>  128 |                                                     0
>  256 |                                                     0
>
>
>
> NB: Observed lock times in the 1-2 us range indicate no contention, as
> that's on the order of magnitude of the (un)lock_kernel kprobe
> overheads.

That's a great and powerful tool.

But just one sorrow:
I just think that's a pity that we have to use/learn a new scripting
language to use it.
I would rather prefer to use an API that provides functions/objects
for most common scripting languages.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-24 13:43     ` Frédéric Weisbecker
@ 2008-10-24 14:37       ` Frank Ch. Eigler
  2008-10-24 14:47         ` Steven Rostedt
  0 siblings, 1 reply; 12+ messages in thread
From: Frank Ch. Eigler @ 2008-10-24 14:37 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Ingo Molnar, linux-kernel, Steven Rostedt, systemtap

Hi -

On Fri, Oct 24, 2008 at 03:43:49PM +0200, Frédéric Weisbecker wrote:

> > [...]  For comparison, this is how this sort of analysis may be
> > done with systemtap [...]

> That's a great and powerful tool.
> 
> But just one sorrow:
> I just think that's a pity that we have to use/learn a new scripting
> language to use it.

I understand.

> I would rather prefer to use an API that provides functions/objects
> for most common scripting languages.

That is an interesting idea.  One possible problem is that the final
complete script "program" needs to be translated to something that can
run quickly and safely inside the kernel.  Full python or perl runtime
+ libraries would have been almost certainly unbearable.

- FChE

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-24 14:37       ` Frank Ch. Eigler
@ 2008-10-24 14:47         ` Steven Rostedt
  2008-10-24 15:02           ` Frank Ch. Eigler
  0 siblings, 1 reply; 12+ messages in thread
From: Steven Rostedt @ 2008-10-24 14:47 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Frédéric Weisbecker, Ingo Molnar, linux-kernel, systemtap


On Fri, 24 Oct 2008, Frank Ch. Eigler wrote:

> Hi -
> 
> On Fri, Oct 24, 2008 at 03:43:49PM +0200, Fr?d?ric Weisbecker wrote:
> 
> > > [...]  For comparison, this is how this sort of analysis may be
> > > done with systemtap [...]
> 
> > That's a great and powerful tool.
> > 
> > But just one sorrow:
> > I just think that's a pity that we have to use/learn a new scripting
> > language to use it.
> 
> I understand.
> 
> > I would rather prefer to use an API that provides functions/objects
> > for most common scripting languages.
> 
> That is an interesting idea.  One possible problem is that the final
> complete script "program" needs to be translated to something that can
> run quickly and safely inside the kernel.  Full python or perl runtime
> + libraries would have been almost certainly unbearable.

Why can't the userspace application convert the script to something easy 
that the kernel can handle?  But have the user application interface be 
very simple, and perhaps even use perl or python.

-- Steve


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-24 14:47         ` Steven Rostedt
@ 2008-10-24 15:02           ` Frank Ch. Eigler
  2008-10-24 15:26             ` Frédéric Weisbecker
  0 siblings, 1 reply; 12+ messages in thread
From: Frank Ch. Eigler @ 2008-10-24 15:02 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Frédéric Weisbecker, Ingo Molnar, linux-kernel, systemtap

Hi -

On Fri, Oct 24, 2008 at 10:47:36AM -0400, Steven Rostedt wrote:
> [...]
> > > I would rather prefer to use an API that provides functions/objects
> > > for most common scripting languages.
> > 
> > That is an interesting idea.  One possible problem is that the final
> > complete script "program" needs to be translated to something that can
> > run quickly and safely inside the kernel.  Full python or perl runtime
> > + libraries would have been almost certainly unbearable.
>
> Why can't the userspace application convert the script to something
> easy that the kernel can handle?

That's what we do with the systemtap script, where kernel "handling"
consists of "running the machine code".

> But have the user application interface be very simple, and perhaps
> even use perl or python.

perl and python are pretty big procedural languages, and are not
easily compiled down to compact & quickly executed machine code.  (I
take it no one is suggesting including a perl or python VM in the
kernel.)  Plus, debugger-flavoured event-handling programming style
would not look nearly as compact in perl/python as in systemtap, which
is small and domain-specific.

- FChE

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-24 15:02           ` Frank Ch. Eigler
@ 2008-10-24 15:26             ` Frédéric Weisbecker
  2008-10-24 19:29               ` Frank Ch. Eigler
  2008-10-29  5:46               ` Tom Zanussi
  0 siblings, 2 replies; 12+ messages in thread
From: Frédéric Weisbecker @ 2008-10-24 15:26 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: Steven Rostedt, Ingo Molnar, linux-kernel, systemtap

2008/10/24 Frank Ch. Eigler <fche@redhat.com>:
> That's what we do with the systemtap script, where kernel "handling"
> consists of "running the machine code".
>
>> But have the user application interface be very simple, and perhaps
>> even use perl or python.
>
> perl and python are pretty big procedural languages, and are not
> easily compiled down to compact & quickly executed machine code.  (I
> take it no one is suggesting including a perl or python VM in the
> kernel.)  Plus, debugger-flavoured event-handling programming style
> would not look nearly as compact in perl/python as in systemtap, which
> is small and domain-specific.
>
> - FChE
>

Actually what I thought is a style like this (Python like):

probe = Systemtap.probeFunc("lock_kernel")
probe.captureUtime("utime"))
probe.captureBacktrace("trace")
probe.trace()

For an obvious set of batch work like that, that could be possible,
perhaps even easy
to implement an Api...
When the object calls trace(), the userspace Systemtap analyse the list
of work to do and then translate into commands in kernel space.

And the script could wait for events and then do its own processing
with the captured events
(do some operations on delays, on output....).

for event in probe.getEvent(): #blocking
    print event["utime"]
    trace = event["trace"] #Systemtap.trace object with specific
fields and a default string repr
    print trace

It would be interpreted by Python itself, and you just have to capture
commands and works
sent through Api. Then, when the kernel has something to give, you
just have to place it in the
appripriate object and transmit it to the script which is waiting.
Processing and output with the data are done by the python script.
So actually, the python script only needs to ask you what data to
capture. It's its own responsability to
do wathever it wants with.

What do you think?

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-24 15:26             ` Frédéric Weisbecker
@ 2008-10-24 19:29               ` Frank Ch. Eigler
  2008-10-29  5:46               ` Tom Zanussi
  1 sibling, 0 replies; 12+ messages in thread
From: Frank Ch. Eigler @ 2008-10-24 19:29 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Steven Rostedt, Ingo Molnar, linux-kernel, systemtap

Hi -

On Fri, Oct 24, 2008 at 05:26:53PM +0200, Frédéric Weisbecker wrote:
> [...]
> Actually what I thought is a style like this (Python like):
> 
> probe = Systemtap.probeFunc("lock_kernel")
> probe.captureUtime("utime"))
> probe.captureBacktrace("trace")
> probe.trace()
> 
> For an obvious set of batch work like that, that could be possible,
> perhaps even easy to implement an Api...
> When the object calls trace(), the userspace Systemtap analyse the list
> of work to do and then translate into commands in kernel space.

There comes a point not too far beyond this example where one may want
to capture functions of values; filter; perform loops/conditionals;
refer to variables/arrays.  Coding that logic this way would be so
clumsy as to take away any charm of working in python.


> And the script could wait for events and then do its own processing
> with the captured events (do some operations on delays, on output....).
> 
> for event in probe.getEvent(): #blocking
>     print event["utime"]
>     trace = event["trace"] #Systemtap.trace object with specific
> fields and a default string repr
>     print trace
> 
> It would be interpreted by Python itself [...]

This is too gets impractical.  If the only event handler code is
general python, it can't practically be executed within the kernel.
So we're then talking about a python script consuming trace data
streamed in.  We lose:
- capability to take immediate action upon event occurrence
- any claim to good performance (~microsecond event handling)
- the natural coupling between events and their handlers
- the ability to statically analyze the entire instrumentation,
  to optimize locking, data encoding


Now, one may imagine using this sort of thing as a *wrapper* around
plain systemtap: where the python libraries generate stylized
systemtap code that emits data in a form that the later "getEvent"
routine can decode again and act upon.

Several people have written shell, perl, even awk scripts scripts to
generate systemtap code for particular special purposes.  So one can
hide the script language as an implementation detail of a higher level
tool.  It's not a substitute though.


- FChE

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer
  2008-10-24 15:26             ` Frédéric Weisbecker
  2008-10-24 19:29               ` Frank Ch. Eigler
@ 2008-10-29  5:46               ` Tom Zanussi
  1 sibling, 0 replies; 12+ messages in thread
From: Tom Zanussi @ 2008-10-29  5:46 UTC (permalink / raw)
  To: Frédéric Weisbecker
  Cc: Frank Ch. Eigler, Steven Rostedt, Ingo Molnar, linux-kernel, systemtap

Hi,

This is a great idea.  Some thoughts on how it could work below...

On Fri, 2008-10-24 at 17:26 +0200, Frédéric Weisbecker wrote: 
> 2008/10/24 Frank Ch. Eigler <fche@redhat.com>:
> > That's what we do with the systemtap script, where kernel "handling"
> > consists of "running the machine code".
> >
> >> But have the user application interface be very simple, and perhaps
> >> even use perl or python.
> >
> > perl and python are pretty big procedural languages, and are not
> > easily compiled down to compact & quickly executed machine code. (I
> > take it no one is suggesting including a perl or python VM in the
> > kernel.) Plus, debugger-flavoured event-handling programming style
> > would not look nearly as compact in perl/python as in systemtap,
> which
> > is small and domain-specific.
> >
> > - FChE
> >
> 
> Actually what I thought is a style like this (Python like):
> 
> probe = Systemtap.probeFunc("lock_kernel")
> probe.captureUtime("utime"))
> probe.captureBacktrace("trace")
> probe.trace()
> 
> For an obvious set of batch work like that, that could be possible,
> perhaps even easy
> to implement an Api...
> When the object calls trace(), the userspace Systemtap analyse the
> list
> of work to do and then translate into commands in kernel space.
> 

When you say 'translate into commands in kernel space', I'm assuming in
the simplest case that you're thinking of the trace() method on your
Python probe object as generating systemtap probe code which in this
case would insert a probe on "lock_kernel" and collect the specific data
you named in the captureXXX methods.  If so, then the generated
systemtap code might look something like this (my systemtap coding is a
bit rusty and I don't know Python at all, so, please excuse any coding
problems - think of it as pseudo-code):

global ID_LOCK_KERNEL = 1;

/* systemtap code - in turn generates kernel code/module */
probe kernel.function("lock_kernel")
{
        /* log captureXXX fields using systemtap binary printf */
        printf("%b2%b4%s", ID_LOCK_KERNEL, utime, backtrace());
}

Once the trace method generates the systemtap probe code, it would then
construct the appropriate stap command line, exec it (which compiles the
generated probe code, inserts the module, arms the probes, etc) and from
then on sits around waiting for output from the probe...

> And the script could wait for events and then do its own processing
> with the captured events
> (do some operations on delays, on output....).
> 
> for event in probe.getEvent(): #blocking
> print event["utime"]
> trace = event["trace"] #Systemtap.trace object with specific
> fields and a default string repr
> print trace
> 
> It would be interpreted by Python itself, and you just have to capture
> commands and works
> sent through Api. Then, when the kernel has something to give, you
> just have to place it in the
> appripriate object and transmit it to the script which is waiting.
> Processing and output with the data are done by the python script.
> So actually, the python script only needs to ask you what data to
> capture. It's its own responsability to
> do wathever it wants with.

...as it receives the probe output, it would then extract the individual
events from the stream and dispatch them to the event-handling Python
code, which would be able to do whatever it wants to with...

There are at least two different ways I can think of to do this part.

The most straightforward would be to do it all in pure Python in the
script receiving the probe output.  Since I don't know Python, I'll use
pseudo-Perl, but the idea would be the same for Python:

open EVENT_STREAM, "stap -g lock-kernel-probe.stp"

while (<EVENT_STREAM>) {
  /* userspace Perl code: get and dispatch the next event id */
  $id = unpack("C"); /* Perl function for pulling apart C structs */

  /* dispatch to matching on_xxx 'handler' function */
  switch ($id) {
          case ID_LOCK_KERNEL:
                  /* unpack event, call handler using param array */
                  on_lock_kernel(unpack("XCLZ"));
                  break;
          default:
                  break;
  }
}

The Perl script code above continually grabs the next event on the
output stream, uses the first byte to figure out which event it
corresponds to and once it knows that, grabs the rest of the event data
and finally dispatches it to the 'handler' function for that event:

/* userspace Perl code: lock_kernel event 'handler' */
sub on_lock_kernel
{
        /* get the params */
        my ($id, $utime, $stacktrace) = @_;
        /* add to hash tracking times we've seen this stacktrace */
        $stacktraces{$stacktrace}++;
}

The handler code gets the data as usual via the params and does what it
wants with it; in this case it just uses the stack trace as a hash key
to keep a running count of the number of times that particular path was
hit.

Finally, at the end of the trace, a special end-of-trace handler gets
called, which can be used e.g. to dump the results out: 

/* userspace Perl code: at the end of the trace, dump what we've got */
sub on_end_trace_session
{
        while (($stacktrace, $count) each %stacktraces) {
                print "Stacktrace: $stacktrace\n";
                print "happened $stacktraces{$stacktrace} times\n";
        }
}

Another, presumably more efficient way to do the same thing, would be to
'embed' and instantiate an instance of the interpreter in the daemon
code.  The same (in this case) end-use Perl on_XXX event handlers would
be called, but the unpacking code and dispatch loop would be done in C
code as part of the daemon.  There, the fields of each event would be
translated into a form understandable by the interpreter and the handler
in the embedded interpreter invoked for each event:

/* userspace C code: get and dispatch the next event id */
unsigned char id = next_event_id(event_stream);

/* dispatch to matching on_xxx 'handler' function in Perl interpreter*/
switch (id) {
        case ID_LOCK_KERNEL:
                /* unpack event, call handler using param array */
                unsigned long = next_event_long(event_stream);
                char *string = next_event_string(event_stream);
                perl_invoke_on_lock_kernel(utime, stacktrace);
                break;
        default:
                break;
}

/* userspace C code: embedded Perl magic for invoking a Perl function */
void perl_invoke_on_lock_kernel(unsigned long utime, char *stacktrace)
{
        CALLBACK_SETUP(on_lock_kernel);
        XPUSHs(sv_2mortal(newSViv(utime)));
        XPUSHs(sv_2mortal(newSVpvn(stacktrace, strlen(stacktrace))));
        CALLBACK_CALL_AND_CLEANUP(on_lock_kernel);
}

The above dispatch loop, unpacking, etc is pretty much the same as in
the 'pure Perl' version, but done in C, and with the exception that it
does some interpreter-specific magic to invoke the interpreter methods,
which are exactly the same as in the 'pure' version.

I actually did this for every single event in the old LTT (not the
language binding, just the dispatching-to-to script-level handlers
part), so I know it works in practice and in fact it worked very well -
it was able to handle a pretty heavy trace stream while doing all the
nice Perl hashing/summing/etc in the event handlers it needed to do in
order to produce interesting and non-trivial results; IIRC it
comfortably handled full tracing (all events) during a kernel compile:

http://lkml.org/lkml/2004/9/1/197

And, not to knock the systemtap language, which is a fine and capable
language, but even the simple scripts I wrote for that demo did things
that exceeded the capabilities of the systemtap language (and the dtrace
language as well, I should add).

> What do you think?

I think what you want to do is quite doable, however you decide to do
it.  I know Python too has an API for embedding interpreters and
invoking script methods, as most non-trivial scripting languages do.  My
guess is that if you took the embedded interpreter approach for Python,
with a little generalization you could have a common layer to the
implementation that other languages could easily plug into.

Also, once you had the basic stuff working, you could extend it and in
the process make more use of the filtering and other capabilities
systemtap offers i.e. you needn't be limited to just using systemtap to
'scrape' data and do all the processing in the userspace Python script.

One example would again be stacktraces, which because of their size, are
things that you probably wouldn't want to send to userspace at a very
high frequency.  Here's an example of a combination systemtap kernel
script/Perl userspace script that continuously streams and converts
systemtap hashes into Perl hashes (because sytemtap kernel resources are
necessarily limited whereas userspace Perl interpreter resources
aren't):

http://sourceware.org/ml/systemtap/2005-q3/msg00550.html

It's a good example of a case where doing filtering in the kernel makes
a lot of sense.  With the hybrid systemtap/Perl/Python approach, you
make use of the strengths of systemtap while at the same time retaining
the full power of your language of choice. 

Of course, one of the challenges in using the more advanced features of
systemtap would be in making those capabilities available as natural
extensions to the supported scripting language(s).  But even without
them, I think the basic mode would be an extremely useful and powerful
complement to systemtap. 

Tom




^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2008-10-29  5:46 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-10-11 20:22 [PATCH 5/5] tracing/ftrace: Introduce the big kernel lock tracer Frederic Weisbecker
2008-10-21 12:28 ` Frédéric Weisbecker
2008-10-21 12:33   ` Ingo Molnar
2008-10-21 12:42     ` Frédéric Weisbecker
2008-10-23 18:15   ` Frank Ch. Eigler
2008-10-24 13:43     ` Frédéric Weisbecker
2008-10-24 14:37       ` Frank Ch. Eigler
2008-10-24 14:47         ` Steven Rostedt
2008-10-24 15:02           ` Frank Ch. Eigler
2008-10-24 15:26             ` Frédéric Weisbecker
2008-10-24 19:29               ` Frank Ch. Eigler
2008-10-29  5:46               ` Tom Zanussi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).