LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* PATCH] ftrace: Add a C/P state tracer to help power optimization
@ 2008-10-06 17:26 Arjan van de Ven
  2008-10-06 20:46 ` Andi Kleen
  2008-10-27 15:59 ` Ingo Molnar
  0 siblings, 2 replies; 20+ messages in thread
From: Arjan van de Ven @ 2008-10-06 17:26 UTC (permalink / raw)
  To: linux-kernel; +Cc: Steven Rostedt, mingo, Peter Zijlstra, lenb


From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 3 Oct 2008 10:18:21 -0700
Subject: [PATCH] ftrace: Add a C/P state tracer to help power optimization

This patch adds a C/P-state ftrace plugin that will generate
detailed statistics about the C/P-states that are being used,
so that we can look at detailed decisions that the C/P-state
code is making, rather than the too high level "average"
that we have today.

An example way of using this is:
mount -t debugfs none /sys/kernel/debug
echo cstate > /sys/kernel/debug/tracing/current_tracer
echo 1 > /sys/kernel/debug/tracing/tracing_enabled
sleep 1
echo 0 > /sys/kernel/debug/tracing/tracing_enabled
cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    4 +
 arch/x86/kernel/process.c                  |   15 +++
 include/linux/ftrace.h                     |   26 ++++
 kernel/trace/Kconfig                       |   11 ++
 kernel/trace/Makefile                      |    1 +
 kernel/trace/trace.h                       |    7 +
 kernel/trace/trace_power.c                 |  183 ++++++++++++++++++++++++++++
 scripts/trace/power.pl                     |  108 ++++++++++++++++
 8 files changed, 355 insertions(+), 0 deletions(-)
 create mode 100644 kernel/trace/trace_power.c
 create mode 100644 scripts/trace/power.pl

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 9943b4c..d0ac5a7 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -390,6 +391,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
+	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -426,6 +428,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
+	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3468131..4c06af2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
 
 unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
+		struct power_trace it;
+
+		trace_power_start(&it, POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
+		trace_power_end(&it);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
+	trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+	struct power_trace it;
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 		local_irq_enable();
 }
@@ -183,9 +195,12 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+	struct power_trace it;
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
+	trace_power_end(&it);
 }
 
 /*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5812dba..b2547e7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -232,6 +232,32 @@ static inline void start_boot_trace(void) { }
 static inline void stop_boot_trace(void) { }
 #endif
 
+#define POWER_NONE 0
+#define POWER_CSTATE 1
+#define POWER_PSTATE 2
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
 
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 396aea1..1f797bd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,6 +134,17 @@ config BOOT_TRACER
 	  be enabled if this tracer is selected since only one tracer
 	  should touch the tracing buffer at a time.
 
+config POWER_TRACER
+	bool "Trace power consumption behavior"
+	depends on DEBUG_KERNEL
+	depends on X86
+	select TRACING
+	help
+	  This tracer helps developers to analyize and optimize the kernels
+	  power management decisions, specifically the C-state and P-state
+	  behavior.
+
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FTRACE
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index a85dfba..c7820fb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f1f9957..d3568c2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@ enum trace_type {
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BOOT,
+	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
 };
@@ -117,6 +118,11 @@ struct trace_boot {
 	struct boot_trace	initcall;
 };
 
+struct trace_power {
+	struct trace_entry	ent;
+	struct power_trace	state_data;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -217,6 +223,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 0000000..2078f0d
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,183 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int trace_power_enabled;
+
+
+static void start_power_trace(void)
+{
+	trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+static void power_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	power_trace = tr;
+
+	trace_power_enabled = 1;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+}
+
+static void power_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_power_trace();
+	else
+		stop_power_trace(tr);
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_power *field ;
+	struct power_trace *it;
+	struct trace_seq *s = &iter->seq;
+	struct timespec stamp;
+	struct timespec duration;
+
+	trace_assign_type(field, entry);
+	it = &field->state_data;
+	stamp = ktime_to_timespec(it->stamp);
+	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+	if (entry->type == TRACE_POWER) {
+		if (it->type == POWER_CSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu,
+					  duration.tv_sec,
+					  duration.tv_nsec);
+		if (it->type == POWER_PSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+	.name		= "power",
+	.init		= power_trace_init,
+	.reset		= stop_power_trace,
+	.ctrl_update	= power_trace_ctrl_update,
+	.print_line	= power_print_line,
+};
+
+static int init_power_trace(void)
+{
+	return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
diff --git a/scripts/trace/power.pl b/scripts/trace/power.pl
new file mode 100644
index 0000000..8114e6a
--- /dev/null
+++ b/scripts/trace/power.pl
@@ -0,0 +1,108 @@
+#!/usr/bin/perl
+
+# Copyright 2008, Intel Corporation
+#
+# This file is part of the Linux kernel
+#
+# This program file is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by the
+# Free Software Foundation; version 2 of the License.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program in a file named COPYING; if not, write to the
+# Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor,
+# Boston, MA 02110-1301 USA
+#
+# Authors:
+# 	Arjan van de Ven <arjan@linux.intel.com>
+
+
+#
+# This script turns a cstate ftrace output into a SVG graphic that shows
+# historic C-state information
+#
+#
+# 	cat /sys/kernel/debug/tracer/trace | perl power.pl > out.svg
+#
+
+my @styles;
+my $base = 0;
+
+my @pstate_last;
+my @pstate_level;
+
+$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
+
+
+print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
+print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
+
+my $scale = 30000.0;
+while (<>) {
+	my $line = $_;
+	if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
+		if ($base == 0) {
+			$base = $1;
+		}
+		my $time = $1 - $base;
+		$time = $time * $scale;
+		my $C = $2;
+		my $cpu = $3;
+		my $y = 400 * $cpu;
+		my $duration = $4 * $scale;
+		my $msec = int($4 * 100000)/100.0;
+		my $height = $C * 20;
+		$style = $styles[$C];
+
+		$y = $y + 140 - $height;
+
+		$x2 = $time + 4;
+		$y2 = $y + 4;
+
+
+		print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
+		print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
+	}
+	if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
+		my $time = $1 - $base;
+		my $state = $2;
+		my $cpu = $3;
+
+		if (defined($pstate_last[$cpu])) {
+			my $from = $pstate_last[$cpu];
+			my $oldstate = $pstate_state[$cpu];
+			my $duration = ($time-$from) * $scale;
+
+			$from = $from * $scale;
+			my $to = $from + $duration;
+			my $height = 140 - ($oldstate * (140/8));
+
+			my $y = 400 * $cpu + 200 + $height;
+			my $y2 = $y+4;
+			my $style = $styles[8];
+
+			print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
+			print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
+		};
+
+		$pstate_last[$cpu] = $time;
+		$pstate_state[$cpu] = $state;
+	}
+}
+
+
+print "</svg>\n";
-- 
1.5.5.1



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 17:26 PATCH] ftrace: Add a C/P state tracer to help power optimization Arjan van de Ven
@ 2008-10-06 20:46 ` Andi Kleen
  2008-10-06 20:57   ` Arjan van de Ven
  2008-10-07 10:39   ` Steven Rostedt
  2008-10-27 15:59 ` Ingo Molnar
  1 sibling, 2 replies; 20+ messages in thread
From: Andi Kleen @ 2008-10-06 20:46 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: linux-kernel, Steven Rostedt, mingo, Peter Zijlstra, lenb

Arjan van de Ven <arjan@infradead.org> writes:

This might be a subtle issue, but when someone starts synchronizing
their clocksource read with idle (which makes sense in some circumstances) 
then your timing will break because ktime_get() might be inaccurate directly 
coming out of idle before other code ran.

That's theoretical right now, but could be a real danger in the future.
  
> @@ -426,6 +428,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
>  		}
>  	}
>  
> +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);

Wouldn't that be better higher up in the cpufreq system?  It would
seem bad to duplicate that in all low level cpufreq modules.

Also I suspect some higher level format would be good here too.
Just put the frequency in? 

> +	bool "Trace power consumption behavior"
> +	depends on DEBUG_KERNEL
> +	depends on X86
> +	select TRACING
> +	help
> +	  This tracer helps developers to analyize and optimize the kernels
> +	  power management decisions, specifically the C-state and P-state
> +	  behavior.

Reference to the required userland?

> +	stamp = ktime_to_timespec(it->stamp);
> +	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
> +
> +	if (entry->type == TRACE_POWER) {
> +		if (it->type == POWER_CSTATE)
> +			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
> +					  stamp.tv_sec,
> +					  stamp.tv_nsec,
> +					  it->state, iter->cpu,
> +					  duration.tv_sec,
> +					  duration.tv_nsec);
> +		if (it->type == POWER_PSTATE)
> +			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
> +					  stamp.tv_sec,
> +					  stamp.tv_nsec,
> +					  it->state, iter->cpu);

I suspect a less verbose output format would be better.

> +{
> +	if (!trace_power_enabled)
> +		return;
> +
> +	memset(it, 0, sizeof(struct power_trace));

The memset seems redundant.


> +void trace_power_end(struct power_trace *it)
> +{
> +	struct ring_buffer_event *event;
> +	struct trace_power *entry;
> +	struct trace_array_cpu *data;
> +	unsigned long irq_flags;
> +	struct trace_array *tr = power_trace;
> +
> +	if (!trace_power_enabled)
> +		return;
> +
> +	preempt_disable();
> +	it->end = ktime_get();
> +	data = tr->data[smp_processor_id()];
> +
> +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
> +					 &irq_flags);
> +	if (!event)
> +		goto out;
> +	entry	= ring_buffer_event_data(event);
> +	tracing_generic_entry_update(&entry->ent, 0, 0);
> +	entry->ent.type = TRACE_POWER;
> +	entry->state_data = *it;
> +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);

When ring_buffer_lock_reserve really disables interrupts (I haven't
checked since it's not in 2.6.27rc8) you could avoid the
preempt_disable by moving the data = tr->data ... one below.

Similar comments to trace_power_mark. Also it would be probably good 
to use a common function instead of duplicating so much code.


> +	it->state = level;
> +	it->type = type;
> +	it->stamp = ktime_get();
> +	preempt_disable();
> +	it->end = it->stamp;
> +	data = tr->data[smp_processor_id()];
> +
> +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
> +					 &irq_flags);
> +	if (!event)
> +		goto out;
> +	entry	= ring_buffer_event_data(event);
> +	tracing_generic_entry_update(&entry->ent, 0, 0);
> +	entry->ent.type = TRACE_POWER;
> +	entry->state_data = *it;
> +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
> +
> +	trace_wake_up();

Hmm, that does a unconditional wake_up() in idle. Doesn't this cause a loop
on UP?

idle -> wakeup -> idle -> wakeup -> ... etc.

Am I missing something?

> +#
> +# 	cat /sys/kernel/debug/tracer/trace | perl power.pl > out.svg

The cat is not needed

-Andi
-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 20:46 ` Andi Kleen
@ 2008-10-06 20:57   ` Arjan van de Ven
  2008-10-06 21:19     ` Andi Kleen
  2008-10-07 10:39   ` Steven Rostedt
  1 sibling, 1 reply; 20+ messages in thread
From: Arjan van de Ven @ 2008-10-06 20:57 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, Steven Rostedt, mingo, Peter Zijlstra, lenb

On Mon, 06 Oct 2008 22:46:09 +0200
Andi Kleen <andi@firstfloor.org> wrote:
> >  
> > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> 
> Wouldn't that be better higher up in the cpufreq system?  It would
> seem bad to duplicate that in all low level cpufreq modules.

the problem is that higher up the actual P state isn't known.

> 
> Also I suspect some higher level format would be good here too.
> Just put the frequency in? 

the link between P states and frequency is... rather lose.
Especially with Turbo Mode it no longer is really relevant to list
frequencies.

> > +			ret = trace_seq_printf(s, "[%5ld.%09ld]
> > CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
> > +					  stamp.tv_sec,
> > +					  stamp.tv_nsec,
> > +					  it->state, iter->cpu,
> > +					  duration.tv_sec,
> > +					  duration.tv_nsec);
> > +		if (it->type == POWER_PSTATE)
> > +			ret = trace_seq_printf(s, "[%5ld.%09ld]
> > PSTATE: Going to P%i on cpu %i\n",
> > +					  stamp.tv_sec,
> > +					  stamp.tv_nsec,
> > +					  it->state, iter->cpu);
> 
> I suspect a less verbose output format would be better.
why?
It's fine as it is, and it's actually human readable as well.

> 
> > +{
> > +	if (!trace_power_enabled)
> > +		return;
> > +
> > +	memset(it, 0, sizeof(struct power_trace));
> 
> The memset seems redundant.

it's free and it initializes the datastructure cleanly; and only when
the tracer is in use.

> 

> Hmm, that does a unconditional wake_up() in idle. Doesn't this cause
> a loop on UP?
> 
> idle -> wakeup -> idle -> wakeup -> ... etc.
> 
> Am I missing something?

yes you're missing something ;-)
this code is called when going out of idle, not when going into idle.


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 20:57   ` Arjan van de Ven
@ 2008-10-06 21:19     ` Andi Kleen
  2008-10-06 21:21       ` Arjan van de Ven
  0 siblings, 1 reply; 20+ messages in thread
From: Andi Kleen @ 2008-10-06 21:19 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, linux-kernel, Steven Rostedt, mingo, Peter Zijlstra, lenb

On Mon, Oct 06, 2008 at 01:57:15PM -0700, Arjan van de Ven wrote:
> 
> the problem is that higher up the actual P state isn't known.

My main worry is that every architecture is going to implement
their own version of this plugin if there isn't some abstraction.
 
> > Also I suspect some higher level format would be good here too.
> > Just put the frequency in? 
> 
> the link between P states and frequency is... rather lose.
> Especially with Turbo Mode it no longer is really relevant to list
> frequencies.

It would probably be less confusing for everyone if the higher level
cpufreq layers reported the correct frequency for turbo mode too.
I haven't checked how complicated this would be.

> 
> > > +			ret = trace_seq_printf(s, "[%5ld.%09ld]
> > > CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
> > > +					  stamp.tv_sec,
> > > +					  stamp.tv_nsec,
> > > +					  it->state, iter->cpu,
> > > +					  duration.tv_sec,
> > > +					  duration.tv_nsec);
> > > +		if (it->type == POWER_PSTATE)
> > > +			ret = trace_seq_printf(s, "[%5ld.%09ld]
> > > PSTATE: Going to P%i on cpu %i\n",
> > > +					  stamp.tv_sec,
> > > +					  stamp.tv_nsec,
> > > +					  it->state, iter->cpu);
> > 
> > I suspect a less verbose output format would be better.
> why?

to shuffle less data around

> It's fine as it is, and it's actually human readable as well.

I suspect humans could well do with something more compact too

e.g.

CPU %d -> P%d

> 
> > 
> > > +{
> > > +	if (!trace_power_enabled)
> > > +		return;
> > > +
> > > +	memset(it, 0, sizeof(struct power_trace));
> > 
> > The memset seems redundant.
> 
> it's free and it initializes the datastructure cleanly; and only when

AFAIK all data fields are init'ed anyways.

How is it free? I'm not sure gcc will optimize it away completely.

I thought the point of ftrace tracers was to be reasonably efficient
when they are enabled?  If efficiency doesn't matter you could
as well use printk.

> > Hmm, that does a unconditional wake_up() in idle. Doesn't this cause
> > a loop on UP?
> > 
> > idle -> wakeup -> idle -> wakeup -> ... etc.
> > 
> > Am I missing something?
> 
> yes you're missing something ;-)
> this code is called when going out of idle, not when going into idle.

Ok.

That means that when a CPU is idle forever there won't be any output?

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 21:19     ` Andi Kleen
@ 2008-10-06 21:21       ` Arjan van de Ven
  2008-10-06 21:34         ` Andi Kleen
  0 siblings, 1 reply; 20+ messages in thread
From: Arjan van de Ven @ 2008-10-06 21:21 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Andi Kleen, linux-kernel, Steven Rostedt, mingo, Peter Zijlstra, lenb

> > the link between P states and frequency is... rather lose.
> > Especially with Turbo Mode it no longer is really relevant to list
> > frequencies.
> 
> It would probably be less confusing for everyone if the higher level
> cpufreq layers reported the correct frequency for turbo mode too.
> I haven't checked how complicated this would be.

it's impossible until after the fact; you don't know which frequencies
you got until you check back later.


> Ok.
> 
> That means that when a CPU is idle forever there won't be any output?

correct; it'll wait until it stops being idle before telling  you how
long it was idle.

if it really bothers you we could do a dummy broadcast ipi on shutting
down the tracer.. in practice it's not a problem.
(we wake up all cpus all the time)



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 21:21       ` Arjan van de Ven
@ 2008-10-06 21:34         ` Andi Kleen
  0 siblings, 0 replies; 20+ messages in thread
From: Andi Kleen @ 2008-10-06 21:34 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andi Kleen, linux-kernel, Steven Rostedt, mingo, Peter Zijlstra, lenb

On Mon, Oct 06, 2008 at 02:21:31PM -0700, Arjan van de Ven wrote:
> > > the link between P states and frequency is... rather lose.
> > > Especially with Turbo Mode it no longer is really relevant to list
> > > frequencies.
> > 
> > It would probably be less confusing for everyone if the higher level
> > cpufreq layers reported the correct frequency for turbo mode too.
> > I haven't checked how complicated this would be.
> 
> it's impossible until after the fact; you don't know which frequencies
> you got until you check back later.

Well it could do that couldn't it? Ok not sure how big the cost
would be.

I can just imagine Turbo mode becoming a FAQ on this list and having
better reporting upfront might mitigate this a bit.

> 
> 
> > Ok.
> > 
> > That means that when a CPU is idle forever there won't be any output?
> 
> correct; it'll wait until it stops being idle before telling  you how
> long it was idle.
> 
> if it really bothers you we could do a dummy broadcast ipi on shutting
> down the tracer.. in practice it's not a problem.

Or just mark entry/exit, but you just need a wakeup threshold to avoid
the loop. A wakeup threshold seems like a good idea anyways though, just
to get better efficiency on larger systems.

> (we wake up all cpus all the time)

I assume that will change in the future and might even not true
anymore on some special stripped down configurations.

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 20:46 ` Andi Kleen
  2008-10-06 20:57   ` Arjan van de Ven
@ 2008-10-07 10:39   ` Steven Rostedt
  1 sibling, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2008-10-07 10:39 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Arjan van de Ven, linux-kernel, mingo, Peter Zijlstra, lenb


On Mon, 6 Oct 2008, Andi Kleen wrote:
> 
> > +void trace_power_end(struct power_trace *it)
> > +{
> > +	struct ring_buffer_event *event;
> > +	struct trace_power *entry;
> > +	struct trace_array_cpu *data;
> > +	unsigned long irq_flags;
> > +	struct trace_array *tr = power_trace;
> > +
> > +	if (!trace_power_enabled)
> > +		return;
> > +
> > +	preempt_disable();
> > +	it->end = ktime_get();
> > +	data = tr->data[smp_processor_id()];
> > +
> > +	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
> > +					 &irq_flags);
> > +	if (!event)
> > +		goto out;
> > +	entry	= ring_buffer_event_data(event);
> > +	tracing_generic_entry_update(&entry->ent, 0, 0);
> > +	entry->ent.type = TRACE_POWER;
> > +	entry->state_data = *it;
> > +	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
> 
> When ring_buffer_lock_reserve really disables interrupts (I haven't
> checked since it's not in 2.6.27rc8) you could avoid the
> preempt_disable by moving the data = tr->data ... one below.

The ring_buffer_lock_reserve use to disable interrupts (I'll be removing
the flags argument soon). Now it just does a minimum of preempt_disable.
So your suggestion about moving the smp_processor_id() calls below that
is still valid. The reserve will always disable preemption.

-- Steve

> 
> Similar comments to trace_power_mark. Also it would be probably good 
> to use a common function instead of duplicating so much code.
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-06 17:26 PATCH] ftrace: Add a C/P state tracer to help power optimization Arjan van de Ven
  2008-10-06 20:46 ` Andi Kleen
@ 2008-10-27 15:59 ` Ingo Molnar
  2008-10-27 16:05   ` Steven Rostedt
  2008-10-27 18:05   ` Arjan van de Ven
  1 sibling, 2 replies; 20+ messages in thread
From: Ingo Molnar @ 2008-10-27 15:59 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: linux-kernel, Steven Rostedt, Peter Zijlstra, lenb


* Arjan van de Ven <arjan@infradead.org> wrote:

> From: Arjan van de Ven <arjan@linux.intel.com>
> Date: Fri, 3 Oct 2008 10:18:21 -0700
> Subject: [PATCH] ftrace: Add a C/P state tracer to help power optimization
> 
> This patch adds a C/P-state ftrace plugin that will generate
> detailed statistics about the C/P-states that are being used,
> so that we can look at detailed decisions that the C/P-state
> code is making, rather than the too high level "average"
> that we have today.

nice! No fundamental objections - Len, do you concur? We could carry 
this in the ftrace tree, the impact on the rest of the kernel is 
minimal.

A few minor nits:

>  static void poll_idle(void)
>  {
> +	struct power_trace it;
> +	trace_power_start(&it, POWER_CSTATE, 0);

please put a newline after variable definitions.

> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -232,6 +232,32 @@ static inline void start_boot_trace(void) { }
>  static inline void stop_boot_trace(void) { }
>  #endif
>  
> +#define POWER_NONE 0
> +#define POWER_CSTATE 1
> +#define POWER_PSTATE 2
> +struct power_trace {

the defines should be an enum and please put a newline after the enum 
definition.

> +#ifdef CONFIG_POWER_TRACER
> +	ktime_t			stamp;
> +	ktime_t			end;
> +	int			type;
> +	int			state;
> +#endif
> +};

> index 396aea1..1f797bd 100644
> --- a/kernel/trace/Kconfig
> +++ b/kernel/trace/Kconfig
> @@ -134,6 +134,17 @@ config BOOT_TRACER
>  	  be enabled if this tracer is selected since only one tracer
>  	  should touch the tracing buffer at a time.
>  
> +config POWER_TRACER
> +	bool "Trace power consumption behavior"
> +	depends on DEBUG_KERNEL
> +	depends on X86
> +	select TRACING

this should be HAVE_POWER_TRACER defined in arch/x86/Kconfig, instead 
of a 'depends on X86'.

> +	help
> +	  This tracer helps developers to analyize and optimize the kernels
> +	  power management decisions, specifically the C-state and P-state
> +	  behavior.

s/analyize/analyse
s/kernels/kernel's

> +static struct trace_array *power_trace;
> +static int trace_power_enabled;

__read_mostly ?

	Ingo

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 15:59 ` Ingo Molnar
@ 2008-10-27 16:05   ` Steven Rostedt
  2008-10-27 16:21     ` Alan Cox
  2008-10-27 18:05   ` Arjan van de Ven
  1 sibling, 1 reply; 20+ messages in thread
From: Steven Rostedt @ 2008-10-27 16:05 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Arjan van de Ven, linux-kernel, Peter Zijlstra, lenb


On Mon, 27 Oct 2008, Ingo Molnar wrote:
> 
> > +	help
> > +	  This tracer helps developers to analyize and optimize the kernels
> > +	  power management decisions, specifically the C-state and P-state
> > +	  behavior.
> 
> s/analyize/analyse

Unless Britsh English is different, which I do not know.

s/analyse/analyze/

;-)

-- Steve

> s/kernels/kernel's
> 
> > +static struct trace_array *power_trace;
> > +static int trace_power_enabled;
> 
> __read_mostly ?
> 
> 	Ingo
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 16:05   ` Steven Rostedt
@ 2008-10-27 16:21     ` Alan Cox
  2008-10-27 17:16       ` Steven Rostedt
  0 siblings, 1 reply; 20+ messages in thread
From: Alan Cox @ 2008-10-27 16:21 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Ingo Molnar, Arjan van de Ven, linux-kernel, Peter Zijlstra, lenb

On Mon, 27 Oct 2008 12:05:57 -0400 (EDT)
Steven Rostedt <rostedt@goodmis.org> wrote:

> 
> On Mon, 27 Oct 2008, Ingo Molnar wrote:
> > 
> > > +	help
> > > +	  This tracer helps developers to analyize and optimize the kernels
> > > +	  power management decisions, specifically the C-state and P-state
> > > +	  behavior.
> > 
> > s/analyize/analyse
> 
> Unless Britsh English is different, which I do not know.
> 
> s/analyse/analyze/

American English is indeed "different" to English.

Alan
 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 16:21     ` Alan Cox
@ 2008-10-27 17:16       ` Steven Rostedt
  0 siblings, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2008-10-27 17:16 UTC (permalink / raw)
  To: Alan Cox
  Cc: Ingo Molnar, Arjan van de Ven, linux-kernel, Peter Zijlstra, lenb


On Mon, 27 Oct 2008, Alan Cox wrote:

> On Mon, 27 Oct 2008 12:05:57 -0400 (EDT)
> Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > 
> > On Mon, 27 Oct 2008, Ingo Molnar wrote:
> > > 
> > > > +	help
> > > > +	  This tracer helps developers to analyize and optimize the kernels
> > > > +	  power management decisions, specifically the C-state and P-state
> > > > +	  behavior.
> > > 
> > > s/analyize/analyse
> > 
> > Unless Britsh English is different, which I do not know.
> > 
> > s/analyse/analyze/
> 
> American English is indeed "different" to English.

By golly! Us Americans are more English that those on that Island over the 
pond. Dog gonnit! We still use the English system for measuring!

  Sarah Palin for Prez!

;-)

-- STeve


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 15:59 ` Ingo Molnar
  2008-10-27 16:05   ` Steven Rostedt
@ 2008-10-27 18:05   ` Arjan van de Ven
  2008-10-27 19:47     ` Frank Ch. Eigler
  1 sibling, 1 reply; 20+ messages in thread
From: Arjan van de Ven @ 2008-10-27 18:05 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Steven Rostedt, Peter Zijlstra, lenb

On Mon, 27 Oct 2008 16:59:21 +0100
Ingo Molnar <mingo@elte.hu> wrote:
> >  
> > +config POWER_TRACER
> > +	bool "Trace power consumption behavior"
> > +	depends on DEBUG_KERNEL
> > +	depends on X86
> > +	select TRACING
> 
> this should be HAVE_POWER_TRACER defined in arch/x86/Kconfig, instead 
> of a 'depends on X86'.

there's other tracers that have a similar depends

other than that, updated patch below:


From: Arjan van de Ven <arjan@linux.intel.com>
Subject: [PATCH] ftrace: Add a C/P state tracer to help power optimization

This patch adds a C/P-state ftrace plugin that will generate
detailed statistics about the C/P-states that are being used,
so that we can look at detailed decisions that the C/P-state
code is making, rather than the too high level "average"
that we have today.

An example way of using this is:
mount -t debugfs none /sys/kernel/debug
echo cstate > /sys/kernel/debug/tracing/current_tracer
echo 1 > /sys/kernel/debug/tracing/tracing_enabled
sleep 1
echo 0 > /sys/kernel/debug/tracing/tracing_enabled
cat /sys/kernel/debug/tracing/trace | perl scripts/trace/cstate.pl > out.svg

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    4 +
 arch/x86/kernel/process.c                  |   16 +++
 include/linux/ftrace.h                     |   29 +++++
 kernel/trace/Kconfig                       |   11 ++
 kernel/trace/Makefile                      |    1 +
 kernel/trace/trace.h                       |    7 +
 kernel/trace/trace_power.c                 |  183 ++++++++++++++++++++++++++++
 7 files changed, 251 insertions(+), 0 deletions(-)
 create mode 100644 kernel/trace/trace_power.c

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8e48c5d..88ea02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
+#include <linux/ftrace.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -391,6 +392,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
+	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
+	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
 		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3468131..d7baba9 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/ftrace.h>
 #include <asm/system.h>
 
 unsigned long idle_halt;
@@ -100,6 +101,9 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
+		struct power_trace it;
+
+		trace_power_start(&it, POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -112,6 +116,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
+		trace_power_end(&it);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -154,24 +159,31 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
+	trace_power_end(&it);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
+	struct power_trace it;
 	if (!need_resched()) {
+		trace_power_start(&it, POWER_CSTATE, 1);
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
 		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
+		trace_power_end(&it);
 	} else
 		local_irq_enable();
 }
@@ -183,9 +195,13 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
+	struct power_trace it;
+
+	trace_power_start(&it, POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
+	trace_power_end(&it);
 }
 
 /*
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 899ec4b..71c5c43 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -250,6 +250,35 @@ static inline void start_boot_trace(void) { }
 static inline void stop_boot_trace(void) { }
 #endif
 
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
 
 
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e0cea28..e6386bc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -138,6 +138,17 @@ config BOOT_TRACER
 	    selected, because the self-tests are an initcall as well and that
 	    would invalidate the boot trace. )
 
+config POWER_TRACER
+	bool "Trace power consumption behavior"
+	depends on DEBUG_KERNEL
+	depends on X86
+	select TRACING
+	help
+	  This tracer helps developers to analyze and optimize the kernels
+	  power management decisions, specifically the C-state and P-state
+	  behavior.
+
+
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c8228b1..917b42e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -24,5 +24,6 @@ obj-$(CONFIG_NOP_TRACER) += trace_nop.o
 obj-$(CONFIG_STACK_TRACER) += trace_stack.o
 obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
+obj-$(CONFIG_POWER_TRACER) += trace_power.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6889ca4..e58d7a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -22,6 +22,7 @@ enum trace_type {
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
 	TRACE_BOOT,
+	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
 };
@@ -117,6 +118,11 @@ struct trace_boot {
 	struct boot_trace	initcall;
 };
 
+struct trace_power {
+	struct trace_entry	ent;
+	struct power_trace	state_data;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -217,6 +223,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
 			  TRACE_MMIO_MAP);				\
 		IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT);	\
+		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
new file mode 100644
index 0000000..ac9ea42
--- /dev/null
+++ b/kernel/trace/trace_power.c
@@ -0,0 +1,183 @@
+/*
+ * ring buffer based C-state tracer
+ *
+ * Arjan van de Ven <arjan@linux.intel.com>
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * Much is borrowed from trace_boot.c which is
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/debugfs.h>
+#include <linux/ftrace.h>
+#include <linux/kallsyms.h>
+
+#include "trace.h"
+
+static struct trace_array *power_trace;
+static int __read_mostly trace_power_enabled;
+
+
+static void start_power_trace(void)
+{
+	trace_power_enabled = 1;
+}
+
+static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+static void power_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	power_trace = tr;
+
+	trace_power_enabled = 1;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+}
+
+static void power_trace_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_power_trace();
+	else
+		stop_power_trace(tr);
+}
+
+static enum print_line_t power_print_line(struct trace_iterator *iter)
+{
+	int ret = 0;
+	struct trace_entry *entry = iter->ent;
+	struct trace_power *field ;
+	struct power_trace *it;
+	struct trace_seq *s = &iter->seq;
+	struct timespec stamp;
+	struct timespec duration;
+
+	trace_assign_type(field, entry);
+	it = &field->state_data;
+	stamp = ktime_to_timespec(it->stamp);
+	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
+
+	if (entry->type == TRACE_POWER) {
+		if (it->type == POWER_CSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu,
+					  duration.tv_sec,
+					  duration.tv_nsec);
+		if (it->type == POWER_PSTATE)
+			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
+					  stamp.tv_sec,
+					  stamp.tv_nsec,
+					  it->state, iter->cpu);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_HANDLED;
+	}
+	return TRACE_TYPE_UNHANDLED;
+}
+
+static struct tracer power_tracer __read_mostly =
+{
+	.name		= "power",
+	.init		= power_trace_init,
+	.reset		= stop_power_trace,
+	.ctrl_update	= power_trace_ctrl_update,
+	.print_line	= power_print_line,
+};
+
+static int init_power_trace(void)
+{
+	return register_tracer(&power_tracer);
+}
+device_initcall(init_power_trace);
+
+void trace_power_start(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+EXPORT_SYMBOL_GPL(trace_power_start);
+
+
+void trace_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_end);
+
+void trace_power_mark(struct power_trace *it, unsigned int type,
+			 unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	unsigned long irq_flags;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+	entry->ent.type = TRACE_POWER;
+	entry->state_data = *it;
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+
+ out:
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(trace_power_mark);
-- 
1.5.6.5



-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 18:05   ` Arjan van de Ven
@ 2008-10-27 19:47     ` Frank Ch. Eigler
  2008-10-27 20:13       ` Steven Rostedt
  2008-10-27 21:06       ` Arjan van de Ven
  0 siblings, 2 replies; 20+ messages in thread
From: Frank Ch. Eigler @ 2008-10-27 19:47 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, linux-kernel, Steven Rostedt, Peter Zijlstra, lenb


Arjan van de Ven <arjan@infradead.org> writes:

> [...]
> --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> [...]
> @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
>  		}
>  	}
>  
> +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> +
>  	switch (data->cpu_feature) {
>  	case SYSTEM_INTEL_MSR_CAPABLE:
>  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> [...]

Is there some reason that this doesn't use tracepoints instead
of such a single-backend hook?


- FChE

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 19:47     ` Frank Ch. Eigler
@ 2008-10-27 20:13       ` Steven Rostedt
  2008-10-27 21:06       ` Arjan van de Ven
  1 sibling, 0 replies; 20+ messages in thread
From: Steven Rostedt @ 2008-10-27 20:13 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Arjan van de Ven, Ingo Molnar, linux-kernel, Peter Zijlstra, lenb


On Mon, 27 Oct 2008, Frank Ch. Eigler wrote:

> 
> Arjan van de Ven <arjan@infradead.org> writes:
> 
> > [...]
> > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > [...]
> > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
> >  		}
> >  	}
> >  
> > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > +
> >  	switch (data->cpu_feature) {
> >  	case SYSTEM_INTEL_MSR_CAPABLE:
> >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > [...]
> 
> Is there some reason that this doesn't use tracepoints instead
> of such a single-backend hook?

Yeah, this should be converted to a trace_point. See include/trace/sched.h 
for examples.

The users are in kernel/sched.c.  Just add "trace_" in front of the names 
to find the users. Then you can look at kernel/trace/trace_sched_switch.c 
to see an example on how to hook into it.

-- Steve


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 19:47     ` Frank Ch. Eigler
  2008-10-27 20:13       ` Steven Rostedt
@ 2008-10-27 21:06       ` Arjan van de Ven
  2008-10-28 10:04         ` Ingo Molnar
  2009-02-10 20:57         ` Jason Baron
  1 sibling, 2 replies; 20+ messages in thread
From: Arjan van de Ven @ 2008-10-27 21:06 UTC (permalink / raw)
  To: Frank Ch. Eigler
  Cc: Ingo Molnar, linux-kernel, Steven Rostedt, Peter Zijlstra, lenb

On Mon, 27 Oct 2008 15:47:30 -0400
fche@redhat.com (Frank Ch. Eigler) wrote:

> 
> Arjan van de Ven <arjan@infradead.org> writes:
> 
> > [...]
> > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > [...]
> > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct
> > cpufreq_policy *policy, }
> >  	}
> >  
> > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > +
> >  	switch (data->cpu_feature) {
> >  	case SYSTEM_INTEL_MSR_CAPABLE:
> >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > [...]
> 
> Is there some reason that this doesn't use tracepoints instead
> of such a single-backend hook?

because it's a ton simpler this way? do simple things simpe and all
that....


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 21:06       ` Arjan van de Ven
@ 2008-10-28 10:04         ` Ingo Molnar
  2009-02-10 20:57         ` Jason Baron
  1 sibling, 0 replies; 20+ messages in thread
From: Ingo Molnar @ 2008-10-28 10:04 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Frank Ch. Eigler, linux-kernel, Steven Rostedt, Peter Zijlstra, lenb


* Arjan van de Ven <arjan@infradead.org> wrote:

> On Mon, 27 Oct 2008 15:47:30 -0400
> fche@redhat.com (Frank Ch. Eigler) wrote:
> 
> > 
> > Arjan van de Ven <arjan@infradead.org> writes:
> > 
> > > [...]
> > > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > [...]
> > > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct
> > > cpufreq_policy *policy, }
> > >  	}
> > >  
> > > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > > +
> > >  	switch (data->cpu_feature) {
> > >  	case SYSTEM_INTEL_MSR_CAPABLE:
> > >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > > [...]
> > 
> > Is there some reason that this doesn't use tracepoints instead
> > of such a single-backend hook?
> 
> because it's a ton simpler this way? do simple things simpe and all 
> that....

correct.

	Ingo

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2008-10-27 21:06       ` Arjan van de Ven
  2008-10-28 10:04         ` Ingo Molnar
@ 2009-02-10 20:57         ` Jason Baron
  2009-02-10 21:26           ` Frederic Weisbecker
  2009-02-11  9:30           ` Ingo Molnar
  1 sibling, 2 replies; 20+ messages in thread
From: Jason Baron @ 2009-02-10 20:57 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Frank Ch. Eigler, Ingo Molnar, linux-kernel, Steven Rostedt,
	Peter Zijlstra, lenb

On Mon, Oct 27, 2008 at 02:06:30PM -0700, Arjan van de Ven wrote:
> > Arjan van de Ven <arjan@infradead.org> writes:
> > 
> > > [...]
> > > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > [...]
> > > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct
> > > cpufreq_policy *policy, }
> > >  	}
> > >  
> > > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > > +
> > >  	switch (data->cpu_feature) {
> > >  	case SYSTEM_INTEL_MSR_CAPABLE:
> > >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > > [...]
> > 
> > Is there some reason that this doesn't use tracepoints instead
> > of such a single-backend hook?
> 
> because it's a ton simpler this way? do simple things simpe and all
> that....
> 

hi,

I wrote a patch to make c/p state tracer dependent on tracepoints and
then realized that the discussion had already been had. However, the
patch to use tracepoints is fairly simple, allows for other consumers,
and avoids a function call in the off case. please consider.

thanks,

-Jason

Signed-off-by: Jason Baron <jbaron@redhat.com>

---

 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    3 +++
 arch/x86/kernel/process.c                  |    4 ++++
 include/linux/ftrace.h                     |   15 ---------------
 include/trace/power.h                      |   18 ++++++++++++++++++
 kernel/trace/trace_power.c                 |   28 ++++++++++++++++++++--------
 5 files changed, 45 insertions(+), 23 deletions(-)
 create mode 100644 include/trace/power.h


diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319..4540ddc 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,6 +34,7 @@
 #include <linux/compiler.h>
 #include <linux/dmi.h>
 #include <linux/ftrace.h>
+#include <trace/power.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
@@ -70,6 +71,8 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+DEFINE_TRACE(power_mark);
+
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e68bb9e..09cfd5d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,6 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 
@@ -19,6 +20,9 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b..044e0fd 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -342,21 +342,6 @@ struct power_trace {
 #endif
 };
 
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
-
-
 /*
  * Structure that defines an entry function trace.
  */
diff --git a/include/trace/power.h b/include/trace/power.h
new file mode 100644
index 0000000..c3225ff
--- /dev/null
+++ b/include/trace/power.h
@@ -0,0 +1,18 @@
+#ifndef _TRACE_POWER_H
+#define _TRACE_POWER_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_TRACE(power_start,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_mark,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_end,
+	TPPROTO(struct power_trace *it),
+		TPARGS(it));
+
+#endif
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 7bda248..18e97c9 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -14,30 +14,45 @@
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
+#include <linux/module.h>
+#include <trace/power.h>
 
 #include "trace.h"
 
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
 
+static void trace_power_start_callback(struct power_trace *it,
+					unsigned int type, unsigned int state);
+static void trace_power_mark_callback(struct power_trace *it, unsigned int type,
+					unsigned int state);
+static void trace_power_end_callback(struct power_trace *it);
 
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
+	register_trace_power_start(trace_power_start_callback);
+	register_trace_power_end(trace_power_end_callback);
+	register_trace_power_mark(trace_power_mark_callback);
 }
 
 static void stop_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
+	unregister_trace_power_start(trace_power_start_callback);
+	unregister_trace_power_end(trace_power_end_callback);
+	unregister_trace_power_mark(trace_power_mark_callback);
 }
 
-
 static int power_trace_init(struct trace_array *tr)
 {
 	int cpu;
 	power_trace = tr;
 
 	trace_power_enabled = 1;
+	register_trace_power_start(trace_power_start_callback);
+	register_trace_power_end(trace_power_end_callback);
+	register_trace_power_mark(trace_power_mark_callback);
 
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
@@ -95,8 +110,8 @@ static int init_power_trace(void)
 }
 device_initcall(init_power_trace);
 
-void trace_power_start(struct power_trace *it, unsigned int type,
-			 unsigned int level)
+static void trace_power_start_callback(struct power_trace *it,
+			unsigned int type, unsigned int level)
 {
 	if (!trace_power_enabled)
 		return;
@@ -106,10 +121,9 @@ void trace_power_start(struct power_trace *it, unsigned int type,
 	it->type = type;
 	it->stamp = ktime_get();
 }
-EXPORT_SYMBOL_GPL(trace_power_start);
 
 
-void trace_power_end(struct power_trace *it)
+static void trace_power_end_callback(struct power_trace *it)
 {
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
@@ -139,9 +153,8 @@ void trace_power_end(struct power_trace *it)
  out:
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(trace_power_end);
 
-void trace_power_mark(struct power_trace *it, unsigned int type,
+static void trace_power_mark_callback(struct power_trace *it, unsigned int type,
 			 unsigned int level)
 {
 	struct ring_buffer_event *event;
@@ -176,4 +189,3 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
  out:
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(trace_power_mark);

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2009-02-10 20:57         ` Jason Baron
@ 2009-02-10 21:26           ` Frederic Weisbecker
  2009-02-11  9:30           ` Ingo Molnar
  1 sibling, 0 replies; 20+ messages in thread
From: Frederic Weisbecker @ 2009-02-10 21:26 UTC (permalink / raw)
  To: Jason Baron
  Cc: Arjan van de Ven, Frank Ch. Eigler, Ingo Molnar, linux-kernel,
	Steven Rostedt, Peter Zijlstra, lenb

On Tue, Feb 10, 2009 at 03:57:47PM -0500, Jason Baron wrote:
> On Mon, Oct 27, 2008 at 02:06:30PM -0700, Arjan van de Ven wrote:
> > > Arjan van de Ven <arjan@infradead.org> writes:
> > > 
> > > > [...]
> > > > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > > [...]
> > > > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct
> > > > cpufreq_policy *policy, }
> > > >  	}
> > > >  
> > > > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > > > +
> > > >  	switch (data->cpu_feature) {
> > > >  	case SYSTEM_INTEL_MSR_CAPABLE:
> > > >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > > > [...]
> > > 
> > > Is there some reason that this doesn't use tracepoints instead
> > > of such a single-backend hook?
> > 
> > because it's a ton simpler this way? do simple things simpe and all
> > that....
> > 
> 
> hi,
> 
> I wrote a patch to make c/p state tracer dependent on tracepoints and
> then realized that the discussion had already been had. However, the
> patch to use tracepoints is fairly simple, allows for other consumers,
> and avoids a function call in the off case. please consider.
> 
> thanks,
> 
> -Jason


Hi Jason, yes that's a nice idea.

 
> Signed-off-by: Jason Baron <jbaron@redhat.com>
> 
> ---
> 
>  arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    3 +++
>  arch/x86/kernel/process.c                  |    4 ++++
>  include/linux/ftrace.h                     |   15 ---------------
>  include/trace/power.h                      |   18 ++++++++++++++++++
>  kernel/trace/trace_power.c                 |   28 ++++++++++++++++++++--------
>  5 files changed, 45 insertions(+), 23 deletions(-)
>  create mode 100644 include/trace/power.h
> 
> 
> diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> index 4b1c319..4540ddc 100644
> --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> @@ -34,6 +34,7 @@
>  #include <linux/compiler.h>
>  #include <linux/dmi.h>
>  #include <linux/ftrace.h>
> +#include <trace/power.h>
>  
>  #include <linux/acpi.h>
>  #include <acpi/processor.h>
> @@ -70,6 +71,8 @@ struct acpi_cpufreq_data {
>  
>  static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
>  
> +DEFINE_TRACE(power_mark);
> +
>  /* acpi_perf_data is a pointer to percpu data. */
>  static struct acpi_processor_performance *acpi_perf_data;
>  
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index e68bb9e..09cfd5d 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -9,6 +9,7 @@
>  #include <linux/pm.h>
>  #include <linux/clockchips.h>
>  #include <linux/ftrace.h>
> +#include <trace/power.h>
>  #include <asm/system.h>
>  #include <asm/apic.h>
>  
> @@ -19,6 +20,9 @@ EXPORT_SYMBOL(idle_nomwait);
>  
>  struct kmem_cache *task_xstate_cachep;
>  
> +DEFINE_TRACE(power_start);
> +DEFINE_TRACE(power_end);
> +
>  int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
>  {
>  	*dst = *src;
> diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
> index 677432b..044e0fd 100644
> --- a/include/linux/ftrace.h
> +++ b/include/linux/ftrace.h
> @@ -342,21 +342,6 @@ struct power_trace {
>  #endif
>  };
>  
> -#ifdef CONFIG_POWER_TRACER
> -extern void trace_power_start(struct power_trace *it, unsigned int type,
> -					unsigned int state);
> -extern void trace_power_mark(struct power_trace *it, unsigned int type,
> -					unsigned int state);
> -extern void trace_power_end(struct power_trace *it);
> -#else
> -static inline void trace_power_start(struct power_trace *it, unsigned int type,
> -					unsigned int state) { }
> -static inline void trace_power_mark(struct power_trace *it, unsigned int type,
> -					unsigned int state) { }
> -static inline void trace_power_end(struct power_trace *it) { }
> -#endif
> -
> -
>  /*
>   * Structure that defines an entry function trace.
>   */
> diff --git a/include/trace/power.h b/include/trace/power.h
> new file mode 100644
> index 0000000..c3225ff
> --- /dev/null
> +++ b/include/trace/power.h
> @@ -0,0 +1,18 @@
> +#ifndef _TRACE_POWER_H
> +#define _TRACE_POWER_H
> +
> +#include <linux/tracepoint.h>
> +
> +DECLARE_TRACE(power_start,
> +	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
> +		TPARGS(it, type, state));
> +
> +DECLARE_TRACE(power_mark,
> +	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
> +		TPARGS(it, type, state));
> +
> +DECLARE_TRACE(power_end,
> +	TPPROTO(struct power_trace *it),
> +		TPARGS(it));
> +
> +#endif


A recent patch on -tip tree moved the power tracing related headers to trace/power.h
So you will probably need to rebase your patch against latest -tip (not a lot of changes
are needed though).


> diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
> index 7bda248..18e97c9 100644
> --- a/kernel/trace/trace_power.c
> +++ b/kernel/trace/trace_power.c
> @@ -14,30 +14,45 @@
>  #include <linux/ftrace.h>
>  #include <linux/kallsyms.h>
>  #include <linux/module.h>
> +#include <linux/module.h>
> +#include <trace/power.h>
>  
>  #include "trace.h"
>  
>  static struct trace_array *power_trace;
>  static int __read_mostly trace_power_enabled;
>  
> +static void trace_power_start_callback(struct power_trace *it,
> +					unsigned int type, unsigned int state);
> +static void trace_power_mark_callback(struct power_trace *it, unsigned int type,
> +					unsigned int state);
> +static void trace_power_end_callback(struct power_trace *it);
>  
>  static void start_power_trace(struct trace_array *tr)
>  {
>  	trace_power_enabled = 1;
> +	register_trace_power_start(trace_power_start_callback);
> +	register_trace_power_end(trace_power_end_callback);
> +	register_trace_power_mark(trace_power_mark_callback);
>  }
>  
>  static void stop_power_trace(struct trace_array *tr)
>  {
>  	trace_power_enabled = 0;
> +	unregister_trace_power_start(trace_power_start_callback);
> +	unregister_trace_power_end(trace_power_end_callback);
> +	unregister_trace_power_mark(trace_power_mark_callback);
>  }
>  
> -
>  static int power_trace_init(struct trace_array *tr)
>  {
>  	int cpu;
>  	power_trace = tr;
>  
>  	trace_power_enabled = 1;
> +	register_trace_power_start(trace_power_start_callback);
> +	register_trace_power_end(trace_power_end_callback);
> +	register_trace_power_mark(trace_power_mark_callback);


It would be probably better to put your tracepoints registering on a dedicated
function to call it on both init and start.
Also it should be better to check the return value and warn on fail case.

Thanks!


>  	for_each_cpu(cpu, cpu_possible_mask)
>  		tracing_reset(tr, cpu);
> @@ -95,8 +110,8 @@ static int init_power_trace(void)
>  }
>  device_initcall(init_power_trace);
>  
> -void trace_power_start(struct power_trace *it, unsigned int type,
> -			 unsigned int level)
> +static void trace_power_start_callback(struct power_trace *it,
> +			unsigned int type, unsigned int level)
>  {
>  	if (!trace_power_enabled)
>  		return;
> @@ -106,10 +121,9 @@ void trace_power_start(struct power_trace *it, unsigned int type,
>  	it->type = type;
>  	it->stamp = ktime_get();
>  }
> -EXPORT_SYMBOL_GPL(trace_power_start);
>  
>  
> -void trace_power_end(struct power_trace *it)
> +static void trace_power_end_callback(struct power_trace *it)
>  {
>  	struct ring_buffer_event *event;
>  	struct trace_power *entry;
> @@ -139,9 +153,8 @@ void trace_power_end(struct power_trace *it)
>   out:
>  	preempt_enable();
>  }
> -EXPORT_SYMBOL_GPL(trace_power_end);
>  
> -void trace_power_mark(struct power_trace *it, unsigned int type,
> +static void trace_power_mark_callback(struct power_trace *it, unsigned int type,
>  			 unsigned int level)
>  {
>  	struct ring_buffer_event *event;
> @@ -176,4 +189,3 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
>   out:
>  	preempt_enable();
>  }
> -EXPORT_SYMBOL_GPL(trace_power_mark);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2009-02-10 20:57         ` Jason Baron
  2009-02-10 21:26           ` Frederic Weisbecker
@ 2009-02-11  9:30           ` Ingo Molnar
  2009-02-11 18:57             ` Jason Baron
  1 sibling, 1 reply; 20+ messages in thread
From: Ingo Molnar @ 2009-02-11  9:30 UTC (permalink / raw)
  To: Jason Baron
  Cc: Arjan van de Ven, Frank Ch. Eigler, linux-kernel, Steven Rostedt,
	Peter Zijlstra, lenb, Frédéric Weisbecker


* Jason Baron <jbaron@redhat.com> wrote:

> On Mon, Oct 27, 2008 at 02:06:30PM -0700, Arjan van de Ven wrote:
> > > Arjan van de Ven <arjan@infradead.org> writes:
> > > 
> > > > [...]
> > > > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > > [...]
> > > > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct
> > > > cpufreq_policy *policy, }
> > > >  	}
> > > >  
> > > > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > > > +
> > > >  	switch (data->cpu_feature) {
> > > >  	case SYSTEM_INTEL_MSR_CAPABLE:
> > > >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > > > [...]
> > > 
> > > Is there some reason that this doesn't use tracepoints instead
> > > of such a single-backend hook?
> > 
> > because it's a ton simpler this way? do simple things simpe and all
> > that....
> > 
> 
> hi,
> 
> I wrote a patch to make c/p state tracer dependent on tracepoints and
> then realized that the discussion had already been had. However, the
> patch to use tracepoints is fairly simple, allows for other consumers,
> and avoids a function call in the off case. please consider.
> 
> thanks,
> 
> -Jason
> 
> Signed-off-by: Jason Baron <jbaron@redhat.com>
> 
> ---
> 
>  arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    3 +++
>  arch/x86/kernel/process.c                  |    4 ++++
>  include/linux/ftrace.h                     |   15 ---------------
>  include/trace/power.h                      |   18 ++++++++++++++++++
>  kernel/trace/trace_power.c                 |   28 ++++++++++++++++++++--------
>  5 files changed, 45 insertions(+), 23 deletions(-)
>  create mode 100644 include/trace/power.h

Looks good, but could you please base this on latest -tip? There's a number of 
pending changes in the tracing tree that conflict:

patching file arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
Hunk #1 FAILED at 34.
1 out of 2 hunks FAILED -- rejects in file 
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
patching file arch/x86/kernel/process.c
Hunk #1 FAILED at 9.
1 out of 2 hunks FAILED -- rejects in file arch/x86/kernel/process.c
patching file include/linux/ftrace.h
Hunk #1 FAILED at 342.
1 out of 1 hunk FAILED -- rejects in file include/linux/ftrace.h
The next patch would create the file include/trace/power.h,
which already exists!  Applying it anyway.
patching file include/trace/power.h
Patch attempted to create file include/trace/power.h, which already exists.
Hunk #1 FAILED at 1.
1 out of 1 hunk FAILED -- rejects in file include/trace/power.h
patching file kernel/trace/trace_power.c
Hunk #1 FAILED at 14.
Hunk #2 succeeded at 111 (offset 1 line).
Hunk #4 succeeded at 148 (offset -5 lines).
Hunk #5 succeeded at 183 (offset -6 lines).
1 out of 5 hunks FAILED -- rejects in file kernel/trace/trace_power.c

See:

  http://people.redhat.com/mingo/tip.git/README

	Ingo

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: PATCH] ftrace: Add a C/P state tracer to help power optimization
  2009-02-11  9:30           ` Ingo Molnar
@ 2009-02-11 18:57             ` Jason Baron
  0 siblings, 0 replies; 20+ messages in thread
From: Jason Baron @ 2009-02-11 18:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Arjan van de Ven, Frank Ch. Eigler, linux-kernel, Steven Rostedt,
	Peter Zijlstra, lenb, Frédéric Weisbecker

On Wed, Feb 11, 2009 at 10:30:29AM +0100, Ingo Molnar wrote:
> * Jason Baron <jbaron@redhat.com> wrote:
> 
> > On Mon, Oct 27, 2008 at 02:06:30PM -0700, Arjan van de Ven wrote:
> > > > Arjan van de Ven <arjan@infradead.org> writes:
> > > > 
> > > > > [...]
> > > > > --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > > > +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
> > > > > [...]
> > > > > @@ -427,6 +429,8 @@ static int acpi_cpufreq_target(struct
> > > > > cpufreq_policy *policy, }
> > > > >  	}
> > > > >  
> > > > > +	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
> > > > > +
> > > > >  	switch (data->cpu_feature) {
> > > > >  	case SYSTEM_INTEL_MSR_CAPABLE:
> > > > >  		cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
> > > > > [...]
> > > > 
> > > > Is there some reason that this doesn't use tracepoints instead
> > > > of such a single-backend hook?
> > > 
> > > because it's a ton simpler this way? do simple things simpe and all
> > > that....
> > > 
> > 
> > hi,
> > 
> > I wrote a patch to make c/p state tracer dependent on tracepoints and
> > then realized that the discussion had already been had. However, the
> > patch to use tracepoints is fairly simple, allows for other consumers,
> > and avoids a function call in the off case. please consider.
> > 
> > thanks,
> > 
> > -Jason
> > 
> > Signed-off-by: Jason Baron <jbaron@redhat.com>
> > 
> > ---
> > 
> >  arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    3 +++
> >  arch/x86/kernel/process.c                  |    4 ++++
> >  include/linux/ftrace.h                     |   15 ---------------
> >  include/trace/power.h                      |   18 ++++++++++++++++++
> >  kernel/trace/trace_power.c                 |   28 ++++++++++++++++++++--------
> >  5 files changed, 45 insertions(+), 23 deletions(-)
> >  create mode 100644 include/trace/power.h
> 
> Looks good, but could you please base this on latest -tip? There's a number of 
> pending changes in the tracing tree that conflict:
> 

ok, re-based patch against latest -tip. Addresses Frédéric's
concerns as well.

thanks,

-Jason

Convert the c/p state "power" tracer to use tracepoints. Avoids a
function call when the tracer is disabled.

Signed-off-by: Jason Baron <jbaron@redhat.com>
---

 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |    2 
 arch/x86/kernel/process.c                  |    3 
 include/trace/power.h                      |   25 ++--
 kernel/trace/trace_power.c                 |  173 +++++++++++++++++-----------
 4 files changed, 119 insertions(+), 84 deletions(-)


diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7ed925e..c5d737c 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+DEFINE_TRACE(power_mark);
+
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0f5d420..bf91350 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -19,6 +19,9 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
diff --git a/include/trace/power.h b/include/trace/power.h
index c7cefbc..2c733e5 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -2,6 +2,7 @@
 #define _TRACE_POWER_H
 
 #include <linux/ktime.h>
+#include <linux/tracepoint.h>
 
 enum {
 	POWER_NONE = 0,
@@ -18,18 +19,16 @@ struct power_trace {
 #endif
 };
 
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
+DECLARE_TRACE(power_start,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_mark,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_end,
+	TPPROTO(struct power_trace *it),
+		TPARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index b1d0d08..91ce672 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -21,15 +21,116 @@
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
 
+static void probe_power_start(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+
+
+static void probe_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static void probe_power_mark(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static int tracing_power_register(void)
+{
+	int ret;
+
+	ret = register_trace_power_start(probe_power_start);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_start\n");
+		return ret;
+	}
+	ret = register_trace_power_end(probe_power_end);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_end\n");
+		goto fail_start;
+	}
+	ret = register_trace_power_mark(probe_power_mark);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_mark\n");
+		goto fail_end;
+	}
+	return ret;
+fail_end:
+	unregister_trace_power_end(probe_power_end);
+fail_start:
+	unregister_trace_power_start(probe_power_start);
+	return ret;
+}
 
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
+	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
+	unregister_trace_power_start(probe_power_start);
+	unregister_trace_power_end(probe_power_end);
+	unregister_trace_power_mark(probe_power_mark);
 }
 
 
@@ -39,6 +140,7 @@ static int power_trace_init(struct trace_array *tr)
 	power_trace = tr;
 
 	trace_power_enabled = 1;
+	tracing_power_register();
 
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
@@ -95,74 +197,3 @@ static int init_power_trace(void)
 	return register_tracer(&power_tracer);
 }
 device_initcall(init_power_trace);
-
-void trace_power_start(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-}
-EXPORT_SYMBOL_GPL(trace_power_start);
-
-
-void trace_power_end(struct power_trace *it)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	preempt_disable();
-	it->end = ktime_get();
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_end);
-
-void trace_power_mark(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-	preempt_disable();
-	it->end = it->stamp;
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_mark);

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2009-02-11 18:58 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-10-06 17:26 PATCH] ftrace: Add a C/P state tracer to help power optimization Arjan van de Ven
2008-10-06 20:46 ` Andi Kleen
2008-10-06 20:57   ` Arjan van de Ven
2008-10-06 21:19     ` Andi Kleen
2008-10-06 21:21       ` Arjan van de Ven
2008-10-06 21:34         ` Andi Kleen
2008-10-07 10:39   ` Steven Rostedt
2008-10-27 15:59 ` Ingo Molnar
2008-10-27 16:05   ` Steven Rostedt
2008-10-27 16:21     ` Alan Cox
2008-10-27 17:16       ` Steven Rostedt
2008-10-27 18:05   ` Arjan van de Ven
2008-10-27 19:47     ` Frank Ch. Eigler
2008-10-27 20:13       ` Steven Rostedt
2008-10-27 21:06       ` Arjan van de Ven
2008-10-28 10:04         ` Ingo Molnar
2009-02-10 20:57         ` Jason Baron
2009-02-10 21:26           ` Frederic Weisbecker
2009-02-11  9:30           ` Ingo Molnar
2009-02-11 18:57             ` Jason Baron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).