LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch] sched: fix single-depth wchan output
@ 2008-11-06  5:58 Ken Chen
  2008-11-06  6:11 ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Ken Chen @ 2008-11-06  5:58 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linux Kernel Mailing List

To get a meaningful /proc/<pid>/wchan, one is required to turn on full
frame pointer when compile kernel/sched.c on x86 arch.  The enabling of
frame pointer applies to entire kernel/sched.c and affects lots of other
core scheduler functions that aren't related to wchan's call stack
unwind.  This causes unnecessary expansion of stack pointer push and pop
on the stack for scheduler functions.  To cut down the cost of frame
pointer push/pop, one can use compile time config option 'single-depth
wchan'.  However, the 'single-depth' option is broken on x86 due to lack
of stack frame marker and simple stack unwind doesn't work, i.e., wchan
always produces '0'.

This patch adds call site location explicitly in thread_struct for
schedule() function so that get_wchan() can reliably get the data and at
the same time not to overly burden the entire kernel/sched.c with frame
pointer generation.  The remove of frame pointer dependency allows
compiler to generate better and faster core scheduler code on x86_64.

Signed-off-by: Ken Chen <kenchen@google.com>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e60c59b..9951853 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_GENERIC_DMA_COHERENT if X86_32
 	select HAVE_EFFICIENT_UNALIGNED_ACCESS
+	select SCHED_NO_NO_OMIT_FRAME_POINTER

 config ARCH_DEFCONFIG
 	string
@@ -367,7 +368,7 @@ config X86_RDC321X
 config SCHED_NO_NO_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
-	depends on X86_32
+	depends on X86
 	help
 	  Calculate simpler /proc/<PID>/wchan values. If this option
 	  is disabled then wchan values will recurse back to the
index 5ca01e3..1d2ff70 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -401,6 +401,7 @@ struct thread_struct {
 	unsigned long		ip;
 	unsigned long		fs;
 	unsigned long		gs;
+	unsigned long		wchan;
 	/* Hardware debugging registers: */
 	unsigned long		debugreg0;
 	unsigned long		debugreg1;
@@ -603,6 +604,12 @@ extern void release_thread(struct task_struct *);
 extern void prepare_to_copy(struct task_struct *tsk);

 unsigned long get_wchan(struct task_struct *p);
+#define set_wchan(task, ip) do { (task)->thread.wchan = (ip); } while (0)
+#define set_wchan_cond(task, ip) do {			\
+	unsigned long *__wchan = &(task)->thread.wchan;	\
+	if (!__wchan)					\
+		*__wchan = (ip);			\
+} while (0)

 /*
  * Generic CPUID function
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302f..ba02359 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -697,26 +697,10 @@ out:

 unsigned long get_wchan(struct task_struct *p)
 {
-	unsigned long bp, sp, ip;
-	unsigned long stack_page;
-	int count = 0;
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
-	stack_page = (unsigned long)task_stack_page(p);
-	sp = p->thread.sp;
-	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
-		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes bp last. */
-	bp = *(unsigned long *) sp;
-	do {
-		if (bp < stack_page || bp > top_ebp+stack_page)
-			return 0;
-		ip = *(unsigned long *) (bp+4);
-		if (!in_sched_functions(ip))
-			return ip;
-		bp = *(unsigned long *) bp;
-	} while (count++ < 16);
-	return 0;
+
+	return p->thread.wchan;
 }

 unsigned long arch_align_stack(unsigned long sp)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index c958120..222029b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -739,26 +739,10 @@ asmlinkage long sys_vfork(struct pt_regs *regs)

 unsigned long get_wchan(struct task_struct *p)
 {
-	unsigned long stack;
-	u64 fp, ip;
-	int count = 0;
-
 	if (!p || p == current || p->state == TASK_RUNNING)
 		return 0;
-	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
-		return 0;
-	fp = *(u64 *)(p->thread.sp);
-	do {
-		if (fp < (unsigned long)stack ||
-		    fp >= (unsigned long)stack+THREAD_SIZE)
-			return 0;
-		ip = *(u64 *)(fp+8);
-		if (!in_sched_functions(ip))
-			return ip;
-		fp = *(u64 *)fp;
-	} while (count++ < 16);
-	return 0;
+
+	return p->thread.wchan;
 }

 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b483f39..82f0b11 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -324,6 +324,11 @@ extern char __sched_text_start[], __sched_text_end[];
 /* Is this address in the __sched functions? */
 extern int in_sched_functions(unsigned long addr);

+#ifndef set_wchan
+#define set_wchan(task, ip)
+#define set_wchan_cond(task, ip)
+#endif
+
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long schedule_timeout(signed long timeout);
 extern signed long schedule_timeout_interruptible(signed long timeout);
diff --git a/kernel/sched.c b/kernel/sched.c
index e8819bc..48b0965 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4477,6 +4477,7 @@ need_resched_nonpreemptible:
 		rq->curr = next;
 		++*switch_count;

+		set_wchan_cond(prev, _RET_IP_);
 		context_switch(rq, prev, next); /* unlocks the rq */
 		/*
 		 * the context switch might have flipped the stack from under
@@ -4487,6 +4488,7 @@ need_resched_nonpreemptible:
 	} else
 		spin_unlock_irq(&rq->lock);

+	set_wchan(current, 0);
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;

@@ -4514,6 +4516,7 @@ asmlinkage void __sched preempt_schedule(void)
 		return;

 	do {
+		set_wchan(current, _RET_IP_);
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
@@ -4541,6 +4544,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 	BUG_ON(ti->preempt_count || !irqs_disabled());

 	do {
+		set_wchan(current, _RET_IP_);
 		add_preempt_count(PREEMPT_ACTIVE);
 		local_irq_enable();
 		schedule();
@@ -5547,6 +5551,7 @@ asmlinkage long sys_sched_yield(void)
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();

+	set_wchan(current, _RET_IP_);
 	schedule();

 	return 0;
@@ -5563,6 +5568,7 @@ static void __cond_resched(void)
 	 * cond_resched() call.
 	 */
 	do {
+		set_wchan(current, _RET_IP_);
 		add_preempt_count(PREEMPT_ACTIVE);
 		schedule();
 		sub_preempt_count(PREEMPT_ACTIVE);
@@ -5646,6 +5652,7 @@ void __sched io_schedule(void)

 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
+	set_wchan(current, _RET_IP_);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
diff --git a/kernel/timer.c b/kernel/timer.c
index 56becf3..72def2f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1182,6 +1182,7 @@ signed long __sched schedule_timeout
 	struct timer_list timer;
 	unsigned long expire;

+	set_wchan(current, _RET_IP_);
 	switch (timeout)
 	{
 	case MAX_SCHEDULE_TIMEOUT:

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  5:58 [patch] sched: fix single-depth wchan output Ken Chen
@ 2008-11-06  6:11 ` Ingo Molnar
  2008-11-06  6:16   ` Ken Chen
  2008-11-06 21:56   ` Matt Mackall
  0 siblings, 2 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-11-06  6:11 UTC (permalink / raw)
  To: Ken Chen; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt


* Ken Chen <kenchen@google.com> wrote:

> To get a meaningful /proc/<pid>/wchan, one is required to turn on 
> full frame pointer when compile kernel/sched.c on x86 arch.  The 
> enabling of frame pointer applies to entire kernel/sched.c and 
> affects lots of other core scheduler functions that aren't related 
> to wchan's call stack unwind.  This causes unnecessary expansion of 
> stack pointer push and pop on the stack for scheduler functions.  To 
> cut down the cost of frame pointer push/pop, one can use compile 
> time config option 'single-depth wchan'.  However, the 
> 'single-depth' option is broken on x86 due to lack of stack frame 
> marker and simple stack unwind doesn't work, i.e., wchan always 
> produces '0'.
> 
> This patch adds call site location explicitly in thread_struct for 
> schedule() function so that get_wchan() can reliably get the data 
> and at the same time not to overly burden the entire kernel/sched.c 
> with frame pointer generation.  The remove of frame pointer 
> dependency allows compiler to generate better and faster core 
> scheduler code on x86_64.

hm, this adds overhead - and the thing is that WCHAN is rather 
uninformative to begin with (because it's a single dimension), so we 
should phase it out, not expand it.

How about adding a /proc/<PID>/stacktrace file that gives us the stack 
trace of any task in the system? That would be useful for a number of 
other purposes as well, and about 100 times more useful than wchan. 
(often it would be more useful than sysrq-t dumps)

Hm?

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:11 ` Ingo Molnar
@ 2008-11-06  6:16   ` Ken Chen
  2008-11-06  6:30     ` Ingo Molnar
  2008-11-06 21:56   ` Matt Mackall
  1 sibling, 1 reply; 11+ messages in thread
From: Ken Chen @ 2008-11-06  6:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt

On Wed, Nov 5, 2008 at 10:11 PM, Ingo Molnar <mingo@elte.hu> wrote:
> hm, this adds overhead - and the thing is that WCHAN is rather
> uninformative to begin with (because it's a single dimension), so we
> should phase it out, not expand it.
>
> How about adding a /proc/<PID>/stacktrace file that gives us the stack
> trace of any task in the system? That would be useful for a number of
> other purposes as well, and about 100 times more useful than wchan.
> (often it would be more useful than sysrq-t dumps)

Sure, my main motivation is to remove frame pointer generation. x86_64
unconditionally adds fp for kernel/sched.c right now.  I'm all for
phasing out wchan if people don't think there is value in it.

- Ken

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:16   ` Ken Chen
@ 2008-11-06  6:30     ` Ingo Molnar
  2008-11-06  6:42       ` Ken Chen
  2008-11-06 22:33       ` Andi Kleen
  0 siblings, 2 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-11-06  6:30 UTC (permalink / raw)
  To: Ken Chen; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt


* Ken Chen <kenchen@google.com> wrote:

> On Wed, Nov 5, 2008 at 10:11 PM, Ingo Molnar <mingo@elte.hu> wrote:
>
> > hm, this adds overhead - and the thing is that WCHAN is rather 
> > uninformative to begin with (because it's a single dimension), so 
> > we should phase it out, not expand it.
> >
> > How about adding a /proc/<PID>/stacktrace file that gives us the 
> > stack trace of any task in the system? That would be useful for a 
> > number of other purposes as well, and about 100 times more useful 
> > than wchan. (often it would be more useful than sysrq-t dumps)
> 
> Sure, my main motivation is to remove frame pointer generation. 
> x86_64 unconditionally adds fp for kernel/sched.c right now.  I'm 
> all for phasing out wchan if people don't think there is value in 
> it.

are you interested in adding /proc/<PID>/stacktrace? If yes then we 
could remove fp generation for 64-bit right now and add your 
stacktrace patch when you are done with it.

Generally we want frame pointers for high quality backtraces and 
trouble-shooting. The small cost is almost always worth paying and 
most distros enable framepointers for that reason. On 32-bit a 
no-framepointers kernel image has less register pressure, but on 
64-bit there's little reason to not enable them.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:30     ` Ingo Molnar
@ 2008-11-06  6:42       ` Ken Chen
  2008-11-06  6:59         ` Ingo Molnar
  2008-11-06 22:33       ` Andi Kleen
  1 sibling, 1 reply; 11+ messages in thread
From: Ken Chen @ 2008-11-06  6:42 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt

On Wed, Nov 5, 2008 at 10:30 PM, Ingo Molnar <mingo@elte.hu> wrote:
> are you interested in adding /proc/<PID>/stacktrace? If yes then we
> could remove fp generation for 64-bit right now and add your
> stacktrace patch when you are done with it.

Yes, we had a patch internally at google to dump task stack trace via
procfs just like what you suggested.  I will get that patch out.

- Ken

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:42       ` Ken Chen
@ 2008-11-06  6:59         ` Ingo Molnar
  2008-11-06  7:28           ` Ken Chen
  0 siblings, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2008-11-06  6:59 UTC (permalink / raw)
  To: Ken Chen; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt


* Ken Chen <kenchen@google.com> wrote:

> On Wed, Nov 5, 2008 at 10:30 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > are you interested in adding /proc/<PID>/stacktrace? If yes then we
> > could remove fp generation for 64-bit right now and add your
> > stacktrace patch when you are done with it.
> 
> Yes, we had a patch internally at google to dump task stack trace 
> via procfs just like what you suggested.  I will get that patch out.

very nice!

If it does stack walking manually then please update it to use 
save_stack_trace() instead - that is the standard API that will 
utilize the best possible stack walking machinery on the architecture 
level.

For security reasons we want it to be admin-only readable, and we also 
want a .config for it for the extra paranoid and for CONFIG_EMBEDDED.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:59         ` Ingo Molnar
@ 2008-11-06  7:28           ` Ken Chen
  2008-11-06  7:44             ` Ingo Molnar
  0 siblings, 1 reply; 11+ messages in thread
From: Ken Chen @ 2008-11-06  7:28 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt

On Wed, Nov 5, 2008 at 10:59 PM, Ingo Molnar <mingo@elte.hu> wrote:
> If it does stack walking manually then please update it to use
> save_stack_trace() instead - that is the standard API that will
> utilize the best possible stack walking machinery on the architecture
> level.

OK, I pulled the patch out of our code that export stack trace via
/proc/pid/trace.  I didn't write this patch, but I think a better choice
would be to override struct stacktrace_ops print_trace_ops with a memory
buffer pointer to dump the stack into.  If you have any comments, please
let me know.  I will polish this patch up and rebase to git-head.

--- linux/arch/x86/Kconfig	2008-11-05 22:30:06.000000000 -0800
+++ linux/arch/x86/Kconfig	2008-11-05 22:30:06.000000000 -0800
@@ -213,6 +213,11 @@

 config KTIME_SCALAR
 	def_bool X86_32
+
+config ARCH_HAS_BUF_SHOW_STACK
+	bool
+	default y
+
 source "init/Kconfig"

 menu "Processor type and features"
--- linux/arch/x86/kernel/traps_64.c	2008-11-05 22:30:06.000000000 -0800
+++ linux/arch/x86/kernel/traps_64.c	2008-11-05 22:30:06.000000000 -0800
@@ -78,6 +78,11 @@
 asmlinkage void machine_check(void);
 asmlinkage void spurious_interrupt_bug(void);

+struct arch_unwind {
+	char **posp;
+	char *end;
+};
+
 static unsigned int code_bytes = 64;

 static inline void conditional_sti(struct pt_regs *regs)
@@ -104,22 +109,23 @@

 int kstack_depth_to_print = 12;

-void printk_address(unsigned long address, int reliable)
-{
 #ifdef CONFIG_KALLSYMS
+static void buf_printk_address(char **posp, char *end, unsigned long address,
+			int reliable)
+{
 	unsigned long offset = 0, symsize;
 	const char *symname;
 	char *modname;
 	char *delim = ":";
-	char namebuf[KSYM_NAME_LEN];
+	char namebuf[128];
 	char reliab[4] = "";

 	symname = kallsyms_lookup(address, &symsize, &offset,
-					&modname, namebuf);
+				  &modname, namebuf);
 	if (!offset)
-		return;	/* We don't want to print function ptrs */
+		return;         /* We don't want to print function ptrs */
 	if (!symname) {
-		printk(" [<%016lx>]\n", address);
+		buf_printk(posp, end, " [<%016lx>]\n", address);
 		return;
 	}
 	if (!reliable)
@@ -127,11 +133,21 @@

 	if (!modname)
 		modname = delim = "";
-	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
-		address, reliab, delim, modname, delim, symname, offset, symsize);
+	buf_printk(posp, end, " [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
+		address, reliab, delim, modname, delim, symname, offset,
+		symsize);
+}
 #else
-	printk(" [<%016lx>]\n", address);
+static void buf_printk_address(char **posp, char *end, unsigned long address,
+			int reliable)
+{
+	buf_printk(posp, end, " [<%016lx>]\n", address);
+}
 #endif
+
+void printk_address(unsigned long address, int reliable)
+{
+	buf_printk_address(NULL, NULL, address, reliable);
 }

 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
@@ -357,25 +373,31 @@
 static void
 print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
 {
-	print_symbol(msg, symbol);
-	printk("\n");
+	struct arch_unwind *unw = (struct arch_unwind *)data;
+
+	buf_print_symbol(unw->posp, unw->end, msg, symbol);
+	buf_printk(unw->posp, unw->end, "\n");
 }

 static void print_trace_warning(void *data, char *msg)
 {
-	printk("%s\n", msg);
+	struct arch_unwind *unw = (struct arch_unwind *)data;
+
+	buf_printk(unw->posp, unw->end, "%s\n", msg);
 }

 static int print_trace_stack(void *data, char *name)
 {
-	printk(" <%s> ", name);
+	struct arch_unwind *unw = (struct arch_unwind *)data;
+	buf_printk(unw->posp, unw->end, " <%s> ", name);
 	return 0;
 }

 static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
+	struct arch_unwind *unw = (struct arch_unwind *)data;
 	touch_nmi_watchdog();
-	printk_address(addr, reliable);
+	buf_printk_address(unw->posp, unw->end, addr, reliable);
 }

 static const struct stacktrace_ops print_trace_ops = {
@@ -385,18 +407,29 @@
 	.address = print_trace_address,
 };

+static void
+buf_show_trace(char **posp, char *end,
+		struct task_struct *tsk, struct pt_regs *regs,
+		unsigned long *stack, unsigned long bp) {
+	struct arch_unwind unw;
+	unw.posp = posp;
+	unw.end = end;
+
+	buf_printk(posp, end, "\nCall Trace:\n");
+	dump_trace(tsk, regs, stack, bp, &print_trace_ops, &unw);
+	buf_printk(posp, end, "\n");
+}
+
 void
 show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack,
 		unsigned long bp)
 {
-	printk("\nCall Trace:\n");
-	dump_trace(tsk, regs, stack, bp, &print_trace_ops, NULL);
-	printk("\n");
+	buf_show_trace(NULL, NULL, tsk, regs, stack, bp);
 }

 static void
-_show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long *sp,
-							unsigned long bp)
+_buf_show_stack(char **posp, char *end, struct task_struct *tsk,
+		struct pt_regs *regs, unsigned long *sp, unsigned long bp)
 {
 	unsigned long *stack;
 	int i;
@@ -419,23 +452,29 @@
 		if (stack >= irqstack && stack <= irqstack_end) {
 			if (stack == irqstack_end) {
 				stack = (unsigned long *) (irqstack_end[-1]);
-				printk(" <EOI> ");
+				buf_printk(posp, end, " <EOI> ");
 			}
 		} else {
 		if (((long) stack & (THREAD_SIZE-1)) == 0)
 			break;
 		}
 		if (i && ((i % 4) == 0))
-			printk("\n");
-		printk(" %016lx", *stack++);
+			buf_printk(posp, end, "\n");
+		buf_printk(posp, end, " %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace(tsk, regs, sp, bp);
+	buf_show_trace(posp, end, tsk, regs, sp, bp);
 }

 void show_stack(struct task_struct *tsk, unsigned long * sp)
 {
-	_show_stack(tsk, NULL, sp, 0);
+	_buf_show_stack(NULL, NULL, tsk, NULL, sp, 0);
+}
+
+void buf_show_stack(char **posp, char *end, struct task_struct *tsk,
+		    unsigned long * sp)
+{
+	_buf_show_stack(posp, end, tsk, NULL, sp, 0);
 }

 /*
@@ -485,7 +524,8 @@
 	if (!user_mode(regs)) {
 		unsigned char c;
 		printk("Stack: ");
-		_show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
+		_buf_show_stack(NULL, NULL,
+				NULL, regs, (unsigned long *)sp, regs->bp);
 		printk("\n");

 		printk(KERN_EMERG "Code: ");
diff -u linux/fs/proc/base.c 2.6.26/fs/proc/base.c
--- linux/fs/proc/base.c	2008-11-05 22:27:42.000000000 -0800
+++ linux/fs/proc/base.c	2008-11-05 22:27:42.000000000 -0800
@@ -2271,6 +2271,31 @@

 #endif

+static int proc_pid_trace(struct task_struct *task, char * buffer)
+{
+	char *pos, *end, **posp;
+
+	posp = &pos;
+	pos = buffer;
+	end = buffer + PAGE_SIZE;
+	memset(buffer, 0, PAGE_SIZE);
+
+	/*
+	 * Add some safety checking
+	 */
+
+	buffer[PAGE_SIZE - 1] = 1;
+	end--;
+
+	read_lock(&tasklist_lock);
+	buf_show_task(posp, end, task);
+	read_unlock(&tasklist_lock);
+
+	WARN_ON(buffer[PAGE_SIZE - 1] != 1);
+	WARN_ON(buffer[PAGE_SIZE - 2] != 0);
+	return (pos - buffer);
+}
+
 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
 					 size_t count, loff_t *ppos)
@@ -2574,6 +2599,7 @@
 	REG("smaps",      S_IRUGO, smaps),
 	REG("pagemap",    S_IRUSR, pagemap),
 #endif
+	INF("trace",  S_IFREG|S_IRUGO, pid_trace),
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
 #endif
--- linux/include/linux/kallsyms.h	2008-11-05 22:30:06.000000000 -0800
+++ linux/include/linux/kallsyms.h	2008-11-05 22:30:06.000000000 -0800
@@ -31,6 +31,8 @@

 /* Look up a kernel symbol and print it to the kernel messages. */
 extern void __print_symbol(const char *fmt, unsigned long address);
+extern void __buf_print_symbol(char **posp, char *end,
+			       const char *fmt, unsigned long address);

 int lookup_symbol_name(unsigned long addr, char *symname);
 int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
unsigned long *offset, char *modname, char *name);
@@ -75,6 +77,7 @@

 /* Stupid that this does nothing, but I didn't create this mess. */
 #define __print_symbol(fmt, addr)
+#define __buf_print_symbol(posp, end, fmt, addr)
 #endif /*CONFIG_KALLSYMS*/

 /* This macro allows us to keep printk typechecking */
@@ -105,6 +108,14 @@
 	print_symbol(fmt, (unsigned long)addr);
 }

+static inline void buf_print_symbol(char **posp, char *end,
+				    const char *fmt, unsigned long addr)
+{
+	__check_printsym_format(fmt, "");
+	__buf_print_symbol(posp, end, fmt, (unsigned long)
+			   __builtin_extract_return_addr((void *)addr));
+}
+
 #ifndef CONFIG_64BIT
 #define print_ip_sym(ip)		\
 do {					\
--- linux/include/linux/kernel.h	2008-11-05 22:30:06.000000000 -0800
+++ linux/include/linux/kernel.h	2008-11-05 22:30:06.000000000 -0800
@@ -224,6 +224,9 @@
 extern void __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);

+extern void buf_printk(char **posp, char *end, const char *fmt, ...)
+	__attribute__ ((format (printf, 3, 4)));
+
 void redisplay_console_messages(void);

 unsigned long int_sqrt(unsigned long);
--- linux/include/linux/sched.h	2008-11-05 22:30:06.000000000 -0800
+++ linux/include/linux/sched.h	2008-11-05 22:30:06.000000000 -0800
@@ -267,6 +267,7 @@
  */
 extern void show_state_filter(unsigned long state_filter);

+extern void buf_show_task(char **posp, char *end, struct task_struct *p);
 static inline void show_state(void)
 {
 	show_state_filter(0);
@@ -280,7 +281,17 @@
  * trace (or NULL if the entire call-chain of the task should be shown).
  */
 extern void show_stack(struct task_struct *task, unsigned long *sp);
-
+#ifdef CONFIG_ARCH_HAS_BUF_SHOW_STACK
+extern void buf_show_stack(char **posp, char *end,
+			   struct task_struct *task, unsigned long *sp);
+#else
+static inline void buf_show_stack(char **posp, char *end,
+			     struct task_struct *task, unsigned long *sp)
+{
+	if (posp == NULL)
+		show_stack(task, sp);
+}
+#endif
 void io_schedule(void);
 long io_schedule_timeout(long timeout);

--- linux/kernel/kallsyms.c	2008-11-05 22:30:06.000000000 -0800
+++ linux/kernel/kallsyms.c	2008-11-05 22:30:06.000000000 -0800
@@ -319,13 +319,19 @@
 }

 /* Look up a kernel symbol and print it to the kernel messages. */
+void __buf_print_symbol(char **posp, char *end,
+			const char *fmt, unsigned long address)
+{
+	char buffer[KSYM_SYMBOL_LEN];
+
+	sprint_symbol(buffer, address);
+
+	buf_printk(posp, end, fmt, buffer);
+}
+
 void __print_symbol(const char *fmt, unsigned long address)
 {
-	char buffer[KSYM_SYMBOL_LEN];
-
-	sprint_symbol(buffer, address);
-
-	printk(fmt, buffer);
+	__buf_print_symbol(NULL, NULL, fmt, address);
 }

 /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */
--- linux/kernel/printk.c	2008-11-05 22:30:06.000000000 -0800
+++ linux/kernel/printk.c	2008-11-05 22:30:06.000000000 -0800
@@ -655,6 +655,20 @@
 	return r;
 }

+void buf_printk(char **posp, char *end, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+
+	if (posp != NULL)
+		*posp += vscnprintf(*posp, end - (*posp), fmt, args);
+	else
+		(void)vprintk(fmt, args);
+
+	va_end(args);
+}
+
 /* cpu currently holding logbuf_lock */
 static volatile unsigned int printk_cpu = UINT_MAX;

--- linux/kernel/sched.c	2008-11-05 22:30:06.000000000 -0800
+++ linux/kernel/sched.c	2008-11-05 22:30:06.000000000 -0800
@@ -5398,24 +5398,24 @@

 static const char stat_nam[] = "RSDTtZX";

-void sched_show_task(struct task_struct *p)
+void buf_show_task(char **posp, char *end, struct task_struct *p)
 {
 	unsigned long free = 0;
 	unsigned state;

 	state = p->state ? __ffs(p->state) + 1 : 0;
-	printk(KERN_INFO "%-13.13s %c", p->comm,
+	buf_printk(posp, end, KERN_INFO "%-13.13s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
 	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
+		buf_printk(posp, end, KERN_CONT " running  ");
 	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+		buf_printk(posp, end, KERN_CONT " %08lx ", thread_saved_pc(p));
 #else
 	if (state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
+		buf_printk(posp, end, KERN_CONT "  running task    ");
 	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+		buf_printk(posp, end, KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
@@ -5425,10 +5425,15 @@
 		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
-	printk(KERN_CONT "%5lu %5d %6d\n", free,
+	buf_printk(posp, end, KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));

-	show_stack(p, NULL);
+	buf_show_stack(posp, end, p, NULL);
+}
+
+void sched_show_task(struct task_struct *p)
+{
+	buf_show_task(NULL, NULL, p);
 }

 void show_state_filter(unsigned long state_filter)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  7:28           ` Ken Chen
@ 2008-11-06  7:44             ` Ingo Molnar
  0 siblings, 0 replies; 11+ messages in thread
From: Ingo Molnar @ 2008-11-06  7:44 UTC (permalink / raw)
  To: Ken Chen; +Cc: Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt


* Ken Chen <kenchen@google.com> wrote:

> On Wed, Nov 5, 2008 at 10:59 PM, Ingo Molnar <mingo@elte.hu> wrote:
> > If it does stack walking manually then please update it to use
> > save_stack_trace() instead - that is the standard API that will
> > utilize the best possible stack walking machinery on the architecture
> > level.
> 
> OK, I pulled the patch out of our code that export stack trace via 
> /proc/pid/trace.  I didn't write this patch, but I think a better 
> choice would be to override struct stacktrace_ops print_trace_ops 
> with a memory buffer pointer to dump the stack into.  If you have 
> any comments, please let me know.  I will polish this patch up and 
> rebase to git-head.

hm, instead of modifying the lowlevel arch dump code, why not just use 
the existing save_stack_trace(), and render the output yourself via a 
trivial sprintf, just like kernel/lockdep.c does?

See kernel/stacktrace.c's print_stack_trace() - that could be extended 
with a sprintf_stack_trace() method. Allocate a large enough buffer 
dynamically, with a max of 128 stacktrace entries or so. (the output 
buffer is limited to 4K-ish anyway, right?)

As a bonus this will work on every architecture, not just x86.

a few other details:

> -	char namebuf[KSYM_NAME_LEN];
> +	char namebuf[128];

... time machine back to old crappy code ;-)

>  	symname = kallsyms_lookup(address, &symsize, &offset,
> -					&modname, namebuf);
> +				  &modname, namebuf);

ditto. But none of this has to be modified so you can just drop these 
bits.

> +	read_lock(&tasklist_lock);
> +	buf_show_task(posp, end, task);
> +	read_unlock(&tasklist_lock);

to get a stable trace we'll need more locking than that (tasklist_lock 
does not exclude scheduling, etc.) - but it takes care of the most 
important detail: tasks exiting from under us. So this should be OK.

>  #endif
> +	INF("trace",  S_IFREG|S_IRUGO, pid_trace),

that needs to be r-------- instead of r--r--r--, for security reasons.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:11 ` Ingo Molnar
  2008-11-06  6:16   ` Ken Chen
@ 2008-11-06 21:56   ` Matt Mackall
  2008-11-06 22:30     ` Chris Friesen
  1 sibling, 1 reply; 11+ messages in thread
From: Matt Mackall @ 2008-11-06 21:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Ken Chen, Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt

On Thu, 2008-11-06 at 07:11 +0100, Ingo Molnar wrote:
> * Ken Chen <kenchen@google.com> wrote:
> 
> > To get a meaningful /proc/<pid>/wchan, one is required to turn on 
> > full frame pointer when compile kernel/sched.c on x86 arch.  The 
> > enabling of frame pointer applies to entire kernel/sched.c and 
> > affects lots of other core scheduler functions that aren't related 
> > to wchan's call stack unwind.  This causes unnecessary expansion of 
> > stack pointer push and pop on the stack for scheduler functions.  To 
> > cut down the cost of frame pointer push/pop, one can use compile 
> > time config option 'single-depth wchan'.  However, the 
> > 'single-depth' option is broken on x86 due to lack of stack frame 
> > marker and simple stack unwind doesn't work, i.e., wchan always 
> > produces '0'.
> > 
> > This patch adds call site location explicitly in thread_struct for 
> > schedule() function so that get_wchan() can reliably get the data 
> > and at the same time not to overly burden the entire kernel/sched.c 
> > with frame pointer generation.  The remove of frame pointer 
> > dependency allows compiler to generate better and faster core 
> > scheduler code on x86_64.
> 
> hm, this adds overhead - and the thing is that WCHAN is rather 
> uninformative to begin with (because it's a single dimension), so we 
> should phase it out, not expand it.

WCHAN is a long-standing public interface and isn't a Linuxism. I don't
think phasing it out is a good idea.

> How about adding a /proc/<PID>/stacktrace file that gives us the stack 
> trace of any task in the system? That would be useful for a number of 
> other purposes as well, and about 100 times more useful than wchan. 
> (often it would be more useful than sysrq-t dumps)

But this is a great idea, of course.

-- 
Mathematics is the supreme nostalgia of our time.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06 21:56   ` Matt Mackall
@ 2008-11-06 22:30     ` Chris Friesen
  0 siblings, 0 replies; 11+ messages in thread
From: Chris Friesen @ 2008-11-06 22:30 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Ingo Molnar, Ken Chen, Linux Kernel Mailing List, Peter Zijlstra,
	Steven Rostedt

Matt Mackall wrote:
> On Thu, 2008-11-06 at 07:11 +0100, Ingo Molnar wrote:

>>How about adding a /proc/<PID>/stacktrace file that gives us the stack 
>>trace of any task in the system? That would be useful for a number of 
>>other purposes as well, and about 100 times more useful than wchan. 
>>(often it would be more useful than sysrq-t dumps)
> 
> 
> But this is a great idea, of course.

Agreed.  I know some apps designers that would love to get their hands 
on something like that...



Chris

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [patch] sched: fix single-depth wchan output
  2008-11-06  6:30     ` Ingo Molnar
  2008-11-06  6:42       ` Ken Chen
@ 2008-11-06 22:33       ` Andi Kleen
  1 sibling, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2008-11-06 22:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Ken Chen, Linux Kernel Mailing List, Peter Zijlstra, Steven Rostedt

Ingo Molnar <mingo@elte.hu> writes:
>
> are you interested in adding /proc/<PID>/stacktrace? If yes then we 
> could remove fp generation for 64-bit right now and add your 
> stacktrace patch when you are done with it.

FWIW there used to be a patch floating around doing that a couple of
years ago and even used in some distros. But the big problem was if
someone read all of proc it took a lot of CPU time to do all the stack
tracing. This also was triggerable from non root.

So yes it's useful, but it can be costly.

-Andi

-- 
ak@linux.intel.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2008-11-06 22:33 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-11-06  5:58 [patch] sched: fix single-depth wchan output Ken Chen
2008-11-06  6:11 ` Ingo Molnar
2008-11-06  6:16   ` Ken Chen
2008-11-06  6:30     ` Ingo Molnar
2008-11-06  6:42       ` Ken Chen
2008-11-06  6:59         ` Ingo Molnar
2008-11-06  7:28           ` Ken Chen
2008-11-06  7:44             ` Ingo Molnar
2008-11-06 22:33       ` Andi Kleen
2008-11-06 21:56   ` Matt Mackall
2008-11-06 22:30     ` Chris Friesen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).