LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH] Kwatch: kernel watchpoints using CPU debug registers
@ 2007-01-16 16:55 Alan Stern
  2007-01-16 23:35 ` Christoph Hellwig
  2007-01-17  9:44 ` Ingo Molnar
  0 siblings, 2 replies; 11+ messages in thread
From: Alan Stern @ 2007-01-16 16:55 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Prasanna S Panchamukhi, Kernel development list

From: Alan Stern <stern@rowland.harvard.edu>

This patch (as839) implements the Kwatch (kernel-space hardware-based
watchpoints) API for the i386 architecture.  The API is explained in
the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c.

The original version of the patch was written by Vamsi Krishna S and
Bharata Rao.  It was later updated by Prasanna S Panchamukhi for 2.6.13
and then again by me for 2.6.20.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>

---

Hardware-based watchpoints can sometimes be indispensable for finding the 
source of problems.  Although this patch is only for the x86 architecture, 
it should still be useful.  And there's no downside to adopting it, since 
it has virtually no overhead with CONFIG_KWATCH isn't selected.


Index: usb-2.6/arch/i386/kernel/debugreg.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/debugreg.c
@@ -0,0 +1,182 @@
+/*
+ *  Debug register
+ *  arch/i386/kernel/debugreg.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> and
+ *		Bharata Rao <bharata@in.ibm.com> to provide debug register
+ *		allocation mechanism.
+ * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
+ *		idr_allocations mechanism as suggested by Andi Kleen.
+ */
+
+/*
+ * These routines provide a debug register allocation mechanism.
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <asm/debugreg.h>
+
+struct debugreg {
+	int flag;
+	int use_count;
+};
+
+struct debugreg dr_list[DR_MAX];
+static spinlock_t dr_lock = SPIN_LOCK_UNLOCKED;
+unsigned long dr7_global_mask = DR_CONTROL_RESERVED | DR_GLOBAL_SLOWDOWN |
+		DR_GLOBAL_ENABLE_MASK;
+
+static unsigned long dr7_global_reg_mask(unsigned int regnum)
+{
+	return (0xf << (16 + regnum * 4)) | (0x1 << (regnum * 2));
+}
+
+static int get_dr(int regnum, int flag)
+{
+	if (flag == DR_ALLOC_GLOBAL && !dr_list[regnum].flag) {
+		dr_list[regnum].flag = flag;
+		dr7_global_mask |= dr7_global_reg_mask(regnum);
+		return regnum;
+	}
+	if (flag == DR_ALLOC_LOCAL &&
+			dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
+		dr_list[regnum].flag = flag;
+		dr_list[regnum].use_count++;
+		return regnum;
+	}
+	return -1;
+}
+
+static void free_dr(int regnum)
+{
+	if (dr_list[regnum].flag == DR_ALLOC_LOCAL) {
+		if (!--dr_list[regnum].use_count)
+			dr_list[regnum].flag = 0;
+	} else {
+		dr_list[regnum].flag = 0;
+		dr_list[regnum].use_count = 0;
+		dr7_global_mask &= ~(dr7_global_reg_mask(regnum));
+	}
+}
+
+int dr_alloc(int regnum, int flag)
+{
+	int ret = -1;
+
+	spin_lock(&dr_lock);
+	if (regnum >= 0 && regnum < DR_MAX)
+		ret = get_dr(regnum, flag);
+	else if (regnum == DR_ANY) {
+
+		/* gdb allocates local debug registers starting from 0.
+		 * To help avoid conflicts, we'll start from the other end.
+		 */
+		for (regnum = DR_MAX - 1; regnum >= 0; --regnum) {
+			ret = get_dr(regnum, flag);
+			if (ret >= 0)
+				break;
+		}
+	} else
+		printk(KERN_ERR "dr_alloc: "
+				"Cannot allocate debug register %d\n", regnum);
+	spin_unlock(&dr_lock);
+	return ret;
+}
+
+void dr_free(int regnum)
+{
+	spin_lock(&dr_lock);
+	if (regnum < 0 || regnum >= DR_MAX || !dr_list[regnum].flag)
+		printk(KERN_ERR "dr_free: "
+				"Cannot free debug register %d\n", regnum);
+	else
+		free_dr(regnum);
+	spin_unlock(&dr_lock);
+}
+
+void dr_inc_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			dr_list[i].use_count++;
+	}
+	spin_unlock(&dr_lock);
+}
+
+void dr_dec_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			free_dr(i);
+	}
+	spin_unlock(&dr_lock);
+}
+
+int dr_is_global(int regnum)
+{
+	return (dr_list[regnum].flag == DR_ALLOC_GLOBAL);
+}
+
+/*
+ * This routine decides if a ptrace request is for enabling or disabling
+ * a debug reg, and accordingly calls dr_alloc() or dr_free().
+ *
+ * gdb uses ptrace to write to debug registers.  It assumes that writing to
+ * a debug register always succeds and it doesn't check the return value of
+ * ptrace.  Now with this new global debug register allocation/freeing,
+ * ptrace request for a local debug register will fail if the required debug
+ * register is already globally allocated.  Since gdb doesn't notice this
+ * failure, it sometimes tries to free a debug register which it does not
+ * own.
+ *
+ * Returns -1 if the ptrace request tries to locally allocate a debug register
+ * that is already globally allocated.  Otherwise returns >0 or 0 according
+ * as any debug registers are or are not locally allocated in the new setting.
+ */
+int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	if (new_dr7 & DR_LOCAL_ENABLE_MASK & dr7_global_mask)
+		return -1;
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if ((old_dr7 ^ new_dr7) & dr_local_enable) {
+			if (new_dr7 & dr_local_enable)
+				dr_alloc(i, DR_ALLOC_LOCAL);
+			else
+				dr_free(i);
+		}
+	}
+	return new_dr7 & DR_LOCAL_ENABLE_MASK;
+}
+
+EXPORT_SYMBOL(dr_alloc);
+EXPORT_SYMBOL(dr_free);
Index: usb-2.6/arch/i386/kernel/process.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/process.c
+++ usb-2.6/arch/i386/kernel/process.c
@@ -51,6 +51,7 @@
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
+#include <asm/debugreg.h>
 
 #include <linux/err.h>
 
@@ -356,9 +357,10 @@ EXPORT_SYMBOL(kernel_thread);
  */
 void exit_thread(void)
 {
+	struct task_struct *tsk = current;
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-		struct task_struct *tsk = current;
 		struct thread_struct *t = &tsk->thread;
 		int cpu = get_cpu();
 		struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -376,12 +378,16 @@ void exit_thread(void)
 		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+ 	if (unlikely(tsk->thread.debugreg[7]))
+ 		dr_dec_use_count(tsk->thread.debugreg[7]);
 }
 
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
+	if (unlikely(tsk->thread.debugreg[7]))
+		dr_dec_use_count(tsk->thread.debugreg[7]);
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
 	clear_tsk_thread_flag(tsk, TIF_DEBUG);
@@ -462,6 +468,9 @@ int copy_thread(int nr, unsigned long cl
 		desc->b = LDT_entry_b(&info);
 	}
 
+	if (unlikely(tsk->thread.debugreg[7]))
+		dr_inc_use_count(tsk->thread.debugreg[7]);
+
 	err = 0;
  out:
 	if (err && p->thread.io_bitmap_ptr) {
@@ -537,14 +546,22 @@ static noinline void __switch_to_xtra(st
 
 	next = &next_p->thread;
 
+	/*
+	 * Don't reload global debug registers. Don't touch the global debug
+	 * register settings in dr7.
+	 */
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg[0], 0);
-		set_debugreg(next->debugreg[1], 1);
-		set_debugreg(next->debugreg[2], 2);
-		set_debugreg(next->debugreg[3], 3);
+		if (!dr_is_global(0))
+			set_debugreg(next->debugreg[0], 0);
+		if (!dr_is_global(1))
+			set_debugreg(next->debugreg[1], 1);
+		if (!dr_is_global(2))
+			set_debugreg(next->debugreg[2], 2);
+		if (!dr_is_global(3))
+			set_debugreg(next->debugreg[3], 3);
 		/* no 4 and 5 */
 		set_debugreg(next->debugreg[6], 6);
-		set_debugreg(next->debugreg[7], 7);
+		set_process_dr7(next->debugreg[7]);
 	}
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
Index: usb-2.6/arch/i386/kernel/ptrace.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/ptrace.c
+++ usb-2.6/arch/i386/kernel/ptrace.c
@@ -412,6 +412,7 @@ long arch_ptrace(struct task_struct *chi
 			ret = putreg(child, addr, data);
 			break;
 		}
+
 		/* We need to be very careful here.  We implicitly
 		   want to modify a portion of the task_struct, and we
 		   have to be selective about what portions we allow someone
@@ -421,10 +422,18 @@ long arch_ptrace(struct task_struct *chi
 		if (addr >= (long) &dummy->u_debugreg[0] &&
 		    addr <= (long) &dummy->u_debugreg[7]) {
 
-			if (addr == (long) &dummy->u_debugreg[4]) break;
-			if (addr == (long) &dummy->u_debugreg[5]) break;
-			if (addr < (long) &dummy->u_debugreg[4] &&
-			    ((unsigned long) data) >= TASK_SIZE-3) break;
+			addr -= (long) &dummy->u_debugreg;
+			addr = addr >> 2;
+			if (addr < 4) {
+				if ((unsigned long) data >= TASK_SIZE-3)
+					break;
+				if (dr_is_global(addr)) {
+					ret = -EBUSY;
+					break;
+				}
+			}
+			else if (addr == 4 || addr == 5)
+				break;
 
 			/* Sanity-check data. Take one half-byte at once with
 			 * check = (val >> (16 + 4*i)) & 0xf. It contains the
@@ -456,18 +465,21 @@ long arch_ptrace(struct task_struct *chi
 			 * See the AMD manual no. 24593 (AMD64 System
 			 * Programming) */
 
-			if (addr == (long) &dummy->u_debugreg[7]) {
+			else if (addr == 7) {
 				data &= ~DR_CONTROL_RESERVED;
 				for (i = 0; i < 4; i++)
 					if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 						goto out_tsk;
-				if (data)
+				i = enable_debugreg(child->thread.debugreg[7], data);
+				if (i < 0) {
+					ret = -EBUSY;
+					break;
+				}
+				if (i)
 					set_tsk_thread_flag(child, TIF_DEBUG);
 				else
 					clear_tsk_thread_flag(child, TIF_DEBUG);
 			}
-			addr -= (long) &dummy->u_debugreg;
-			addr = addr >> 2;
 			child->thread.debugreg[addr] = data;
 			ret = 0;
 		}
Index: usb-2.6/arch/i386/kernel/signal.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/signal.c
+++ usb-2.6/arch/i386/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
 #include "sigframe.h"
 
 #define DEBUG_SIG 0
@@ -594,7 +595,7 @@ static void fastcall do_signal(struct pt
 		 * inside the kernel.
 		 */
 		if (unlikely(current->thread.debugreg[7]))
-			set_debugreg(current->thread.debugreg[7], 7);
+			set_process_dr7(current->thread.debugreg[7]);
 
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
Index: usb-2.6/arch/i386/kernel/traps.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/traps.c
+++ usb-2.6/arch/i386/kernel/traps.c
@@ -808,6 +808,7 @@ fastcall void __kprobes do_debug(struct 
 	struct task_struct *tsk = current;
 
 	get_debugreg(condition, 6);
+	set_debugreg(0, 6);	/* DR6 is never cleared by the CPU */
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 					SIGTRAP) == NOTIFY_STOP)
@@ -849,7 +850,7 @@ fastcall void __kprobes do_debug(struct 
 	 * the signal is delivered.
 	 */
 clear_dr7:
-	set_debugreg(0, 7);
+	set_process_dr7(0);
 	return;
 
 debug_vm86:
Index: usb-2.6/include/asm-i386/debugreg.h
===================================================================
--- usb-2.6.orig/include/asm-i386/debugreg.h
+++ usb-2.6/include/asm-i386/debugreg.h
@@ -33,6 +33,7 @@
 
 #define DR_RW_EXECUTE (0x0)   /* Settings for the access types to trap on */
 #define DR_RW_WRITE (0x1)
+#define DR_RW_IO (0x2)
 #define DR_RW_READ (0x3)
 
 #define DR_LEN_1 (0x0) /* Settings for data length to trap on */
@@ -61,4 +62,63 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+#define DR_MAX	4
+#define DR_ANY	(DR_MAX + 1)
+
+/* global or local allocation requests */
+#define DR_ALLOC_GLOBAL		1
+#define DR_ALLOC_LOCAL		2
+
+#ifdef CONFIG_KWATCH
+
+/* Set the type, len and global flag in dr7 for a debug register */
+#define SET_DR7(dr, regnum, type, len)	do {		\
+		dr &= ~(0xf << (16 + (regnum) * 4));	\
+		dr |= (((((len) - 1) << 2) | (type)) <<	\
+				(16 + (regnum) * 4)) |	\
+			(0x2 << ((regnum) * 2));	\
+	} while (0)
+
+/* Disable a debug register by clearing the global/local flag in dr7 */
+#define RESET_DR7(dr, regnum)	dr &= ~(0x3 << ((regnum) * 2))
+
+extern int dr_alloc(int regnum, int flag);
+extern void dr_free(int regnum);
+extern void dr_inc_use_count(unsigned long mask);
+extern void dr_dec_use_count(unsigned long mask);
+extern int dr_is_global(int regnum);
+extern unsigned long dr7_global_mask;
+extern int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7);
+
+static inline void set_process_dr7(unsigned long new_dr7)
+{
+	unsigned long dr7;
+
+	get_debugreg(dr7, 7);
+	dr7 = (dr7 & dr7_global_mask) | (new_dr7 & ~dr7_global_mask);
+	set_debugreg(dr7, 7);
+}
+
+#else
+
+static inline void dr_inc_use_count(unsigned long mask)
+{
+}
+static inline void dr_dec_use_count(unsigned long mask)
+{
+}
+static inline int dr_is_global(int regnum)
+{
+	return 0;
+}
+static inline int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7)
+{
+	return (new_dr7 != 0);
+}
+static inline void set_process_dr7(unsigned long new_dr7)
+{
+	set_debugreg(new_dr7, 7);
+}
+
+#endif				/* CONFIG_KWATCH */
 #endif
Index: usb-2.6/arch/i386/kernel/kwatch.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/kwatch.c
@@ -0,0 +1,281 @@
+/*
+ *  Kernel Watchpoint interface.
+ *  arch/i386/kernel/kwatch.c
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
+ *		Kernel Watchpoint implementation.
+ * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> to
+ *		to make use of notifiers.
+ */
+#include <linux/kprobes.h>
+#include <linux/ptrace.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/kwatch.h>
+#include <asm/kdebug.h>
+#include <asm/debugreg.h>
+#include <asm/bitops.h>
+
+#define RF_MASK	0x00010000
+
+static struct kwatch kwatch_list[DR_MAX];
+static spinlock_t kwatch_lock = SPIN_LOCK_UNLOCKED;
+static unsigned long kwatch_in_progress;	/* currently being handled */
+
+struct dr_info {
+	int debugreg;
+	unsigned long addr;
+	int type;
+};
+
+static void write_dr(int debugreg, unsigned long addr)
+{
+	switch (debugreg) {
+		case 0:	set_debugreg(addr, 0);	break;
+		case 1:	set_debugreg(addr, 1);	break;
+		case 2:	set_debugreg(addr, 2);	break;
+		case 3:	set_debugreg(addr, 3);	break;
+		case 6:	set_debugreg(addr, 6);	break;
+		case 7:	set_debugreg(addr, 7);	break;
+	}
+}
+
+#define write_dr7(val)	set_debugreg((val), 7)
+#define read_dr7(val)	get_debugreg((val), 7)
+
+static void write_smp_dr(void *info)
+{
+	struct dr_info *dr = (struct dr_info *)info;
+
+	if (cpu_has_de && dr->type == DR_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_dr(dr->debugreg, dr->addr);
+}
+
+/* Update the debug register on all CPUs */
+static void sync_dr(int debugreg, unsigned long addr, int type)
+{
+	struct dr_info dr;
+	dr.debugreg = debugreg;
+	dr.addr = addr;
+	dr.type = type;
+	smp_call_function(write_smp_dr, &dr, 0, 0);
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+int kwatch_handler(unsigned long condition, struct pt_regs *regs)
+{
+	unsigned int debugreg;
+	unsigned long addr;
+
+	/* Using the debug status register value, find the debug register
+	 * number and the address for which the trap occurred. */
+	if (condition & DR_TRAP0) {
+		debugreg = 0;
+		get_debugreg(addr, 0);
+	} else if (condition & DR_TRAP1) {
+		debugreg = 1;
+		get_debugreg(addr, 1);
+	} else if (condition & DR_TRAP2) {
+		debugreg = 2;
+		get_debugreg(addr, 2);
+	} else if (condition & DR_TRAP3) {
+		debugreg = 3;
+		get_debugreg(addr, 3);
+	} else
+		return 0;
+
+	/* We're in an interrupt, but this is clear and BUG()-safe. */
+	preempt_disable();
+
+	/* If we are recursing, we already hold the lock. */
+	if (kwatch_in_progress)
+		goto recursed;
+
+	set_bit(debugreg, &kwatch_in_progress);
+
+	spin_lock(&kwatch_lock);
+	if ((unsigned long) kwatch_list[debugreg].addr != addr)
+		goto out;
+
+	if (kwatch_list[debugreg].handler)
+		kwatch_list[debugreg].handler(&kwatch_list[debugreg], regs);
+
+	if (kwatch_list[debugreg].type == DR_TYPE_EXECUTE)
+		regs->eflags |= RF_MASK;
+      out:
+	clear_bit(debugreg, &kwatch_in_progress);
+	spin_unlock(&kwatch_lock);
+	preempt_enable_no_resched();
+	return 0;
+
+      recursed:
+	if (kwatch_list[debugreg].type == DR_TYPE_EXECUTE)
+		regs->eflags |= RF_MASK;
+	preempt_enable_no_resched();
+	return 1;
+}
+
+/**
+ * register_kwatch - register a hardware watchpoint
+ * @addr: address of the watchpoint
+ * @length: extent of the watchpoint (1, 2, or 4 bytes)
+ * @type: type of access to trap (read, write, I/O, or execute)
+ * @handler: callback routine to invoke when a trap occurs
+ *
+ * Allocates and returns a debug register and installs the requested
+ * watchpoint.
+ *
+ * @length must be 1, 2, or 4, and @type must be one of %DR_TYPE_RW
+ * (read or write), %DR_TYPE_WRITE (write only), %DR_TYPE_IO (I/O space
+ * access), or %DR_TYPE_EXECUTE.  Note that %DR_TYPE_IO is available only
+ * on processors with Debugging Extensions, and @length must be 1 for
+ * %DR_TYPE_EXECUTE.
+ *
+ * When a trap occurs, @handler is invoked in_interrupt with a pointer
+ * to a struct kwatch containing the watchpoint information and a pointer
+ * to the CPU register values at the time of the trap.  %DR_TYPE_EXECUTE
+ * traps occur before the watch-pointed instruction executes; all other
+ * types occur after the memory or I/O access has taken place.
+ *
+ * Returns a debug register number or a negative error code.
+ */
+int register_kwatch(void *addr, u8 length, u8 type, kwatch_handler_t handler)
+{
+	int debugreg;
+	unsigned long dr7, flags;
+
+	switch (length) {
+	case 1:
+	case 2:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+	switch (type) {
+	case DR_TYPE_WRITE:
+	case DR_TYPE_RW:
+		break;
+	case DR_TYPE_IO:
+		if (cpu_has_de)
+			break;
+		return -EINVAL;
+	case DR_TYPE_EXECUTE:
+		if (length == 1)
+			break;
+		/* FALL THROUGH */
+	default:
+		return -EINVAL;
+	}
+	if (!handler)
+		return -EINVAL;
+
+	debugreg = dr_alloc(DR_ANY, DR_ALLOC_GLOBAL);
+	if (debugreg < 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = addr;
+	kwatch_list[debugreg].length = length;
+	kwatch_list[debugreg].type = type;
+	kwatch_list[debugreg].handler = handler;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	if (type == DR_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_dr(debugreg, (unsigned long) addr);
+	sync_dr(debugreg, (unsigned long) addr, type);
+
+	read_dr7(dr7);
+	SET_DR7(dr7, debugreg, type, length);
+	write_dr7(dr7);
+	sync_dr(7, dr7, 0);
+	return debugreg;
+}
+
+/**
+ * unregister_kwatch - free a previously-allocated debugging watchpoint
+ * @debugreg: the debugging register to deallocate
+ *
+ * Removes a hardware watchpoint and deallocates the corresponding
+ * debugging register.  @debugreg must previously have been allocated
+ * by register_kwatch().
+ */
+void unregister_kwatch(int debugreg)
+{
+	unsigned long flags;
+	unsigned long dr7;
+
+	if (debugreg < 0 || debugreg >= DR_MAX ||
+			!kwatch_list[debugreg].handler)
+		return;
+
+	read_dr7(dr7);
+	RESET_DR7(dr7, debugreg);
+	write_dr7(dr7);
+	sync_dr(7, dr7, 0);
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = 0;
+	kwatch_list[debugreg].handler = NULL;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	dr_free(debugreg);
+}
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+int kwatch_exceptions_notify(struct notifier_block *self, unsigned long val,
+			     void *data)
+{
+	struct die_args *args = (struct die_args *)data;
+	switch (val) {
+	case DIE_DEBUG:
+		if (kwatch_handler(args->err, args->regs))
+			return NOTIFY_STOP;
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kwatch_exceptions_nb = {
+	.notifier_call = kwatch_exceptions_notify,
+	.priority = 0x7ffffffe	/* we need to notified second */
+};
+
+static int __init init_kwatch(void)
+{
+	int err = 0;
+
+	err = register_die_notifier(&kwatch_exceptions_nb);
+	return err;
+}
+
+__initcall(init_kwatch);
+
+EXPORT_SYMBOL_GPL(register_kwatch);
+EXPORT_SYMBOL_GPL(unregister_kwatch);
Index: usb-2.6/arch/i386/kernel/Makefile
===================================================================
--- usb-2.6.orig/arch/i386/kernel/Makefile
+++ usb-2.6/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_KWATCH)		+= debugreg.o kwatch.o
 
 # Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
Index: usb-2.6/include/asm-i386/kwatch.h
===================================================================
--- /dev/null
+++ usb-2.6/include/asm-i386/kwatch.h
@@ -0,0 +1,60 @@
+#ifndef _ASM_KWATCH_H
+#define _ASM_KWATCH_H
+/*
+ *  Kernel Watchpoint interface.
+ *  include/asm-i386/kwatch.h
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ *
+ * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
+ *		Kernel Watchpoint implementation.
+ */
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kwatch;
+typedef void (*kwatch_handler_t) (struct kwatch *, struct pt_regs *);
+
+struct kwatch {
+	void *addr;		/* location of watchpoint */
+	u8 length;		/* range of address */
+	u8 type;		/* type of watchpoint */
+	kwatch_handler_t handler;
+};
+
+#define DR_TYPE_EXECUTE 	0x0	/* Watchpoint types */
+#define DR_TYPE_WRITE		0x1
+#define DR_TYPE_IO		0x2
+#define DR_TYPE_RW		0x3
+
+#ifdef CONFIG_KWATCH
+extern int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler);
+extern void unregister_kwatch(int debugreg);
+
+#else
+
+static inline int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kwatch(int debugreg)
+{
+}
+#endif
+#endif				/* _ASM_KWATCH_H */
Index: usb-2.6/arch/i386/Kconfig
===================================================================
--- usb-2.6.orig/arch/i386/Kconfig
+++ usb-2.6/arch/i386/Kconfig
@@ -1210,6 +1210,14 @@ config KPROBES
 	  a probepoint and specifies the callback.  Kprobes is useful
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
+
+config KWATCH
+	bool "Kwatch points (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Kwatch enables kernel-space data watchpoints using the processor's
+	  debug registers.  It can be very useful for kernel debugging.
+	  If in doubt, say "N".
 endmenu
 
 source "arch/i386/Kconfig.debug"


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-16 16:55 [PATCH] Kwatch: kernel watchpoints using CPU debug registers Alan Stern
@ 2007-01-16 23:35 ` Christoph Hellwig
  2007-01-17 16:33   ` Alan Stern
  2007-01-17  9:44 ` Ingo Molnar
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2007-01-16 23:35 UTC (permalink / raw)
  To: Alan Stern; +Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list

Fir4st I'd say thanks a lot for forward-porting this, it's really useful
feature for all kinds of nasty debugging.

I think you should split this into two patches, one for the debugreg
infrastructure, and one for the actual kwatch code.

Also I think you provide one (or even a few) example wathes for
trivial things, say updating i_ino for an inode given through debugfs.

Some comments on the code below:

> --- /dev/null
> +++ usb-2.6/arch/i386/kernel/debugreg.c
> @@ -0,0 +1,182 @@
> +/*
> + *  Debug register
> + *  arch/i386/kernel/debugreg.c

Please don't put in comments that mention the name of the containing
file.  Also the "Debug register" comments seems rather useless.

> + * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> and
> + *		Bharata Rao <bharata@in.ibm.com> to provide debug register
> + *		allocation mechanism.
> + * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> with
> + *		idr_allocations mechanism as suggested by Andi Kleen.

I think these kinds of comments aren't in fashion anymore either, all
changelogs should be in git commit messages and initial credits go
into the first commit message.

> +struct debugreg dr_list[DR_MAX];
> +static spinlock_t dr_lock = SPIN_LOCK_UNLOCKED;

I think you're supposed to use magic DEFINE_SPINLOCK macro these days.

> +unsigned long dr7_global_mask = DR_CONTROL_RESERVED | DR_GLOBAL_SLOWDOWN |
> +		DR_GLOBAL_ENABLE_MASK;

I'd rahter keep this static and make  set_process_dr7 a non-inline
function.

> +
> +static unsigned long dr7_global_reg_mask(unsigned int regnum)
> +{
> +	return (0xf << (16 + regnum * 4)) | (0x1 << (regnum * 2));
> +}
> +
> +static int get_dr(int regnum, int flag)
> +{
> +	if (flag == DR_ALLOC_GLOBAL && !dr_list[regnum].flag) {
> +		dr_list[regnum].flag = flag;
> +		dr7_global_mask |= dr7_global_reg_mask(regnum);
> +		return regnum;
> +	}
> +	if (flag == DR_ALLOC_LOCAL &&
> +			dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
> +		dr_list[regnum].flag = flag;
> +		dr_list[regnum].use_count++;
> +		return regnum;
> +	}
> +	return -1;

This looks rather poorly structured, as the function does compltely
different things depending on the flags passed in.

> +static void free_dr(int regnum)
> +{
> +	if (dr_list[regnum].flag == DR_ALLOC_LOCAL) {
> +		if (!--dr_list[regnum].use_count)
> +			dr_list[regnum].flag = 0;
> +	} else {
> +		dr_list[regnum].flag = 0;
> +		dr_list[regnum].use_count = 0;
> +		dr7_global_mask &= ~(dr7_global_reg_mask(regnum));
> +	}
> +}

Same here.

> +int dr_alloc(int regnum, int flag)
> +{
> +	int ret = -1;
> +
> +	spin_lock(&dr_lock);
> +	if (regnum >= 0 && regnum < DR_MAX)
> +		ret = get_dr(regnum, flag);
> +	else if (regnum == DR_ANY) {
> +
> +		/* gdb allocates local debug registers starting from 0.
> +		 * To help avoid conflicts, we'll start from the other end.
> +		 */
> +		for (regnum = DR_MAX - 1; regnum >= 0; --regnum) {
> +			ret = get_dr(regnum, flag);
> +			if (ret >= 0)
> +				break;
> +		}
> +	} else
> +		printk(KERN_ERR "dr_alloc: "
> +				"Cannot allocate debug register %d\n", regnum);
> +	spin_unlock(&dr_lock);
> +	return ret;

I suspect this should be replaced wit ha global and local variant
to fix the above mentioned issue.  It's a tiny bit duplicated code,
but seems much cleaner.

> +static int get_dr(int regnum, int flag)
> +{
> +	if (flag == DR_ALLOC_GLOBAL && !dr_list[regnum].flag) {
> +		dr_list[regnum].flag = flag;
> +		dr7_global_mask |= dr7_global_reg_mask(regnum);
> +		return regnum;
> +	}
> +	if (flag == DR_ALLOC_LOCAL &&
> +			dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
> +		dr_list[regnum].flag = flag;
> +		dr_list[regnum].use_count++;
> +		return regnum;
> +	}
> +	return -1;

Same comments about global vs local here.

> +
> +EXPORT_SYMBOL(dr_alloc);
> +EXPORT_SYMBOL(dr_free);

I don't think we want these exported at all, and if a proper modular
user shows up they should be _GPL as they're fairly lowlevel.

Btw, the naming in the whole debugregs code should be consolidated to
be debugreg_ instead of all kinds of different variants.

> +#ifdef CONFIG_KWATCH
> +
> +/* Set the type, len and global flag in dr7 for a debug register */
> +#define SET_DR7(dr, regnum, type, len)	do {		\
> +		dr &= ~(0xf << (16 + (regnum) * 4));	\
> +		dr |= (((((len) - 1) << 2) | (type)) <<	\
> +				(16 + (regnum) * 4)) |	\
> +			(0x2 << ((regnum) * 2));	\
> +	} while (0)
> +
> +/* Disable a debug register by clearing the global/local flag in dr7 */
> +#define RESET_DR7(dr, regnum)	dr &= ~(0x3 << ((regnum) * 2))

I don't think there's any point in making these macros conditional.
Then again is there a good reason to mke these macros?

> + *  Kernel Watchpoint interface.
> + *  arch/i386/kernel/kwatch.c
> + *
> + *
> + * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
> + *		Kernel Watchpoint implementation.
> + * 2004-Oct	Updated by Prasanna S Panchamukhi <prasanna@in.ibm.com> to
> + *		to make use of notifiers.
> + */

Same comments about these comments applies as in debugreg.c

> +#include <linux/kprobes.h>
> +#include <linux/ptrace.h>
> +#include <linux/spinlock.h>
> +#include <linux/module.h>
> +#include <linux/init.h>
> +#include <asm/kwatch.h>
> +#include <asm/kdebug.h>
> +#include <asm/debugreg.h>
> +#include <asm/bitops.h>

I think this should be linux/bitops.h these days.

> +
> +#define RF_MASK	0x00010000
> +
> +static struct kwatch kwatch_list[DR_MAX];
> +static spinlock_t kwatch_lock = SPIN_LOCK_UNLOCKED;



> +static unsigned long kwatch_in_progress;	/* currently being handled */

Give that this is a bitmap the comment is rather misleading, it should
probably be:

/*
 * Bitmap of registers beeing handled.
 */

> +static void write_dr(int debugreg, unsigned long addr)
> +{
> +	switch (debugreg) {
> +		case 0:	set_debugreg(addr, 0);	break;
> +		case 1:	set_debugreg(addr, 1);	break;
> +		case 2:	set_debugreg(addr, 2);	break;
> +		case 3:	set_debugreg(addr, 3);	break;
> +		case 6:	set_debugreg(addr, 6);	break;
> +		case 7:	set_debugreg(addr, 7);	break;
> +	}
> +}

What's the point of this wrapper?

> +
> +#define write_dr7(val)	set_debugreg((val), 7)
> +#define read_dr7(val)	get_debugreg((val), 7)

And these?

> +	if (kwatch_in_progress)
> +		goto recursed;
> +

I don't think there's any point in this goto, just handle it inside
the if block

> +	set_bit(debugreg, &kwatch_in_progress);
> +
> +	spin_lock(&kwatch_lock);
> +	if ((unsigned long) kwatch_list[debugreg].addr != addr)
> +		goto out;
> +
> +	if (kwatch_list[debugreg].handler)
> +		kwatch_list[debugreg].handler(&kwatch_list[debugreg], regs);
> +
> +	if (kwatch_list[debugreg].type == DR_TYPE_EXECUTE)
> +		regs->eflags |= RF_MASK;
> +      out:

Again, I think the goto here could be avoided and actually make the code
cleanere.  Also a local variable for kwatch_list[debugreg] with a short
would probably make this section of code a lot more readable.

> +
> +static int __init init_kwatch(void)
> +{
> +	int err = 0;
> +
> +	err = register_die_notifier(&kwatch_exceptions_nb);
> +	return err;
> +}

Just remove the err local variable here.

> +EXPORT_SYMBOL_GPL(register_kwatch);
> +EXPORT_SYMBOL_GPL(unregister_kwatch);

Please move these exports close to the actual function definition.

> --- /dev/null
> +++ usb-2.6/include/asm-i386/kwatch.h
> @@ -0,0 +1,60 @@
> +#ifndef _ASM_KWATCH_H
> +#define _ASM_KWATCH_H
> +/*
> + *  Kernel Watchpoint interface.
> + *  include/asm-i386/kwatch.h

> + * 2002-Oct	Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> for
> + *		Kernel Watchpoint implementation.
> + */

Same comments once again.

> +#include <linux/types.h>
> +#include <linux/ptrace.h>
> +
> +struct kwatch;
> +typedef void (*kwatch_handler_t) (struct kwatch *, struct pt_regs *);
> +
> +struct kwatch {
> +	void *addr;		/* location of watchpoint */
> +	u8 length;		/* range of address */
> +	u8 type;		/* type of watchpoint */
> +	kwatch_handler_t handler;
> +};
> +
> +#define DR_TYPE_EXECUTE 	0x0	/* Watchpoint types */
> +#define DR_TYPE_WRITE		0x1
> +#define DR_TYPE_IO		0x2
> +#define DR_TYPE_RW		0x3

I think large parts of this header should go into a new linux/kwatch.h
so that generic code can use kwatches.

> +config KWATCH
> +	bool "Kwatch points (EXPERIMENTAL)"
> +	depends on EXPERIMENTAL
> +	help
> +	  Kwatch enables kernel-space data watchpoints using the processor's
> +	  debug registers.  It can be very useful for kernel debugging.
> +	  If in doubt, say "N".

I think we want different options for debugregs and kwatch.  The debugreg
one probably doesn't have to be actually user-visible, though.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-16 16:55 [PATCH] Kwatch: kernel watchpoints using CPU debug registers Alan Stern
  2007-01-16 23:35 ` Christoph Hellwig
@ 2007-01-17  9:44 ` Ingo Molnar
  2007-01-17 16:17   ` Alan Stern
  1 sibling, 1 reply; 11+ messages in thread
From: Ingo Molnar @ 2007-01-17  9:44 UTC (permalink / raw)
  To: Alan Stern
  Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list,
	Roland McGrath


* Alan Stern <stern@rowland.harvard.edu> wrote:

> From: Alan Stern <stern@rowland.harvard.edu>
> 
> This patch (as839) implements the Kwatch (kernel-space hardware-based 
> watchpoints) API for the i386 architecture.  The API is explained in 
> the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c.

i think it would be nice to have this ontop of Roland's utrace 
infrastructure, which nicely modularizes all hardware debugging 
capabilities and detaches it from ptrace.

	Ingo

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-17  9:44 ` Ingo Molnar
@ 2007-01-17 16:17   ` Alan Stern
  2007-01-18  0:12     ` Christoph Hellwig
  2007-02-06  4:25     ` [PATCH] " Roland McGrath
  0 siblings, 2 replies; 11+ messages in thread
From: Alan Stern @ 2007-01-17 16:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list,
	Roland McGrath

On Wed, 17 Jan 2007, Ingo Molnar wrote:

> * Alan Stern <stern@rowland.harvard.edu> wrote:
> 
> > From: Alan Stern <stern@rowland.harvard.edu>
> > 
> > This patch (as839) implements the Kwatch (kernel-space hardware-based 
> > watchpoints) API for the i386 architecture.  The API is explained in 
> > the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c.
> 
> i think it would be nice to have this ontop of Roland's utrace 
> infrastructure, which nicely modularizes all hardware debugging 
> capabilities and detaches it from ptrace.

I'll be happy to move this over to the utrace setting, once it is merged.  
Do you think it would be better to include the current version of kwatch 
now or to wait for utrace?

Roland, is there a schedule for when you plan to get utrace into -mm?

Alan Stern


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-16 23:35 ` Christoph Hellwig
@ 2007-01-17 16:33   ` Alan Stern
  0 siblings, 0 replies; 11+ messages in thread
From: Alan Stern @ 2007-01-17 16:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Andrew Morton, Prasanna S Panchamukhi, Kernel development list

On Tue, 16 Jan 2007, Christoph Hellwig wrote:

> Fir4st I'd say thanks a lot for forward-porting this, it's really useful
> feature for all kinds of nasty debugging.
> 
> I think you should split this into two patches, one for the debugreg
> infrastructure, and one for the actual kwatch code.
> 
> Also I think you provide one (or even a few) example wathes for
> trivial things, say updating i_ino for an inode given through debugfs.
> 
> Some comments on the code below:

Many thanks for your detailed comments and suggestions.  It probably was
obvious that most of the things you picked up on were inherited from the
original Kwatch patch.  I'll update my patch in accordance with your
suggestions.

Responses to just a couple of the comments:

> I suspect this should be replaced wit ha global and local variant
> to fix the above mentioned issue.  It's a tiny bit duplicated code,
> but seems much cleaner.

It would indeed be cleaner.  And in fact the local variant would have a
large amount of dead code, which could be left out entirely (at least from
the initial version).  That's because the only current user of local debug 
register allocations is ptrace.

> > +static void write_dr(int debugreg, unsigned long addr)
> > +{
> > +	switch (debugreg) {
> > +		case 0:	set_debugreg(addr, 0);	break;
> > +		case 1:	set_debugreg(addr, 1);	break;
> > +		case 2:	set_debugreg(addr, 2);	break;
> > +		case 3:	set_debugreg(addr, 3);	break;
> > +		case 6:	set_debugreg(addr, 6);	break;
> > +		case 7:	set_debugreg(addr, 7);	break;
> > +	}
> > +}
> 
> What's the point of this wrapper?

It is called from two different places, and it's better than including
the "switch" in each place.

> I think large parts of this header should go into a new linux/kwatch.h
> so that generic code can use kwatches.

In the long run that may well be true.  For now, I'm a little hesitant to
put something which works only on i386 under include/linux.

> > +config KWATCH
> > +	bool "Kwatch points (EXPERIMENTAL)"
> > +	depends on EXPERIMENTAL
> > +	help
> > +	  Kwatch enables kernel-space data watchpoints using the processor's
> > +	  debug registers.  It can be very useful for kernel debugging.
> > +	  If in doubt, say "N".
> 
> I think we want different options for debugregs and kwatch.  The debugreg
> one probably doesn't have to be actually user-visible, though.

It's easier to start out like this and then change it later when someone
comes up with another use for debugregs.  Or perhaps by then the whole
thing will have been moved over to utrace, making the issue academic.

Alan Stern


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-17 16:17   ` Alan Stern
@ 2007-01-18  0:12     ` Christoph Hellwig
  2007-01-18  7:31       ` Ingo Molnar
  2007-02-06  4:25     ` [PATCH] " Roland McGrath
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2007-01-18  0:12 UTC (permalink / raw)
  To: Alan Stern
  Cc: Ingo Molnar, Andrew Morton, Prasanna S Panchamukhi,
	Kernel development list, Roland McGrath

On Wed, Jan 17, 2007 at 11:17:37AM -0500, Alan Stern wrote:
> I'll be happy to move this over to the utrace setting, once it is merged.  
> Do you think it would be better to include the current version of kwatch 
> now or to wait for utrace?
> 
> Roland, is there a schedule for when you plan to get utrace into -mm?

Even if it goes into mainline soon we'll need a lot of time for all
architectures to catch up, so I think kwatch should definitely comes first.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18  0:12     ` Christoph Hellwig
@ 2007-01-18  7:31       ` Ingo Molnar
  2007-01-18 15:37         ` Alan Stern
  2007-01-18 22:33         ` Christoph Hellwig
  0 siblings, 2 replies; 11+ messages in thread
From: Ingo Molnar @ 2007-01-18  7:31 UTC (permalink / raw)
  To: Christoph Hellwig, Alan Stern, Andrew Morton,
	Prasanna S Panchamukhi, Kernel development list, Roland McGrath


* Christoph Hellwig <hch@infradead.org> wrote:

> > I'll be happy to move this over to the utrace setting, once it is 
> > merged.  Do you think it would be better to include the current 
> > version of kwatch now or to wait for utrace?
> > 
> > Roland, is there a schedule for when you plan to get utrace into 
> > -mm?
> 
> Even if it goes into mainline soon we'll need a lot of time for all 
> architectures to catch up, so I think kwatch should definitely comes 
> first.

i disagree. Utrace is a once-in-a-lifetime opportunity to clean up the 
/huge/ ptrace mess. Ptrace has been a very large PITA, for many, many 
years, precisely because it was done in the 'oh, lets get this feature 
added first, think about it later' manner. Roland's work is a large 
logistical undertaking and we should not make it more complex than it 
is. Once it's in we can add debugging features ontop of that. To me work 
that cleans up existing mess takes precedence before work that adds to 
the mess.

	Ingo

ps. please fix your mailer to not emit those silly Mail-Followup-To 
headers! It collapses To: and Cc: lines into one huge unnecessary To: 
line.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18  7:31       ` Ingo Molnar
@ 2007-01-18 15:37         ` Alan Stern
  2007-01-18 22:33         ` Christoph Hellwig
  1 sibling, 0 replies; 11+ messages in thread
From: Alan Stern @ 2007-01-18 15:37 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Andrew Morton, Prasanna S Panchamukhi,
	Kernel development list, Roland McGrath

On Thu, 18 Jan 2007, Ingo Molnar wrote:

> 
> * Christoph Hellwig <hch@infradead.org> wrote:
> 
> > > I'll be happy to move this over to the utrace setting, once it is 
> > > merged.  Do you think it would be better to include the current 
> > > version of kwatch now or to wait for utrace?
> > > 
> > > Roland, is there a schedule for when you plan to get utrace into 
> > > -mm?
> > 
> > Even if it goes into mainline soon we'll need a lot of time for all 
> > architectures to catch up, so I think kwatch should definitely comes 
> > first.
> 
> i disagree. Utrace is a once-in-a-lifetime opportunity to clean up the 
> /huge/ ptrace mess. Ptrace has been a very large PITA, for many, many 
> years, precisely because it was done in the 'oh, lets get this feature 
> added first, think about it later' manner. Roland's work is a large 
> logistical undertaking and we should not make it more complex than it 
> is. Once it's in we can add debugging features ontop of that. To me work 
> that cleans up existing mess takes precedence before work that adds to 
> the mess.

Interestingly, the current version of utrace makes no special provision
for watchpoints, either in kernel or user space.  Instead it relies on the
legacy ptrace mechanism for setting debug registers in the target
process's user area.  Perhaps an explicit watchpoint implementation should
be added to utrace, but that's beyond the scope of this discussion.

Furthermore, utrace is explicitly intended for tracing user programs, not
for tracing the kernel.  Kwatch, however, is just the opposite: It is
intended for setting up watchpoints in kernel space.  In that sense it is
pretty much orthogonal to utrace.  Although it would affect the utrace
patches, the changes would be basically transparent (i.e., move the new
code from one ptrace handler to another instead of moving the old code).

If Kwatch is to be subsumed anywhere, I think it should be under the
Kprobes/Systemtap project.  Again, that's a separate question -- so far 
they have avoided data watchpoints.

Alan Stern


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18  7:31       ` Ingo Molnar
  2007-01-18 15:37         ` Alan Stern
@ 2007-01-18 22:33         ` Christoph Hellwig
  2007-01-22 16:54           ` [PATCH - revised] " Alan Stern
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2007-01-18 22:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Hellwig, Alan Stern, Andrew Morton,
	Prasanna S Panchamukhi, Kernel development list, Roland McGrath

On Thu, Jan 18, 2007 at 08:31:59AM +0100, Ingo Molnar wrote:
> 
> * Christoph Hellwig <hch@infradead.org> wrote:
> 
> > > I'll be happy to move this over to the utrace setting, once it is 
> > > merged.  Do you think it would be better to include the current 
> > > version of kwatch now or to wait for utrace?
> > > 
> > > Roland, is there a schedule for when you plan to get utrace into 
> > > -mm?
> > 
> > Even if it goes into mainline soon we'll need a lot of time for all 
> > architectures to catch up, so I think kwatch should definitely comes 
> > first.
> 
> i disagree. Utrace is a once-in-a-lifetime opportunity to clean up the 
> /huge/ ptrace mess. Ptrace has been a very large PITA, for many, many 
> years, precisely because it was done in the 'oh, lets get this feature 
> added first, think about it later' manner. Roland's work is a large 
> logistical undertaking and we should not make it more complex than it 
> is. Once it's in we can add debugging features ontop of that. To me work 
> that cleans up existing mess takes precedence before work that adds to 
> the mess.

Utrace doesn't provide any kind of watchpoint infrastructure now, and
utrace will take a lot of time to get ready for inclusion, mostly because
it really needs asll the arch maintainers to help out (and various not
so easy core fixes aswell).

I'm all for merging utrace, and I wish we'd be much further into the
merging process already, but blocking mostly unrelated functionality for
it is more than dumb.


> ps. please fix your mailer to not emit those silly Mail-Followup-To 
> headers! It collapses To: and Cc: lines into one huge unnecessary To: 
> line.

This header is absolutely intentation as far too many folks seem to randomly
drop to or cc lines on mailing lists.  and of course it's alsmost esential
on lists with braindead reply to list policies (e.g. Debian)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH - revised] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-18 22:33         ` Christoph Hellwig
@ 2007-01-22 16:54           ` Alan Stern
  0 siblings, 0 replies; 11+ messages in thread
From: Alan Stern @ 2007-01-22 16:54 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Hellwig, Ingo Molnar, Prasanna S Panchamukhi,
	Kernel development list, Roland McGrath

From: Alan Stern <stern@rowland.harvard.edu>

This patch (as839b) implements the Kwatch (kernel-space hardware-based
watchpoints) API for the i386 architecture.  The API is explained in
the kerneldoc for register_kwatch() in arch/i386/kernel/kwatch.c, and
there is demonstration code in include/asm-i386/kwatch.h.

The original version of the patch was written by Vamsi Krishna S and
Bharata B Rao in 2002.  It was updated in 2004 by Prasanna S Panchamukhi
for 2.6.13 and then again (here) by me for 2.6.20.

Signed-off-by: Prasanna S Panchamukhi <prasanna@in.ibm.com>
Signed-off-by: Alan Stern <stern@rowland.harvard.edu>

---

This version of the patch has been revised along the lines suggested by
Christoph.  Note that it is meant to apply on top of the i386-ptrace
whitespace cleanup patch I submitted last week.

The impact of kwatch upon utrace is minimal.  Kwatch is intended for 
adding hardware watchpoints in the kernel, whereas utrace is intended for 
general debugging (not including watchpoints at the moment, oddly enough) 
of user processes.  The two don't really overlap, and adding kwatch should 
not result in any significant delay in adding utrace.

Alan Stern


Index: usb-2.6/arch/i386/kernel/debugreg.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/debugreg.c
@@ -0,0 +1,269 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) 2007 Alan Stern
+ */
+
+/*
+ * These routines provide a debug register allocation mechanism.
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <asm/system.h>
+#include <asm/debugreg.h>
+
+struct debugreg {
+	int flag;
+	int use_count;
+};
+
+/* flag values */
+#define DR_ALLOC_NONE		0
+#define DR_ALLOC_GLOBAL		1
+#define DR_ALLOC_LOCAL		2
+
+static struct debugreg dr_list[DR_MAX];
+static DEFINE_SPINLOCK(dr_lock);
+static unsigned long dr7_global_mask = DR_CONTROL_RESERVED |
+		DR_GLOBAL_SLOWDOWN | DR_GLOBAL_ENABLE_MASK;
+
+/*
+ * Set the process's debug register 7 value: Keep all existing global
+ * bit settings and install the process's local bit settings.
+ */
+void set_process_debugreg7(unsigned long new_dr7)
+{
+	unsigned long dr7;
+
+	get_debugreg(dr7, 7);
+	dr7 = (dr7 & dr7_global_mask) | (new_dr7 & ~dr7_global_mask);
+	set_debugreg(dr7, 7);
+}
+
+/**
+ * debugreg7_clear_bits - clear the type, len, and global-enable flag bits
+ * @old_dr7: original value for debug register 7
+ * @regnum: number of the debug register to disable
+ *
+ * @regnum must lie in the range 0 - 3.
+ *
+ * Returns the new value for debug register 7, with the appropriate bits
+ * cleared to disable the specified debug register.
+ */
+unsigned long debugreg7_clear_bits(unsigned long old_dr7, int regnum)
+{
+	unsigned long new_dr7;
+
+	new_dr7 = old_dr7 & ~(0xf <<
+			(DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE));
+	new_dr7 &= ~(0x2 << (regnum * DR_ENABLE_SIZE));
+	return new_dr7;
+}
+
+/**
+ * debugreg7_set_bits - set the type, len, and global-enable flag bits
+ * @old_dr7: original value for debug register 7
+ * @regnum: number of the debug register to enable
+ * @type: type of debugging watchpoint
+ * @len: length of the debugging watchpoint
+ *
+ * @regnum must lie in the range DR_FIRST_ADDR - DR_LAST_ADDR (0 - 3).
+ * @type must lie in the range DR_WR_EXECUTE - DR_RW_READ (0 - 3).
+ * @len must be 1, 2, or 4.
+ *
+ * Returns the new value for debug register 7, with the appropriate bits
+ * set to enable a new watchpoint for the specified debug register.
+ */
+unsigned long debugreg7_set_bits(unsigned long old_dr7, int regnum,
+		u8 type, u8 len)
+{
+	unsigned long new_dr7;
+
+	--len;
+	new_dr7 = old_dr7 & ~(0xf <<
+			(DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE));
+	new_dr7 |= (((len << 2) | type) <<
+			(DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE));
+	new_dr7 |= (0x2 << (regnum * DR_ENABLE_SIZE));
+	return new_dr7;
+}
+
+static unsigned long dr7_global_reg_mask(unsigned int regnum)
+{
+	return (0xf << (DR_CONTROL_SHIFT + regnum * DR_CONTROL_SIZE)) |
+			(0x1 << (regnum * DR_ENABLE_SIZE));
+}
+
+static int get_global_dr(int regnum)
+{
+	if (dr_list[regnum].flag == DR_ALLOC_NONE) {
+		dr_list[regnum].flag = DR_ALLOC_GLOBAL;
+		dr7_global_mask |= dr7_global_reg_mask(regnum);
+		return regnum;
+	}
+	return -1;
+}
+
+static void free_global_dr(int regnum)
+{
+	dr_list[regnum].flag = DR_ALLOC_NONE;
+	dr_list[regnum].use_count = 0;
+	dr7_global_mask &= ~dr7_global_reg_mask(regnum);
+}
+
+static int get_local_dr(int regnum)
+{
+	if (dr_list[regnum].flag != DR_ALLOC_GLOBAL) {
+		dr_list[regnum].flag = DR_ALLOC_LOCAL;
+		dr_list[regnum].use_count++;
+		return regnum;
+	}
+	return -1;
+}
+
+static void free_local_dr(int regnum)
+{
+	if (dr_list[regnum].flag == DR_ALLOC_LOCAL) {
+		if (!--dr_list[regnum].use_count)
+			dr_list[regnum].flag = DR_ALLOC_NONE;
+	}
+}
+
+/**
+ * debugreg_global_alloc - allocate a debug register for global use
+ * @regnum: register number to allocate or %DR_ANY
+ *
+ * Returns -1 if @regnum is already allocated, otherwise returns
+ * @regnum and does a global allocation.
+ */
+int debugreg_global_alloc(int regnum)
+{
+	int ret = -1;
+
+	spin_lock(&dr_lock);
+	if (regnum >= 0 && regnum < DR_MAX)
+		ret = get_global_dr(regnum);
+	else if (regnum == DR_ANY) {
+
+		/*
+		 * gdb allocates local debug registers starting from 0.
+		 * To help avoid conflicts, we'll start from the other end.
+		 */
+		for (regnum = DR_MAX - 1; regnum >= 0; --regnum) {
+			ret = get_global_dr(regnum);
+			if (ret >= 0)
+				break;
+		}
+	} else
+		printk(KERN_ERR "%s: Cannot allocate debug register %d\n",
+				__FUNCTION__,  regnum);
+	spin_unlock(&dr_lock);
+	return ret;
+}
+
+/**
+ * debugreg_global_free - release a global debug register allocation
+ * @regnum: the number of the register to deallocate
+ *
+ * @regnum must previously have been allocated by debugreg_global_alloc().
+ */
+void debugreg_global_free(int regnum)
+{
+	spin_lock(&dr_lock);
+	if (regnum < 0 || regnum >= DR_MAX ||
+			dr_list[regnum].flag != DR_ALLOC_GLOBAL)
+		printk(KERN_ERR "%s: Cannot free debug register %d\n",
+				__FUNCTION__,  regnum);
+	else
+		free_global_dr(regnum);
+	spin_unlock(&dr_lock);
+}
+
+/*
+ * Increment the usage counts for locally-enabled debug registers.
+ * Must be called when a process using debug registers forks or is cloned.
+ */
+void debugreg_inc_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			dr_list[i].use_count++;
+	}
+	spin_unlock(&dr_lock);
+}
+
+/*
+ * Decrement the usage counts for locally-enabled debug registers.
+ * Must be called when a process using debug registers exits or execs.
+ */
+void debugreg_dec_use_count(unsigned long mask)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	spin_lock(&dr_lock);
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if (mask & dr_local_enable)
+			free_local_dr(i);
+	}
+	spin_unlock(&dr_lock);
+}
+
+/* Report whether a debug register is globally allocated. */
+int debugreg_is_global(int regnum)
+{
+	return (dr_list[regnum].flag == DR_ALLOC_GLOBAL);
+}
+
+/*
+ * This routine decides if a ptrace request is for enabling or disabling
+ * a debug reg, and accordingly calls dr_alloc() or dr_free().
+ *
+ * gdb uses ptrace to write to debug registers.  It assumes that writing to
+ * a debug register always succeds and it doesn't check the return value of
+ * ptrace.  Now with this new global debug register allocation/freeing,
+ * ptrace request for a local debug register will fail if the required debug
+ * register is already globally allocated.  Since gdb doesn't notice this
+ * failure, it sometimes tries to free a debug register which it does not
+ * own.
+ *
+ * Returns -1 if the ptrace request tries to locally allocate a debug register
+ * that is already globally allocated.  Otherwise returns >0 or 0 according
+ * as any debug registers are or are not locally allocated in the new setting.
+ */
+int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7)
+{
+	int i;
+	int dr_local_enable = 1 << DR_LOCAL_ENABLE_SHIFT;
+
+	if (new_dr7 & DR_LOCAL_ENABLE_MASK & dr7_global_mask)
+		return -1;
+	for (i = 0; i < DR_MAX; (++i, dr_local_enable <<= DR_ENABLE_SIZE)) {
+		if ((old_dr7 ^ new_dr7) & dr_local_enable) {
+			if (new_dr7 & dr_local_enable)
+				get_local_dr(i);
+			else
+				free_local_dr(i);
+		}
+	}
+	return new_dr7 & DR_LOCAL_ENABLE_MASK;
+}
Index: usb-2.6/arch/i386/kernel/process.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/process.c
+++ usb-2.6/arch/i386/kernel/process.c
@@ -51,6 +51,7 @@
 #ifdef CONFIG_MATH_EMULATION
 #include <asm/math_emu.h>
 #endif
+#include <asm/debugreg.h>
 
 #include <linux/err.h>
 
@@ -356,9 +357,10 @@ EXPORT_SYMBOL(kernel_thread);
  */
 void exit_thread(void)
 {
+	struct task_struct *tsk = current;
+
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(test_thread_flag(TIF_IO_BITMAP))) {
-		struct task_struct *tsk = current;
 		struct thread_struct *t = &tsk->thread;
 		int cpu = get_cpu();
 		struct tss_struct *tss = &per_cpu(init_tss, cpu);
@@ -376,15 +378,20 @@ void exit_thread(void)
 		tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+	if (unlikely(test_thread_flag(TIF_DEBUG)))
+ 		debugreg_dec_use_count(tsk->thread.debugreg[7]);
 }
 
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
 
+	if (unlikely(test_thread_flag(TIF_DEBUG))) {
+		debugreg_dec_use_count(tsk->thread.debugreg[7]);
+		clear_tsk_thread_flag(tsk, TIF_DEBUG);
+	}
 	memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));	
-	clear_tsk_thread_flag(tsk, TIF_DEBUG);
 	/*
 	 * Forget coprocessor state..
 	 */
@@ -462,6 +469,9 @@ int copy_thread(int nr, unsigned long cl
 		desc->b = LDT_entry_b(&info);
 	}
 
+	if (unlikely(test_thread_flag(TIF_DEBUG)))
+		debugreg_inc_use_count(tsk->thread.debugreg[7]);
+
 	err = 0;
  out:
 	if (err && p->thread.io_bitmap_ptr) {
@@ -537,14 +547,22 @@ static noinline void __switch_to_xtra(st
 
 	next = &next_p->thread;
 
+	/*
+	 * Don't reload global debug registers. Don't touch the global debug
+	 * register settings in dr7.
+	 */
 	if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
-		set_debugreg(next->debugreg[0], 0);
-		set_debugreg(next->debugreg[1], 1);
-		set_debugreg(next->debugreg[2], 2);
-		set_debugreg(next->debugreg[3], 3);
+		if (!debugreg_is_global(0))
+			set_debugreg(next->debugreg[0], 0);
+		if (!debugreg_is_global(1))
+			set_debugreg(next->debugreg[1], 1);
+		if (!debugreg_is_global(2))
+			set_debugreg(next->debugreg[2], 2);
+		if (!debugreg_is_global(3))
+			set_debugreg(next->debugreg[3], 3);
 		/* no 4 and 5 */
 		set_debugreg(next->debugreg[6], 6);
-		set_debugreg(next->debugreg[7], 7);
+		set_process_debugreg7(next->debugreg[7]);
 	}
 
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
Index: usb-2.6/arch/i386/kernel/ptrace.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/ptrace.c
+++ usb-2.6/arch/i386/kernel/ptrace.c
@@ -412,6 +412,7 @@ long arch_ptrace(struct task_struct *chi
 			ret = putreg(child, addr, data);
 			break;
 		}
+
 		/* We need to be very careful here.  We implicitly
 		   want to modify a portion of the task_struct, and we
 		   have to be selective about what portions we allow someone
@@ -421,10 +422,18 @@ long arch_ptrace(struct task_struct *chi
 		if (addr >= (long) &dummy->u_debugreg[0] &&
 		    addr <= (long) &dummy->u_debugreg[7]) {
 
-			if (addr == (long) &dummy->u_debugreg[4]) break;
-			if (addr == (long) &dummy->u_debugreg[5]) break;
-			if (addr < (long) &dummy->u_debugreg[4] &&
-			    ((unsigned long) data) >= TASK_SIZE-3) break;
+			addr -= (long) &dummy->u_debugreg;
+			addr = addr >> 2;
+			if (addr < 4) {
+				if ((unsigned long) data >= TASK_SIZE-3)
+					break;
+				if (debugreg_is_global(addr)) {
+					ret = -EBUSY;
+					break;
+				}
+			}
+			else if (addr == 4 || addr == 5)
+				break;
 
 			/* Sanity-check data. Take one half-byte at once with
 			 * check = (val >> (16 + 4*i)) & 0xf. It contains the
@@ -456,18 +465,21 @@ long arch_ptrace(struct task_struct *chi
 			 * See the AMD manual no. 24593 (AMD64 System
 			 * Programming) */
 
-			if (addr == (long) &dummy->u_debugreg[7]) {
+			else if (addr == 7) {
 				data &= ~DR_CONTROL_RESERVED;
 				for (i = 0; i < 4; i++)
 					if ((0x5f54 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
 						goto out_tsk;
-				if (data)
+				i = enable_debugreg(child->thread.debugreg[7], data);
+				if (i < 0) {
+					ret = -EBUSY;
+					break;
+				}
+				if (i)
 					set_tsk_thread_flag(child, TIF_DEBUG);
 				else
 					clear_tsk_thread_flag(child, TIF_DEBUG);
 			}
-			addr -= (long) &dummy->u_debugreg;
-			addr = addr >> 2;
 			child->thread.debugreg[addr] = data;
 			ret = 0;
 		}
Index: usb-2.6/arch/i386/kernel/signal.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/signal.c
+++ usb-2.6/arch/i386/kernel/signal.c
@@ -25,6 +25,7 @@
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/debugreg.h>
 #include "sigframe.h"
 
 #define DEBUG_SIG 0
@@ -594,7 +595,7 @@ static void fastcall do_signal(struct pt
 		 * inside the kernel.
 		 */
 		if (unlikely(current->thread.debugreg[7]))
-			set_debugreg(current->thread.debugreg[7], 7);
+			set_process_debugreg7(current->thread.debugreg[7]);
 
 		/* Whee!  Actually deliver the signal.  */
 		if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
Index: usb-2.6/arch/i386/kernel/traps.c
===================================================================
--- usb-2.6.orig/arch/i386/kernel/traps.c
+++ usb-2.6/arch/i386/kernel/traps.c
@@ -808,6 +808,7 @@ fastcall void __kprobes do_debug(struct 
 	struct task_struct *tsk = current;
 
 	get_debugreg(condition, 6);
+	set_debugreg(0, 6);	/* DR6 is never cleared by the CPU */
 
 	if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
 					SIGTRAP) == NOTIFY_STOP)
@@ -849,7 +850,7 @@ fastcall void __kprobes do_debug(struct 
 	 * the signal is delivered.
 	 */
 clear_dr7:
-	set_debugreg(0, 7);
+	set_process_debugreg7(0);
 	return;
 
 debug_vm86:
Index: usb-2.6/include/asm-i386/debugreg.h
===================================================================
--- usb-2.6.orig/include/asm-i386/debugreg.h
+++ usb-2.6/include/asm-i386/debugreg.h
@@ -33,6 +33,7 @@
 
 #define DR_RW_EXECUTE (0x0)   /* Settings for the access types to trap on */
 #define DR_RW_WRITE (0x1)
+#define DR_RW_IO (0x2)
 #define DR_RW_READ (0x3)
 
 #define DR_LEN_1 (0x0) /* Settings for data length to trap on */
@@ -61,4 +62,43 @@
 #define DR_LOCAL_SLOWDOWN (0x100)   /* Local slow the pipeline */
 #define DR_GLOBAL_SLOWDOWN (0x200)  /* Global slow the pipeline */
 
+#define DR_MAX	4
+#define DR_ANY	(DR_MAX + 1)
+
+#ifdef CONFIG_KWATCH
+
+extern void set_process_debugreg7(unsigned long new_dr7);
+extern unsigned long debugreg7_set_bits(unsigned long old_dr7, int regnum,
+		u8 type, u8 len);
+extern unsigned long debugreg7_clear_bits(unsigned long old_dr7, int regnum);
+extern int debugreg_global_alloc(int regnum);
+extern void debugreg_global_free(int regnum);
+extern void debugreg_inc_use_count(unsigned long mask);
+extern void debugreg_dec_use_count(unsigned long mask);
+extern int debugreg_is_global(int regnum);
+extern int enable_debugreg(unsigned long old_dr7, unsigned long new_dr7);
+
+#else
+
+static inline void set_process_debugreg7(unsigned long new_dr7);
+{
+	set_debugreg(new_dr7, 7);
+}
+static inline void debugreg_inc_use_count(unsigned long mask)
+{
+}
+static inline void debugreg_dec_use_count(unsigned long mask)
+{
+}
+static inline int debugreg_is_global(int regnum)
+{
+	return 0;
+}
+static inline int enable_debugreg(unsigned long old_dr7,
+		unsigned long new_dr7)
+{
+	return (new_dr7 != 0);
+}
+
+#endif				/* CONFIG_KWATCH */
 #endif
Index: usb-2.6/arch/i386/kernel/kwatch.c
===================================================================
--- /dev/null
+++ usb-2.6/arch/i386/kernel/kwatch.c
@@ -0,0 +1,274 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) 2007 Alan Stern
+ */
+
+/* Kwatch: a kernel watchpoint interface, using the CPU's debug registers. */
+
+#include <linux/bitops.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ptrace.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <asm/kwatch.h>
+#include <asm/kdebug.h>
+#include <asm/debugreg.h>
+
+#define RF_MASK	0x00010000
+
+static struct kwatch kwatch_list[DR_MAX];
+static DEFINE_SPINLOCK(kwatch_lock);
+static unsigned long kwatch_in_progress;	/* bitmap of registers
+							being handled */
+
+static void write_debugreg(unsigned long addr, int debugreg)
+{
+	switch (debugreg) {
+		case 0:	set_debugreg(addr, 0);	break;
+		case 1:	set_debugreg(addr, 1);	break;
+		case 2:	set_debugreg(addr, 2);	break;
+		case 3:	set_debugreg(addr, 3);	break;
+		case 6:	set_debugreg(addr, 6);	break;
+		case 7:	set_debugreg(addr, 7);	break;
+	}
+}
+
+#ifdef CONFIG_SMP
+
+struct dr_info {
+	int debugreg;
+	int type;
+	unsigned long addr;
+};
+
+static void write_smp_debugreg(void *info)
+{
+	struct dr_info *dr = (struct dr_info *) info;
+
+	if (cpu_has_de && dr->type == KWATCH_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_debugreg(dr->addr, dr->debugreg);
+}
+
+/* Update a debug register on all CPUs */
+static void sync_debugreg(unsigned long addr, int debugreg, int type)
+{
+	struct dr_info dr;
+
+	dr.debugreg = debugreg;
+	dr.type = type;
+	dr.addr = addr;
+	smp_call_function(write_smp_debugreg, &dr, 0, 0);
+}
+
+#else
+
+static inline void sync_debugreg(unsigned long addr, int debugreg, int type)
+{
+}
+#endif	/* CONFIG_SMP */
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate and they
+ * remain disabled thorough out this function.
+ */
+static int kwatch_handler(unsigned long condition, struct pt_regs *regs)
+{
+	unsigned int debugreg;
+	unsigned long addr;
+	struct kwatch *kw;
+	int recursed = 0;
+
+	/* The debug status register value is passed in "condition". */
+	if (condition & DR_TRAP0) {
+		debugreg = 0;
+		get_debugreg(addr, 0);
+	} else if (condition & DR_TRAP1) {
+		debugreg = 1;
+		get_debugreg(addr, 1);
+	} else if (condition & DR_TRAP2) {
+		debugreg = 2;
+		get_debugreg(addr, 2);
+	} else if (condition & DR_TRAP3) {
+		debugreg = 3;
+		get_debugreg(addr, 3);
+	} else
+		return recursed;
+	kw = &kwatch_list[debugreg];
+
+	/* We're in an interrupt, but this is clear and BUG()-safe. */
+	preempt_disable();
+
+	/* If we are recursing, we already hold the lock. */
+	if (kwatch_in_progress)
+		recursed = 1;
+	else
+		spin_lock(&kwatch_lock);
+	set_bit(debugreg, &kwatch_in_progress);
+
+	if ((unsigned long) kw->addr == addr) {
+		if (!recursed && kw->handler)
+			kw->handler(kw, regs);
+		if (kw->type == KWATCH_TYPE_EXECUTE)
+			regs->eflags |= RF_MASK;
+	}
+
+	clear_bit(debugreg, &kwatch_in_progress);
+	if (!recursed)
+		spin_unlock(&kwatch_lock);
+
+	preempt_enable_no_resched();
+	return recursed;
+}
+
+/**
+ * register_kwatch - register a hardware watchpoint
+ * @addr: address of the watchpoint
+ * @length: extent of the watchpoint (1, 2, or 4 bytes)
+ * @type: type of access to trap (read, write, I/O, or execute)
+ * @handler: callback routine to invoke when a trap occurs
+ *
+ * Allocates and returns a debug register and installs the requested
+ * watchpoint.
+ *
+ * @length must be 1, 2, or 4, and @type must be one of %KWATCH_TYPE_RW
+ * (read or write), %KWATCH_TYPE_WRITE (write only), %KWATCH_TYPE_IO
+ * (IO-space access), or %KWATCH_TYPE_EXECUTE.  Note that %KWATCH_TYPE_IO
+ * is available only on processors with Debugging Extensions, and @length
+ * must be 1 for %KWATCH_TYPE_EXECUTE.
+ *
+ * When a trap occurs, @handler is invoked in_interrupt with a pointer
+ * to a struct kwatch containing the watchpoint information and a pointer
+ * to the CPU register values at the time of the trap.  %KWATCH_TYPE_EXECUTE
+ * traps occur before the watch-pointed instruction executes; all other
+ * types occur after the memory or I/O access has taken place.
+ *
+ * Returns a debug register number or a negative error code.
+ */
+int register_kwatch(void *addr, u8 length, u8 type, kwatch_handler_t handler)
+{
+	int debugreg;
+	unsigned long dr7, flags;
+
+	switch (length) {
+	case 1:
+	case 2:
+	case 4:
+		break;
+	default:
+		return -EINVAL;
+	}
+	switch (type) {
+	case KWATCH_TYPE_WRITE:
+	case KWATCH_TYPE_RW:
+		break;
+	case KWATCH_TYPE_IO:
+		if (cpu_has_de)
+			break;
+		return -EINVAL;
+	case KWATCH_TYPE_EXECUTE:
+		if (length == 1)
+			break;
+		/* FALL THROUGH */
+	default:
+		return -EINVAL;
+	}
+	if (!handler)
+		return -EINVAL;
+
+	debugreg = debugreg_global_alloc(DR_ANY);
+	if (debugreg < 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = addr;
+	kwatch_list[debugreg].length = length;
+	kwatch_list[debugreg].type = type;
+	kwatch_list[debugreg].handler = handler;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	if (type == KWATCH_TYPE_IO)
+		set_in_cr4(X86_CR4_DE);
+	write_debugreg((unsigned long) addr, debugreg);
+	sync_debugreg((unsigned long) addr, debugreg, type);
+
+	get_debugreg(dr7, 7);
+	dr7 = debugreg7_set_bits(dr7, debugreg, type, length);
+	set_debugreg(dr7, 7);
+	sync_debugreg(dr7, 7, 0);
+	return debugreg;
+}
+EXPORT_SYMBOL_GPL(register_kwatch);
+
+/**
+ * unregister_kwatch - free a previously-allocated debugging watchpoint
+ * @debugreg: the debugging register to deallocate
+ *
+ * Removes a hardware watchpoint and deallocates the corresponding
+ * debugging register.  @debugreg must previously have been allocated
+ * by register_kwatch().
+ */
+void unregister_kwatch(int debugreg)
+{
+	unsigned long flags;
+	unsigned long dr7;
+
+	if (debugreg < 0 || debugreg >= DR_MAX ||
+			!kwatch_list[debugreg].handler)
+		return;
+
+	get_debugreg(dr7, 7);
+	dr7 = debugreg7_clear_bits(dr7, debugreg);
+	set_debugreg(dr7, 7);
+	sync_debugreg(dr7, 7, 0);
+
+	spin_lock_irqsave(&kwatch_lock, flags);
+	kwatch_list[debugreg].addr = 0;
+	kwatch_list[debugreg].handler = NULL;
+	spin_unlock_irqrestore(&kwatch_lock, flags);
+
+	debugreg_global_free(debugreg);
+}
+EXPORT_SYMBOL_GPL(unregister_kwatch);
+
+/*
+ * Wrapper routine to for handling exceptions.
+ */
+static int kwatch_exceptions_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct die_args *args = (struct die_args *) data;
+
+	if (val == DIE_DEBUG) {
+		if (kwatch_handler(args->err, args->regs))
+			return NOTIFY_STOP;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block kwatch_exceptions_nb = {
+	.notifier_call = kwatch_exceptions_notify,
+	.priority = 0x7ffffffe		/* we need to notified second */
+};
+
+static int __init init_kwatch(void)
+{
+	return register_die_notifier(&kwatch_exceptions_nb);
+}
+
+__initcall(init_kwatch);
Index: usb-2.6/arch/i386/kernel/Makefile
===================================================================
--- usb-2.6.orig/arch/i386/kernel/Makefile
+++ usb-2.6/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86)		+= vm86.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 obj-$(CONFIG_K8_NB)		+= k8.o
+obj-$(CONFIG_KWATCH)		+= debugreg.o kwatch.o
 
 # Make sure this is linked after any other paravirt_ops structs: see head.S
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o
Index: usb-2.6/include/asm-i386/kwatch.h
===================================================================
--- /dev/null
+++ usb-2.6/include/asm-i386/kwatch.h
@@ -0,0 +1,99 @@
+#ifndef _ASM_KWATCH_H
+#define _ASM_KWATCH_H
+
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2002, 2004
+ * Copyright (C) 2007 Alan Stern
+ */
+
+/*
+ * Kernel watchpoint interface.  Use these routines to create and delete
+ * hardware watchpoints within the kernel, using the CPU's debug registers.
+ *
+ * This sample code sets a watchpoint on pid_max and registers a callback
+ * function for writes to that variable.
+ *
+ * ----------------------------------------------------------------------
+ * #include <asm/kwatch.h>
+ *
+ * static void kwatch_handler(struct kwatch *p, struct pt_regs *regs)
+ * {
+ * 	printk(KERN_DEBUG "Watchpoint triggered\n");
+ * 	dump_stack();
+ * 	.......<do anything>........
+ * }
+ *
+ * static int debugreg_num;
+ *
+ * static int init_module(void)
+ * {
+ * 	..........<do anything>............
+ * 	debugreg_num = register_kwatch(&pid_max,
+ * 			 4, KWATCH_TYPE_WRITE, kwatch_handler);
+ * 	..........<do anything>............
+ * }
+ *
+ * static void cleanup_module(void)
+ * {
+ * 	..........<do anything>............
+ * 	unregister_kwatch(debugreg_num);
+ * 	..........<do anything>............
+ * }
+ * ----------------------------------------------------------------------
+ *
+ * Test this by changing the value of pid_max in /proc/sys/kernel/pid_max:
+ *
+ * 	# echo 1000 > /proc/sys/kernel/pid_max
+ *
+ * The output from kwatch_handler() will show up in the system log.
+ */
+
+#include <linux/types.h>
+
+struct kwatch;
+struct pt_regs;
+typedef void (*kwatch_handler_t)(struct kwatch *, struct pt_regs *);
+
+struct kwatch {
+	void *addr;		/* location of watchpoint */
+	u8 length;		/* range of address */
+	u8 type;		/* type of watchpoint */
+	kwatch_handler_t handler;
+};
+
+#define KWATCH_TYPE_EXECUTE 	0x0	/* Watchpoint types */
+#define KWATCH_TYPE_WRITE	0x1
+#define KWATCH_TYPE_IO		0x2
+#define KWATCH_TYPE_RW		0x3
+
+#ifdef	CONFIG_KWATCH
+extern int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler);
+extern void unregister_kwatch(int debugreg);
+
+#else
+
+static inline int register_kwatch(void *addr, u8 length, u8 type,
+		kwatch_handler_t handler)
+{
+	return -ENOSYS;
+}
+static inline void unregister_kwatch(int debugreg)
+{
+}
+#endif
+#endif	/* _ASM_KWATCH_H */
Index: usb-2.6/arch/i386/Kconfig
===================================================================
--- usb-2.6.orig/arch/i386/Kconfig
+++ usb-2.6/arch/i386/Kconfig
@@ -1210,6 +1210,14 @@ config KPROBES
 	  a probepoint and specifies the callback.  Kprobes is useful
 	  for kernel debugging, non-intrusive instrumentation and testing.
 	  If in doubt, say "N".
+
+config KWATCH
+	bool "Kwatch points (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  Kwatch enables kernel-space data watchpoints using the processor's
+	  debug registers.  It can be very useful for kernel debugging.
+	  If in doubt, say "N".
 endmenu
 
 source "arch/i386/Kconfig.debug"


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] Kwatch: kernel watchpoints using CPU debug registers
  2007-01-17 16:17   ` Alan Stern
  2007-01-18  0:12     ` Christoph Hellwig
@ 2007-02-06  4:25     ` Roland McGrath
  1 sibling, 0 replies; 11+ messages in thread
From: Roland McGrath @ 2007-02-06  4:25 UTC (permalink / raw)
  To: Alan Stern; +Cc: Prasanna S Panchamukhi, Kernel development list

Sorry I've been slow in giving you feedback on kwatch.

> I'll be happy to move this over to the utrace setting, once it is merged.  
> Do you think it would be better to include the current version of kwatch 
> now or to wait for utrace?
> 
> Roland, is there a schedule for when you plan to get utrace into -mm?

Since you've asked, I'll mention that I've been discussing this with Andrew
lately and we plan to work on merging it into -mm as soon as we can manage.

The kwatch implementation is pretty much orthogonal to the utrace patch as
it is so far.  As you've noted, it doesn't change the nature of the setting
of the debug registers; it only moves around the existing code for setting
them in raw form.  Hence it doesn't much matter which order the work is
merged at this stage.  There's no reason to withhold kwatch waiting for utrace.

I do have a problem with kwatch, however.  The existing user ABI includes
setting all of the debug registers, and this interface has never before
expressed a situation where you can set some but not all of them.  Having
ptrace suddenly fail with EBUSY when it never did before is not OK.  No
well-behaved kernel-mode tracing/debugging facility should perturb the user
experience in this way.  It is certainly understandable that one will
sometimes want to do invasive kernel-mode debugging and on special
occasions choose to be ill-behaved in this way (you might know your
userland work load doesn't include running gdb with watchpoints).  
But kwatch as it stands does not even make it possible to write a
well-behaved facility.

I am all in favor of a facility to manage shared use of the debug
registers, such as your debugreg.h additions.  I just think it needs to be
a little more flexible.  An unobtrusive kernel facility has to get out of
the way when user-mode decides to use all its debug registers.  It's not
immediately important what it's going to about it when contention arises,
but there has to be a way for the user-mode facilities to say they need to
allocate debugregs with priority and evict other squatters.  So, something
like code allocating a debugreg can supply a callback that's made when its
allocation has to taken by something with higher priority.  

Even after utrace, there will always be the possibility of a traditional
uncoordinated user of the raw debug registers, if nothing else ptrace
compatibility will always be there for old users.  So anything new and
fancy needs to be prepared to back out of the way gracefully.  In the case
of kwatch, it can just have a handler function given by the caller to start
with.  It's OK if individual callers can specially declare "I am not
well-behaved" and eat debugregs so that well-behaved high-priority users
like ptrace just have to lose (breaking compatibility).  But no
well-behaved caller of kwatch will do that.  

As a later improvement, kwatch could try a thing or two to stave off giving
up and telling its caller the watchpoint couldn't stay for the current
task.  For example, if a watchpoint is in kernel memory, you could switch
in your debugreg settings on entering the kernel and restore the user
watchpoints before returning to user mode.  Then you'd need to make
get_user et al somehow observe the user-mode watchpoints.  But it could be
investigated if the need arises.  Note that you can already silently do
something simple like juggling your kwatch debugreg assignments around if
the higher-priority consumer evicting you has left some other debugregs unused.

I certainly intend for later features based on utrace to include
higher-level treatment of watchpoints so that user debugging facilities can
also become responsive to debugreg allocation pressure.  (Eventually, the
user facilities might have easier ways of falling back to other methods and
getting out of the way of kernel debugreg consumers, than can be done for
the kernel-mode-tracing facilities.)  To that end, I'd like to see a clear
and robust interface for debugreg sharing, below the level of kwatch.  I'd
also like to see a thin layer on that giving a machine-independent kernel
source API for talking about watchpoints, which you pretty much have rolled
into the kwatch interface now.  But these are further refinements, not
barriers to including kwatch.

Also, an unrelated minor point.  I think it's error-prone to have an
integer argument to unregister_kwatch.  I think it makes most sense to have
the caller provide the space and call register/unregister with a pointer,
in the style of kprobes.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2007-02-06  4:25 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-16 16:55 [PATCH] Kwatch: kernel watchpoints using CPU debug registers Alan Stern
2007-01-16 23:35 ` Christoph Hellwig
2007-01-17 16:33   ` Alan Stern
2007-01-17  9:44 ` Ingo Molnar
2007-01-17 16:17   ` Alan Stern
2007-01-18  0:12     ` Christoph Hellwig
2007-01-18  7:31       ` Ingo Molnar
2007-01-18 15:37         ` Alan Stern
2007-01-18 22:33         ` Christoph Hellwig
2007-01-22 16:54           ` [PATCH - revised] " Alan Stern
2007-02-06  4:25     ` [PATCH] " Roland McGrath

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).