LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [patch 0/4] Linux Kernel Markers
@ 2007-07-03 17:08 Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 1/4] Linux Kernel Markers, architecture independent code Mathieu Desnoyers
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-03 17:08 UTC (permalink / raw)
  To: akpm, Christoph Hellwig, linux-kernel

Hi,

This updated version of the Linux Kernel Markers mostly adds a unique 16 bits
per marker ID and a per-probe marker group.

Christoph, I think the only concern that I do not plan to address immediately is
to provide a complet in-kernel user of the markers (blktrace patch does not
actually use the markers full potential). I have external patches that provides
that, but I don't want to send too much patches at once. Between providing a
complete marker/tracer stack and sending small incremental patches, I think the
latter is the choice the better suited. This is however an uneasy problem, which
looks very much like the chicken and egg problem. :)

If you have concerns with what I recently added to the markers, or if you still
strongly feel that I must also send the following patches right away, please let
me know.

Mathieu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 1/4] Linux Kernel Markers, architecture independent code.
  2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
@ 2007-07-03 17:08 ` Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 2/4] Linux Kernel Markers - Add kconfig menus for the marker code Mathieu Desnoyers
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-03 17:08 UTC (permalink / raw)
  To: akpm, Christoph Hellwig, linux-kernel; +Cc: Mathieu Desnoyers

[-- Attachment #1: linux-kernel-markers-architecture-independent-code.patch --]
[-- Type: text/plain, Size: 31344 bytes --]

The marker activation functions sits in kernel/marker.c. A hash table is used
to keep track of the armed/disarmed markers, so they can be activated at module
load time.

The marker IDs will be used as unique identifiers for markers. They are
assigned by the marker infrastructure and are used to identify a marker in a
compact representation either in a probe handler and especially in a trace.

Marker group is a parameter given by the probe provider. It can be used, for
instance, to tell which set of buffers the information must be send to when
recording a trace.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
---

 include/asm-generic/vmlinux.lds.h |   11 
 include/linux/marker.h            |  159 +++++++
 include/linux/module.h            |    5 
 kernel/Makefile                   |    1 
 kernel/marker.c                   |  801 ++++++++++++++++++++++++++++++++++++++
 kernel/module.c                   |   17 
 6 files changed, 993 insertions(+), 1 deletion(-)

Index: linux-2.6-lttng/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/vmlinux.lds.h	2007-07-03 12:33:20.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/vmlinux.lds.h	2007-07-03 12:33:21.000000000 -0400
@@ -12,7 +12,11 @@
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
-	*(.data.init.refok)
+	*(.data.init.refok)						\
+	. = ALIGN(32);							\
+	VMLINUX_SYMBOL(__start___markers) = .;				\
+	*(__markers)							\
+	VMLINUX_SYMBOL(__stop___markers) = .;
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
@@ -129,6 +133,11 @@
 		VMLINUX_SYMBOL(__stop___immediate) = .;			\
 	}								\
 									\
+	/* Markers: strings */						\
+        __markers_strings : AT(ADDR(__markers_strings) - LOAD_OFFSET) {	\
+		*(__markers_strings)					\
+ 	}								\
+									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
 		*(__ksymtab_strings)					\
Index: linux-2.6-lttng/include/linux/marker.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/include/linux/marker.h	2007-07-03 13:03:14.000000000 -0400
@@ -0,0 +1,159 @@
+#ifndef _LINUX_MARKER_H
+#define _LINUX_MARKER_H
+
+/*
+ * Code markup for dynamic and static tracing.
+ *
+ * See Documentation/marker.txt.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#ifdef __KERNEL__
+
+#include <linux/immediate.h>
+#include <linux/types.h>
+
+struct module;
+struct __mark_marker;
+
+/*
+ * Unique ID assigned to each registered probe.
+ */
+enum marker_id {
+	MARKER_ID_LOAD_MARKER = 0,	/* Static IDs available (range 0-7) */
+	MARKER_ID_HEARTBEAT_32,
+	MARKER_ID_HEARTBEAT_64,
+	MARKER_ID_COMPACT,		/* Compact IDs (range: 8-127)	    */
+	MARKER_ID_DYNAMIC,		/* Dynamic IDs (range: 128-65535)   */
+};
+
+/* static ids 0-7 reserved for internal use. */
+#define MARKER_CORE_IDS		8
+/* dynamic ids 8-127 reserved for compact events. */
+#define MARKER_COMPACT_IDS	128
+static inline enum marker_id marker_id_type(uint16_t id)
+{
+	if (id < MARKER_CORE_IDS)
+		return (enum marker_id)id;
+	else if (id < MARKER_COMPACT_IDS)
+		return MARKER_ID_COMPACT;
+	else
+		return MARKER_ID_DYNAMIC;
+}
+
+typedef void marker_probe_func(const struct __mark_marker *mdata,
+	const char *fmt, ...);
+
+struct __mark_marker {
+	const char *name;	/* Marker name */
+	const char *format;	/* Marker format string, describing the
+				 * variable argument list.
+				 */
+	const char *args;	/* List of arguments litteraly transformed
+				 * into a string: "arg1, arg2, arg3".
+				 */
+	immediate_t state;	/* Immediate value state. */
+	int flags;		/* Flags controlling the markers flavor.
+				 * Passed to the contidional call declaration
+				 * and used to check that the probe matches the
+				 * markers restrictions at connexion time. */
+	marker_probe_func *call;/* Probe handler function pointer */
+	void *pdata;		/* Private probe data */
+	uint16_t id;		/* Unique marker numeric ID */
+	uint16_t group;		/*
+				 * Per probe information to select to which
+				 * group the marker belongs.
+				 */
+};
+
+#ifdef CONFIG_MARKERS
+
+/*
+ * Generic marker flavor always available.
+ * Note : the empty asm volatile with read constraint is used here instead of a
+ * "used" attribute to fix a gcc 4.1.x bug.
+ * Make sure the alignment of the structure in the __markers section will
+ * not add unwanted padding between the beginning of the section and the
+ * structure. Force alignment to the same alignment as the section start.
+ * Use the natual alignment for a 32 bytes structure (32 bytes).
+ */
+#define _trace_mark(flags, name, format, args...)			\
+	do {								\
+		static const char __mstrtab_name_##name[]		\
+		__attribute__((section("__markers_strings")))		\
+		= #name;						\
+		static const char __mstrtab_format_##name[]		\
+		__attribute__((section("__markers_strings")))		\
+		= format;						\
+		static const char __mstrtab_args_##name[]		\
+		__attribute__((section("__markers_strings")))		\
+		= #args;						\
+		static struct __mark_marker __mark_##name		\
+		__attribute__((section("__markers"), aligned(32))) = 	\
+		{ __mstrtab_name_##name, __mstrtab_format_##name,	\
+		__mstrtab_args_##name, { 0 }, (flags),			\
+		__mark_empty_function, NULL, 0 };			\
+		asm volatile ( "" : : "i" (&__mark_##name));		\
+		__mark_check_format(format, ## args);			\
+		if (unlikely(_immediate((flags), __mark_##name.state))) { \
+			preempt_disable();				\
+			(*__mark_##name.call)(&__mark_##name, format, ## args);\
+			preempt_enable();				\
+		}							\
+	} while (0)
+
+extern void module_marker_update(struct module *mod);
+#else /* !CONFIG_MARKERS */
+#define _trace_mark(flags, name, format, args...) \
+		__mark_check_format(format, ## args)
+static inline void module_marker_update(struct module *mod) { }
+#endif /* CONFIG_MARKERS */
+
+/* Marker with default behavior */
+#define trace_mark(name, format, args...) \
+	_trace_mark(IF_DEFAULT, name, format, ## args)
+
+#define MARK_MAX_FORMAT_LEN	1024
+/* Pass this as a format string for a marker with no argument */
+#define MARK_NOARGS " "
+
+/* To be used for string format validity checking with gcc (TODO) */
+static inline
+void __mark_check_format(const char *fmt, ...)
+#ifdef GCC_SUPPORTS_MARKER_CHECK
+	__attribute__((format (trace_mark, 1, 2)))
+#endif
+{ }
+
+extern marker_probe_func __mark_empty_function;
+
+extern int _marker_probe_register(int flags, const char *name,
+				const char *format, marker_probe_func *probe,
+				void *pdata, enum marker_id id,
+				uint16_t group);
+extern int marker_set_group(const char *name, uint16_t group);
+extern int marker_set_id(const char *name, enum marker_id id);
+
+/*
+ * Returns the pdata given to marker_probe_register.
+ */
+extern void *marker_probe_unregister(const char *name);
+
+#define marker_probe_register(name, format, probe, pdata, group)	\
+	_marker_probe_register(IF_DEFAULT, name, format, probe, pdata, \
+				MARKER_ID_DYNAMIC, group)
+
+extern int marker_arm(const char *name);
+extern int marker_disarm(const char *name);
+extern int marker_query_probe(const char *name, marker_probe_func **probe,
+				void **pdata, uint16_t *id, uint16_t *group);
+extern int marker_list_probe(marker_probe_func *probe);
+extern const struct __mark_marker *marker_query(const char *name, int instance);
+extern void marker_probe_clean_pdata(void *pdata);
+
+#endif /* __KERNEL__ */
+#endif
Index: linux-2.6-lttng/include/linux/module.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/module.h	2007-07-03 12:33:20.000000000 -0400
+++ linux-2.6-lttng/include/linux/module.h	2007-07-03 12:54:59.000000000 -0400
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/moduleparam.h>
 #include <linux/immediate.h>
+#include <linux/marker.h>
 #include <asm/local.h>
 
 #include <asm/module.h>
@@ -380,6 +381,10 @@
 	const struct __immediate *immediates;
 	unsigned int num_immediates;
 #endif
+#ifdef CONFIG_MARKERS
+	struct __mark_marker *markers;
+	unsigned int num_markers;
+#endif
 };
 #ifndef MODULE_ARCH_INIT
 #define MODULE_ARCH_INIT {}
Index: linux-2.6-lttng/kernel/module.c
===================================================================
--- linux-2.6-lttng.orig/kernel/module.c	2007-07-03 12:33:20.000000000 -0400
+++ linux-2.6-lttng/kernel/module.c	2007-07-03 12:54:59.000000000 -0400
@@ -1726,6 +1726,10 @@
 #ifdef CONFIG_IMMEDIATE
 	immediateindex = find_sec(hdr, sechdrs, secstrings, "__immediate");
 #endif
+#ifdef CONFIG_MARKERS
+	markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
+ 	markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings");
+#endif
 
 	/* Don't keep modinfo section */
 	sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1885,6 +1889,10 @@
 			sechdrs[immediateindex].sh_size / sizeof(*mod->immediates);
 	}
 #endif
+	if (markersindex)
+		sechdrs[markersindex].sh_flags |= SHF_ALLOC;
+	if (markersstringsindex)
+		sechdrs[markersstringsindex].sh_flags |= SHF_ALLOC;
 
 	mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
 	if (unusedcrcindex)
@@ -1926,6 +1934,13 @@
 		if (err < 0)
 			goto cleanup;
 	}
+#ifdef CONFIG_MARKERS
+	if (markersindex) {
+		mod->markers = (void *)sechdrs[markersindex].sh_addr;
+		mod->num_markers =
+			sechdrs[markersindex].sh_size / sizeof(*mod->markers);
+	}
+#endif
 
         /* Find duplicate symbols */
 	err = verify_export_symbols(mod);
@@ -1950,6 +1965,8 @@
         }
 #endif
 
+	module_marker_update(mod);
+
 	module_immediate_setup(mod);
 
 	err = module_finalize(hdr, sechdrs, mod);
Index: linux-2.6-lttng/kernel/marker.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/kernel/marker.c	2007-07-03 13:03:03.000000000 -0400
@@ -0,0 +1,801 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/marker.h>
+#include <linux/err.h>
+
+static uint8_t mark_next_compact_id = MARKER_CORE_IDS;
+/* Next available ID. Dynamic range : 128-65535 */
+static uint16_t mark_next_id = MARKER_COMPACT_IDS;
+
+extern struct __mark_marker __start___markers[];
+extern struct __mark_marker __stop___markers[];
+
+/*
+ * module_mutex nests inside markers_mutex. Markers mutex protects the builtin
+ * and module markers, and the hash table.
+ */
+DEFINE_MUTEX(markers_mutex);
+
+/*
+ * Marker hash table, containing the active markers.
+ * Protected by module_mutex.
+ */
+#define MARKER_HASH_BITS 6
+#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+
+struct marker_entry {
+	struct hlist_node hlist;
+	int flags;
+	char *format;
+	marker_probe_func *probe;
+	void *pdata;
+	uint16_t id;	/* 16 bits unique numeric ID */
+	uint16_t group;
+	int state;	/* Armed or disarmed */
+	char name[0];	/* Contains name'\0'format'\0' */
+};
+
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
+
+/*
+ * Empty callback provided as a probe to the markers. By providing this to a
+ * disabled marker, we makes sure the  execution flow is always valid even
+ * though the function pointer change and the marker enabling are two distinct
+ * operations that modifies the execution flow of preemptible code.
+ */
+void __mark_empty_function(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+}
+EXPORT_SYMBOL_GPL(__mark_empty_function);
+
+/*
+ * Get marker if the marker is present in the marker hash table.
+ * Must be called with markers_mutex held.
+ * Returns NULL if not present.
+ */
+static struct marker_entry *_get_marker(const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t len = strlen(name) + 1;
+	u32 hash = jhash(name, len-1, 0);
+
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist)
+		if (!strcmp(name, e->name))
+			return e;
+	return NULL;
+}
+
+/*
+ * Add the marker to the marker hash table. Must be called with markers_mutex
+ * held.
+ */
+static int _add_marker(int flags, const char *name,
+	const char *format, marker_probe_func *probe, void *pdata,
+	enum marker_id id, uint16_t group)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	size_t name_len = strlen(name) + 1;
+	size_t format_len = 0;
+	u32 hash = jhash(name, name_len-1, 0);
+
+	/* First check if there are still IDs available */
+	switch (id) {
+		case MARKER_ID_DYNAMIC:
+			if (mark_next_id == 0)
+				return -ENOSPC;
+			break;
+		case MARKER_ID_COMPACT:
+			if (mark_next_compact_id == 0)
+				return -ENOSPC;
+			break;
+		default:
+			/* Only allow 0-7 range for core IDs */
+			if ((uint16_t)id >= MARKER_CORE_IDS)
+				return -EPERM;
+	}
+
+	if (format)
+		format_len = strlen(format) + 1;
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(name, e->name)) {
+			printk(KERN_NOTICE
+				"Marker %s busy, probe %p already installed\n",
+				name, e->probe);
+			return -EBUSY;	/* Already there */
+		}
+	}
+	/*
+	 * Using kmalloc here to allocate a variable length element. Could
+	 * cause some memory fragmentation if overused.
+	 */
+	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
+			GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+	memcpy(&e->name[0], name, name_len);
+	if (format) {
+		e->format = &e->name[name_len];
+		memcpy(e->format, format, format_len);
+	} else
+		e->format = NULL;
+	e->flags = flags;
+	e->probe = probe;
+	e->pdata = pdata;
+	e->group = group;
+	switch (id) {
+		case MARKER_ID_COMPACT:
+			e->id = mark_next_compact_id++;
+			break;
+		case MARKER_ID_DYNAMIC:
+			e->id = mark_next_id++;
+			break;
+		default:
+			e->id = (uint16_t)id;
+	}
+	e->state = 0;
+	hlist_add_head(&e->hlist, head);
+	return 0;
+}
+
+/*
+ * Remove the marker from the marker hash table. Must be called with mutex_lock
+ * held.
+ */
+static void *_remove_marker(const char *name)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	int found = 0;
+	size_t len = strlen(name) + 1;
+	void *pdata = NULL;
+	u32 hash = jhash(name, len-1, 0);
+
+	head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+	hlist_for_each_entry(e, node, head, hlist) {
+		if (!strcmp(name, e->name)) {
+			found = 1;
+			break;
+		}
+	}
+	if (found) {
+		pdata = e->pdata;
+		hlist_del(&e->hlist);
+		kfree(e);
+	}
+	return pdata;
+}
+
+/*
+ * Set the mark_entry format to the format found in the element.
+ */
+static int _marker_set_format(struct marker_entry **entry, const char *format)
+{
+	struct marker_entry *e;
+	size_t name_len = strlen((*entry)->name) + 1;
+	size_t format_len = strlen(format) + 1;
+
+	e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
+			GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+	memcpy(&e->name[0], (*entry)->name, name_len);
+	e->format = &e->name[name_len];
+	memcpy(e->format, format, format_len);
+	e->flags = (*entry)->flags;
+	e->probe = (*entry)->probe;
+	e->pdata = (*entry)->pdata;
+	e->state = (*entry)->state;
+	e->id = (*entry)->id;
+	e->group = (*entry)->group;
+	hlist_add_before(&e->hlist, &(*entry)->hlist);
+	hlist_del(&(*entry)->hlist);
+	kfree(*entry);
+	*entry = e;
+	trace_mark(marker_load, "name %s format %s id %hu",
+			e->name, e->format, e->id);
+	return 0;
+}
+
+/* Sets the probe callback corresponding to one marker. */
+static int _set_marker(struct marker_entry **entry,
+			struct __mark_marker *elem)
+{
+	int ret;
+	BUG_ON(strcmp((*entry)->name, elem->name) != 0);
+
+	if ((*entry)->format) {
+		if (strcmp((*entry)->format, elem->format) != 0) {
+			printk(KERN_NOTICE
+				"Format mismatch for probe %s "
+				"(%s), marker (%s)\n",
+				(*entry)->name,
+				(*entry)->format,
+				elem->format);
+			return -EPERM;
+		}
+	} else {
+		ret = _marker_set_format(entry, elem->format);
+		if (ret)
+			return ret;
+	}
+	if ((*entry)->flags & IF_LOCKDEP
+		&& !(elem->flags & IF_LOCKDEP)) {
+		printk(KERN_NOTICE
+			"Incompatible lockdep flags for "
+			"probe %s\n",
+			(*entry)->name);
+		return -EPERM;
+	}
+	elem->call = (*entry)->probe;
+	elem->pdata = (*entry)->pdata;
+	elem->id = (*entry)->id;
+	elem->group = (*entry)->group;
+	__immediate_update(&elem->state, 1);
+	return 0;
+}
+
+static void _disable_marker(struct __mark_marker *elem)
+{
+	__immediate_update(&elem->state, 0);
+	elem->call = __mark_empty_function;
+	/*
+	 * Leave the pdata and id there, because removal is racy and should be
+	 * done only after a synchronize_sched(). There are never used until
+	 * the next initialization anyway.
+	 */
+}
+
+/*
+ * Updates the probe callback corresponding to a range of markers.
+ * Must be called with markers_mutex held.
+ */
+static void _marker_update_probe_range(
+	struct __mark_marker *begin,
+	struct __mark_marker *end,
+	struct module *probe_module,
+	int *refcount, int *nr_armed)
+{
+	struct __mark_marker *iter;
+	struct marker_entry *mark_entry;
+
+	for (iter = begin; iter < end; iter++) {
+		mark_entry = _get_marker(iter->name);
+		if (mark_entry && mark_entry->state) {
+			_set_marker(&mark_entry, iter);
+			/*
+			 * ignore error, continue
+			 */
+			if (probe_module)
+				if (probe_module ==
+			__module_text_address((unsigned long)mark_entry->probe))
+					(*refcount)++;
+			(*nr_armed)++;
+		} else {
+			_disable_marker(iter);
+		}
+	}
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * Update module probes.
+ * Must be called with markers_mutex held.
+ */
+static inline void __marker_update_probes_modules(struct module *probe_module,
+	int *refcount, int *nr_armed)
+{
+	struct module *mod;
+
+	list_for_each_entry(mod, &modules, list) {
+		if (!mod->taints) {
+			_marker_update_probe_range(mod->markers,
+				mod->markers+mod->num_markers,
+				probe_module, refcount, nr_armed);
+		}
+	}
+}
+#else
+static inline void __marker_update_probes_modules(struct module *probe_module,
+	int *refcount, *nr_armed)
+{
+}
+#endif
+
+/*
+ * Defragment the markers IDs. Should only be called when the IDs are not used
+ * by anyone, typically when all probes are disarmed. Clients of the markers
+ * rely on having their code markers armed during a "session" to make sure there
+ * will be no ID compaction. (for instance, a marker ID is never reused during a
+ * trace).
+ * There is no need to synchronize the hash table entries with the section
+ * elements because none is armed.
+ */
+void _marker_id_defrag(void)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	unsigned int i;
+
+	mark_next_compact_id = MARKER_CORE_IDS;
+	mark_next_id = MARKER_COMPACT_IDS;
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(e, node, head, hlist) {
+			switch (marker_id_type(e->id)) {
+				case MARKER_ID_COMPACT:
+					e->id = mark_next_compact_id++;
+					break;
+				case MARKER_ID_DYNAMIC:
+					e->id = mark_next_id++;
+					break;
+				default:
+					/* default is to keep the static ID */
+					break;
+			}
+		}
+	}
+}
+
+/*
+ * Update probes, removing the faulty probes.
+ * Issues a synchronize_sched() when no reference to the module passed
+ * as parameter is found in the probes so the probe module can be
+ * safely unloaded from now on.
+ */
+static inline void __marker_update_probes(struct module *probe_module)
+{
+	int refcount = 0, nr_armed = 0;
+
+	/* Core kernel markers */
+	_marker_update_probe_range(__start___markers,
+			__stop___markers, probe_module, &refcount, &nr_armed);
+	/* Markers in modules. */
+	__marker_update_probes_modules(probe_module, &refcount,
+			&nr_armed);
+	if (probe_module && refcount == 0)
+		synchronize_sched();
+	if (!nr_armed)
+		_marker_id_defrag();
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * Setup the marker according to the data present in the marker hash table
+ * upon module load. If an error occur during the set probe range,
+ * refuse to load the module. Must be called with module_mutex held.
+ */
+void module_marker_update(struct module *mod)
+{
+	int nr_armed = 0;
+	if (!mod->taints)
+		_marker_update_probe_range(mod->markers,
+			mod->markers+mod->num_markers, NULL, NULL, &nr_armed);
+}
+
+/*
+ * Update the system wide probes, with modules. */
+static inline void _marker_update_probes(struct module *probe_module)
+{
+	mutex_lock(&module_mutex);
+	__marker_update_probes(probe_module);
+	mutex_unlock(&module_mutex);
+}
+#else
+/* Update the system wide probes, without modules. */
+static inline void _marker_update_probes(struct module *probe_module)
+{
+	__marker_update_probes(probe_module);
+}
+#endif
+
+/*
+ * Register a probe : set the callback for each marker.
+ * Markers must be disarmed to be registered.
+ */
+int _marker_probe_register(int flags, const char *name, const char *format,
+			marker_probe_func *probe, void *pdata,
+			enum marker_id id, uint16_t group)
+{
+	struct marker_entry *entry;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (entry && entry->state) {
+		ret = -EBUSY;
+		goto end;
+	}
+	ret = _add_marker(flags, name, format, probe, pdata, id, group);
+	if (ret)
+		goto end;
+	_marker_update_probes(NULL);
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(_marker_probe_register);
+
+/*
+ * Set the group of a marker.
+ */
+int marker_set_group(const char *name, uint16_t group)
+{
+	struct marker_entry *entry;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	} else if (entry->state) {
+		ret = -EBUSY;
+		goto end;
+	}
+
+	entry->group = group;
+	_marker_update_probes(NULL);
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_set_group);
+
+/*
+ * Set the ID of a marker.
+ */
+int marker_set_id(const char *name, enum marker_id id)
+{
+	struct marker_entry *entry;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	} else if (entry->state) {
+		ret = -EBUSY;
+		goto end;
+	}
+
+	if (id == marker_id_type(entry->id)) {
+		ret = 0;
+		goto end;
+	}
+	switch (id) {
+		case MARKER_ID_COMPACT:
+			entry->id = mark_next_compact_id++;
+			break;
+		case MARKER_ID_DYNAMIC:
+			entry->id = mark_next_id++;
+			break;
+		default:
+			entry->id = (uint16_t)id;
+	}
+	_marker_update_probes(NULL);
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_set_id);
+
+/*
+ * Unregister a probe : unset the callback for each marker.
+ * Markers must be disarmed to be unregistered.
+ * returns the pdata if ok.
+ * else, returns a ERR_PTR().
+ */
+void *marker_probe_unregister(const char *name)
+{
+	struct module *probe_module;
+	struct marker_entry *entry;
+	void *pdata;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (!entry) {
+		pdata = ERR_PTR(-ENOENT);
+		goto end;
+	} else if (entry->state) {
+		pdata = ERR_PTR(-EBUSY);
+		goto end;
+	}
+	/* In what module is the probe handler ? */
+	probe_module = __module_text_address((unsigned long)entry->probe);
+	pdata = _remove_marker(name);
+	_marker_update_probes(probe_module);
+end:
+	mutex_unlock(&markers_mutex);
+	return pdata;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister);
+
+/*
+ * Disarm and remove every marker referencing pdata.
+ */
+void marker_probe_clean_pdata(void *pdata)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct marker_entry *e;
+	unsigned int i;
+
+	mutex_lock(&markers_mutex);
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(e, node, head, hlist) {
+			if (e->pdata == pdata) {
+				e->state = 0;
+			}
+		}
+	}
+	_marker_update_probes(NULL);
+	synchronize_sched();
+
+	for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+		head = &marker_table[i];
+		hlist_for_each_entry(e, node, head, hlist) {
+			if (e->pdata == pdata) {
+				hlist_del(&e->hlist);
+				kfree(e);
+			}
+		}
+	}
+	_marker_update_probes(NULL);
+	mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(marker_probe_clean_pdata);
+
+/*
+ * Arm the probe : arm the immediate values.
+ * A probe must have been previously registered.
+ */
+int marker_arm(const char *name)
+{
+	struct marker_entry * entry;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	}
+	/*
+	 * The marker is known to the marker_load callback only once it provides
+	 * a format string, which can come later from the marker section,
+	 * through _marker_set_format().
+	 */
+	if (entry->format)
+		trace_mark(marker_load, "name %s format %s id %hu",
+			name, entry->format, entry->id);
+	entry->state = 1;
+	_marker_update_probes(NULL);
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_arm);
+
+/*
+ * Disarm the probe : disarm the immediate and set the empty callback for each
+ * marker.
+ */
+int marker_disarm(const char *name)
+{
+	struct marker_entry * entry;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	}
+	entry->state = 0;
+	_marker_update_probes(NULL);
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_disarm);
+
+/*
+ * Get the callback, pdata and ID assigned to a marker.
+ * Can be used to verify if the caller module is the owner of the probe. Data
+ * returned should not be touched if the module is not owner, because no locking
+ * prevents modification of these structure by the owner.
+ * returns -ENOENT if the probe is not registered.
+ * returns 0 if ok.
+ */
+int marker_query_probe(const char *name, marker_probe_func **probe,
+				void **pdata, uint16_t *id, uint16_t *group)
+{
+	struct marker_entry * entry;
+	int ret = 0;
+
+	mutex_lock(&markers_mutex);
+	entry = _get_marker(name);
+	if (!entry) {
+		ret = -ENOENT;
+		goto end;
+	}
+	*probe = entry->probe;
+	*pdata = entry->pdata;
+	*id = entry->id;
+	*group = entry->group;
+end:
+	mutex_unlock(&markers_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(marker_query_probe);
+
+/*
+ * Looks up a marker by its name and instance number within the specificed
+ * range and returns the associated data structure.
+ */
+static const struct __mark_marker *_marker_query_range(const char *name,
+	int instance,
+	const struct __mark_marker *begin,
+	const struct __mark_marker *end)
+{
+	const struct __mark_marker *iter;
+	int found = 0;
+
+	for (iter = begin; iter < end; iter++) {
+		if (strcmp(name, iter->name) != 0)
+			continue;
+		if (found++ == instance)
+			return iter;
+	}
+	return NULL;
+}
+
+/* Query the markers in the modules */
+#ifdef CONFIG_MODULES
+static inline const struct __mark_marker *marker_query_modules(const char *name,
+			int instance)
+{
+	struct module *mod;
+	const struct __mark_marker *mdata = NULL;
+
+	mutex_lock(&module_mutex);
+	/* Markers in modules. */
+	list_for_each_entry(mod, &modules, list) {
+		if (!mod->taints) {
+			mdata = _marker_query_range(name, instance,
+				mod->markers,
+				mod->markers+mod->num_markers);
+			if (mdata)
+				break;
+		}
+	}
+	mutex_unlock(&module_mutex);
+	return mdata;
+}
+#else
+static inline const struct __mark_marker *marker_query_modules(const char *name,
+			int instance)
+{
+	return NULL;
+}
+#endif
+
+/*
+ * Looks up a marker by its name and instance number and returns the
+ * associated data structure.
+ */
+const struct __mark_marker *marker_query(const char *name, int instance)
+{
+	const struct __mark_marker *mdata;
+
+	mutex_lock(&markers_mutex);
+	/* Core kernel markers */
+	mdata = _marker_query_range(name, instance,
+			__start___markers, __stop___markers);
+	if (!mdata)
+		mdata = marker_query_modules(name, instance);
+	mutex_unlock(&markers_mutex);
+	return mdata;
+}
+EXPORT_SYMBOL_GPL(marker_query);
+
+/*
+ * Provides a listing of the markers present in the kernel along with their
+ * callback and format string.
+ */
+static int _marker_list_probe_range(marker_probe_func *probe,
+	const struct __mark_marker *begin,
+	const struct __mark_marker *end)
+{
+	const struct __mark_marker *iter;
+	int found = 0;
+
+	for (iter = begin; iter < end; iter++) {
+		if (probe)
+			if (probe != iter->call) continue;
+		printk("name %s func 0x%p format \"%s\"\n",
+			iter->name,
+			iter->call, iter->format);
+		found++;
+	}
+	return found;
+}
+
+#ifdef CONFIG_MODULES
+static inline int marker_list_probe_modules(marker_probe_func *probe)
+{
+	int found = 0;
+	struct module *mod;
+
+	printk("Listing module markers\n");
+	mutex_lock(&module_mutex);
+	list_for_each_entry(mod, &modules, list) {
+		if (!mod->taints) {
+			printk("Listing markers for module %s\n", mod->name);
+			found += _marker_list_probe_range(probe,
+				mod->markers, mod->markers+mod->num_markers);
+		}
+	}
+	mutex_unlock(&module_mutex);
+	return found;
+}
+#else
+static inline int marker_list_probe_modules(marker_probe_func *probe)
+{
+	return 0;
+}
+#endif
+
+/*
+ * Calls _marker_list_probe_range for the core markers and modules markers.
+ * Marker listing uses the modlist_lock to synchronise.
+ * TODO : should output this listing to a procfs file.
+ */
+int marker_list_probe(marker_probe_func *probe)
+{
+	int found = 0;
+
+	mutex_lock(&markers_mutex);
+	/* Core kernel markers */
+	printk("Listing kernel markers\n");
+	found += _marker_list_probe_range(probe,
+			__start___markers, __stop___markers);
+	/* Markers in modules. */
+	found += marker_list_probe_modules(probe);
+	mutex_unlock(&markers_mutex);
+	return found;
+}
+EXPORT_SYMBOL_GPL(marker_list_probe);
Index: linux-2.6-lttng/kernel/Makefile
===================================================================
--- linux-2.6-lttng.orig/kernel/Makefile	2007-07-03 12:33:20.000000000 -0400
+++ linux-2.6-lttng/kernel/Makefile	2007-07-03 12:33:21.000000000 -0400
@@ -59,6 +59,7 @@
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_IMMEDIATE) += immediate.o
+obj-$(CONFIG_MARKERS) += marker.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 2/4] Linux Kernel Markers - Add kconfig menus for the marker code
  2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 1/4] Linux Kernel Markers, architecture independent code Mathieu Desnoyers
@ 2007-07-03 17:08 ` Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 3/4] Linux Kernel Markers - Documentation Mathieu Desnoyers
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-03 17:08 UTC (permalink / raw)
  To: akpm, Christoph Hellwig, linux-kernel; +Cc: Mathieu Desnoyers, Adrian Bunk

[-- Attachment #1: linux-kernel-markers-kconfig-menus.patch --]
[-- Type: text/plain, Size: 21484 bytes --]

With the increasing complexity of today's user-space application and the wide
deployment of SMP systems, the users need an increasing understanding of the
behavior and performance of a system across multiple processes/different
execution contexts/multiple CPUs.  In applications such as large clusters
(Google, IBM), video acquisition (Autodesk), embedded real-time systems (Wind
River, Monta Vista, Sony) or sysadmin/programmer-type tasks (SystemTAP from
Redhat), a tool that permits tracing of kernel-user space interaction becomes
necessary.

Usage of such tools have been made to successfully pinpoint problems such as:
latency issues in a user-space video acquisition application, slowdown
problems in large clusters due to a switch to a different filesystems with a
different cache size, abnormal Linux scheduler latency (just to name a few
that I have personally investigated).

The currently existing solutions does not give a system-wide overview of what
- and when - things are happening on the system.  Ptracing a program works
with few processes, but quickly becomes useless when it comes to keeping track
of many processes.

Bugs occuring because of bad interaction of such complex systems can be very
hard to find due to the fact that they occur rarely (sometimes once a week on
hundreds of machines).  One can therefore only hope at having the best
conditions to statistically reproduce the bug while extracting information
from the system.  Some bugs have been successfully found at Google using their
ktrace tracer only because they could enable it on production machines and
therefore recreate the same context where the bug happened.

Therefore, it makes sense to offer an instrumentation set of the most relevant
events occurring in the Linux that can have the smallest performance cost
possible when not active while not requiring a reboot of a production system
to activate.  This is essentially what the markers are providing.

Since we cannot limit the growth of the Linux kernel, nor can we pre-determine
each and every "interesting" instrumentation within each subsystem and driver,
it is sensible to let this task to the persons who knows the best their code. 
Adding instrumentation should therefore be as easy as adding and maintaining a
"printk" in the kernel code from the developer's point of view.

Towards a complete tracing mechanism in the Linux kernel, the markers are only
one step forward.  The following step is to connect probes to those markers
that will record the tracing information in buffers exported to user-space,
organized in timestamped "events".  Probe callbacks are responsible for
serializing the information passed as parameter to the markers (described by
the format string) into the events.  A control mechanism to activate/stop the
tracing is required, as well as a daemon that maps the buffers to write them
to disk or send them through the network.

Keeping track of the events also requires a centralized infrastructure : the
idea is to assign a unique ID to each event so they can be later recognized in
the trace.  Keeping in mind that recording the complete instrumentation site
name string for each event would be more that inefficient, assigning a numeric
unique identifier makes sense.

Finally, support for gathering events coming from user-space, with a minimal
performance impact, is very useful to see the interaction between the system's
execution contexts.

The last steps are currently implemented in Linux Trace Toolkit Next
Generation (LTTng).

The SystemTAP project could clearly benefit from such an infrastructure for
tracing.  In addition, they would be providing support for dynamic addition of
kernel probes through breakpoints/jumps when possible, with the associated
restrictions (accessing local variables, reentrancy, speed).




This marker infrastructure is a hook-callback mechanism.  It is meant to have
an impact as low as possible on the system performances when no callback
(probe) is connected so markers (hooks) can be compiled into a production
kernel without noticeable slowdown.

Why use the markers instead of kprobes?

The rationale behind this mechanism the following :

1 - It makes sense to have instrumentation (for tracing, profiling)
    within the kernel source tree so that it can follow its evolution.
    Other options, such as kprobes, imply maintaining an external set of
    instrumentation that must be adapted to each kernel version.
    Although it may make sense for distributions, it is not well suited
    for kernel developers, since they rarely work on a major
    distribution image.
2 - kprobes, although being a very good attempt at providing a dynamic
    hooking mechanism that has no impact when disabled, suffers from
    important limitations :
  a - It cannot access local variables of a function at a particular
      point within its body that will be consistent thorough the kernel
      versions without involving a lot of recurrent hair-pulling.
  b - Kprobes is slow, since it involves going though a trap each time
      a probe site is executed. Even though the djprobes project made a
      good effort to make things faster, it cannot currently instrument
      fully-preemptible kernels and does not solve (1), (2a) and (2c).
  c - On the reentrancy side, going though a trap (thus playing with
      interrupt enable/disable) and taking spinlocks are not suited to
      some code paths, i.e. :
      kernel/lockdep.c, printk (within the lockdep_on()/lockdep_off()).
      It must be understood that some code paths interesting for
      instrumentation often present a particular reentrancy challenge.

Some more details :

The probe callback connection to its markers is done dynamically.  A predicted
branch (see the immediate values infrastructure) is used to skip the hook stack
setup and function call when the marker is "disabled" (no probe is connected).
Further optimizations can be implemented for each architecture to make this
branch faster.

Instrumentation of a subsystem becomes therefore a straightforward task.  One
has to add instrumentation within the key locations of the kernel code in the
following form :

trace_mark(subsystem_event, "%d %p", myint, myptr);


Jim Keniston <jkenisto@us.ibm.com> adds:

kprobes remains a vital foundation for SystemTap.  But markers are attactive
as an alternate source of trace/debug info.  Here's why:

1. Markers will live in the kernel and presumably be kept up to date by
   the maintainers of the enclosing code.  We have a growing set of tapsets
   (probe libraries), each of which "knows" the source code for a certain area
   of the kernel.  Whenever the underlying kernel code changes (e.g., a
   function or one of its args disappears or is renamed), there's a chance
   that the tapset will become invalid until we bring it back in sync with the
   kernel.  As you can imagine, maintaining tapsets separate from the kernel
   source is a maintenance headache.  Markers could mitigate this.

2. Because the kernel code is highly optimized, the kernel's dwarf info
   doesn't always accurately reflect which variables have which values on
   which lines (sometimes even upon entry to a function).  A marker is a way
   to ensure that values of interest are available to SystemTap at marked
   points.

3. Sometimes the overhead of a kprobe probepoint is too much (either in
   terms of time or locking) for the particular hotspot we want to probe.


In OLS2006 proceedings, vol. 1
http://www.linuxsymposium.org/2006/linuxsymposium_procv1.pdf

Frank C. Eigler, from SystemTAP, presents its "static probing markers"
(pp. 261-268) in his paper "Problem Solving With Systemtap".

He explains the advantages :

"In exchange for this effort, systemtap marker-based probes are faster and
 more precise than kprobes.  The better precision comes from not having to
 covet the compiler's favours.  Such fickle favours include retaining
 clean boundaries in the instruction stream between interesting statements,
 and precisely describing positions of variables in the stack frame.  Since
 markers don't rely on debugging information, neither favour is required,
 and the compiler can channel its charms into unabated optimization.  The
 speed advantage comes from using direct call instructions rather than int 3
 breakpoints to dispatch to the systemtap handlers.  We will see below just
 how big a difference this makes."

He does a comparison of his "simple" marker solution with kprobes (his simple
solution looks like my generic markers, but with a major race condition).  I
also posted numbers about the markers performance impact a few months ago in
the initial thread.  I can dig into my emails to find them for you if you
consider it important for the Changelog.

He concludes with :

"To the extent that is true, we propose that these groups consider using a
 shared pool of static markers as the basic kernel-side instrumentation
 mechanism.  If they prove to have as low dormant cost and as high active
 performance as initial experience suggests, perhaps this could motivate the
 various tracing efforts and kernel subsystem developers to finally join
 forces.  Let's designate standard trace/probe points once and for all. 
 Tracing backends can attach to these markers the same way systemtap would. 
 There would be no need for them to maintain kernel patches any more. 
 Let's think about it."


This patch:

Add Kconfig menus for the marker code.

[bunk@stusta.de: Never ever select MODULES]
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Adrian Bunk <bunk@stusta.de>
---

 arch/alpha/Kconfig       |    2 ++
 arch/arm/Kconfig         |    2 ++
 arch/arm26/Kconfig       |    2 ++
 arch/avr32/Kconfig.debug |    7 +++++++
 arch/cris/Kconfig        |    2 ++
 arch/frv/Kconfig         |    2 ++
 arch/h8300/Kconfig       |    2 ++
 arch/i386/Kconfig        |    2 ++
 arch/ia64/Kconfig        |    2 ++
 arch/m32r/Kconfig        |    2 ++
 arch/m68k/Kconfig        |    2 ++
 arch/m68knommu/Kconfig   |    2 ++
 arch/mips/Kconfig        |    2 ++
 arch/parisc/Kconfig      |    2 ++
 arch/powerpc/Kconfig     |    2 ++
 arch/ppc/Kconfig         |    2 ++
 arch/s390/Kconfig        |    2 ++
 arch/sh/Kconfig          |    2 ++
 arch/sh64/Kconfig        |    2 ++
 arch/sparc/Kconfig       |    2 ++
 arch/sparc64/Kconfig     |    2 ++
 arch/um/Kconfig          |    2 ++
 arch/v850/Kconfig        |    2 ++
 arch/x86_64/Kconfig      |    2 ++
 arch/xtensa/Kconfig      |    2 ++
 kernel/Kconfig.marker    |    7 +++++++
 26 files changed, 62 insertions(+)

Index: linux-2.6-lttng/arch/alpha/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/alpha/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/alpha/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -657,6 +657,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/alpha/Kconfig.debug"
Index: linux-2.6-lttng/arch/arm/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/arm/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/arm/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -1050,6 +1050,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/arm/Kconfig.debug"
Index: linux-2.6-lttng/arch/arm26/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/arm26/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/arm26/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -248,6 +248,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/arm26/Kconfig.debug"
Index: linux-2.6-lttng/arch/cris/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/cris/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/cris/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -202,6 +202,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/cris/Kconfig.debug"
Index: linux-2.6-lttng/arch/frv/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/frv/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/frv/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -379,6 +379,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/frv/Kconfig.debug"
Index: linux-2.6-lttng/arch/h8300/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/h8300/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/h8300/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -227,6 +227,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/h8300/Kconfig.debug"
Index: linux-2.6-lttng/arch/i386/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/i386/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/i386/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -1237,6 +1237,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endif # INSTRUMENTATION
 
 source "arch/i386/Kconfig.debug"
Index: linux-2.6-lttng/arch/ia64/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/ia64/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/ia64/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -596,6 +596,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/ia64/Kconfig.debug"
Index: linux-2.6-lttng/arch/m32r/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/m32r/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/m32r/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -412,6 +412,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/m32r/Kconfig.debug"
Index: linux-2.6-lttng/arch/m68k/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/m68k/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/m68k/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -683,6 +683,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/m68k/Kconfig.debug"
Index: linux-2.6-lttng/arch/m68knommu/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/m68knommu/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/m68knommu/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -672,6 +672,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/m68knommu/Kconfig.debug"
Index: linux-2.6-lttng/arch/mips/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/mips/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/mips/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -1961,6 +1961,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/mips/Kconfig.debug"
Index: linux-2.6-lttng/arch/parisc/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/parisc/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/parisc/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -273,6 +273,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/parisc/Kconfig.debug"
Index: linux-2.6-lttng/arch/powerpc/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/powerpc/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/powerpc/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -908,6 +908,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/powerpc/Kconfig.debug"
Index: linux-2.6-lttng/arch/ppc/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/ppc/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/ppc/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -1457,6 +1457,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/ppc/Kconfig.debug"
Index: linux-2.6-lttng/arch/s390/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/s390/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/s390/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -549,6 +549,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/s390/Kconfig.debug"
Index: linux-2.6-lttng/arch/sh/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/sh/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/sh/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -725,6 +725,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/sh/Kconfig.debug"
Index: linux-2.6-lttng/arch/sh64/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/sh64/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/sh64/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -288,6 +288,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/sh64/Kconfig.debug"
Index: linux-2.6-lttng/arch/sparc/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/sparc/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/sparc/Kconfig	2007-06-15 16:14:10.000000000 -0400
@@ -315,6 +315,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/sparc/Kconfig.debug"
Index: linux-2.6-lttng/arch/sparc64/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/sparc64/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/sparc64/Kconfig	2007-06-15 16:14:11.000000000 -0400
@@ -441,6 +441,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/sparc64/Kconfig.debug"
Index: linux-2.6-lttng/arch/um/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/um/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/um/Kconfig	2007-06-15 16:14:11.000000000 -0400
@@ -339,6 +339,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/um/Kconfig.debug"
Index: linux-2.6-lttng/arch/v850/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/v850/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/v850/Kconfig	2007-06-15 16:14:11.000000000 -0400
@@ -335,6 +335,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/v850/Kconfig.debug"
Index: linux-2.6-lttng/arch/x86_64/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/x86_64/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/x86_64/Kconfig	2007-06-15 16:14:11.000000000 -0400
@@ -795,6 +795,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/x86_64/Kconfig.debug"
Index: linux-2.6-lttng/arch/xtensa/Kconfig
===================================================================
--- linux-2.6-lttng.orig/arch/xtensa/Kconfig	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/xtensa/Kconfig	2007-06-15 16:14:11.000000000 -0400
@@ -255,6 +255,8 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
 endmenu
 
 source "arch/xtensa/Kconfig.debug"
Index: linux-2.6-lttng/kernel/Kconfig.marker
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/kernel/Kconfig.marker	2007-06-15 16:14:11.000000000 -0400
@@ -0,0 +1,7 @@
+# Code markers configuration
+
+config MARKERS
+	bool "Activate markers"
+	help
+	  Place an empty function call at each marker site. Can be
+	  dynamically changed for a probe function.
Index: linux-2.6-lttng/arch/avr32/Kconfig.debug
===================================================================
--- linux-2.6-lttng.orig/arch/avr32/Kconfig.debug	2007-06-15 16:14:02.000000000 -0400
+++ linux-2.6-lttng/arch/avr32/Kconfig.debug	2007-06-15 16:14:11.000000000 -0400
@@ -9,6 +9,9 @@
 menu "Instrumentation Support"
 	depends on EXPERIMENTAL
 
+menu "Instrumentation Support"
+	depends on EXPERIMENTAL
+
 config KPROBES
 	bool "Kprobes"
 	depends on DEBUG_KERNEL
@@ -21,6 +24,10 @@
 
 source "kernel/Kconfig.immediate"
 
+source "kernel/Kconfig.marker"
+
+endmenu
+
 endmenu
 
 endmenu

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 3/4] Linux Kernel Markers - Documentation
  2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 1/4] Linux Kernel Markers, architecture independent code Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 2/4] Linux Kernel Markers - Add kconfig menus for the marker code Mathieu Desnoyers
@ 2007-07-03 17:08 ` Mathieu Desnoyers
  2007-07-03 17:08 ` [patch 4/4] Port of blktrace to the Linux Kernel Markers Mathieu Desnoyers
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-03 17:08 UTC (permalink / raw)
  To: akpm, Christoph Hellwig, linux-kernel; +Cc: Mathieu Desnoyers

[-- Attachment #1: linux-kernel-markers-documentation.patch --]
[-- Type: text/plain, Size: 7974 bytes --]

Here is some documentation explaining what is/how to use the Linux
Kernel Markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
---

 Documentation/marker.txt |  249 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 249 insertions(+)

Index: linux-2.6-lttng/Documentation/marker.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/Documentation/marker.txt	2007-06-15 16:14:12.000000000 -0400
@@ -0,0 +1,249 @@
+ 	             Using the Linux Kernel Markers
+
+			    Mathieu Desnoyers
+
+
+This document introduces Linux Kernel Markers and their use. It provides
+examples of how to insert markers in the kernel and connect probe functions to
+them and provides some examples of probe functions.
+
+
+* Purpose of markers
+
+A marker placed in your code provides a hook to call a function (probe) that
+you can provide at runtime. A marker can be "on" (a probe is connected to it)
+or "off" (no probe is attached). When a marker is "off" it has no
+effect. When a marker is "on", the function you provide is called each
+time the marker is executed, in the execution context of the
+caller. When the function provided ends its execution, it returns to the
+caller (continuing from the marker site).
+
+You can put markers at important locations in the code. Markers are
+lightweight hooks that can pass an arbitrary number of parameters,
+described in a printk-like format string, to the attached probe function.
+
+They can be used for tracing and performance accounting.
+
+
+* Usage
+
+In order to use the macro trace_mark, you should include linux/marker.h.
+
+#include <linux/marker.h>
+
+Add, in your code :
+
+trace_mark(subsystem_event, "%d %s", someint, somestring);
+Where :
+- subsystem_event is an identifier unique to your event
+    - subsystem is the name of your subsystem.
+    - event is the name of the event to mark.
+- "%d %s" is the formatted string for the serializer.
+- someint is an integer.
+- somestring is a char pointer.
+
+Connecting a function (probe) to a marker is done by providing a probe
+(function to call) for the specific marker through marker_arm_probe(). It will
+automatically connect the function and enable the marker site. Removing a probe
+is done through marker_disarm_probe(). Probe removal is preempt-safe because
+preemption is disabled around the probe call. See the "Probe example" section
+below for a sample probe module.
+
+The marker mechanism supports inserting multiple instances of the same marker.
+Markers can be put in inline functions, inlined static functions, and
+unrolled loops.
+
+Note: It is safe to put markers within preempt-safe code : preempt_enable()
+will not call the scheduler due to the tests in preempt_schedule().
+
+
+* Optimization for a given architecture
+
+One can implement optimized markers for a given architecture by replacing
+asm-$ARCH/marker.h.
+
+The IF_* flags can be used to control the type of marker. See the
+include/linux/immediate.h header for the list of flags. They can be specified as
+the first parameter of the _trace_mark() macro, as in the following example,
+which is safe with respect to lockdep.c (useful for marking lockdep.c and printk
+functions).
+
+_trace_mark(IF_DEFAULT | ~IF_LOCKDEP, subsystem_eventb, MARK_NOARGS);
+
+Flag compatibility is checked before connecting the probe to the marker: the
+right flags must be given to _marker_arm_probe().
+
+
+* Probe example
+
+You can build the kernel modules, probe-example.ko and marker-example.ko,
+using the following Makefile:
+------------------------------ CUT -------------------------------------
+obj-m := probe-example.o marker-example.o
+KDIR := /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+default:
+	$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules
+clean:
+	rm -f *.mod.c *.ko *.o
+------------------------------ CUT -------------------------------------
+/* probe-example.c
+ *
+ * Connects two functions to marker call sites.
+ *
+ * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <asm/atomic.h>
+
+struct probe_data {
+	const char *name;
+	const char *format;
+	marker_probe_func *probe_func;
+};
+
+void probe_subsystem_event(const struct __mark_marker_c *mdata,
+		const char *format, ...)
+{
+	va_list ap;
+	/* Declare args */
+	unsigned int value;
+	const char *mystr;
+
+	/* Assign args */
+	va_start(ap, format);
+	value = va_arg(ap, typeof(value));
+	mystr = va_arg(ap, typeof(mystr));
+
+	/* Call printk */
+	printk("Value %u, string %s\n", value, mystr);
+
+	/* or count, check rights, serialize data in a buffer */
+
+	va_end(ap);
+}
+
+atomic_t eventb_count = ATOMIC_INIT(0);
+
+void probe_subsystem_eventb(const struct __mark_marker_c *mdata,
+	const char *format, ...)
+{
+	/* Increment counter */
+	atomic_inc(&eventb_count);
+}
+
+static struct probe_data probe_array[] =
+{
+	{	.name = "subsystem_event",
+		.format = "%d %s",
+		.probe_func = probe_subsystem_event },
+	{	.name = "subsystem_eventb",
+		.format = MARK_NOARGS,
+		.probe_func = probe_subsystem_eventb },
+};
+
+static int __init probe_init(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_arm_probe(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].probe_func, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO "Unable to register probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+static void __exit probe_fini(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		marker_disarm_probe(probe_array[i].name);
+	}
+	printk("Number of event b : %u\n", atomic_read(&eventb_count));
+}
+
+module_init(probe_init);
+module_exit(probe_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("SUBSYSTEM Probe");
+------------------------------ CUT -------------------------------------
+/* marker-example.c
+ *
+ * Executes a marker when /proc/marker-example is opened.
+ *
+ * (C) Copyright 2007 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+
+struct proc_dir_entry *pentry_example = NULL;
+
+static int my_open(struct inode *inode, struct file *file)
+{
+	int i;
+
+	trace_mark(subsystem_event, "%d %s", 123, "example string");
+	for (i=0; i<10; i++) {
+		trace_mark(subsystem_eventb, MARK_NOARGS);
+	}
+	return -EPERM;
+}
+
+static struct file_operations mark_ops = {
+	.open = my_open,
+};
+
+static int example_init(void)
+{
+	printk(KERN_ALERT "example init\n");
+	pentry_example = create_proc_entry("marker-example", 0444, NULL);
+	if (pentry_example)
+		pentry_example->proc_fops = &mark_ops;
+	else
+		return -EPERM;
+	return 0;
+}
+
+static void example_exit(void)
+{
+	printk(KERN_ALERT "example exit\n");
+	remove_proc_entry("marker-example", NULL);
+}
+
+module_init(example_init)
+module_exit(example_exit)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Mathieu Desnoyers");
+MODULE_DESCRIPTION("Linux Trace Toolkit example");
+------------------------------ CUT -------------------------------------
+Sequence of operations : (as root)
+make
+insmod marker-example.ko
+insmod probe-example.ko
+  (it is important to load the probe after the marked code)
+cat /proc/marker-example (returns an expected error)
+rmmod marker-example probe-example
+dmesg
+------------------------------ CUT -------------------------------------

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
                   ` (2 preceding siblings ...)
  2007-07-03 17:08 ` [patch 3/4] Linux Kernel Markers - Documentation Mathieu Desnoyers
@ 2007-07-03 17:08 ` Mathieu Desnoyers
  2007-07-03 18:01 ` [patch 0/4] " Mathieu Desnoyers
  2007-07-05  2:00 ` Frank Ch. Eigler
  5 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-03 17:08 UTC (permalink / raw)
  To: akpm, Christoph Hellwig, linux-kernel; +Cc: Mathieu Desnoyers, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 23858 bytes --]

Here is a proof of concept patch, for demonstration purpose, of moving
blktrace to the markers.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, every devices have to
fall into the tracing function call. This is slower than the previous
inline function which tested the condition quickly. If it becomes a
show stopper, it could be fixed by having the possibility to test a
supplementary condition, dependant of the marker context, at the marker
site, just after the enable/disable test.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it a proof a
concept.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  281 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   27 ++--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 +-
 fs/bio.c                     |    4 
 include/linux/blktrace_api.h |  146 +---------------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 322 insertions(+), 167 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-06-15 16:14:14.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -547,7 +547,7 @@
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -726,7 +726,7 @@
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-06-15 16:14:14.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -1551,7 +1552,7 @@
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1617,7 +1618,7 @@
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1628,7 +1629,7 @@
 {
 	request_queue_t *q = container_of(work, request_queue_t, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1638,7 +1639,7 @@
 {
 	request_queue_t *q = (request_queue_t *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2150,7 +2151,7 @@
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2180,7 +2181,7 @@
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2254,7 +2255,7 @@
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -2940,7 +2941,7 @@
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -2957,7 +2958,7 @@
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3187,10 +3188,10 @@
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %u %llu %llu", q, bio, old_dev,
+					(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3383,7 +3384,7 @@
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-06-15 16:14:14.000000000 -0400
@@ -32,6 +32,7 @@
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-06-15 16:14:14.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(request_queue_t *q)
@@ -386,6 +398,11 @@
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,269 @@
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	__be64 rpdu;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %d", BLK_TA_SPLIT,
+		blk_add_trace_pdu_int },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %u %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_arm_probe(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		marker_disarm_probe(probe_array[i].name);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-06-15 16:14:14.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,149 +143,24 @@
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(request_queue_t *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
-
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
+extern int blk_probe_connect(void);
+extern void blk_probe_disconnect(void);
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-06-15 16:14:14.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-06-15 16:14:14.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-06-15 16:14:14.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1081,7 +1081,7 @@
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+	trace_mark(blk_pdu_split, "%p %p %d", bdev_get_queue(bi->bi_bdev), bi,
 				bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-06-15 16:14:14.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2502,7 +2502,7 @@
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-06-15 16:13:49.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-06-15 16:14:14.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -483,8 +483,8 @@
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -580,10 +580,10 @@
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev, sector,
-				    clone->bi_sector);
+		trace_mark(blk_remap, "%p %p %u %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/4] Linux Kernel Markers
  2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
                   ` (3 preceding siblings ...)
  2007-07-03 17:08 ` [patch 4/4] Port of blktrace to the Linux Kernel Markers Mathieu Desnoyers
@ 2007-07-03 18:01 ` Mathieu Desnoyers
  2007-07-05  2:00 ` Frank Ch. Eigler
  5 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-03 18:01 UTC (permalink / raw)
  To: akpm, Christoph Hellwig, linux-kernel

Please note that this release will apply on 2.6.22-rc6-mm1 and depends
on the immediate values patch.

* Mathieu Desnoyers (mathieu.desnoyers@polymtl.ca) wrote:
> Hi,
> 
> This updated version of the Linux Kernel Markers mostly adds a unique 16 bits
> per marker ID and a per-probe marker group.
> 
> Christoph, I think the only concern that I do not plan to address immediately is
> to provide a complet in-kernel user of the markers (blktrace patch does not
> actually use the markers full potential). I have external patches that provides
> that, but I don't want to send too much patches at once. Between providing a
> complete marker/tracer stack and sending small incremental patches, I think the
> latter is the choice the better suited. This is however an uneasy problem, which
> looks very much like the chicken and egg problem. :)
> 
> If you have concerns with what I recently added to the markers, or if you still
> strongly feel that I must also send the following patches right away, please let
> me know.
> 
> Mathieu
> 
> -- 
> Mathieu Desnoyers
> Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
> OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/4] Linux Kernel Markers
  2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
                   ` (4 preceding siblings ...)
  2007-07-03 18:01 ` [patch 0/4] " Mathieu Desnoyers
@ 2007-07-05  2:00 ` Frank Ch. Eigler
  2007-07-11 21:43   ` Mathieu Desnoyers
  5 siblings, 1 reply; 18+ messages in thread
From: Frank Ch. Eigler @ 2007-07-05  2:00 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: akpm, Christoph Hellwig, linux-kernel

Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:

> This updated version of the Linux Kernel Markers mostly adds a unique 16 bits
> per marker ID and a per-probe marker group. [...]

Could you motivate this part better?  It is not covered in the
documentation patch.

It seems to be a way of having a marker handling (callback) module
give alternate names/ids to markers.  If so, why, considering that
there is already a private void* callback parameter available to pass
data back to itself through?

Also, what if different marker handling modules want to set different
id/group numbers on the same set of markers?

- FChE

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 0/4] Linux Kernel Markers
  2007-07-05  2:00 ` Frank Ch. Eigler
@ 2007-07-11 21:43   ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-11 21:43 UTC (permalink / raw)
  To: Frank Ch. Eigler; +Cc: akpm, Christoph Hellwig, linux-kernel

* Frank Ch. Eigler (fche@redhat.com) wrote:
> Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca> writes:
> 
> > This updated version of the Linux Kernel Markers mostly adds a unique 16 bits
> > per marker ID and a per-probe marker group. [...]
> 

Hello,

> Could you motivate this part better?  It is not covered in the
> documentation patch.
> 
> It seems to be a way of having a marker handling (callback) module
> give alternate names/ids to markers.  If so, why, considering that
> there is already a private void* callback parameter available to pass
> data back to itself through?
> 

The original reason was to get rid of a supplementary kmalloc() for each
active marker. However, I just noticed that I could pack my private data
in a slab cache, which makes the problem go away. I am therefore
removing IDs and groups from the markers.. they don't really belong to
this low-level infrastructure anyway, so this is all better.

> Also, what if different marker handling modules want to set different
> id/group numbers on the same set of markers?
> 

The way I see things now is to provide the simplest way to do the job,
without over-design. Clearly, putting the IDs and groups there was not
the best idea. I also think it will be up to a "tee" callback module to
implement a list of handlers (notifiers). However, supporting such a
list of handlers should not be a requirement for the low-level markers,
since has a significant performance impact which can be unwanted in the
common case (only one probe connected to a marker).

Mathieu

> - FChE

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-09-21  1:03   ` Steven Rostedt
@ 2007-09-21 13:46     ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-09-21 13:46 UTC (permalink / raw)
  To: Steven Rostedt; +Cc: akpm, linux-kernel, Frank Ch. Eigler, Jens Axboe

* Steven Rostedt (rostedt@goodmis.org) wrote:
> On Tue, Sep 18, 2007 at 05:13:28PM -0400, Mathieu Desnoyers wrote:
> > +void blk_probe_disarm(void)
> > +{
> > +	int i, err;
> > +
> > +	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
> > +		err = marker_disarm(probe_array[i].name);
> > +		BUG_ON(err);
> > +		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
> > +		BUG_ON(err);
> > +	}
> > +}
> 
> As well as changing these to WARN_ON.
> 
Yep.

> -- Steve
> 

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-09-18 21:13 ` [patch 4/4] Port of blktrace to the Linux Kernel Markers Mathieu Desnoyers
@ 2007-09-21  1:03   ` Steven Rostedt
  2007-09-21 13:46     ` Mathieu Desnoyers
  0 siblings, 1 reply; 18+ messages in thread
From: Steven Rostedt @ 2007-09-21  1:03 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: akpm, linux-kernel, Frank Ch. Eigler, Jens Axboe

On Tue, Sep 18, 2007 at 05:13:28PM -0400, Mathieu Desnoyers wrote:
> +void blk_probe_disarm(void)
> +{
> +	int i, err;
> +
> +	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
> +		err = marker_disarm(probe_array[i].name);
> +		BUG_ON(err);
> +		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
> +		BUG_ON(err);
> +	}
> +}

As well as changing these to WARN_ON.

-- Steve


^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-09-18 21:13 [patch 0/4] Linux Kernel Markers for 2.6.23-rc6-mm1 Mathieu Desnoyers
@ 2007-09-18 21:13 ` Mathieu Desnoyers
  2007-09-21  1:03   ` Steven Rostedt
  0 siblings, 1 reply; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-09-18 21:13 UTC (permalink / raw)
  To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Frank Ch. Eigler, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 27084 bytes --]

Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: "Frank Ch. Eigler" <fche@redhat.com>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  343 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   35 ++--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 -
 fs/bio.c                     |    6 
 include/linux/blktrace_api.h |  145 +-----------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 388 insertions(+), 172 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-09-18 10:08:11.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-09-18 13:18:26.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -548,7 +548,7 @@ void elv_insert(struct request_queue *q,
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -735,7 +735,7 @@ struct request *elv_next_request(struct 
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-09-18 10:09:51.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-09-18 13:18:26.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <linux/scatterlist.h>
@@ -1570,7 +1571,7 @@ void blk_plug_device(struct request_queu
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1636,7 +1637,7 @@ static void blk_backing_dev_unplug(struc
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1648,7 +1649,7 @@ static void blk_unplug_work(struct work_
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1658,7 +1659,7 @@ static void blk_unplug_timeout(unsigned 
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2178,7 +2179,7 @@ rq_starved:
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2208,7 +2209,7 @@ static struct request *get_request_wait(
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2282,7 +2283,7 @@ EXPORT_SYMBOL(blk_start_queueing);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -3005,7 +3006,7 @@ static int __make_request(struct request
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -3022,7 +3023,7 @@ static int __make_request(struct request
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3105,9 +3106,10 @@ static inline void blk_partition_remap(s
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev, bio->bi_sector,
-				    bio->bi_sector - p->start_sect);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+				    bdev_get_queue(bio->bi_bdev), bio,
+				    (u64)bdev->bd_dev, (u64)bio->bi_sector,
+				    (u64)bio->bi_sector - p->start_sect);
 	}
 }
 
@@ -3272,10 +3274,11 @@ end_io:
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %llu %llu %llu",
+				q, bio, (u64)old_dev,
+				(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3464,7 +3467,7 @@ static int __end_that_request_first(stru
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-09-18 10:04:27.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-09-18 13:18:26.000000000 -0400
@@ -32,6 +32,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-09-18 10:08:31.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-09-18 13:18:26.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@ void __blk_add_trace(struct blk_trace *b
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@ static void blk_trace_cleanup(struct blk
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(struct request_queue *q)
@@ -386,6 +398,11 @@ static int blk_trace_setup(struct reques
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,331 @@ static void blk_trace_set_ht_offsets(voi
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", NULL, q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+					u32 what)
+{
+	struct blk_trace *bt;
+	__be64 rpdu;
+
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+					!bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ * Description:
+ *     Adds a trace with some long long integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given as the
+ *     payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned long long pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned long long);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+		blk_add_trace_pdu_ll },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_probe_register(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+		result = marker_arm(probe_array[i].name);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to arm probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		err = marker_disarm(probe_array[i].name);
+		BUG_ON(err);
+		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+		BUG_ON(err);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-09-18 10:05:15.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-09-18 13:18:26.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,150 +143,22 @@ struct blk_user_trace_setup {
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-09-18 10:08:12.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-09-18 13:18:26.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@ static void __blk_queue_bounce(struct re
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-09-18 10:05:23.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-09-18 13:18:26.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-09-18 10:08:12.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-09-18 13:18:26.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1072,8 +1072,8 @@ struct bio_pair *bio_split(struct bio *b
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
-				bi->bi_sector + first_sectors);
+	trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+				(u64)bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-09-18 10:09:12.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-09-18 13:18:26.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2545,7 +2545,7 @@ after_error_processing:
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-09-18 10:08:11.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-09-18 13:18:26.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -481,8 +481,8 @@ static void dec_pending(struct dm_io *io
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -578,10 +578,10 @@ static void __map_bio(struct dm_target *
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-09-17 18:46 [patch 0/4] " Mathieu Desnoyers
@ 2007-09-17 18:46 ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-09-17 18:46 UTC (permalink / raw)
  To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Frank Ch. Eigler, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 27084 bytes --]

Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: "Frank Ch. Eigler" <fche@redhat.com>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  343 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   35 ++--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 -
 fs/bio.c                     |    6 
 include/linux/blktrace_api.h |  145 +-----------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 388 insertions(+), 172 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-09-17 14:03:12.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -548,7 +548,7 @@ void elv_insert(struct request_queue *q,
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -735,7 +735,7 @@ struct request *elv_next_request(struct 
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-09-17 14:03:12.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <linux/scatterlist.h>
@@ -1559,7 +1560,7 @@ void blk_plug_device(struct request_queu
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1625,7 +1626,7 @@ static void blk_backing_dev_unplug(struc
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1637,7 +1638,7 @@ static void blk_unplug_work(struct work_
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1647,7 +1648,7 @@ static void blk_unplug_timeout(unsigned 
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2160,7 +2161,7 @@ rq_starved:
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2190,7 +2191,7 @@ static struct request *get_request_wait(
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2264,7 +2265,7 @@ EXPORT_SYMBOL(blk_start_queueing);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -2987,7 +2988,7 @@ static int __make_request(struct request
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -3004,7 +3005,7 @@ static int __make_request(struct request
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3087,9 +3088,10 @@ static inline void blk_partition_remap(s
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev, bio->bi_sector,
-				    bio->bi_sector - p->start_sect);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+				    bdev_get_queue(bio->bi_bdev), bio,
+				    (u64)bdev->bd_dev, (u64)bio->bi_sector,
+				    (u64)bio->bi_sector - p->start_sect);
 	}
 }
 
@@ -3254,10 +3256,11 @@ end_io:
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %llu %llu %llu",
+				q, bio, (u64)old_dev,
+				(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3446,7 +3449,7 @@ static int __end_that_request_first(stru
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-09-17 14:03:12.000000000 -0400
@@ -32,6 +32,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-09-17 14:03:12.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@ void __blk_add_trace(struct blk_trace *b
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@ static void blk_trace_cleanup(struct blk
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(struct request_queue *q)
@@ -386,6 +398,11 @@ static int blk_trace_setup(struct reques
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,331 @@ static void blk_trace_set_ht_offsets(voi
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", NULL, q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+					u32 what)
+{
+	struct blk_trace *bt;
+	__be64 rpdu;
+
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+					!bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ * Description:
+ *     Adds a trace with some long long integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given as the
+ *     payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned long long pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned long long);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+		blk_add_trace_pdu_ll },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_probe_register(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+		result = marker_arm(probe_array[i].name);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to arm probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		err = marker_disarm(probe_array[i].name);
+		BUG_ON(err);
+		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+		BUG_ON(err);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-09-17 14:03:12.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,150 +143,22 @@ struct blk_user_trace_setup {
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-09-17 14:03:12.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@ static void __blk_queue_bounce(struct re
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-09-17 14:03:12.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-09-17 14:03:12.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1072,8 +1072,8 @@ struct bio_pair *bio_split(struct bio *b
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
-				bi->bi_sector + first_sectors);
+	trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+				(u64)bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-09-17 14:03:12.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2545,7 +2545,7 @@ after_error_processing:
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-09-17 14:03:12.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -481,8 +481,8 @@ static void dec_pending(struct dm_io *io
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -578,10 +578,10 @@ static void __map_bio(struct dm_target *
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-08-30 17:21   ` Christoph Hellwig
@ 2007-08-30 18:37     ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-08-30 18:37 UTC (permalink / raw)
  To: Christoph Hellwig, akpm, linux-kernel, Jens Axboe

* Christoph Hellwig (hch@infradead.org) wrote:
> On Mon, Aug 27, 2007 at 12:05:44PM -0400, Mathieu Desnoyers wrote:
> > Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
> > advantage of this port is that it minimizes the impact on the running when
> > blktrace is not active.
> > 
> > A few remarks : this patch has the positive effect of removing some code
> > from the block io tracing hot paths, minimizing the i-cache impact in a
> > system where the io tracing is compiled in but inactive.
> > 
> > It also moves the blk tracing code from a header (and therefore from the
> > body of the instrumented functions) to a separate C file.
> > 
> > There, as soon as one device has to be traced, all devices have to
> > execute the tracing function call when they pass by the instrumentation site.
> > This is slower than the previous inline function which tested the condition
> > quickly.
> > 
> > It does not make the code smaller, since I left all the specialized
> > tracing functions for requests, bio, generic, remap, which would go away
> > once a generic infrastructure is in place to serialize the information
> > passed to the marker. This is mostly why I consider it as a step towards the
> > full improvements that could bring the markers.
> 
> I like this as it moves the whole tracing code out of line.  It would
> be nice if we could make blktrace a module with this, but we'd need
> to change the interface away from an ioctl on the block device for that.
> 
> Btw, something that really shows here and what I noticed in my sputrace
> aswell is that there is a lot of boilerplate code due to the varargs
> trace handlers.  We really need some way to auto-generate the boilerplate
> for the trace function to avoid coding this up everywhere.

Or we can use a vprintk-like function to parse the format string and
serialize the information into trace buffers. I prefer this latter
option because, overall, it will localize the probes in a few bytes of
functions instead of duplicating the memory and instruction cache
required by multiple serializing functions.

I have the code ready, but I do not want to flood LKML with patches
neither....

Mathieu


-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-08-27 16:05 ` [patch 4/4] Port of blktrace to the " Mathieu Desnoyers
@ 2007-08-30 17:21   ` Christoph Hellwig
  2007-08-30 18:37     ` Mathieu Desnoyers
  0 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2007-08-30 17:21 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: akpm, linux-kernel, Jens Axboe

On Mon, Aug 27, 2007 at 12:05:44PM -0400, Mathieu Desnoyers wrote:
> Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
> advantage of this port is that it minimizes the impact on the running when
> blktrace is not active.
> 
> A few remarks : this patch has the positive effect of removing some code
> from the block io tracing hot paths, minimizing the i-cache impact in a
> system where the io tracing is compiled in but inactive.
> 
> It also moves the blk tracing code from a header (and therefore from the
> body of the instrumented functions) to a separate C file.
> 
> There, as soon as one device has to be traced, all devices have to
> execute the tracing function call when they pass by the instrumentation site.
> This is slower than the previous inline function which tested the condition
> quickly.
> 
> It does not make the code smaller, since I left all the specialized
> tracing functions for requests, bio, generic, remap, which would go away
> once a generic infrastructure is in place to serialize the information
> passed to the marker. This is mostly why I consider it as a step towards the
> full improvements that could bring the markers.

I like this as it moves the whole tracing code out of line.  It would
be nice if we could make blktrace a module with this, but we'd need
to change the interface away from an ioctl on the block device for that.

Btw, something that really shows here and what I noticed in my sputrace
aswell is that there is a lot of boilerplate code due to the varargs
trace handlers.  We really need some way to auto-generate the boilerplate
for the trace function to avoid coding this up everywhere.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-08-27 16:05 [patch 0/4] " Mathieu Desnoyers
@ 2007-08-27 16:05 ` Mathieu Desnoyers
  2007-08-30 17:21   ` Christoph Hellwig
  0 siblings, 1 reply; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-08-27 16:05 UTC (permalink / raw)
  To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 27060 bytes --]

Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Reviewed-by: "Frank Ch. Eigler" <fche@redhat.com>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  343 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   35 ++--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 -
 fs/bio.c                     |    6 
 include/linux/blktrace_api.h |  145 +-----------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 388 insertions(+), 172 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-08-24 17:21:23.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-08-24 17:48:22.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -548,7 +548,7 @@ void elv_insert(struct request_queue *q,
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -727,7 +727,7 @@ struct request *elv_next_request(struct 
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-08-24 17:29:47.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-08-24 18:01:12.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -1555,7 +1556,7 @@ void blk_plug_device(struct request_queu
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1621,7 +1622,7 @@ static void blk_backing_dev_unplug(struc
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1633,7 +1634,7 @@ static void blk_unplug_work(struct work_
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1643,7 +1644,7 @@ static void blk_unplug_timeout(unsigned 
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2156,7 +2157,7 @@ rq_starved:
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2186,7 +2187,7 @@ static struct request *get_request_wait(
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2260,7 +2261,7 @@ EXPORT_SYMBOL(blk_start_queueing);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -2948,7 +2949,7 @@ static int __make_request(struct request
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -2965,7 +2966,7 @@ static int __make_request(struct request
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3048,9 +3049,10 @@ static inline void blk_partition_remap(s
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 
-		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev, bio->bi_sector,
-				    bio->bi_sector - p->start_sect);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+				    bdev_get_queue(bio->bi_bdev), bio,
+				    (u64)bdev->bd_dev, (u64)bio->bi_sector,
+				    (u64)bio->bi_sector - p->start_sect);
 	}
 }
 
@@ -3199,10 +3201,11 @@ end_io:
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %llu %llu %llu",
+					q, bio, (u64)old_dev,
+					(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3395,7 +3398,7 @@ static int __end_that_request_first(stru
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-08-24 17:21:23.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-08-24 17:48:22.000000000 -0400
@@ -32,6 +32,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-08-24 17:21:23.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-08-24 17:52:25.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@ void __blk_add_trace(struct blk_trace *b
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@ static void blk_trace_cleanup(struct blk
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(struct request_queue *q)
@@ -386,6 +398,11 @@ static int blk_trace_setup(struct reques
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,331 @@ static void blk_trace_set_ht_offsets(voi
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", NULL, q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+					u32 what)
+{
+	struct blk_trace *bt;
+	__be64 rpdu;
+
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+					!bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ * Description:
+ *     Adds a trace with some long long integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given as the
+ *     payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned long long pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned long long);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+		blk_add_trace_pdu_ll },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_probe_register(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+		result = marker_arm(probe_array[i].name);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to arm probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		err = marker_disarm(probe_array[i].name);
+		BUG_ON(err);
+		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+		BUG_ON(err);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-08-24 17:22:33.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-08-24 17:54:56.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,150 +143,22 @@ struct blk_user_trace_setup {
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-08-24 17:22:36.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-08-24 17:48:22.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@ static void __blk_queue_bounce(struct re
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-08-24 17:22:36.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-08-24 17:48:22.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-08-24 17:22:07.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-08-24 17:48:22.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1081,8 +1081,8 @@ struct bio_pair *bio_split(struct bio *b
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
-				bi->bi_sector + first_sectors);
+	trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+				(u64)bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-08-24 17:21:28.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-08-24 17:48:22.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2504,7 +2504,7 @@ after_error_processing:
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-08-24 17:28:48.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-08-24 17:49:50.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -481,8 +481,8 @@ static void dec_pending(struct dm_io *io
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -578,10 +578,10 @@ static void __map_bio(struct dm_target *
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-08-20 20:27 [patch 0/4] " Mathieu Desnoyers
@ 2007-08-20 20:27 ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-08-20 20:27 UTC (permalink / raw)
  To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 26427 bytes --]

Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  342 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   28 +--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 -
 fs/bio.c                     |    6 
 include/linux/blktrace_api.h |  144 +-----------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 383 insertions(+), 168 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-08-07 11:03:19.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-08-07 11:43:37.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -548,7 +548,7 @@ void elv_insert(struct request_queue *q,
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -727,7 +727,7 @@ struct request *elv_next_request(struct 
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-08-07 11:03:39.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-08-07 11:43:37.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -1555,7 +1556,7 @@ void blk_plug_device(struct request_queu
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1621,7 +1622,7 @@ static void blk_backing_dev_unplug(struc
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1633,7 +1634,7 @@ static void blk_unplug_work(struct work_
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1643,7 +1644,7 @@ static void blk_unplug_timeout(unsigned 
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2156,7 +2157,7 @@ rq_starved:
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2186,7 +2187,7 @@ static struct request *get_request_wait(
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2260,7 +2261,7 @@ EXPORT_SYMBOL(blk_start_queueing);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -2948,7 +2949,7 @@ static int __make_request(struct request
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -2965,7 +2966,7 @@ static int __make_request(struct request
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3195,10 +3196,11 @@ end_io:
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %llu %llu %llu",
+					q, bio, (u64)old_dev,
+					(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3391,7 +3393,7 @@ static int __end_that_request_first(stru
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-08-07 11:03:19.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-08-07 11:43:37.000000000 -0400
@@ -32,6 +32,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-08-07 11:03:19.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-08-07 11:43:37.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@ void __blk_add_trace(struct blk_trace *b
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@ static void blk_trace_cleanup(struct blk
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(struct request_queue *q)
@@ -386,6 +398,11 @@ static int blk_trace_setup(struct reques
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,330 @@ static void blk_trace_set_ht_offsets(voi
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", NULL, q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+					u32 what)
+{
+	struct blk_trace *bt;
+	__be64 rpdu;
+
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+					!bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ * Description:
+ *     Adds a trace with some long long integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given as the
+ *     payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned long long pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned long long);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+		blk_add_trace_pdu_ll },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_probe_register(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+		result = marker_arm(probe_array[i].name);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to arm probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		err = marker_disarm(probe_array[i].name);
+		BUG_ON(err);
+		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+		BUG_ON(err);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-08-07 11:03:21.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-08-07 11:59:41.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,149 +143,22 @@ struct blk_user_trace_setup {
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-08-07 11:03:21.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-08-07 11:43:37.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@ static void __blk_queue_bounce(struct re
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-08-07 11:01:24.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-08-07 11:43:37.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-08-07 11:03:21.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-08-07 11:43:37.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1081,8 +1081,8 @@ struct bio_pair *bio_split(struct bio *b
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
-				bi->bi_sector + first_sectors);
+	trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+				(u64)bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-08-07 11:03:33.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-08-07 11:43:37.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2504,7 +2504,7 @@ after_error_processing:
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-08-07 11:03:24.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-08-07 11:43:37.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -481,8 +481,8 @@ static void dec_pending(struct dm_io *io
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -578,10 +578,10 @@ static void __map_bio(struct dm_target *
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev, sector,
-				    clone->bi_sector);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-08-12 15:10 [patch 0/4] " Mathieu Desnoyers
@ 2007-08-12 15:10 ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-08-12 15:10 UTC (permalink / raw)
  To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 26427 bytes --]

Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  342 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   28 +--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 -
 fs/bio.c                     |    6 
 include/linux/blktrace_api.h |  144 +-----------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 383 insertions(+), 168 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-08-07 11:03:19.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-08-07 11:43:37.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -548,7 +548,7 @@ void elv_insert(struct request_queue *q,
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -727,7 +727,7 @@ struct request *elv_next_request(struct 
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-08-07 11:03:39.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-08-07 11:43:37.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -1555,7 +1556,7 @@ void blk_plug_device(struct request_queu
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1621,7 +1622,7 @@ static void blk_backing_dev_unplug(struc
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1633,7 +1634,7 @@ static void blk_unplug_work(struct work_
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1643,7 +1644,7 @@ static void blk_unplug_timeout(unsigned 
 {
 	struct request_queue *q = (struct request_queue *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2156,7 +2157,7 @@ rq_starved:
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2186,7 +2187,7 @@ static struct request *get_request_wait(
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2260,7 +2261,7 @@ EXPORT_SYMBOL(blk_start_queueing);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -2948,7 +2949,7 @@ static int __make_request(struct request
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -2965,7 +2966,7 @@ static int __make_request(struct request
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3195,10 +3196,11 @@ end_io:
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %llu %llu %llu",
+					q, bio, (u64)old_dev,
+					(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3391,7 +3393,7 @@ static int __end_that_request_first(stru
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-08-07 11:03:19.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-08-07 11:43:37.000000000 -0400
@@ -32,6 +32,7 @@ config BLK_DEV_IO_TRACE
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-08-07 11:03:19.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-08-07 11:43:37.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@ void __blk_add_trace(struct blk_trace *b
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@ static void blk_trace_cleanup(struct blk
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(struct request_queue *q)
@@ -386,6 +398,11 @@ static int blk_trace_setup(struct reques
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,330 @@ static void blk_trace_set_ht_offsets(voi
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", NULL, q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+					u32 what)
+{
+	struct blk_trace *bt;
+	__be64 rpdu;
+
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+					!bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ * Description:
+ *     Adds a trace with some long long integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given as the
+ *     payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned long long pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned long long);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	void *private_data, const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+		blk_add_trace_pdu_ll },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_probe_register(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+		result = marker_arm(probe_array[i].name);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to arm probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		err = marker_disarm(probe_array[i].name);
+		BUG_ON(err);
+		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+		BUG_ON(err);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-08-07 11:03:21.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-08-07 11:59:41.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,149 +143,22 @@ struct blk_user_trace_setup {
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
 
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-08-07 11:03:21.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-08-07 11:43:37.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@ static void __blk_queue_bounce(struct re
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-08-07 11:01:24.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-08-07 11:43:37.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-08-07 11:03:21.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-08-07 11:43:37.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1081,8 +1081,8 @@ struct bio_pair *bio_split(struct bio *b
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
-				bi->bi_sector + first_sectors);
+	trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+				(u64)bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-08-07 11:03:33.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-08-07 11:43:37.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2504,7 +2504,7 @@ after_error_processing:
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-08-07 11:03:24.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-08-07 11:43:37.000000000 -0400
@@ -19,7 +19,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -481,8 +481,8 @@ static void dec_pending(struct dm_io *io
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -578,10 +578,10 @@ static void __map_bio(struct dm_target *
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev, sector,
-				    clone->bi_sector);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
  2007-07-14  1:29 Mathieu Desnoyers
@ 2007-07-14  1:29 ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-07-14  1:29 UTC (permalink / raw)
  To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Jens Axboe

[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 25473 bytes --]

Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.

A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.

It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.

There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.

It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
CC: Jens Axboe <jens.axboe@oracle.com>
---

 block/Kconfig                |    1 
 block/blktrace.c             |  342 ++++++++++++++++++++++++++++++++++++++++++-
 block/elevator.c             |    6 
 block/ll_rw_blk.c            |   28 +--
 drivers/block/cciss.c        |    4 
 drivers/md/dm.c              |   14 -
 fs/bio.c                     |    6 
 include/linux/blktrace_api.h |  146 +-----------------
 mm/bounce.c                  |    4 
 mm/highmem.c                 |    2 
 10 files changed, 385 insertions(+), 168 deletions(-)

Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c	2007-07-13 17:34:05.000000000 -0400
@@ -32,7 +32,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/hash.h>
 
 #include <asm/uaccess.h>
@@ -547,7 +547,7 @@
 	unsigned ordseq;
 	int unplug_it = 1;
 
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+	trace_mark(blk_request_insert, "%p %p", q, rq);
 
 	rq->q = q;
 
@@ -726,7 +726,7 @@
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
-			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+			trace_mark(blk_request_issue, "%p %p", q, rq);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c	2007-07-13 17:54:03.000000000 -0400
@@ -28,6 +28,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/marker.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
@@ -1551,7 +1552,7 @@
 
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+		trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
 	}
 }
 
@@ -1617,7 +1618,7 @@
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
-		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+		trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 
 		q->unplug_fn(q);
@@ -1628,7 +1629,7 @@
 {
 	request_queue_t *q = container_of(work, request_queue_t, unplug_work);
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+	trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	q->unplug_fn(q);
@@ -1638,7 +1639,7 @@
 {
 	request_queue_t *q = (request_queue_t *)data;
 
-	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+	trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 
 	kblockd_schedule_work(&q->unplug_work);
@@ -2150,7 +2151,7 @@
 	
 	rq_init(q, rq);
 
-	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+	trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
 out:
 	return rq;
 }
@@ -2180,7 +2181,7 @@
 		if (!rq) {
 			struct io_context *ioc;
 
-			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+			trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
 
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
@@ -2254,7 +2255,7 @@
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+	trace_mark(blk_requeue, "%p %p", q, rq);
 
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
@@ -2940,7 +2941,7 @@
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+			trace_mark(blk_bio_backmerge, "%p %p", q, bio);
 
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
@@ -2957,7 +2958,7 @@
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 
-			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+			trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
 
 			bio->bi_next = req->bio;
 			req->bio = bio;
@@ -3187,10 +3188,11 @@
 		blk_partition_remap(bio);
 
 		if (old_sector != -1)
-			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
-					    old_sector);
+			trace_mark(blk_remap, "%p %p %llu %llu %llu",
+					q, bio, (u64)old_dev,
+					(u64)bio->bi_sector, (u64)old_sector);
 
-		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+		trace_mark(blk_bio_queue, "%p %p", q, bio);
 
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
@@ -3383,7 +3385,7 @@
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
-	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", req->q, req);
 
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig	2007-07-13 17:34:05.000000000 -0400
@@ -32,6 +32,7 @@
 	depends on SYSFS
 	select RELAY
 	select DEBUG_FS
+	select MARKERS
 	help
 	  Say Y here, if you want to be able to trace the block layer actions
 	  on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c	2007-07-13 17:34:05.000000000 -0400
@@ -23,11 +23,19 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 
 static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
 static unsigned int blktrace_seq __read_mostly = 1;
 
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
 /*
  * Send out a notify message.
  */
@@ -179,7 +187,7 @@
 EXPORT_SYMBOL_GPL(__blk_add_trace);
 
 static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
 static unsigned int root_users;
 
 static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@
 	blk_remove_tree(bt->dir);
 	free_percpu(bt->sequence);
 	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (--blk_probes_ref == 0)
+		blk_probe_disarm();
+	mutex_unlock(&blk_probe_mutex);
 }
 
 static int blk_trace_remove(request_queue_t *q)
@@ -386,6 +398,11 @@
 		goto err;
 	}
 
+	mutex_lock(&blk_probe_mutex);
+	if (!blk_probes_ref++)
+		blk_probe_arm();
+	mutex_unlock(&blk_probe_mutex);
+
 	return 0;
 err:
 	if (dir)
@@ -549,9 +566,330 @@
 #endif
 }
 
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @rq:		the source request
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	int rw;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct request *rq;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	rq = va_arg(args, struct request *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+	rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	u32 what;
+	struct blk_trace *bt;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	int rw;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	rw = va_arg(args, int);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(mdata, "%p %p", q, bio);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+					u32 what)
+{
+	struct blk_trace *bt;
+	__be64 rpdu;
+
+	bt = q->blk_trace;
+	rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+					!bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the long long integer payload
+ *
+ * Description:
+ *     Adds a trace with some long long integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given as the
+ *     payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned long long pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned long long);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	unsigned int pdu;
+	u32 what;
+
+	what = pinfo->flags;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	pdu = va_arg(args, unsigned int);
+	va_end(args);
+
+	blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+	const char *fmt, ...)
+{
+	va_list args;
+	struct blk_trace *bt;
+	struct blk_io_trace_remap r;
+	u32 what;
+	struct blk_probe_data *pinfo = mdata->pdata;
+	struct request_queue *q;
+	struct bio *bio;
+	u64 dev, from, to;
+
+	va_start(args, fmt);
+	q = va_arg(args, struct request_queue *);
+	bio = va_arg(args, struct bio *);
+	dev = va_arg(args, u64);
+	from = va_arg(args, u64);
+	to = va_arg(args, u64);
+	va_end(args);
+
+	what = pinfo->flags;
+	bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+	{ "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+	{ "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+	{ "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+	{ "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+	{ "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+		blk_add_trace_generic },
+	{ "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+	{ "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+	{ "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+	{ "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+	{ "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+		blk_add_trace_pdu_int },
+	{ "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+		blk_add_trace_pdu_int },
+	{ "blk_request_insert", "%p %p", BLK_TA_INSERT,
+		blk_add_trace_rq },
+	{ "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+		blk_add_trace_pdu_ll },
+	{ "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+	{ "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+		blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+	int result;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		result = marker_probe_register(probe_array[i].name,
+				probe_array[i].format,
+				probe_array[i].callback, &probe_array[i]);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to register probe %s\n",
+				probe_array[i].name);
+		result = marker_arm(probe_array[i].name);
+		if (result)
+			printk(KERN_INFO
+				"blktrace unable to arm probe %s\n",
+				probe_array[i].name);
+	}
+	return 0;
+}
+
+void blk_probe_disarm(void)
+{
+	int i, err;
+
+	for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+		err = marker_disarm(probe_array[i].name);
+		BUG_ON(err);
+		err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+		BUG_ON(err);
+	}
+}
+
+
 static __init int blk_trace_init(void)
 {
-	mutex_init(&blk_tree_mutex);
 	on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
 	blk_trace_set_ht_offsets();
 
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h	2007-07-13 17:34:05.000000000 -0400
@@ -3,6 +3,7 @@
 
 #include <linux/blkdev.h>
 #include <linux/relay.h>
+#include <linux/marker.h>
 
 /*
  * Trace categories
@@ -142,149 +143,24 @@
 	u32 pid;
 };
 
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+	const char *name;
+	const char *format;
+	u32 flags;
+	marker_probe_func *callback;
+};
+
 #if defined(CONFIG_BLK_DEV_IO_TRACE)
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(request_queue_t *);
 extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
-
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
-	}
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @rw:		the data direction
- * @what:	the action
- *
- * Description:
- *     Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
-					 struct bio *bio, int rw, u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		blk_add_trace_bio(q, bio, what);
-	else
-		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q:		queue the io is for
- * @what:	the action
- * @bio:	the source bio
- * @pdu:	the integer payload
- *
- * Description:
- *     Adds a trace with some integer payload. This might be an unplug
- *     option given as the action, with the depth at unplug time given
- *     as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
-					 struct bio *bio, unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-	__be64 rpdu = cpu_to_be64(pdu);
-
-	if (likely(!bt))
-		return;
-
-	if (bio)
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
-	else
-		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
+extern int blk_probe_connect(void);
+extern void blk_probe_disconnect(void);
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
 #define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
 #define blk_trace_shutdown(q)			do { } while (0)
-#define blk_add_trace_rq(q, rq, what)		do { } while (0)
-#define blk_add_trace_bio(q, rq, what)		do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c	2007-07-13 17:34:05.000000000 -0400
@@ -13,7 +13,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 #define POOL_SIZE	64
@@ -237,7 +237,7 @@
 	if (!bio)
 		return;
 
-	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+	trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
 
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c	2007-07-13 17:34:05.000000000 -0400
@@ -26,7 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/tlbflush.h>
 
 /*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c	2007-07-13 17:34:05.000000000 -0400
@@ -25,7 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 2
@@ -1081,8 +1081,8 @@
 	if (!bp)
 		return bp;
 
-	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
-				bi->bi_sector + first_sectors);
+	trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+				(u64)bi->bi_sector + first_sectors);
 
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c	2007-07-13 17:33:58.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c	2007-07-13 17:34:05.000000000 -0400
@@ -37,7 +37,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2502,7 +2502,7 @@
 	}
 	cmd->rq->data_len = 0;
 	cmd->rq->completion_data = cmd;
-	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+	trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
 	blk_complete_request(cmd->rq);
 }
 
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c	2007-07-13 17:33:59.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c	2007-07-13 17:54:30.000000000 -0400
@@ -20,7 +20,7 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
 #include <linux/smp_lock.h>
 
 #define DM_MSG_PREFIX "core"
@@ -489,8 +489,8 @@
 			wake_up(&io->md->wait);
 
 		if (io->error != DM_ENDIO_REQUEUE) {
-			blk_add_trace_bio(io->md->queue, io->bio,
-					  BLK_TA_COMPLETE);
+			trace_mark(blk_request_complete, "%p %p",
+				io->md->queue, io->bio);
 
 			bio_endio(io->bio, io->bio->bi_size, io->error);
 		}
@@ -586,10 +586,10 @@
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
-
-		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev, sector,
-				    clone->bi_sector);
+		trace_mark(blk_remap, "%p %p %llu %llu %llu",
+			bdev_get_queue(clone->bi_bdev), clone,
+			(u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+			(u64)clone->bi_sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

-- 
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F  BA06 3F25 A8FE 3BAE 9A68

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2007-09-21 13:51 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-07-03 17:08 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
2007-07-03 17:08 ` [patch 1/4] Linux Kernel Markers, architecture independent code Mathieu Desnoyers
2007-07-03 17:08 ` [patch 2/4] Linux Kernel Markers - Add kconfig menus for the marker code Mathieu Desnoyers
2007-07-03 17:08 ` [patch 3/4] Linux Kernel Markers - Documentation Mathieu Desnoyers
2007-07-03 17:08 ` [patch 4/4] Port of blktrace to the Linux Kernel Markers Mathieu Desnoyers
2007-07-03 18:01 ` [patch 0/4] " Mathieu Desnoyers
2007-07-05  2:00 ` Frank Ch. Eigler
2007-07-11 21:43   ` Mathieu Desnoyers
2007-07-14  1:29 Mathieu Desnoyers
2007-07-14  1:29 ` [patch 4/4] Port of blktrace to the " Mathieu Desnoyers
2007-08-12 15:10 [patch 0/4] " Mathieu Desnoyers
2007-08-12 15:10 ` [patch 4/4] Port of blktrace to the " Mathieu Desnoyers
2007-08-20 20:27 [patch 0/4] " Mathieu Desnoyers
2007-08-20 20:27 ` [patch 4/4] Port of blktrace to the " Mathieu Desnoyers
2007-08-27 16:05 [patch 0/4] " Mathieu Desnoyers
2007-08-27 16:05 ` [patch 4/4] Port of blktrace to the " Mathieu Desnoyers
2007-08-30 17:21   ` Christoph Hellwig
2007-08-30 18:37     ` Mathieu Desnoyers
2007-09-17 18:46 [patch 0/4] " Mathieu Desnoyers
2007-09-17 18:46 ` [patch 4/4] Port of blktrace to the " Mathieu Desnoyers
2007-09-18 21:13 [patch 0/4] Linux Kernel Markers for 2.6.23-rc6-mm1 Mathieu Desnoyers
2007-09-18 21:13 ` [patch 4/4] Port of blktrace to the Linux Kernel Markers Mathieu Desnoyers
2007-09-21  1:03   ` Steven Rostedt
2007-09-21 13:46     ` Mathieu Desnoyers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).