* [patch 1/4] Linux Kernel Markers - Architecture Independent Code
2007-09-17 18:46 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
@ 2007-09-17 18:46 ` Mathieu Desnoyers
2007-09-17 18:46 ` [patch 2/4] Linux Kernel Markers - Use instrumentation kconfig menu Mathieu Desnoyers
` (2 subsequent siblings)
3 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-09-17 18:46 UTC (permalink / raw)
To: akpm, linux-kernel
Cc: Mathieu Desnoyers, Frank Ch. Eigler, Christoph Hellwig, Rusty Russell
[-- Attachment #1: linux-kernel-markers-architecture-independent-code.patch --]
[-- Type: text/plain, Size: 29168 bytes --]
The marker activation functions sits in kernel/marker.c. A hash table is used
to keep track of the registered probes and armed markers, so the markers within
a newly loaded module that should be active can be activated at module load
time.
marker_query has been removed. marker_get_first, marker_get_next and
marker_release should be used as iterators on the markers.
Changelog:
- markers_mutex now nests inside module_mutex rather than the opposite.
- Iteration on modules is now done in module.c.
- module_mutex is not exported anymore.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: "Frank Ch. Eigler" <fche@redhat.com>
CC: Christoph Hellwig <hch@infradead.org>
CC: Rusty Russell <rusty@rustcorp.com.au>
---
include/asm-generic/vmlinux.lds.h | 11
include/linux/marker.h | 175 ++++++++++
include/linux/module.h | 18 +
kernel/marker.c | 608 ++++++++++++++++++++++++++++++++++++++
kernel/module.c | 66 ++++
5 files changed, 875 insertions(+), 3 deletions(-)
Index: linux-2.6-lttng/include/asm-generic/vmlinux.lds.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/vmlinux.lds.h 2007-09-14 10:11:18.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/vmlinux.lds.h 2007-09-14 10:11:31.000000000 -0400
@@ -12,7 +12,11 @@
/* .data section */
#define DATA_DATA \
*(.data) \
- *(.data.init.refok)
+ *(.data.init.refok) \
+ . = ALIGN(8); \
+ VMLINUX_SYMBOL(__start___markers) = .; \
+ *(__markers) \
+ VMLINUX_SYMBOL(__stop___markers) = .;
#define RO_DATA(align) \
. = ALIGN((align)); \
@@ -129,6 +133,11 @@
VMLINUX_SYMBOL(__stop___immediate) = .; \
} \
\
+ /* Markers: strings */ \
+ __markers_strings : AT(ADDR(__markers_strings) - LOAD_OFFSET) { \
+ *(__markers_strings) \
+ } \
+ \
/* Kernel symbol table: strings */ \
__ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
*(__ksymtab_strings) \
Index: linux-2.6-lttng/include/linux/marker.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/include/linux/marker.h 2007-09-17 12:43:54.000000000 -0400
@@ -0,0 +1,175 @@
+#ifndef _LINUX_MARKER_H
+#define _LINUX_MARKER_H
+
+/*
+ * Code markup for dynamic and static tracing.
+ *
+ * See Documentation/marker.txt.
+ *
+ * (C) Copyright 2006 Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
+ *
+ * This file is released under the GPLv2.
+ * See the file COPYING for more details.
+ */
+
+#include <linux/immediate.h>
+#include <linux/types.h>
+
+struct module;
+struct __mark_marker;
+
+/**
+ * marker_probe_func - Type of a marker probe function
+ * @mdata: pointer of type struct __mark_marker
+ * @private_data: caller site private data
+ * @fmt: format string
+ * @...: variable argument list
+ *
+ * Type of marker probe functions. They receive the mdata and need to parse the
+ * format string to recover the variable argument list.
+ */
+typedef void marker_probe_func(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...);
+
+struct __mark_marker {
+ const char *name; /* Marker name */
+ const char *format; /* Marker format string, describing the
+ * variable argument list.
+ */
+ const char *args; /* List of arguments litteraly transformed
+ * into a string: "arg1, arg2, arg3".
+ */
+ DEFINE_IMMEDIATE(char, state); /* Immediate value state. */
+ marker_probe_func *call;/* Probe handler function pointer */
+ void *pdata; /* Private probe data */
+} __attribute__((aligned(8)));
+
+#ifdef CONFIG_MARKERS
+
+/*
+ * Generic marker flavor always available.
+ * Note : the empty asm volatile with read constraint is used here instead of a
+ * "used" attribute to fix a gcc 4.1.x bug.
+ * Make sure the alignment of the structure in the __markers section will
+ * not add unwanted padding between the beginning of the section and the
+ * structure. Force alignment to the same alignment as the section start.
+ */
+#define __trace_mark(generic, name, call_data, format, args...) \
+ do { \
+ static const char __mstrtab_name_##name[] \
+ __attribute__((section("__markers_strings"))) \
+ = #name; \
+ static const char __mstrtab_format_##name[] \
+ __attribute__((section("__markers_strings"))) \
+ = format; \
+ static const char __mstrtab_args_##name[] \
+ __attribute__((section("__markers_strings"))) \
+ = #args; \
+ static struct __mark_marker __mark_##name \
+ __attribute__((section("__markers"))) = \
+ { __mstrtab_name_##name, __mstrtab_format_##name, \
+ __mstrtab_args_##name, 0, \
+ __mark_empty_function, NULL }; \
+ asm volatile ( "" : : "i" (&__mark_##name)); \
+ __mark_check_format(format, ## args); \
+ if (!generic) { \
+ if (unlikely(immediate_read(__mark_##name.state))) { \
+ preempt_disable(); \
+ (*__mark_##name.call) \
+ (&__mark_##name, call_data, \
+ format, ## args); \
+ preempt_enable(); \
+ } \
+ } else { \
+ if (unlikely(_immediate_read(__mark_##name.state))) { \
+ preempt_disable(); \
+ (*__mark_##name.call) \
+ (&__mark_##name, call_data, \
+ format, ## args); \
+ preempt_enable(); \
+ } \
+ } \
+ } while (0)
+
+extern void marker_update_probe_range(struct __mark_marker *begin,
+ struct __mark_marker *end, struct module *probe_module, int *refcount);
+#else /* !CONFIG_MARKERS */
+#define __trace_mark(generic, name, call_data, format, args...) \
+ __mark_check_format(format, ## args)
+static inline void marker_update_probe_range(struct __mark_marker *begin,
+ struct __mark_marker *end, struct module *probe_module, int *refcount)
+{ }
+#endif /* CONFIG_MARKERS */
+
+/**
+ * trace_mark - Marker using code patching
+ * @name: marker name, not quoted.
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker using optimized code patching technique (immediate_read())
+ * to be enabled.
+ */
+#define trace_mark(name, format, args...) \
+ __trace_mark(0, name, NULL, format, ## args)
+
+/**
+ * _trace_mark - Marker using variable read
+ * @name: marker name, not quoted.
+ * @format: format string
+ * @args...: variable argument list
+ *
+ * Places a marker using a standard memory read (_immediate_read()) to be
+ * enabled. Should be used for markers in __init and __exit functions and in
+ * lockdep code.
+ */
+#define _trace_mark(name, format, args...) \
+ __trace_mark(1, name, NULL, format, ## args)
+
+#define MARK_MAX_FORMAT_LEN 1024
+
+/**
+ * MARK_NOARGS - Format string for a marker with no argument.
+ */
+#define MARK_NOARGS " "
+
+/* To be used for string format validity checking with gcc */
+static inline void __attribute__ ((format (printf, 1, 2)))
+ __mark_check_format(const char *fmt, ...) { }
+
+extern marker_probe_func __mark_empty_function;
+
+/*
+ * Connect a probe to a marker.
+ * pdata must be a valid allocated memory address, or NULL.
+ */
+extern int marker_probe_register(const char *name, const char *format,
+ marker_probe_func *probe, void *pdata);
+
+/*
+ * Returns the pdata given to marker_probe_register.
+ */
+extern void *marker_probe_unregister(const char *name);
+/*
+ * Unregister a marker by providing the registered pdata.
+ */
+extern void *marker_probe_unregister_pdata(void *pdata);
+
+extern int marker_arm(const char *name);
+extern int marker_disarm(const char *name);
+
+struct marker_iter {
+ struct module *module;
+ struct __mark_marker *marker;
+};
+
+extern void marker_iter_start(struct marker_iter *iter);
+extern void marker_iter_next(struct marker_iter *iter);
+extern void marker_iter_stop(struct marker_iter *iter);
+extern void marker_iter_reset(struct marker_iter *iter);
+extern void *marker_get_pdata(const char *name);
+extern int marker_get_iter_range(struct __mark_marker **marker,
+ struct __mark_marker *begin,
+ struct __mark_marker *end);
+
+#endif
Index: linux-2.6-lttng/include/linux/module.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/module.h 2007-09-14 10:11:18.000000000 -0400
+++ linux-2.6-lttng/include/linux/module.h 2007-09-14 10:11:31.000000000 -0400
@@ -16,6 +16,7 @@
#include <linux/kobject.h>
#include <linux/moduleparam.h>
#include <linux/immediate.h>
+#include <linux/marker.h>
#include <asm/local.h>
#include <asm/module.h>
@@ -376,6 +377,10 @@ struct module
const struct __immediate *immediate;
unsigned int num_immediate;
#endif
+#ifdef CONFIG_MARKERS
+ struct __mark_marker *markers;
+ unsigned int num_markers;
+#endif
};
#ifndef MODULE_ARCH_INIT
#define MODULE_ARCH_INIT {}
@@ -482,6 +487,9 @@ extern void print_modules(void);
extern void _module_immediate_update(void);
extern void module_immediate_update(void);
+extern void module_update_markers(struct module *probe_module, int *refcount);
+extern int module_get_iter_markers(struct marker_iter *iter);
+
#else /* !CONFIG_MODULES... */
#define EXPORT_SYMBOL(sym)
#define EXPORT_SYMBOL_GPL(sym)
@@ -589,6 +597,16 @@ static inline void module_immediate_upda
{
}
+static inline void module_update_markers(struct module *probe_module,
+ int *refcount)
+{
+}
+
+static inline int module_get_iter_markers(struct marker_iter *iter)
+{
+ return 0;
+}
+
#endif /* CONFIG_MODULES */
struct device_driver;
Index: linux-2.6-lttng/kernel/module.c
===================================================================
--- linux-2.6-lttng.orig/kernel/module.c 2007-09-14 10:11:30.000000000 -0400
+++ linux-2.6-lttng/kernel/module.c 2007-09-14 10:11:31.000000000 -0400
@@ -1720,6 +1720,8 @@ static struct module *load_module(void _
unsigned int unusedgplindex;
unsigned int unusedgplcrcindex;
unsigned int immediateindex;
+ unsigned int markersindex;
+ unsigned int markersstringsindex;
struct module *mod;
long err = 0;
void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1972,6 +1974,8 @@ static struct module *load_module(void _
mod->num_immediate =
sechdrs[immediateindex].sh_size / sizeof(*mod->immediate);
#endif
+ markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
+ markersstringsindex = find_sec(hdr, sechdrs, secstrings, "__markers_strings");
mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
if (unusedcrcindex)
@@ -2013,6 +2017,11 @@ static struct module *load_module(void _
if (err < 0)
goto cleanup;
}
+#ifdef CONFIG_MARKERS
+ mod->markers = (void *)sechdrs[markersindex].sh_addr;
+ mod->num_markers =
+ sechdrs[markersindex].sh_size / sizeof(*mod->markers);
+#endif
/* Find duplicate symbols */
err = verify_export_symbols(mod);
@@ -2037,12 +2046,16 @@ static struct module *load_module(void _
goto nomodsectinfo;
#endif
+ if (!mod->taints) {
#ifdef CONFIG_IMMEDIATE
- if (!mod->taints)
immediate_update_range(mod->immediate,
mod->immediate + mod->num_immediate);
#endif
-
+#ifdef CONFIG_MARKERS
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers, NULL, NULL);
+#endif
+ }
err = module_finalize(hdr, sechdrs, mod);
if (err < 0)
goto cleanup;
@@ -2693,3 +2706,52 @@ void module_immediate_update(void)
}
EXPORT_SYMBOL_GPL(module_immediate_update);
#endif
+
+#ifdef CONFIG_MARKERS
+void module_update_markers(struct module *probe_module, int *refcount)
+{
+ struct module *mod;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(mod, &modules, list)
+ if (!mod->taints)
+ marker_update_probe_range(mod->markers,
+ mod->markers + mod->num_markers,
+ probe_module, refcount);
+ mutex_unlock(&module_mutex);
+}
+EXPORT_SYMBOL_GPL(module_update_markers);
+
+/*
+ * Returns 0 if current not found.
+ * Returns 1 if current found.
+ */
+int module_get_iter_markers(struct marker_iter *iter)
+{
+ struct module *iter_mod;
+ int found = 0;
+
+ mutex_lock(&module_mutex);
+ list_for_each_entry(iter_mod, &modules, list) {
+ if (!iter_mod->taints) {
+ /*
+ * Sorted module list
+ */
+ if (iter_mod < iter->module)
+ continue;
+ else if (iter_mod > iter->module)
+ iter->marker = NULL;
+ found = marker_get_iter_range(&iter->marker,
+ iter_mod->markers,
+ iter_mod->markers + iter_mod->num_markers);
+ if (found) {
+ iter->module = iter_mod;
+ break;
+ }
+ }
+ }
+ mutex_unlock(&module_mutex);
+ return found;
+}
+EXPORT_SYMBOL_GPL(module_get_iter_markers);
+#endif
Index: linux-2.6-lttng/kernel/marker.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6-lttng/kernel/marker.c 2007-09-14 10:11:31.000000000 -0400
@@ -0,0 +1,608 @@
+/*
+ * Copyright (C) 2007 Mathieu Desnoyers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/types.h>
+#include <linux/jhash.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/marker.h>
+#include <linux/err.h>
+#include <linux/immediate.h>
+
+extern struct __mark_marker __start___markers[];
+extern struct __mark_marker __stop___markers[];
+
+/*
+ * module_mutex nests inside markers_mutex. Markers mutex protects the builtin
+ * and module markers, the hash table and deferred_sync.
+ */
+DEFINE_MUTEX(markers_mutex);
+
+/*
+ * Marker deferred synchronization.
+ * Upon marker probe_unregister, we delay call to synchronize_sched() to
+ * accelerate mass unregistration (only when there is no more reference to a
+ * given module do we call synchronize_sched()). However, we need to make sure
+ * every critical region has ended before we re-arm a marker that has been
+ * unregistered and then registered back with a different probe data.
+ */
+static int deferred_sync;
+
+/*
+ * Marker hash table, containing the active markers.
+ * Protected by module_mutex.
+ */
+#define MARKER_HASH_BITS 6
+#define MARKER_TABLE_SIZE (1 << MARKER_HASH_BITS)
+
+struct marker_entry {
+ struct hlist_node hlist;
+ char *format;
+ marker_probe_func *probe;
+ void *pdata;
+ int refcount; /* Number of times armed. 0 if disarmed. */
+ char name[0]; /* Contains name'\0'format'\0' */
+};
+
+static struct hlist_head marker_table[MARKER_TABLE_SIZE];
+
+/**
+ * __mark_empty_function - Empty probe callback
+ * @mdata: pointer of type const struct __mark_marker
+ * @fmt: format string
+ * @...: variable argument list
+ *
+ * Empty callback provided as a probe to the markers. By providing this to a
+ * disabled marker, we make sure the execution flow is always valid even
+ * though the function pointer change and the marker enabling are two distinct
+ * operations that modifies the execution flow of preemptible code.
+ */
+void __mark_empty_function(const struct __mark_marker *mdata,
+ void *private_data,
+ const char *fmt, ...)
+{ }
+EXPORT_SYMBOL_GPL(__mark_empty_function);
+
+/*
+ * Get marker if the marker is present in the marker hash table.
+ * Must be called with markers_mutex held.
+ * Returns NULL if not present.
+ */
+static struct marker_entry *get_marker(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ u32 hash = jhash(name, strlen(name), 0);
+
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name))
+ return e;
+ }
+ return NULL;
+}
+
+/*
+ * Add the marker to the marker hash table. Must be called with markers_mutex
+ * held.
+ */
+static int add_marker(const char *name,
+ const char *format, marker_probe_func *probe, void *pdata)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t name_len = strlen(name) + 1;
+ size_t format_len = 0;
+ u32 hash = jhash(name, name_len-1, 0);
+
+ if (format)
+ format_len = strlen(format) + 1;
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ printk(KERN_NOTICE
+ "Marker %s busy, probe %p already installed\n",
+ name, e->probe);
+ return -EBUSY; /* Already there */
+ }
+ }
+ /*
+ * Using kmalloc here to allocate a variable length element. Could
+ * cause some memory fragmentation if overused.
+ */
+ e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
+ GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+ memcpy(&e->name[0], name, name_len);
+ if (format) {
+ e->format = &e->name[name_len];
+ memcpy(e->format, format, format_len);
+ trace_mark(core_marker_format, "name %s format %s",
+ e->name, e->format);
+ } else
+ e->format = NULL;
+ e->probe = probe;
+ e->pdata = pdata;
+ e->refcount = 0;
+ hlist_add_head(&e->hlist, head);
+ return 0;
+}
+
+/*
+ * Remove the marker from the marker hash table. Must be called with mutex_lock
+ * held.
+ */
+static void *remove_marker(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ int found = 0;
+ size_t len = strlen(name) + 1;
+ void *pdata = NULL;
+ u32 hash = jhash(name, len-1, 0);
+
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ found = 1;
+ break;
+ }
+ }
+ if (found) {
+ pdata = e->pdata;
+ hlist_del(&e->hlist);
+ kfree(e);
+ }
+ return pdata;
+}
+
+/*
+ * Set the mark_entry format to the format found in the element.
+ */
+static int marker_set_format(struct marker_entry **entry, const char *format)
+{
+ struct marker_entry *e;
+ size_t name_len = strlen((*entry)->name) + 1;
+ size_t format_len = strlen(format) + 1;
+
+ e = kmalloc(sizeof(struct marker_entry) + name_len + format_len,
+ GFP_KERNEL);
+ if (!e)
+ return -ENOMEM;
+ memcpy(&e->name[0], (*entry)->name, name_len);
+ e->format = &e->name[name_len];
+ memcpy(e->format, format, format_len);
+ e->probe = (*entry)->probe;
+ e->pdata = (*entry)->pdata;
+ e->refcount = (*entry)->refcount;
+ hlist_add_before(&e->hlist, &(*entry)->hlist);
+ hlist_del(&(*entry)->hlist);
+ kfree(*entry);
+ *entry = e;
+ trace_mark(core_marker_format, "name %s format %s",
+ e->name, e->format);
+ return 0;
+}
+
+/*
+ * Sets the probe callback corresponding to one marker.
+ */
+static int set_marker(struct marker_entry **entry,
+ struct __mark_marker *elem)
+{
+ int ret;
+ BUG_ON(strcmp((*entry)->name, elem->name) != 0);
+
+ if ((*entry)->format) {
+ if (strcmp((*entry)->format, elem->format) != 0) {
+ printk(KERN_NOTICE
+ "Format mismatch for probe %s "
+ "(%s), marker (%s)\n",
+ (*entry)->name,
+ (*entry)->format,
+ elem->format);
+ return -EPERM;
+ }
+ } else {
+ ret = marker_set_format(entry, elem->format);
+ if (ret)
+ return ret;
+ }
+ elem->call = (*entry)->probe;
+ elem->pdata = (*entry)->pdata;
+ _immediate_set(elem->state, 1);
+ return 0;
+}
+
+/*
+ * Disable a marker and its probe callback.
+ * Note: only after a synchronize_sched() issued after setting elem->call to the
+ * empty function insures that the original callback is not used anymore. This
+ * insured by preemption disabling around the call site.
+ */
+static void disable_marker(struct __mark_marker *elem)
+{
+ _immediate_set(elem->state, 0);
+ elem->call = __mark_empty_function;
+ /*
+ * Leave the pdata and id there, because removal is racy and should be
+ * done only after a synchronize_sched(). These are never used until
+ * the next initialization anyway.
+ */
+}
+
+/**
+ * marker_update_probe_range - Update a probe range
+ * @begin: beginning of the range
+ * @end: end of the range
+ * @probe_module: module address of the probe being updated
+ * @refcount: number of references left to the given probe_module (out)
+ *
+ * Updates the probe callback corresponding to a range of markers.
+ * Must be called with markers_mutex held.
+ */
+void marker_update_probe_range(
+ struct __mark_marker *begin,
+ struct __mark_marker *end,
+ struct module *probe_module,
+ int *refcount)
+{
+ struct __mark_marker *iter;
+ struct marker_entry *mark_entry;
+
+ for (iter = begin; iter < end; iter++) {
+ mark_entry = get_marker(iter->name);
+ if (mark_entry && mark_entry->refcount) {
+ set_marker(&mark_entry, iter);
+ /*
+ * ignore error, continue
+ */
+ if (probe_module)
+ if (probe_module ==
+ __module_text_address((unsigned long)mark_entry->probe))
+ (*refcount)++;
+ } else {
+ disable_marker(iter);
+ }
+ }
+}
+EXPORT_SYMBOL_GPL(marker_update_probe_range);
+
+/*
+ * Update probes, removing the faulty probes.
+ * Issues a synchronize_sched() when no reference to the module passed
+ * as parameter is found in the probes so the probe module can be
+ * safely unloaded from now on.
+ */
+static inline void marker_update_probes(struct module *probe_module)
+{
+ int refcount = 0;
+
+ mutex_lock(&markers_mutex);
+ /* Core kernel markers */
+ marker_update_probe_range(__start___markers,
+ __stop___markers, probe_module, &refcount);
+ /* Markers in modules. */
+ module_update_markers(probe_module, &refcount);
+ if (probe_module && refcount == 0) {
+ synchronize_sched();
+ deferred_sync = 0;
+ }
+ mutex_unlock(&markers_mutex);
+}
+
+/**
+ * marker_probe_register - Connect a probe to a marker
+ * @name: marker name
+ * @format: format string
+ * @probe: probe handler
+ * @pdata: probe private data
+ *
+ * pdata must be a valid allocated memory address, or NULL.
+ * Returns 0 if ok, error value on error.
+ */
+int marker_probe_register(const char *name, const char *format,
+ marker_probe_func *probe, void *pdata)
+{
+ struct marker_entry *entry;
+ int ret = 0, need_update = 0;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (entry && entry->refcount) {
+ ret = -EBUSY;
+ goto end;
+ }
+ if (deferred_sync) {
+ synchronize_sched();
+ deferred_sync = 0;
+ }
+ ret = add_marker(name, format, probe, pdata);
+ if (ret)
+ goto end;
+ need_update = 1;
+end:
+ mutex_unlock(&markers_mutex);
+ if (need_update)
+ marker_update_probes(NULL);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_probe_register);
+
+/**
+ * marker_probe_unregister - Disconnect a probe from a marker
+ * @name: marker name
+ *
+ * Returns the pdata given to marker_probe_register, or an ERR_PTR().
+ */
+void *marker_probe_unregister(const char *name)
+{
+ struct module *probe_module;
+ struct marker_entry *entry;
+ void *pdata;
+ int need_update = 0;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry) {
+ pdata = ERR_PTR(-ENOENT);
+ goto end;
+ }
+ entry->refcount = 0;
+ /* In what module is the probe handler ? */
+ probe_module = __module_text_address((unsigned long)entry->probe);
+ pdata = remove_marker(name);
+ deferred_sync = 1;
+ need_update = 1;
+end:
+ mutex_unlock(&markers_mutex);
+ if (need_update)
+ marker_update_probes(probe_module);
+ return pdata;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister);
+
+/**
+ * marker_probe_unregister_pdata - Disconnect a probe from a marker
+ * @pdata: probe private data
+ *
+ * Unregister a marker by providing the registered pdata.
+ * Returns the pdata given to marker_probe_register, or an ERR_PTR().
+ */
+void *marker_probe_unregister_pdata(void *pdata)
+{
+ struct module *probe_module;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *entry;
+ int found = 0;
+ unsigned int i;
+ int need_update = 0;
+
+ mutex_lock(&markers_mutex);
+ for (i = 0; i < MARKER_TABLE_SIZE; i++) {
+ head = &marker_table[i];
+ hlist_for_each_entry(entry, node, head, hlist) {
+ if (entry->pdata == pdata) {
+ found = 1;
+ goto iter_end;
+ }
+ }
+ }
+iter_end:
+ if (!found) {
+ pdata = ERR_PTR(-ENOENT);
+ goto end;
+ }
+ entry->refcount = 0;
+ /* In what module is the probe handler ? */
+ probe_module = __module_text_address((unsigned long)entry->probe);
+ pdata = remove_marker(entry->name);
+ deferred_sync = 1;
+ need_update = 1;
+end:
+ mutex_unlock(&markers_mutex);
+ if (need_update)
+ marker_update_probes(probe_module);
+ return pdata;
+}
+EXPORT_SYMBOL_GPL(marker_probe_unregister_pdata);
+
+/**
+ * marker_arm - Arm a marker
+ * @name: marker name
+ *
+ * Activate a marker. It keeps a reference count of the number of
+ * arming/disarming done.
+ * Returns 0 if ok, error value on error.
+ */
+int marker_arm(const char *name)
+{
+ struct marker_entry * entry;
+ int ret = 0, need_update = 0;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry) {
+ ret = -ENOENT;
+ goto end;
+ }
+ /*
+ * Only need to update probes when refcount passes from 0 to 1.
+ */
+ if (entry->refcount++)
+ goto end;
+ need_update = 1;
+end:
+ mutex_unlock(&markers_mutex);
+ if (need_update)
+ marker_update_probes(NULL);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_arm);
+
+/**
+ * marker_disarm - Disarm a marker
+ * @name: marker name
+ *
+ * Disarm a marker. It keeps a reference count of the number of arming/disarming
+ * done.
+ * Returns 0 if ok, error value on error.
+ */
+int marker_disarm(const char *name)
+{
+ struct marker_entry * entry;
+ int ret = 0, need_update = 0;
+
+ mutex_lock(&markers_mutex);
+ entry = get_marker(name);
+ if (!entry) {
+ ret = -ENOENT;
+ goto end;
+ }
+ /*
+ * Only permit decrement refcount if higher than 0.
+ * Do probe update only on 1 -> 0 transition.
+ */
+ if (entry->refcount) {
+ if (--entry->refcount)
+ goto end;
+ } else {
+ ret = -EPERM;
+ goto end;
+ }
+ need_update = 1;
+end:
+ mutex_unlock(&markers_mutex);
+ if (need_update)
+ marker_update_probes(NULL);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(marker_disarm);
+
+/**
+ * marker_get_pdata - Get a marker's probe private data
+ * @name: marker name
+ *
+ * Returns the pdata pointer, or an ERR_PTR.
+ * The pdata pointer should _only_ be dereferenced if the caller is the owner of
+ * the data, or its content could vanish. This is mostly used to confirm that a
+ * caller is the owner of a registered probe.
+ */
+void *marker_get_pdata(const char *name)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct marker_entry *e;
+ size_t name_len = strlen(name) + 1;
+ u32 hash = jhash(name, name_len-1, 0);
+ int found = 0;
+
+ head = &marker_table[hash & ((1 << MARKER_HASH_BITS)-1)];
+ hlist_for_each_entry(e, node, head, hlist) {
+ if (!strcmp(name, e->name)) {
+ found = 1;
+ return e->pdata;
+ }
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(marker_get_pdata);
+
+/**
+ * marker_get_iter_range - Get a next marker iterator given a range.
+ * @marker: current markers (in), next marker (out)
+ * @begin: beginning of the range
+ * @end: end of the range
+ *
+ * Returns whether a next marker has been found (1) or not (0).
+ * Will return the first marker in the range if the input marker is NULL.
+ */
+int marker_get_iter_range(struct __mark_marker **marker,
+ struct __mark_marker *begin,
+ struct __mark_marker *end)
+{
+ int found = 0;
+
+ if (!*marker && begin != end) {
+ found = 1;
+ *marker = begin;
+ } else if (*marker >= begin && *marker < end) {
+ found = 1;
+ /*
+ * *marker is known to be a valid marker from now on.
+ */
+ }
+ return found;
+}
+EXPORT_SYMBOL_GPL(marker_get_iter_range);
+
+static inline void marker_get_iter(struct marker_iter *iter)
+{
+ int found = 0;
+
+ /* Core kernel markers */
+ if (!iter->module) {
+ found = marker_get_iter_range(&iter->marker,
+ __start___markers, __stop___markers);
+ if (found)
+ goto end;
+ }
+ /* Markers in modules. */
+ found = module_get_iter_markers(iter);
+end:
+ if (!found)
+ marker_iter_reset(iter);
+}
+
+void marker_iter_start(struct marker_iter *iter)
+{
+ mutex_lock(&markers_mutex);
+ marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_start);
+
+void marker_iter_next(struct marker_iter *iter)
+{
+ iter->marker++;
+ /*
+ * iter->marker may be invalid because we blindly incremented it.
+ * Make sure it is valid by marshalling on the markers, getting the
+ * markers from following modules if necessary.
+ */
+ marker_get_iter(iter);
+}
+EXPORT_SYMBOL_GPL(marker_iter_next);
+
+void marker_iter_stop(struct marker_iter *iter)
+{
+ mutex_unlock(&markers_mutex);
+}
+EXPORT_SYMBOL_GPL(marker_iter_stop);
+
+void marker_iter_reset(struct marker_iter *iter)
+{
+ iter->module = NULL;
+ iter->marker = NULL;
+}
+EXPORT_SYMBOL_GPL(marker_iter_reset);
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
^ permalink raw reply [flat|nested] 18+ messages in thread
* [patch 4/4] Port of blktrace to the Linux Kernel Markers.
2007-09-17 18:46 [patch 0/4] Linux Kernel Markers Mathieu Desnoyers
` (2 preceding siblings ...)
2007-09-17 18:46 ` [patch 3/4] Linux Kernel Markers - Documentation Mathieu Desnoyers
@ 2007-09-17 18:46 ` Mathieu Desnoyers
3 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2007-09-17 18:46 UTC (permalink / raw)
To: akpm, linux-kernel; +Cc: Mathieu Desnoyers, Frank Ch. Eigler, Jens Axboe
[-- Attachment #1: linux-kernel-markers-port-blktrace-to-markers.patch --]
[-- Type: text/plain, Size: 27084 bytes --]
Here is the first stage of a port of blktrace to the Linux Kernel Markers. The
advantage of this port is that it minimizes the impact on the running when
blktrace is not active.
A few remarks : this patch has the positive effect of removing some code
from the block io tracing hot paths, minimizing the i-cache impact in a
system where the io tracing is compiled in but inactive.
It also moves the blk tracing code from a header (and therefore from the
body of the instrumented functions) to a separate C file.
There, as soon as one device has to be traced, all devices have to
execute the tracing function call when they pass by the instrumentation site.
This is slower than the previous inline function which tested the condition
quickly.
It does not make the code smaller, since I left all the specialized
tracing functions for requests, bio, generic, remap, which would go away
once a generic infrastructure is in place to serialize the information
passed to the marker. This is mostly why I consider it as a step towards the
full improvements that could bring the markers.
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: "Frank Ch. Eigler" <fche@redhat.com>
CC: Jens Axboe <jens.axboe@oracle.com>
---
block/Kconfig | 1
block/blktrace.c | 343 ++++++++++++++++++++++++++++++++++++++++++-
block/elevator.c | 6
block/ll_rw_blk.c | 35 ++--
drivers/block/cciss.c | 4
drivers/md/dm.c | 14 -
fs/bio.c | 6
include/linux/blktrace_api.h | 145 +-----------------
mm/bounce.c | 4
mm/highmem.c | 2
10 files changed, 388 insertions(+), 172 deletions(-)
Index: linux-2.6-lttng/block/elevator.c
===================================================================
--- linux-2.6-lttng.orig/block/elevator.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/elevator.c 2007-09-17 14:03:12.000000000 -0400
@@ -32,7 +32,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/delay.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
#include <linux/hash.h>
#include <asm/uaccess.h>
@@ -548,7 +548,7 @@ void elv_insert(struct request_queue *q,
unsigned ordseq;
int unplug_it = 1;
- blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+ trace_mark(blk_request_insert, "%p %p", q, rq);
rq->q = q;
@@ -735,7 +735,7 @@ struct request *elv_next_request(struct
* not be passed by new incoming requests
*/
rq->cmd_flags |= REQ_STARTED;
- blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+ trace_mark(blk_request_issue, "%p %p", q, rq);
}
if (!q->boundary_rq || q->boundary_rq == rq) {
Index: linux-2.6-lttng/block/ll_rw_blk.c
===================================================================
--- linux-2.6-lttng.orig/block/ll_rw_blk.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/ll_rw_blk.c 2007-09-17 14:03:12.000000000 -0400
@@ -28,6 +28,7 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
+#include <linux/marker.h>
#include <linux/blktrace_api.h>
#include <linux/fault-inject.h>
#include <linux/scatterlist.h>
@@ -1559,7 +1560,7 @@ void blk_plug_device(struct request_queu
if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
- blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+ trace_mark(blk_plug_device, "%p %p %d", q, NULL, 0);
}
}
@@ -1625,7 +1626,7 @@ static void blk_backing_dev_unplug(struc
* devices don't necessarily have an ->unplug_fn defined
*/
if (q->unplug_fn) {
- blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+ trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
q->unplug_fn(q);
@@ -1637,7 +1638,7 @@ static void blk_unplug_work(struct work_
struct request_queue *q =
container_of(work, struct request_queue, unplug_work);
- blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+ trace_mark(blk_pdu_unplug_io, "%p %p %d", q, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
q->unplug_fn(q);
@@ -1647,7 +1648,7 @@ static void blk_unplug_timeout(unsigned
{
struct request_queue *q = (struct request_queue *)data;
- blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+ trace_mark(blk_pdu_unplug_timer, "%p %p %d", q, NULL,
q->rq.count[READ] + q->rq.count[WRITE]);
kblockd_schedule_work(&q->unplug_work);
@@ -2160,7 +2161,7 @@ rq_starved:
rq_init(q, rq);
- blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
+ trace_mark(blk_get_request, "%p %p %d", q, bio, rw);
out:
return rq;
}
@@ -2190,7 +2191,7 @@ static struct request *get_request_wait(
if (!rq) {
struct io_context *ioc;
- blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+ trace_mark(blk_sleep_request, "%p %p %d", q, bio, rw);
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
@@ -2264,7 +2265,7 @@ EXPORT_SYMBOL(blk_start_queueing);
*/
void blk_requeue_request(struct request_queue *q, struct request *rq)
{
- blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+ trace_mark(blk_requeue, "%p %p", q, rq);
if (blk_rq_tagged(rq))
blk_queue_end_tag(q, rq);
@@ -2987,7 +2988,7 @@ static int __make_request(struct request
if (!ll_back_merge_fn(q, req, bio))
break;
- blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+ trace_mark(blk_bio_backmerge, "%p %p", q, bio);
req->biotail->bi_next = bio;
req->biotail = bio;
@@ -3004,7 +3005,7 @@ static int __make_request(struct request
if (!ll_front_merge_fn(q, req, bio))
break;
- blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+ trace_mark(blk_bio_frontmerge, "%p %p", q, bio);
bio->bi_next = req->bio;
req->bio = bio;
@@ -3087,9 +3088,10 @@ static inline void blk_partition_remap(s
bio->bi_sector += p->start_sect;
bio->bi_bdev = bdev->bd_contains;
- blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
- bdev->bd_dev, bio->bi_sector,
- bio->bi_sector - p->start_sect);
+ trace_mark(blk_remap, "%p %p %llu %llu %llu",
+ bdev_get_queue(bio->bi_bdev), bio,
+ (u64)bdev->bd_dev, (u64)bio->bi_sector,
+ (u64)bio->bi_sector - p->start_sect);
}
}
@@ -3254,10 +3256,11 @@ end_io:
blk_partition_remap(bio);
if (old_sector != -1)
- blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
- old_sector);
+ trace_mark(blk_remap, "%p %p %llu %llu %llu",
+ q, bio, (u64)old_dev,
+ (u64)bio->bi_sector, (u64)old_sector);
- blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+ trace_mark(blk_bio_queue, "%p %p", q, bio);
old_sector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
@@ -3446,7 +3449,7 @@ static int __end_that_request_first(stru
int total_bytes, bio_nbytes, error, next_idx = 0;
struct bio *bio;
- blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+ trace_mark(blk_request_complete, "%p %p", req->q, req);
/*
* extend uptodate bool to allow < 0 value to be direct io error
Index: linux-2.6-lttng/block/Kconfig
===================================================================
--- linux-2.6-lttng.orig/block/Kconfig 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/Kconfig 2007-09-17 14:03:12.000000000 -0400
@@ -32,6 +32,7 @@ config BLK_DEV_IO_TRACE
depends on SYSFS
select RELAY
select DEBUG_FS
+ select MARKERS
help
Say Y here, if you want to be able to trace the block layer actions
on a given queue. Tracing allows you to see any traffic happening
Index: linux-2.6-lttng/block/blktrace.c
===================================================================
--- linux-2.6-lttng.orig/block/blktrace.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/block/blktrace.c 2007-09-17 14:03:12.000000000 -0400
@@ -23,11 +23,19 @@
#include <linux/mutex.h>
#include <linux/debugfs.h>
#include <linux/time.h>
+#include <linux/marker.h>
#include <asm/uaccess.h>
static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
static unsigned int blktrace_seq __read_mostly = 1;
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
+
+int blk_probe_arm(void);
+void blk_probe_disarm(void);
+
/*
* Send out a notify message.
*/
@@ -179,7 +187,7 @@ void __blk_add_trace(struct blk_trace *b
EXPORT_SYMBOL_GPL(__blk_add_trace);
static struct dentry *blk_tree_root;
-static struct mutex blk_tree_mutex;
+static DEFINE_MUTEX(blk_tree_mutex);
static unsigned int root_users;
static inline void blk_remove_root(void)
@@ -229,6 +237,10 @@ static void blk_trace_cleanup(struct blk
blk_remove_tree(bt->dir);
free_percpu(bt->sequence);
kfree(bt);
+ mutex_lock(&blk_probe_mutex);
+ if (--blk_probes_ref == 0)
+ blk_probe_disarm();
+ mutex_unlock(&blk_probe_mutex);
}
static int blk_trace_remove(struct request_queue *q)
@@ -386,6 +398,11 @@ static int blk_trace_setup(struct reques
goto err;
}
+ mutex_lock(&blk_probe_mutex);
+ if (!blk_probes_ref++)
+ blk_probe_arm();
+ mutex_unlock(&blk_probe_mutex);
+
return 0;
err:
if (dir)
@@ -549,9 +566,331 @@ static void blk_trace_set_ht_offsets(voi
#endif
}
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @rq: the source request
+ *
+ * Description:
+ * Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...)
+{
+ va_list args;
+ u32 what;
+ struct blk_trace *bt;
+ int rw;
+ struct blk_probe_data *pinfo = mdata->pdata;
+ struct request_queue *q;
+ struct request *rq;
+
+ va_start(args, fmt);
+ q = va_arg(args, struct request_queue *);
+ rq = va_arg(args, struct request *);
+ va_end(args);
+
+ what = pinfo->flags;
+ bt = q->blk_trace;
+ rw = rq->cmd_flags & 0x03;
+
+ if (likely(!bt))
+ return;
+
+ if (blk_pc_request(rq)) {
+ what |= BLK_TC_ACT(BLK_TC_PC);
+ __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+ } else {
+ what |= BLK_TC_ACT(BLK_TC_FS);
+ __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+ }
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @bio: the source bio
+ *
+ * Description:
+ * Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...)
+{
+ va_list args;
+ u32 what;
+ struct blk_trace *bt;
+ struct blk_probe_data *pinfo = mdata->pdata;
+ struct request_queue *q;
+ struct bio *bio;
+
+ va_start(args, fmt);
+ q = va_arg(args, struct request_queue *);
+ bio = va_arg(args, struct bio *);
+ va_end(args);
+
+ what = pinfo->flags;
+ bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @rw: the data direction
+ *
+ * Description:
+ * Records a simple trace
+ *
+ **/
+static void blk_add_trace_generic(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...)
+{
+ va_list args;
+ struct blk_trace *bt;
+ u32 what;
+ struct blk_probe_data *pinfo = mdata->pdata;
+ struct request_queue *q;
+ struct bio *bio;
+ int rw;
+
+ va_start(args, fmt);
+ q = va_arg(args, struct request_queue *);
+ bio = va_arg(args, struct bio *);
+ rw = va_arg(args, int);
+ va_end(args);
+
+ what = pinfo->flags;
+ bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ if (bio)
+ blk_add_trace_bio(mdata, "%p %p", NULL, q, bio);
+ else
+ __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with any integer payload
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @pdu: the long long integer payload
+ *
+ **/
+static inline void blk_trace_integer(struct request_queue *q, struct bio *bio, unsigned long long pdu,
+ u32 what)
+{
+ struct blk_trace *bt;
+ __be64 rpdu;
+
+ bt = q->blk_trace;
+ rpdu = cpu_to_be64(pdu);
+
+ if (likely(!bt))
+ return;
+
+ if (bio)
+ __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+ !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+ else
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_pdu_ll - Add a trace for a bio with an long long integer
+ * payload
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @pdu: the long long integer payload
+ *
+ * Description:
+ * Adds a trace with some long long integer payload. This might be an unplug
+ * option given as the action, with the depth at unplug time given as the
+ * payload
+ *
+ **/
+static void blk_add_trace_pdu_ll(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...)
+{
+ va_list args;
+ struct blk_probe_data *pinfo = mdata->pdata;
+ struct request_queue *q;
+ struct bio *bio;
+ unsigned long long pdu;
+ u32 what;
+
+ what = pinfo->flags;
+
+ va_start(args, fmt);
+ q = va_arg(args, struct request_queue *);
+ bio = va_arg(args, struct bio *);
+ pdu = va_arg(args, unsigned long long);
+ va_end(args);
+
+ blk_trace_integer(q, bio, pdu, what);
+}
+
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @pdu: the integer payload
+ *
+ * Description:
+ * Adds a trace with some integer payload. This might be an unplug
+ * option given as the action, with the depth at unplug time given
+ * as the payload
+ *
+ **/
+static void blk_add_trace_pdu_int(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...)
+{
+ va_list args;
+ struct blk_probe_data *pinfo = mdata->pdata;
+ struct request_queue *q;
+ struct bio *bio;
+ unsigned int pdu;
+ u32 what;
+
+ what = pinfo->flags;
+
+ va_start(args, fmt);
+ q = va_arg(args, struct request_queue *);
+ bio = va_arg(args, struct bio *);
+ pdu = va_arg(args, unsigned int);
+ va_end(args);
+
+ blk_trace_integer(q, bio, pdu, what);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * Expected variable arguments :
+ * @q: queue the io is for
+ * @bio: the source bio
+ * @dev: target device
+ * @from: source sector
+ * @to: target sector
+ *
+ * Description:
+ * Device mapper or raid target sometimes need to split a bio because
+ * it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(const struct __mark_marker *mdata,
+ void *private_data, const char *fmt, ...)
+{
+ va_list args;
+ struct blk_trace *bt;
+ struct blk_io_trace_remap r;
+ u32 what;
+ struct blk_probe_data *pinfo = mdata->pdata;
+ struct request_queue *q;
+ struct bio *bio;
+ u64 dev, from, to;
+
+ va_start(args, fmt);
+ q = va_arg(args, struct request_queue *);
+ bio = va_arg(args, struct bio *);
+ dev = va_arg(args, u64);
+ from = va_arg(args, u64);
+ to = va_arg(args, u64);
+ va_end(args);
+
+ what = pinfo->flags;
+ bt = q->blk_trace;
+
+ if (likely(!bt))
+ return;
+
+ r.device = cpu_to_be32(dev);
+ r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+ r.sector = cpu_to_be64(to);
+
+ __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#define FACILITY_NAME "blk"
+
+static struct blk_probe_data probe_array[] =
+{
+ { "blk_bio_queue", "%p %p", BLK_TA_QUEUE, blk_add_trace_bio },
+ { "blk_bio_backmerge", "%p %p", BLK_TA_BACKMERGE, blk_add_trace_bio },
+ { "blk_bio_frontmerge", "%p %p", BLK_TA_FRONTMERGE, blk_add_trace_bio },
+ { "blk_get_request", "%p %p %d", BLK_TA_GETRQ, blk_add_trace_generic },
+ { "blk_sleep_request", "%p %p %d", BLK_TA_SLEEPRQ,
+ blk_add_trace_generic },
+ { "blk_requeue", "%p %p", BLK_TA_REQUEUE, blk_add_trace_rq },
+ { "blk_request_issue", "%p %p", BLK_TA_ISSUE, blk_add_trace_rq },
+ { "blk_request_complete", "%p %p", BLK_TA_COMPLETE, blk_add_trace_rq },
+ { "blk_plug_device", "%p %p %d", BLK_TA_PLUG, blk_add_trace_generic },
+ { "blk_pdu_unplug_io", "%p %p %d", BLK_TA_UNPLUG_IO,
+ blk_add_trace_pdu_int },
+ { "blk_pdu_unplug_timer", "%p %p %d", BLK_TA_UNPLUG_TIMER,
+ blk_add_trace_pdu_int },
+ { "blk_request_insert", "%p %p", BLK_TA_INSERT,
+ blk_add_trace_rq },
+ { "blk_pdu_split", "%p %p %llu", BLK_TA_SPLIT,
+ blk_add_trace_pdu_ll },
+ { "blk_bio_bounce", "%p %p", BLK_TA_BOUNCE, blk_add_trace_bio },
+ { "blk_remap", "%p %p %llu %llu %llu", BLK_TA_REMAP,
+ blk_add_trace_remap },
+};
+
+
+int blk_probe_arm(void)
+{
+ int result;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+ result = marker_probe_register(probe_array[i].name,
+ probe_array[i].format,
+ probe_array[i].callback, &probe_array[i]);
+ if (result)
+ printk(KERN_INFO
+ "blktrace unable to register probe %s\n",
+ probe_array[i].name);
+ result = marker_arm(probe_array[i].name);
+ if (result)
+ printk(KERN_INFO
+ "blktrace unable to arm probe %s\n",
+ probe_array[i].name);
+ }
+ return 0;
+}
+
+void blk_probe_disarm(void)
+{
+ int i, err;
+
+ for (i = 0; i < ARRAY_SIZE(probe_array); i++) {
+ err = marker_disarm(probe_array[i].name);
+ BUG_ON(err);
+ err = IS_ERR(marker_probe_unregister(probe_array[i].name));
+ BUG_ON(err);
+ }
+}
+
+
static __init int blk_trace_init(void)
{
- mutex_init(&blk_tree_mutex);
on_each_cpu(blk_trace_check_cpu_time, NULL, 1, 1);
blk_trace_set_ht_offsets();
Index: linux-2.6-lttng/include/linux/blktrace_api.h
===================================================================
--- linux-2.6-lttng.orig/include/linux/blktrace_api.h 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/include/linux/blktrace_api.h 2007-09-17 14:03:12.000000000 -0400
@@ -3,6 +3,7 @@
#include <linux/blkdev.h>
#include <linux/relay.h>
+#include <linux/marker.h>
/*
* Trace categories
@@ -142,150 +143,22 @@ struct blk_user_trace_setup {
u32 pid;
};
+/* Probe data used for probe-marker connection */
+struct blk_probe_data {
+ const char *name;
+ const char *format;
+ u32 flags;
+ marker_probe_func *callback;
+};
+
#if defined(CONFIG_BLK_DEV_IO_TRACE)
extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
extern void blk_trace_shutdown(struct request_queue *);
extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q: queue the io is for
- * @rq: the source request
- * @what: the action
- *
- * Description:
- * Records an action against a request. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
-{
- struct blk_trace *bt = q->blk_trace;
- int rw = rq->cmd_flags & 0x03;
-
- if (likely(!bt))
- return;
-
- if (blk_pc_request(rq)) {
- what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
- } else {
- what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
- }
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q: queue the io is for
- * @bio: the source bio
- * @what: the action
- *
- * Description:
- * Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
- u32 what)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (likely(!bt))
- return;
-
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-/**
- * blk_add_trace_generic - Add a trace for a generic action
- * @q: queue the io is for
- * @bio: the source bio
- * @rw: the data direction
- * @what: the action
- *
- * Description:
- * Records a simple trace
- *
- **/
-static inline void blk_add_trace_generic(struct request_queue *q,
- struct bio *bio, int rw, u32 what)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (likely(!bt))
- return;
-
- if (bio)
- blk_add_trace_bio(q, bio, what);
- else
- __blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
-}
-
-/**
- * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
- * @q: queue the io is for
- * @what: the action
- * @bio: the source bio
- * @pdu: the integer payload
- *
- * Description:
- * Adds a trace with some integer payload. This might be an unplug
- * option given as the action, with the depth at unplug time given
- * as the payload
- *
- **/
-static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
- struct bio *bio, unsigned int pdu)
-{
- struct blk_trace *bt = q->blk_trace;
- __be64 rpdu = cpu_to_be64(pdu);
-
- if (likely(!bt))
- return;
-
- if (bio)
- __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
- else
- __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q: queue the io is for
- * @bio: the source bio
- * @dev: target device
- * @from: source sector
- * @to: target sector
- *
- * Description:
- * Device mapper or raid target sometimes need to split a bio because
- * it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
- dev_t dev, sector_t from, sector_t to)
-{
- struct blk_trace *bt = q->blk_trace;
- struct blk_io_trace_remap r;
-
- if (likely(!bt))
- return;
-
- r.device = cpu_to_be32(dev);
- r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
- r.sector = cpu_to_be64(to);
-
- __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
#else /* !CONFIG_BLK_DEV_IO_TRACE */
#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY)
#define blk_trace_shutdown(q) do { } while (0)
-#define blk_add_trace_rq(q, rq, what) do { } while (0)
-#define blk_add_trace_bio(q, rq, what) do { } while (0)
-#define blk_add_trace_generic(q, rq, rw, what) do { } while (0)
-#define blk_add_trace_pdu_int(q, what, bio, pdu) do { } while (0)
-#define blk_add_trace_remap(q, bio, dev, f, t) do {} while (0)
#endif /* CONFIG_BLK_DEV_IO_TRACE */
#endif
Index: linux-2.6-lttng/mm/bounce.c
===================================================================
--- linux-2.6-lttng.orig/mm/bounce.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/mm/bounce.c 2007-09-17 14:03:12.000000000 -0400
@@ -13,7 +13,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
#include <asm/tlbflush.h>
#define POOL_SIZE 64
@@ -237,7 +237,7 @@ static void __blk_queue_bounce(struct re
if (!bio)
return;
- blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+ trace_mark(blk_bio_bounce, "%p %p", q, *bio_orig);
/*
* at least one page was bounced, fill in possible non-highmem
Index: linux-2.6-lttng/mm/highmem.c
===================================================================
--- linux-2.6-lttng.orig/mm/highmem.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/mm/highmem.c 2007-09-17 14:03:12.000000000 -0400
@@ -26,7 +26,7 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
#include <asm/tlbflush.h>
/*
Index: linux-2.6-lttng/fs/bio.c
===================================================================
--- linux-2.6-lttng.orig/fs/bio.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/fs/bio.c 2007-09-17 14:03:12.000000000 -0400
@@ -25,7 +25,7 @@
#include <linux/module.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
#include <scsi/sg.h> /* for struct sg_iovec */
#define BIO_POOL_SIZE 2
@@ -1072,8 +1072,8 @@ struct bio_pair *bio_split(struct bio *b
if (!bp)
return bp;
- blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
- bi->bi_sector + first_sectors);
+ trace_mark(blk_pdu_split, "%p %p %llu", bdev_get_queue(bi->bi_bdev), bi,
+ (u64)bi->bi_sector + first_sectors);
BUG_ON(bi->bi_vcnt != 1);
BUG_ON(bi->bi_idx != 0);
Index: linux-2.6-lttng/drivers/block/cciss.c
===================================================================
--- linux-2.6-lttng.orig/drivers/block/cciss.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/drivers/block/cciss.c 2007-09-17 14:03:12.000000000 -0400
@@ -37,7 +37,7 @@
#include <linux/hdreg.h>
#include <linux/spinlock.h>
#include <linux/compat.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
#include <asm/uaccess.h>
#include <asm/io.h>
@@ -2545,7 +2545,7 @@ after_error_processing:
}
cmd->rq->data_len = 0;
cmd->rq->completion_data = cmd;
- blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
+ trace_mark(blk_request_complete, "%p %p", cmd->rq->q, cmd->rq);
blk_complete_request(cmd->rq);
}
Index: linux-2.6-lttng/drivers/md/dm.c
===================================================================
--- linux-2.6-lttng.orig/drivers/md/dm.c 2007-09-17 14:02:48.000000000 -0400
+++ linux-2.6-lttng/drivers/md/dm.c 2007-09-17 14:03:12.000000000 -0400
@@ -19,7 +19,7 @@
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
+#include <linux/marker.h>
#include <linux/smp_lock.h>
#define DM_MSG_PREFIX "core"
@@ -481,8 +481,8 @@ static void dec_pending(struct dm_io *io
wake_up(&io->md->wait);
if (io->error != DM_ENDIO_REQUEUE) {
- blk_add_trace_bio(io->md->queue, io->bio,
- BLK_TA_COMPLETE);
+ trace_mark(blk_request_complete, "%p %p",
+ io->md->queue, io->bio);
bio_endio(io->bio, io->bio->bi_size, io->error);
}
@@ -578,10 +578,10 @@ static void __map_bio(struct dm_target *
r = ti->type->map(ti, clone, &tio->info);
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
-
- blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
- tio->io->bio->bi_bdev->bd_dev,
- clone->bi_sector, sector);
+ trace_mark(blk_remap, "%p %p %llu %llu %llu",
+ bdev_get_queue(clone->bi_bdev), clone,
+ (u64)tio->io->bio->bi_bdev->bd_dev, (u64)sector,
+ (u64)clone->bi_sector);
generic_make_request(clone);
} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
--
Mathieu Desnoyers
Computer Engineering Ph.D. Student, Ecole Polytechnique de Montreal
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
^ permalink raw reply [flat|nested] 18+ messages in thread