LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH wq/for-3.19 1/3] workqueue: make the workqueues list RCU walkable
@ 2014-12-08 17:43 Tejun Heo
2014-12-08 17:44 ` [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager Tejun Heo
0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2014-12-08 17:43 UTC (permalink / raw)
To: linux-kernel; +Cc: Lai Jiangshan, linux-kernel
The workqueues list is protected by wq_pool_mutex and a workqueue and
its subordinate data structures are freed directly on destruction. We
want to add the ability dump workqueues from a sysrq callback which
requires walking all workqueues without grabbing wq_pool_mutex. This
patch makes freeing of workqueues RCU protected and makes the
workqueues list walkable while holding RCU read lock.
Note that pool_workqueues and pools are already sched-RCU protected.
For consistency, workqueues are also protected with sched-RCU.
While at it, reverse the workqueues list so that a workqueue which is
created earlier comes before. The order of the list isn't significant
functionally but this makes the planned sysrq dump list system
workqueues first.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/workqueue.c | 47 +++++++++++++++++++++++++++++++----------------
1 file changed, 31 insertions(+), 16 deletions(-)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -230,7 +230,7 @@ struct wq_device;
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
- struct list_head list; /* PL: list of all workqueues */
+ struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
@@ -257,6 +257,13 @@ struct workqueue_struct {
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
+ /*
+ * Destruction of workqueue_struct is sched-RCU protected to allow
+ * walking the workqueues list without grabbing wq_pool_mutex.
+ * This is used to dump all workqueues from sysrq.
+ */
+ struct rcu_head rcu;
+
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
@@ -288,7 +295,7 @@ static struct workqueue_attrs *wq_update
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
-static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* the per-cpu worker pools */
@@ -3386,6 +3393,20 @@ static int init_worker_pool(struct worke
return 0;
}
+static void rcu_free_wq(struct rcu_head *rcu)
+{
+ struct workqueue_struct *wq =
+ container_of(rcu, struct workqueue_struct, rcu);
+
+ if (!(wq->flags & WQ_UNBOUND))
+ free_percpu(wq->cpu_pwqs);
+ else
+ free_workqueue_attrs(wq->unbound_attrs);
+
+ kfree(wq->rescuer);
+ kfree(wq);
+}
+
static void rcu_free_pool(struct rcu_head *rcu)
{
struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
@@ -3563,12 +3584,10 @@ static void pwq_unbound_release_workfn(s
/*
* If we're the last pwq going away, @wq is already dead and no one
- * is gonna access it anymore. Free it.
+ * is gonna access it anymore. Schedule RCU free.
*/
- if (is_last) {
- free_workqueue_attrs(wq->unbound_attrs);
- kfree(wq);
- }
+ if (is_last)
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
}
/**
@@ -4105,7 +4124,7 @@ struct workqueue_struct *__alloc_workque
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);
- list_add(&wq->list, &workqueues);
+ list_add_tail_rcu(&wq->list, &workqueues);
mutex_unlock(&wq_pool_mutex);
@@ -4161,24 +4180,20 @@ void destroy_workqueue(struct workqueue_
* flushing is complete in case freeze races us.
*/
mutex_lock(&wq_pool_mutex);
- list_del_init(&wq->list);
+ list_del_rcu(&wq->list);
mutex_unlock(&wq_pool_mutex);
workqueue_sysfs_unregister(wq);
- if (wq->rescuer) {
+ if (wq->rescuer)
kthread_stop(wq->rescuer->task);
- kfree(wq->rescuer);
- wq->rescuer = NULL;
- }
if (!(wq->flags & WQ_UNBOUND)) {
/*
* The base ref is never dropped on per-cpu pwqs. Directly
- * free the pwqs and wq.
+ * schedule RCU free.
*/
- free_percpu(wq->cpu_pwqs);
- kfree(wq);
+ call_rcu_sched(&wq->rcu, rcu_free_wq);
} else {
/*
* We're the sole accessor of @wq at this point. Directly
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager
2014-12-08 17:43 [PATCH wq/for-3.19 1/3] workqueue: make the workqueues list RCU walkable Tejun Heo
@ 2014-12-08 17:44 ` Tejun Heo
2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo
0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2014-12-08 17:44 UTC (permalink / raw)
To: linux-kernel; +Cc: Lai Jiangshan
Add wq_barrier->task and worker_pool->manager to keep track of the
flushing task and pool manager respectively. These are purely
informational and will be used to implement sysrq dump of workqueues.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
kernel/workqueue.c | 5 +++++
1 file changed, 5 insertions(+)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -159,6 +159,7 @@ struct worker_pool {
/* see manage_workers() for details on the two manager mutexes */
struct mutex manager_arb; /* manager arbitration */
+ struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
struct completion *detach_completion; /* all workers detached */
@@ -1927,9 +1928,11 @@ static bool manage_workers(struct worker
*/
if (!mutex_trylock(&pool->manager_arb))
return ret;
+ pool->manager = worker;
ret |= maybe_create_worker(pool);
+ pool->manager = NULL;
mutex_unlock(&pool->manager_arb);
return ret;
}
@@ -2319,6 +2322,7 @@ repeat:
struct wq_barrier {
struct work_struct work;
struct completion done;
+ struct task_struct *task; /* purely informational */
};
static void wq_barrier_func(struct work_struct *work)
@@ -2367,6 +2371,7 @@ static void insert_wq_barrier(struct poo
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
+ barr->task = current;
/*
* If @target is currently being executed, schedule the
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 17:44 ` [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager Tejun Heo
@ 2014-12-08 17:47 ` Tejun Heo
2014-12-08 18:06 ` Andrew Morton
2015-03-09 13:28 ` [PATCH v3 " Tejun Heo
0 siblings, 2 replies; 11+ messages in thread
From: Tejun Heo @ 2014-12-08 17:47 UTC (permalink / raw)
To: linux-kernel; +Cc: Lai Jiangshan, Linus Torvalds, Andrew Morton, Ingo Molnar
Workqueues are used extensively throughout the kernel but sometimes
it's difficult to debug stalls involving work items because visibility
into its inner workings is fairly limited. Although sysrq-t task dump
annotates each active worker task with the information on the work
item being executed, it is challenging to find out which work items
are pending or delayed on which queues and how pools are being
managed.
This patch implements show_workqueue_state() which dumps all busy
workqueues and pools and is called from the sysrq-t handler. At the
end of sysrq-t dump, something like the following is printed.
Showing busy workqueues and worker pools:
...
workqueue filler_wq: flags=0x0
pwq 2: cpu=1 flags=0x0 nice=0 active=2/256
in-flight: 491:filler_workfn, 507:filler_workfn
pwq 0: cpu=0 flags=0x0 nice=0 active=2/256
in-flight: 501:filler_workfn
pending: filler_workfn
...
workqueue test_wq: flags=0x8
pwq 2: cpu=1 flags=0x0 nice=0 active=1/1
in-flight: 510(RESCUER):test_workfn BAR(69) BAR(500)
delayed: test_workfn1 BAR(492), test_workfn2
...
pool 0: cpu=0 flags=0x0 nice=0 workers=2 manager: 137
pool 2: cpu=1 flags=0x0 nice=0 workers=3 manager: 469
pool 3: cpu=1 flags=0x0 nice=-20 workers=2 idle: 16
pool 8: cpumask=0f flags=0x4 nice=0 workers=2 manager: 62
The above shows that test_wq is executing test_workfn() on pid 510
which is the rescuer and also that there are two tasks 69 and 500
waiting for the work item to finish in flush_work(). As test_wq has
max_active of 1, there are two work items for test_workfn1() and
test_workfn2() which are delayed till the current work item is
finished. In addition, pid 492 is flushing test_workfn1().
The work item for test_workfn() is being executed on pwq of pool 2
which is the normal priority per-cpu pool for CPU 1. The pool has
three workers, two of which are executing filler_workfn() for
filler_wq and the last one is assuming the manager role trying to
create more workers.
This extra workqueue state dump will hopefully help chasing down hangs
involving workqueues.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
CC: Ingo Molnar <mingo@redhat.com>
---
Hello,
If nobody objects, I'll push these three patches to print workqueue
information at the end of sysrq-t dump through wq/for-3.19.
Thanks.
drivers/tty/sysrq.c | 1
include/linux/workqueue.h | 1
kernel/workqueue.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 170 insertions(+)
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -275,6 +275,7 @@ static struct sysrq_key_op sysrq_showreg
static void sysrq_handle_showstate(int key)
{
show_state();
+ show_workqueue_state();
}
static struct sysrq_key_op sysrq_showstate_op = {
.handler = sysrq_handle_showstate,
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -457,6 +457,7 @@ extern bool workqueue_congested(int cpu,
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
+extern void show_workqueue_state(void);
/**
* queue_work - queue work on a workqueue
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4419,6 +4419,174 @@ void print_worker_info(const char *log_l
}
}
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ if (pool->cpu >= 0)
+ pr_cont(" cpu=%d", pool->cpu);
+ else if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+
+ if (pool->cpu < 0) {
+ static char cpus_buf[PAGE_SIZE];
+
+ cpumask_scnprintf(cpus_buf, sizeof(cpus_buf),
+ pool->attrs->cpumask);
+ pr_cont(" cpumask=%s", cpus_buf);
+ }
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr =
+ container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ printk(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ printk(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ printk(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ printk(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ printk("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ printk("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ printk("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
/*
* CPU hotplug.
*
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo
@ 2014-12-08 18:06 ` Andrew Morton
2014-12-08 18:40 ` Tejun Heo
2015-03-09 13:28 ` [PATCH v3 " Tejun Heo
1 sibling, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2014-12-08 18:06 UTC (permalink / raw)
To: Tejun Heo; +Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar
On Mon, 8 Dec 2014 12:47:33 -0500 Tejun Heo <tj@kernel.org> wrote:
>
> ...
>
> This patch implements show_workqueue_state() which dumps all busy
> workqueues and pools and is called from the sysrq-t handler. At the
> end of sysrq-t dump, something like the following is printed.
Seems sensible.
sysrq-t already produces thousands of lines of output. Maybe create a
new keycode for this?
>
> ...
>
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -4419,6 +4419,174 @@ void print_worker_info(const char *log_l
> }
> }
>
> +static void pr_cont_pool_info(struct worker_pool *pool)
> +{
> + if (pool->cpu >= 0)
> + pr_cont(" cpu=%d", pool->cpu);
> + else if (pool->node != NUMA_NO_NODE)
> + pr_cont(" node=%d", pool->node);
> +
> + if (pool->cpu < 0) {
> + static char cpus_buf[PAGE_SIZE];
Ouch. This could be [NR_CPUS + epsilon]?
> + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf),
> + pool->attrs->cpumask);
> + pr_cont(" cpumask=%s", cpus_buf);
> + }
> + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
> +}
> +
> +static void pr_cont_work(bool comma, struct work_struct *work)
> +{
> + if (work->func == wq_barrier_func) {
> + struct wq_barrier *barr =
> + container_of(work, struct wq_barrier, work);
Can avoid the 80-col contortions with
struct wq_barrier *barr;
barr = container_of(work, struct wq_barrier, work);
> + pr_cont("%s BAR(%d)", comma ? "," : "",
> + task_pid_nr(barr->task));
> + } else {
> + pr_cont("%s %pf", comma ? "," : "", work->func);
> + }
> +}
> +
> +static void show_pwq(struct pool_workqueue *pwq)
> +{
> + struct worker_pool *pool = pwq->pool;
> + struct work_struct *work;
> + struct worker *worker;
> + bool has_in_flight = false, has_pending = false;
> + int bkt;
> +
> + printk(" pwq %d:", pool->id);
> + pr_cont_pool_info(pool);
> +
> + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
> + !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
> +
> + hash_for_each(pool->busy_hash, bkt, worker, hentry) {
> + if (worker->current_pwq == pwq) {
> + has_in_flight = true;
> + break;
> + }
> + }
> + if (has_in_flight) {
> + bool comma = false;
> +
> + printk(" in-flight:");
pr_something? show_state() uses KERN_INFO, which may or may not be
appropriate.
> + hash_for_each(pool->busy_hash, bkt, worker, hentry) {
> + if (worker->current_pwq != pwq)
> + continue;
> +
> + pr_cont("%s %d%s:%pf", comma ? "," : "",
> + task_pid_nr(worker->task),
> + worker == pwq->wq->rescuer ? "(RESCUER)" : "",
> + worker->current_func);
> + list_for_each_entry(work, &worker->scheduled, entry)
> + pr_cont_work(false, work);
> + comma = true;
> + }
> + pr_cont("\n");
> + }
> +
> + list_for_each_entry(work, &pool->worklist, entry) {
> + if (get_work_pwq(work) == pwq) {
> + has_pending = true;
> + break;
> + }
> + }
> + if (has_pending) {
> + bool comma = false;
> +
> + printk(" pending:");
ditto
> + list_for_each_entry(work, &pool->worklist, entry) {
> + if (get_work_pwq(work) != pwq)
> + continue;
> +
> + pr_cont_work(comma, work);
> + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
> + }
> + pr_cont("\n");
> + }
> +
> + if (!list_empty(&pwq->delayed_works)) {
> + bool comma = false;
> +
> + printk(" delayed:");
ditto
> + list_for_each_entry(work, &pwq->delayed_works, entry) {
> + pr_cont_work(comma, work);
> + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
> + }
> + pr_cont("\n");
> + }
> +}
> +
>
> ...
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 18:06 ` Andrew Morton
@ 2014-12-08 18:40 ` Tejun Heo
2014-12-08 19:05 ` Andrew Morton
0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2014-12-08 18:40 UTC (permalink / raw)
To: Andrew Morton; +Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar
Hello, Andrew.
On Mon, Dec 08, 2014 at 10:06:13AM -0800, Andrew Morton wrote:
> sysrq-t already produces thousands of lines of output. Maybe create a
> new keycode for this?
Believe it or not, we already used up all alphanumerics if we count in
the arch-specific ones. Given that the workqueue information would
primarily be useful in tracking down hangs and we'd want to see the
dump of tasks in that case anyway, sysrq-t isn't a bad fit for
appending workqueue dump. If anybody has a better idea, I'm all ears.
...
> > +static void pr_cont_pool_info(struct worker_pool *pool)
> > +{
> > + if (pool->cpu >= 0)
> > + pr_cont(" cpu=%d", pool->cpu);
> > + else if (pool->node != NUMA_NO_NODE)
> > + pr_cont(" node=%d", pool->node);
> > +
> > + if (pool->cpu < 0) {
> > + static char cpus_buf[PAGE_SIZE];
>
> Ouch. This could be [NR_CPUS + epsilon]?
It's bitmap mask printing so each char can show four cpus. PAGE_SIZE
should be enough for now but I think we need cpumask_prcont().
> > +static void pr_cont_work(bool comma, struct work_struct *work)
> > +{
> > + if (work->func == wq_barrier_func) {
> > + struct wq_barrier *barr =
> > + container_of(work, struct wq_barrier, work);
>
> Can avoid the 80-col contortions with
>
> struct wq_barrier *barr;
>
> barr = container_of(work, struct wq_barrier, work);
I'm not sure either is any better, but sure.
> > + if (has_in_flight) {
> > + bool comma = false;
> > +
> > + printk(" in-flight:");
>
> pr_something? show_state() uses KERN_INFO, which may or may not be
> appropriate.
Hmmm, best to match show_state(). I'll convert to pr_info().
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 18:40 ` Tejun Heo
@ 2014-12-08 19:05 ` Andrew Morton
2014-12-08 19:22 ` Tejun Heo
0 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2014-12-08 19:05 UTC (permalink / raw)
To: Tejun Heo; +Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar
On Mon, 8 Dec 2014 13:40:35 -0500 Tejun Heo <tj@kernel.org> wrote:
> Hello, Andrew.
>
> On Mon, Dec 08, 2014 at 10:06:13AM -0800, Andrew Morton wrote:
> > sysrq-t already produces thousands of lines of output. Maybe create a
> > new keycode for this?
>
> Believe it or not, we already used up all alphanumerics if we count in
> the arch-specific ones. Given that the workqueue information would
> primarily be useful in tracking down hangs and we'd want to see the
> dump of tasks in that case anyway, sysrq-t isn't a bad fit for
> appending workqueue dump. If anybody has a better idea, I'm all ears.
Really. Upper case?
> ...
> > > +static void pr_cont_pool_info(struct worker_pool *pool)
> > > +{
> > > + if (pool->cpu >= 0)
> > > + pr_cont(" cpu=%d", pool->cpu);
> > > + else if (pool->node != NUMA_NO_NODE)
> > > + pr_cont(" node=%d", pool->node);
> > > +
> > > + if (pool->cpu < 0) {
> > > + static char cpus_buf[PAGE_SIZE];
> >
> > Ouch. This could be [NR_CPUS + epsilon]?
>
> It's bitmap mask printing so each char can show four cpus. PAGE_SIZE
> should be enough for now but I think we need cpumask_prcont().
I'm not concerned about it being too small ;) Not many people have 16k
CPUs - can it be shrunk? It's particularly gross when CONFIG_SMP=n!
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 19:05 ` Andrew Morton
@ 2014-12-08 19:22 ` Tejun Heo
2014-12-10 4:50 ` Greg Kroah-Hartman
0 siblings, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2014-12-08 19:22 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar,
Greg Kroah-Hartman
(cc'ing Greg for tty)
On Mon, Dec 08, 2014 at 11:05:15AM -0800, Andrew Morton wrote:
> > Believe it or not, we already used up all alphanumerics if we count in
> > the arch-specific ones. Given that the workqueue information would
> > primarily be useful in tracking down hangs and we'd want to see the
> > dump of tasks in that case anyway, sysrq-t isn't a bad fit for
> > appending workqueue dump. If anybody has a better idea, I'm all ears.
>
> Really. Upper case?
Greg, would using uppercase chars for sysrq work over the different
types of ttys?
> > > > +static void pr_cont_pool_info(struct worker_pool *pool)
> > > > +{
> > > > + if (pool->cpu >= 0)
> > > > + pr_cont(" cpu=%d", pool->cpu);
> > > > + else if (pool->node != NUMA_NO_NODE)
> > > > + pr_cont(" node=%d", pool->node);
> > > > +
> > > > + if (pool->cpu < 0) {
> > > > + static char cpus_buf[PAGE_SIZE];
> > >
> > > Ouch. This could be [NR_CPUS + epsilon]?
> >
> > It's bitmap mask printing so each char can show four cpus. PAGE_SIZE
> > should be enough for now but I think we need cpumask_prcont().
>
> I'm not concerned about it being too small ;) Not many people have 16k
> CPUs - can it be shrunk? It's particularly gross when CONFIG_SMP=n!
Heh, lemme just go ahead and implement bitmap_pr_cont() and
cpumask_pr_cont().
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 19:22 ` Tejun Heo
@ 2014-12-10 4:50 ` Greg Kroah-Hartman
2014-12-10 18:34 ` Tejun Heo
0 siblings, 1 reply; 11+ messages in thread
From: Greg Kroah-Hartman @ 2014-12-10 4:50 UTC (permalink / raw)
To: Tejun Heo, linux-serial
Cc: Andrew Morton, linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar
On Mon, Dec 08, 2014 at 02:22:29PM -0500, Tejun Heo wrote:
> (cc'ing Greg for tty)
>
> On Mon, Dec 08, 2014 at 11:05:15AM -0800, Andrew Morton wrote:
> > > Believe it or not, we already used up all alphanumerics if we count in
> > > the arch-specific ones. Given that the workqueue information would
> > > primarily be useful in tracking down hangs and we'd want to see the
> > > dump of tasks in that case anyway, sysrq-t isn't a bad fit for
> > > appending workqueue dump. If anybody has a better idea, I'm all ears.
> >
> > Really. Upper case?
>
> Greg, would using uppercase chars for sysrq work over the different
> types of ttys?
I'm dragging in linux-serial@vger here also, but I think uppercase
characters will work from a tty standpoint. I don't know about keyboard
scancodes, if they will do "odd" things wanting a shift with the sysrq
key at the same time.
Oh wait, I think that might be it, shift is needed for the sysrq key on
the keyboard to start with, right? So there probably isn't a way to
test the difference of a lower/upper case key here.
I'm traveling this week, and don't have access to a "real" keyboard at
the moment, but this should be pretty easy for someone to test who has
one and cares about this type of thing {hint}.
thanks,
greg k-h
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-10 4:50 ` Greg Kroah-Hartman
@ 2014-12-10 18:34 ` Tejun Heo
0 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2014-12-10 18:34 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: linux-serial, Andrew Morton, linux-kernel, Lai Jiangshan,
Linus Torvalds, Ingo Molnar
Hello, Greg.
On Tue, Dec 09, 2014 at 11:50:57PM -0500, Greg Kroah-Hartman wrote:
> Oh wait, I think that might be it, shift is needed for the sysrq key on
> the keyboard to start with, right? So there probably isn't a way to
On x86, the combo is alt-sysrq, so shift can theoretically be added.
> test the difference of a lower/upper case key here.
>
> I'm traveling this week, and don't have access to a "real" keyboard at
> the moment, but this should be pretty easy for someone to test who has
> one and cares about this type of thing {hint}.
Just tested on a PS/2 port attached keyboard and it doesn't go
through. I'm not sure where this is getting lost but given that
people sometimes have to use ctrl too to invoke sysrq, throwing in
shift makes it a four key combo which is likely to be over the
rollover capabilities of most keyboards. It doesn't seem like a good
path to follow to me. :(
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v3 wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo
2014-12-08 18:06 ` Andrew Morton
@ 2015-03-09 13:28 ` Tejun Heo
2015-03-10 12:58 ` Tejun Heo
1 sibling, 1 reply; 11+ messages in thread
From: Tejun Heo @ 2015-03-09 13:28 UTC (permalink / raw)
To: linux-kernel; +Cc: Lai Jiangshan, Linus Torvalds, Andrew Morton, Ingo Molnar
Hello,
Now that %pb[l] formatting is now in mainline, this can go forward.
Here's the updated patch which uses printf instead of the fixed buffer
when printing cpus. I couldn't find a better option at the moment and
am still appending the output to sysrq-t. This only prints out the
workqueues and pools with busy work items so the amount of extra
output should be fairly low in general. If nobody objects, I'll route
these through wq/for-4.1.
Thanks.
------ 8< ------
Workqueues are used extensively throughout the kernel but sometimes
it's difficult to debug stalls involving work items because visibility
into its inner workings is fairly limited. Although sysrq-t task dump
annotates each active worker task with the information on the work
item being executed, it is challenging to find out which work items
are pending or delayed on which queues and how pools are being
managed.
This patch implements show_workqueue_state() which dumps all busy
workqueues and pools and is called from the sysrq-t handler. At the
end of sysrq-t dump, something like the following is printed.
Showing busy workqueues and worker pools:
...
workqueue filler_wq: flags=0x0
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=2/256
in-flight: 491:filler_workfn, 507:filler_workfn
pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256
in-flight: 501:filler_workfn
pending: filler_workfn
...
workqueue test_wq: flags=0x8
pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1
in-flight: 510(RESCUER):test_workfn BAR(69) BAR(500)
delayed: test_workfn1 BAR(492), test_workfn2
...
pool 0: cpus=0 node=0 flags=0x0 nice=0 workers=2 manager: 137
pool 2: cpus=1 node=0 flags=0x0 nice=0 workers=3 manager: 469
pool 3: cpus=1 node=0 flags=0x0 nice=-20 workers=2 idle: 16
pool 8: cpus=0-3 flags=0x4 nice=0 workers=2 manager: 62
The above shows that test_wq is executing test_workfn() on pid 510
which is the rescuer and also that there are two tasks 69 and 500
waiting for the work item to finish in flush_work(). As test_wq has
max_active of 1, there are two work items for test_workfn1() and
test_workfn2() which are delayed till the current work item is
finished. In addition, pid 492 is flushing test_workfn1().
The work item for test_workfn() is being executed on pwq of pool 2
which is the normal priority per-cpu pool for CPU 1. The pool has
three workers, two of which are executing filler_workfn() for
filler_wq and the last one is assuming the manager role trying to
create more workers.
This extra workqueue state dump will hopefully help chasing down hangs
involving workqueues.
v3: cpulist_pr_cont() replaced with "%*pbl" printf formatting.
v2: As suggested by Andrew, minor formatting change in pr_cont_work(),
printk()'s replaced with pr_info()'s, and cpumask printing now
uses cpulist_pr_cont().
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
CC: Ingo Molnar <mingo@redhat.com>
---
drivers/tty/sysrq.c | 1
include/linux/workqueue.h | 1
kernel/workqueue.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 162 insertions(+)
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -275,6 +275,7 @@ static struct sysrq_key_op sysrq_showreg
static void sysrq_handle_showstate(int key)
{
show_state();
+ show_workqueue_state();
}
static struct sysrq_key_op sysrq_showstate_op = {
.handler = sysrq_handle_showstate,
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -453,6 +453,7 @@ extern bool workqueue_congested(int cpu,
extern unsigned int work_busy(struct work_struct *work);
extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
extern void print_worker_info(const char *log_lvl, struct task_struct *task);
+extern void show_workqueue_state(void);
/**
* queue_work - queue work on a workqueue
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4409,6 +4409,166 @@ void print_worker_info(const char *log_l
}
}
+static void pr_cont_pool_info(struct worker_pool *pool)
+{
+ pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask);
+ if (pool->node != NUMA_NO_NODE)
+ pr_cont(" node=%d", pool->node);
+ pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice);
+}
+
+static void pr_cont_work(bool comma, struct work_struct *work)
+{
+ if (work->func == wq_barrier_func) {
+ struct wq_barrier *barr;
+
+ barr = container_of(work, struct wq_barrier, work);
+
+ pr_cont("%s BAR(%d)", comma ? "," : "",
+ task_pid_nr(barr->task));
+ } else {
+ pr_cont("%s %pf", comma ? "," : "", work->func);
+ }
+}
+
+static void show_pwq(struct pool_workqueue *pwq)
+{
+ struct worker_pool *pool = pwq->pool;
+ struct work_struct *work;
+ struct worker *worker;
+ bool has_in_flight = false, has_pending = false;
+ int bkt;
+
+ pr_info(" pwq %d:", pool->id);
+ pr_cont_pool_info(pool);
+
+ pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active,
+ !list_empty(&pwq->mayday_node) ? " MAYDAY" : "");
+
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq == pwq) {
+ has_in_flight = true;
+ break;
+ }
+ }
+ if (has_in_flight) {
+ bool comma = false;
+
+ pr_info(" in-flight:");
+ hash_for_each(pool->busy_hash, bkt, worker, hentry) {
+ if (worker->current_pwq != pwq)
+ continue;
+
+ pr_cont("%s %d%s:%pf", comma ? "," : "",
+ task_pid_nr(worker->task),
+ worker == pwq->wq->rescuer ? "(RESCUER)" : "",
+ worker->current_func);
+ list_for_each_entry(work, &worker->scheduled, entry)
+ pr_cont_work(false, work);
+ comma = true;
+ }
+ pr_cont("\n");
+ }
+
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) == pwq) {
+ has_pending = true;
+ break;
+ }
+ }
+ if (has_pending) {
+ bool comma = false;
+
+ pr_info(" pending:");
+ list_for_each_entry(work, &pool->worklist, entry) {
+ if (get_work_pwq(work) != pwq)
+ continue;
+
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+
+ if (!list_empty(&pwq->delayed_works)) {
+ bool comma = false;
+
+ pr_info(" delayed:");
+ list_for_each_entry(work, &pwq->delayed_works, entry) {
+ pr_cont_work(comma, work);
+ comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED);
+ }
+ pr_cont("\n");
+ }
+}
+
+/**
+ * show_workqueue_state - dump workqueue state
+ *
+ * Called from a sysrq handler and prints out all busy workqueues and
+ * pools.
+ */
+void show_workqueue_state(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ unsigned long flags;
+ int pi;
+
+ rcu_read_lock_sched();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list) {
+ struct pool_workqueue *pwq;
+ bool idle = true;
+
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works)) {
+ idle = false;
+ break;
+ }
+ }
+ if (idle)
+ continue;
+
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+
+ for_each_pwq(pwq, wq) {
+ spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->delayed_works))
+ show_pwq(pwq);
+ spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ }
+ }
+
+ for_each_pool(pool, pi) {
+ struct worker *worker;
+ bool first = true;
+
+ spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" workers=%d", pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ next_pool:
+ spin_unlock_irqrestore(&pool->lock, flags);
+ }
+
+ rcu_read_unlock_sched();
+}
+
/*
* CPU hotplug.
*
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3 wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t
2015-03-09 13:28 ` [PATCH v3 " Tejun Heo
@ 2015-03-10 12:58 ` Tejun Heo
0 siblings, 0 replies; 11+ messages in thread
From: Tejun Heo @ 2015-03-10 12:58 UTC (permalink / raw)
To: linux-kernel; +Cc: Lai Jiangshan, Linus Torvalds, Andrew Morton, Ingo Molnar
On Mon, Mar 09, 2015 at 09:28:28AM -0400, Tejun Heo wrote:
> Now that %pb[l] formatting is now in mainline, this can go forward.
> Here's the updated patch which uses printf instead of the fixed buffer
> when printing cpus. I couldn't find a better option at the moment and
> am still appending the output to sysrq-t. This only prints out the
> workqueues and pools with busy work items so the amount of extra
> output should be fairly low in general. If nobody objects, I'll route
> these through wq/for-4.1.
Applied to wq/for-4.1.
Thanks.
--
tejun
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2015-03-10 12:58 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-12-08 17:43 [PATCH wq/for-3.19 1/3] workqueue: make the workqueues list RCU walkable Tejun Heo
2014-12-08 17:44 ` [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager Tejun Heo
2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo
2014-12-08 18:06 ` Andrew Morton
2014-12-08 18:40 ` Tejun Heo
2014-12-08 19:05 ` Andrew Morton
2014-12-08 19:22 ` Tejun Heo
2014-12-10 4:50 ` Greg Kroah-Hartman
2014-12-10 18:34 ` Tejun Heo
2015-03-09 13:28 ` [PATCH v3 " Tejun Heo
2015-03-10 12:58 ` Tejun Heo
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).