LKML Archive on lore.kernel.org help / color / mirror / Atom feed
* [PATCH wq/for-3.19 1/3] workqueue: make the workqueues list RCU walkable @ 2014-12-08 17:43 Tejun Heo 2014-12-08 17:44 ` [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager Tejun Heo 0 siblings, 1 reply; 11+ messages in thread From: Tejun Heo @ 2014-12-08 17:43 UTC (permalink / raw) To: linux-kernel; +Cc: Lai Jiangshan, linux-kernel The workqueues list is protected by wq_pool_mutex and a workqueue and its subordinate data structures are freed directly on destruction. We want to add the ability dump workqueues from a sysrq callback which requires walking all workqueues without grabbing wq_pool_mutex. This patch makes freeing of workqueues RCU protected and makes the workqueues list walkable while holding RCU read lock. Note that pool_workqueues and pools are already sched-RCU protected. For consistency, workqueues are also protected with sched-RCU. While at it, reverse the workqueues list so that a workqueue which is created earlier comes before. The order of the list isn't significant functionally but this makes the planned sysrq dump list system workqueues first. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -230,7 +230,7 @@ struct wq_device; */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ - struct list_head list; /* PL: list of all workqueues */ + struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ @@ -257,6 +257,13 @@ struct workqueue_struct { #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + /* + * Destruction of workqueue_struct is sched-RCU protected to allow + * walking the workqueues list without grabbing wq_pool_mutex. + * This is used to dump all workqueues from sysrq. + */ + struct rcu_head rcu; + /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ @@ -288,7 +295,7 @@ static struct workqueue_attrs *wq_update static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ @@ -3386,6 +3393,20 @@ static int init_worker_pool(struct worke return 0; } +static void rcu_free_wq(struct rcu_head *rcu) +{ + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); + + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); + else + free_workqueue_attrs(wq->unbound_attrs); + + kfree(wq->rescuer); + kfree(wq); +} + static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); @@ -3563,12 +3584,10 @@ static void pwq_unbound_release_workfn(s /* * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Free it. + * is gonna access it anymore. Schedule RCU free. */ - if (is_last) { - free_workqueue_attrs(wq->unbound_attrs); - kfree(wq); - } + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** @@ -4105,7 +4124,7 @@ struct workqueue_struct *__alloc_workque pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); - list_add(&wq->list, &workqueues); + list_add_tail_rcu(&wq->list, &workqueues); mutex_unlock(&wq_pool_mutex); @@ -4161,24 +4180,20 @@ void destroy_workqueue(struct workqueue_ * flushing is complete in case freeze races us. */ mutex_lock(&wq_pool_mutex); - list_del_init(&wq->list); + list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); - if (wq->rescuer) { + if (wq->rescuer) kthread_stop(wq->rescuer->task); - kfree(wq->rescuer); - wq->rescuer = NULL; - } if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly - * free the pwqs and wq. + * schedule RCU free. */ - free_percpu(wq->cpu_pwqs); - kfree(wq); + call_rcu_sched(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager 2014-12-08 17:43 [PATCH wq/for-3.19 1/3] workqueue: make the workqueues list RCU walkable Tejun Heo @ 2014-12-08 17:44 ` Tejun Heo 2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo 0 siblings, 1 reply; 11+ messages in thread From: Tejun Heo @ 2014-12-08 17:44 UTC (permalink / raw) To: linux-kernel; +Cc: Lai Jiangshan Add wq_barrier->task and worker_pool->manager to keep track of the flushing task and pool manager respectively. These are purely informational and will be used to implement sysrq dump of workqueues. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 5 +++++ 1 file changed, 5 insertions(+) --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -159,6 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ + struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -1927,9 +1928,11 @@ static bool manage_workers(struct worker */ if (!mutex_trylock(&pool->manager_arb)) return ret; + pool->manager = worker; ret |= maybe_create_worker(pool); + pool->manager = NULL; mutex_unlock(&pool->manager_arb); return ret; } @@ -2319,6 +2322,7 @@ repeat: struct wq_barrier { struct work_struct work; struct completion done; + struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) @@ -2367,6 +2371,7 @@ static void insert_wq_barrier(struct poo INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + barr->task = current; /* * If @target is currently being executed, schedule the ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 17:44 ` [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager Tejun Heo @ 2014-12-08 17:47 ` Tejun Heo 2014-12-08 18:06 ` Andrew Morton 2015-03-09 13:28 ` [PATCH v3 " Tejun Heo 0 siblings, 2 replies; 11+ messages in thread From: Tejun Heo @ 2014-12-08 17:47 UTC (permalink / raw) To: linux-kernel; +Cc: Lai Jiangshan, Linus Torvalds, Andrew Morton, Ingo Molnar Workqueues are used extensively throughout the kernel but sometimes it's difficult to debug stalls involving work items because visibility into its inner workings is fairly limited. Although sysrq-t task dump annotates each active worker task with the information on the work item being executed, it is challenging to find out which work items are pending or delayed on which queues and how pools are being managed. This patch implements show_workqueue_state() which dumps all busy workqueues and pools and is called from the sysrq-t handler. At the end of sysrq-t dump, something like the following is printed. Showing busy workqueues and worker pools: ... workqueue filler_wq: flags=0x0 pwq 2: cpu=1 flags=0x0 nice=0 active=2/256 in-flight: 491:filler_workfn, 507:filler_workfn pwq 0: cpu=0 flags=0x0 nice=0 active=2/256 in-flight: 501:filler_workfn pending: filler_workfn ... workqueue test_wq: flags=0x8 pwq 2: cpu=1 flags=0x0 nice=0 active=1/1 in-flight: 510(RESCUER):test_workfn BAR(69) BAR(500) delayed: test_workfn1 BAR(492), test_workfn2 ... pool 0: cpu=0 flags=0x0 nice=0 workers=2 manager: 137 pool 2: cpu=1 flags=0x0 nice=0 workers=3 manager: 469 pool 3: cpu=1 flags=0x0 nice=-20 workers=2 idle: 16 pool 8: cpumask=0f flags=0x4 nice=0 workers=2 manager: 62 The above shows that test_wq is executing test_workfn() on pid 510 which is the rescuer and also that there are two tasks 69 and 500 waiting for the work item to finish in flush_work(). As test_wq has max_active of 1, there are two work items for test_workfn1() and test_workfn2() which are delayed till the current work item is finished. In addition, pid 492 is flushing test_workfn1(). The work item for test_workfn() is being executed on pwq of pool 2 which is the normal priority per-cpu pool for CPU 1. The pool has three workers, two of which are executing filler_workfn() for filler_wq and the last one is assuming the manager role trying to create more workers. This extra workqueue state dump will hopefully help chasing down hangs involving workqueues. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> CC: Ingo Molnar <mingo@redhat.com> --- Hello, If nobody objects, I'll push these three patches to print workqueue information at the end of sysrq-t dump through wq/for-3.19. Thanks. drivers/tty/sysrq.c | 1 include/linux/workqueue.h | 1 kernel/workqueue.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 170 insertions(+) --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -275,6 +275,7 @@ static struct sysrq_key_op sysrq_showreg static void sysrq_handle_showstate(int key) { show_state(); + show_workqueue_state(); } static struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -457,6 +457,7 @@ extern bool workqueue_congested(int cpu, extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); +extern void show_workqueue_state(void); /** * queue_work - queue work on a workqueue --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4419,6 +4419,174 @@ void print_worker_info(const char *log_l } } +static void pr_cont_pool_info(struct worker_pool *pool) +{ + if (pool->cpu >= 0) + pr_cont(" cpu=%d", pool->cpu); + else if (pool->node != NUMA_NO_NODE) + pr_cont(" node=%d", pool->node); + + if (pool->cpu < 0) { + static char cpus_buf[PAGE_SIZE]; + + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), + pool->attrs->cpumask); + pr_cont(" cpumask=%s", cpus_buf); + } + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); +} + +static void pr_cont_work(bool comma, struct work_struct *work) +{ + if (work->func == wq_barrier_func) { + struct wq_barrier *barr = + container_of(work, struct wq_barrier, work); + + pr_cont("%s BAR(%d)", comma ? "," : "", + task_pid_nr(barr->task)); + } else { + pr_cont("%s %pf", comma ? "," : "", work->func); + } +} + +static void show_pwq(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work; + struct worker *worker; + bool has_in_flight = false, has_pending = false; + int bkt; + + printk(" pwq %d:", pool->id); + pr_cont_pool_info(pool); + + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq == pwq) { + has_in_flight = true; + break; + } + } + if (has_in_flight) { + bool comma = false; + + printk(" in-flight:"); + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq != pwq) + continue; + + pr_cont("%s %d%s:%pf", comma ? "," : "", + task_pid_nr(worker->task), + worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->current_func); + list_for_each_entry(work, &worker->scheduled, entry) + pr_cont_work(false, work); + comma = true; + } + pr_cont("\n"); + } + + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + has_pending = true; + break; + } + } + if (has_pending) { + bool comma = false; + + printk(" pending:"); + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) != pwq) + continue; + + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } + + if (!list_empty(&pwq->delayed_works)) { + bool comma = false; + + printk(" delayed:"); + list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } +} + +/** + * show_workqueue_state - dump workqueue state + * + * Called from a sysrq handler and prints out all busy workqueues and + * pools. + */ +void show_workqueue_state(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + unsigned long flags; + int pi; + + rcu_read_lock_sched(); + + printk("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + struct pool_workqueue *pwq; + bool idle = true; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + idle = false; + break; + } + } + if (idle) + continue; + + printk("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + show_pwq(pwq); + spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } + + for_each_pool(pool, pi) { + struct worker *worker; + bool first = true; + + spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + printk("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" workers=%d", pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + next_pool: + spin_unlock_irqrestore(&pool->lock, flags); + } + + rcu_read_unlock_sched(); +} + /* * CPU hotplug. * ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo @ 2014-12-08 18:06 ` Andrew Morton 2014-12-08 18:40 ` Tejun Heo 2015-03-09 13:28 ` [PATCH v3 " Tejun Heo 1 sibling, 1 reply; 11+ messages in thread From: Andrew Morton @ 2014-12-08 18:06 UTC (permalink / raw) To: Tejun Heo; +Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar On Mon, 8 Dec 2014 12:47:33 -0500 Tejun Heo <tj@kernel.org> wrote: > > ... > > This patch implements show_workqueue_state() which dumps all busy > workqueues and pools and is called from the sysrq-t handler. At the > end of sysrq-t dump, something like the following is printed. Seems sensible. sysrq-t already produces thousands of lines of output. Maybe create a new keycode for this? > > ... > > --- a/kernel/workqueue.c > +++ b/kernel/workqueue.c > @@ -4419,6 +4419,174 @@ void print_worker_info(const char *log_l > } > } > > +static void pr_cont_pool_info(struct worker_pool *pool) > +{ > + if (pool->cpu >= 0) > + pr_cont(" cpu=%d", pool->cpu); > + else if (pool->node != NUMA_NO_NODE) > + pr_cont(" node=%d", pool->node); > + > + if (pool->cpu < 0) { > + static char cpus_buf[PAGE_SIZE]; Ouch. This could be [NR_CPUS + epsilon]? > + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), > + pool->attrs->cpumask); > + pr_cont(" cpumask=%s", cpus_buf); > + } > + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); > +} > + > +static void pr_cont_work(bool comma, struct work_struct *work) > +{ > + if (work->func == wq_barrier_func) { > + struct wq_barrier *barr = > + container_of(work, struct wq_barrier, work); Can avoid the 80-col contortions with struct wq_barrier *barr; barr = container_of(work, struct wq_barrier, work); > + pr_cont("%s BAR(%d)", comma ? "," : "", > + task_pid_nr(barr->task)); > + } else { > + pr_cont("%s %pf", comma ? "," : "", work->func); > + } > +} > + > +static void show_pwq(struct pool_workqueue *pwq) > +{ > + struct worker_pool *pool = pwq->pool; > + struct work_struct *work; > + struct worker *worker; > + bool has_in_flight = false, has_pending = false; > + int bkt; > + > + printk(" pwq %d:", pool->id); > + pr_cont_pool_info(pool); > + > + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, > + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); > + > + hash_for_each(pool->busy_hash, bkt, worker, hentry) { > + if (worker->current_pwq == pwq) { > + has_in_flight = true; > + break; > + } > + } > + if (has_in_flight) { > + bool comma = false; > + > + printk(" in-flight:"); pr_something? show_state() uses KERN_INFO, which may or may not be appropriate. > + hash_for_each(pool->busy_hash, bkt, worker, hentry) { > + if (worker->current_pwq != pwq) > + continue; > + > + pr_cont("%s %d%s:%pf", comma ? "," : "", > + task_pid_nr(worker->task), > + worker == pwq->wq->rescuer ? "(RESCUER)" : "", > + worker->current_func); > + list_for_each_entry(work, &worker->scheduled, entry) > + pr_cont_work(false, work); > + comma = true; > + } > + pr_cont("\n"); > + } > + > + list_for_each_entry(work, &pool->worklist, entry) { > + if (get_work_pwq(work) == pwq) { > + has_pending = true; > + break; > + } > + } > + if (has_pending) { > + bool comma = false; > + > + printk(" pending:"); ditto > + list_for_each_entry(work, &pool->worklist, entry) { > + if (get_work_pwq(work) != pwq) > + continue; > + > + pr_cont_work(comma, work); > + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); > + } > + pr_cont("\n"); > + } > + > + if (!list_empty(&pwq->delayed_works)) { > + bool comma = false; > + > + printk(" delayed:"); ditto > + list_for_each_entry(work, &pwq->delayed_works, entry) { > + pr_cont_work(comma, work); > + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); > + } > + pr_cont("\n"); > + } > +} > + > > ... ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 18:06 ` Andrew Morton @ 2014-12-08 18:40 ` Tejun Heo 2014-12-08 19:05 ` Andrew Morton 0 siblings, 1 reply; 11+ messages in thread From: Tejun Heo @ 2014-12-08 18:40 UTC (permalink / raw) To: Andrew Morton; +Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar Hello, Andrew. On Mon, Dec 08, 2014 at 10:06:13AM -0800, Andrew Morton wrote: > sysrq-t already produces thousands of lines of output. Maybe create a > new keycode for this? Believe it or not, we already used up all alphanumerics if we count in the arch-specific ones. Given that the workqueue information would primarily be useful in tracking down hangs and we'd want to see the dump of tasks in that case anyway, sysrq-t isn't a bad fit for appending workqueue dump. If anybody has a better idea, I'm all ears. ... > > +static void pr_cont_pool_info(struct worker_pool *pool) > > +{ > > + if (pool->cpu >= 0) > > + pr_cont(" cpu=%d", pool->cpu); > > + else if (pool->node != NUMA_NO_NODE) > > + pr_cont(" node=%d", pool->node); > > + > > + if (pool->cpu < 0) { > > + static char cpus_buf[PAGE_SIZE]; > > Ouch. This could be [NR_CPUS + epsilon]? It's bitmap mask printing so each char can show four cpus. PAGE_SIZE should be enough for now but I think we need cpumask_prcont(). > > +static void pr_cont_work(bool comma, struct work_struct *work) > > +{ > > + if (work->func == wq_barrier_func) { > > + struct wq_barrier *barr = > > + container_of(work, struct wq_barrier, work); > > Can avoid the 80-col contortions with > > struct wq_barrier *barr; > > barr = container_of(work, struct wq_barrier, work); I'm not sure either is any better, but sure. > > + if (has_in_flight) { > > + bool comma = false; > > + > > + printk(" in-flight:"); > > pr_something? show_state() uses KERN_INFO, which may or may not be > appropriate. Hmmm, best to match show_state(). I'll convert to pr_info(). Thanks. -- tejun ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 18:40 ` Tejun Heo @ 2014-12-08 19:05 ` Andrew Morton 2014-12-08 19:22 ` Tejun Heo 0 siblings, 1 reply; 11+ messages in thread From: Andrew Morton @ 2014-12-08 19:05 UTC (permalink / raw) To: Tejun Heo; +Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar On Mon, 8 Dec 2014 13:40:35 -0500 Tejun Heo <tj@kernel.org> wrote: > Hello, Andrew. > > On Mon, Dec 08, 2014 at 10:06:13AM -0800, Andrew Morton wrote: > > sysrq-t already produces thousands of lines of output. Maybe create a > > new keycode for this? > > Believe it or not, we already used up all alphanumerics if we count in > the arch-specific ones. Given that the workqueue information would > primarily be useful in tracking down hangs and we'd want to see the > dump of tasks in that case anyway, sysrq-t isn't a bad fit for > appending workqueue dump. If anybody has a better idea, I'm all ears. Really. Upper case? > ... > > > +static void pr_cont_pool_info(struct worker_pool *pool) > > > +{ > > > + if (pool->cpu >= 0) > > > + pr_cont(" cpu=%d", pool->cpu); > > > + else if (pool->node != NUMA_NO_NODE) > > > + pr_cont(" node=%d", pool->node); > > > + > > > + if (pool->cpu < 0) { > > > + static char cpus_buf[PAGE_SIZE]; > > > > Ouch. This could be [NR_CPUS + epsilon]? > > It's bitmap mask printing so each char can show four cpus. PAGE_SIZE > should be enough for now but I think we need cpumask_prcont(). I'm not concerned about it being too small ;) Not many people have 16k CPUs - can it be shrunk? It's particularly gross when CONFIG_SMP=n! ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 19:05 ` Andrew Morton @ 2014-12-08 19:22 ` Tejun Heo 2014-12-10 4:50 ` Greg Kroah-Hartman 0 siblings, 1 reply; 11+ messages in thread From: Tejun Heo @ 2014-12-08 19:22 UTC (permalink / raw) To: Andrew Morton Cc: linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar, Greg Kroah-Hartman (cc'ing Greg for tty) On Mon, Dec 08, 2014 at 11:05:15AM -0800, Andrew Morton wrote: > > Believe it or not, we already used up all alphanumerics if we count in > > the arch-specific ones. Given that the workqueue information would > > primarily be useful in tracking down hangs and we'd want to see the > > dump of tasks in that case anyway, sysrq-t isn't a bad fit for > > appending workqueue dump. If anybody has a better idea, I'm all ears. > > Really. Upper case? Greg, would using uppercase chars for sysrq work over the different types of ttys? > > > > +static void pr_cont_pool_info(struct worker_pool *pool) > > > > +{ > > > > + if (pool->cpu >= 0) > > > > + pr_cont(" cpu=%d", pool->cpu); > > > > + else if (pool->node != NUMA_NO_NODE) > > > > + pr_cont(" node=%d", pool->node); > > > > + > > > > + if (pool->cpu < 0) { > > > > + static char cpus_buf[PAGE_SIZE]; > > > > > > Ouch. This could be [NR_CPUS + epsilon]? > > > > It's bitmap mask printing so each char can show four cpus. PAGE_SIZE > > should be enough for now but I think we need cpumask_prcont(). > > I'm not concerned about it being too small ;) Not many people have 16k > CPUs - can it be shrunk? It's particularly gross when CONFIG_SMP=n! Heh, lemme just go ahead and implement bitmap_pr_cont() and cpumask_pr_cont(). Thanks. -- tejun ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 19:22 ` Tejun Heo @ 2014-12-10 4:50 ` Greg Kroah-Hartman 2014-12-10 18:34 ` Tejun Heo 0 siblings, 1 reply; 11+ messages in thread From: Greg Kroah-Hartman @ 2014-12-10 4:50 UTC (permalink / raw) To: Tejun Heo, linux-serial Cc: Andrew Morton, linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar On Mon, Dec 08, 2014 at 02:22:29PM -0500, Tejun Heo wrote: > (cc'ing Greg for tty) > > On Mon, Dec 08, 2014 at 11:05:15AM -0800, Andrew Morton wrote: > > > Believe it or not, we already used up all alphanumerics if we count in > > > the arch-specific ones. Given that the workqueue information would > > > primarily be useful in tracking down hangs and we'd want to see the > > > dump of tasks in that case anyway, sysrq-t isn't a bad fit for > > > appending workqueue dump. If anybody has a better idea, I'm all ears. > > > > Really. Upper case? > > Greg, would using uppercase chars for sysrq work over the different > types of ttys? I'm dragging in linux-serial@vger here also, but I think uppercase characters will work from a tty standpoint. I don't know about keyboard scancodes, if they will do "odd" things wanting a shift with the sysrq key at the same time. Oh wait, I think that might be it, shift is needed for the sysrq key on the keyboard to start with, right? So there probably isn't a way to test the difference of a lower/upper case key here. I'm traveling this week, and don't have access to a "real" keyboard at the moment, but this should be pretty easy for someone to test who has one and cares about this type of thing {hint}. thanks, greg k-h ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-10 4:50 ` Greg Kroah-Hartman @ 2014-12-10 18:34 ` Tejun Heo 0 siblings, 0 replies; 11+ messages in thread From: Tejun Heo @ 2014-12-10 18:34 UTC (permalink / raw) To: Greg Kroah-Hartman Cc: linux-serial, Andrew Morton, linux-kernel, Lai Jiangshan, Linus Torvalds, Ingo Molnar Hello, Greg. On Tue, Dec 09, 2014 at 11:50:57PM -0500, Greg Kroah-Hartman wrote: > Oh wait, I think that might be it, shift is needed for the sysrq key on > the keyboard to start with, right? So there probably isn't a way to On x86, the combo is alt-sysrq, so shift can theoretically be added. > test the difference of a lower/upper case key here. > > I'm traveling this week, and don't have access to a "real" keyboard at > the moment, but this should be pretty easy for someone to test who has > one and cares about this type of thing {hint}. Just tested on a PS/2 port attached keyboard and it doesn't go through. I'm not sure where this is getting lost but given that people sometimes have to use ctrl too to invoke sysrq, throwing in shift makes it a four key combo which is likely to be over the rollover capabilities of most keyboards. It doesn't seem like a good path to follow to me. :( Thanks. -- tejun ^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v3 wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo 2014-12-08 18:06 ` Andrew Morton @ 2015-03-09 13:28 ` Tejun Heo 2015-03-10 12:58 ` Tejun Heo 1 sibling, 1 reply; 11+ messages in thread From: Tejun Heo @ 2015-03-09 13:28 UTC (permalink / raw) To: linux-kernel; +Cc: Lai Jiangshan, Linus Torvalds, Andrew Morton, Ingo Molnar Hello, Now that %pb[l] formatting is now in mainline, this can go forward. Here's the updated patch which uses printf instead of the fixed buffer when printing cpus. I couldn't find a better option at the moment and am still appending the output to sysrq-t. This only prints out the workqueues and pools with busy work items so the amount of extra output should be fairly low in general. If nobody objects, I'll route these through wq/for-4.1. Thanks. ------ 8< ------ Workqueues are used extensively throughout the kernel but sometimes it's difficult to debug stalls involving work items because visibility into its inner workings is fairly limited. Although sysrq-t task dump annotates each active worker task with the information on the work item being executed, it is challenging to find out which work items are pending or delayed on which queues and how pools are being managed. This patch implements show_workqueue_state() which dumps all busy workqueues and pools and is called from the sysrq-t handler. At the end of sysrq-t dump, something like the following is printed. Showing busy workqueues and worker pools: ... workqueue filler_wq: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=2/256 in-flight: 491:filler_workfn, 507:filler_workfn pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 in-flight: 501:filler_workfn pending: filler_workfn ... workqueue test_wq: flags=0x8 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 in-flight: 510(RESCUER):test_workfn BAR(69) BAR(500) delayed: test_workfn1 BAR(492), test_workfn2 ... pool 0: cpus=0 node=0 flags=0x0 nice=0 workers=2 manager: 137 pool 2: cpus=1 node=0 flags=0x0 nice=0 workers=3 manager: 469 pool 3: cpus=1 node=0 flags=0x0 nice=-20 workers=2 idle: 16 pool 8: cpus=0-3 flags=0x4 nice=0 workers=2 manager: 62 The above shows that test_wq is executing test_workfn() on pid 510 which is the rescuer and also that there are two tasks 69 and 500 waiting for the work item to finish in flush_work(). As test_wq has max_active of 1, there are two work items for test_workfn1() and test_workfn2() which are delayed till the current work item is finished. In addition, pid 492 is flushing test_workfn1(). The work item for test_workfn() is being executed on pwq of pool 2 which is the normal priority per-cpu pool for CPU 1. The pool has three workers, two of which are executing filler_workfn() for filler_wq and the last one is assuming the manager role trying to create more workers. This extra workqueue state dump will hopefully help chasing down hangs involving workqueues. v3: cpulist_pr_cont() replaced with "%*pbl" printf formatting. v2: As suggested by Andrew, minor formatting change in pr_cont_work(), printk()'s replaced with pr_info()'s, and cpumask printing now uses cpulist_pr_cont(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> CC: Ingo Molnar <mingo@redhat.com> --- drivers/tty/sysrq.c | 1 include/linux/workqueue.h | 1 kernel/workqueue.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+) --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -275,6 +275,7 @@ static struct sysrq_key_op sysrq_showreg static void sysrq_handle_showstate(int key) { show_state(); + show_workqueue_state(); } static struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -453,6 +453,7 @@ extern bool workqueue_congested(int cpu, extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); +extern void show_workqueue_state(void); /** * queue_work - queue work on a workqueue --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4409,6 +4409,166 @@ void print_worker_info(const char *log_l } } +static void pr_cont_pool_info(struct worker_pool *pool) +{ + pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); + if (pool->node != NUMA_NO_NODE) + pr_cont(" node=%d", pool->node); + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); +} + +static void pr_cont_work(bool comma, struct work_struct *work) +{ + if (work->func == wq_barrier_func) { + struct wq_barrier *barr; + + barr = container_of(work, struct wq_barrier, work); + + pr_cont("%s BAR(%d)", comma ? "," : "", + task_pid_nr(barr->task)); + } else { + pr_cont("%s %pf", comma ? "," : "", work->func); + } +} + +static void show_pwq(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work; + struct worker *worker; + bool has_in_flight = false, has_pending = false; + int bkt; + + pr_info(" pwq %d:", pool->id); + pr_cont_pool_info(pool); + + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq == pwq) { + has_in_flight = true; + break; + } + } + if (has_in_flight) { + bool comma = false; + + pr_info(" in-flight:"); + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq != pwq) + continue; + + pr_cont("%s %d%s:%pf", comma ? "," : "", + task_pid_nr(worker->task), + worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->current_func); + list_for_each_entry(work, &worker->scheduled, entry) + pr_cont_work(false, work); + comma = true; + } + pr_cont("\n"); + } + + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + has_pending = true; + break; + } + } + if (has_pending) { + bool comma = false; + + pr_info(" pending:"); + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) != pwq) + continue; + + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } + + if (!list_empty(&pwq->delayed_works)) { + bool comma = false; + + pr_info(" delayed:"); + list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } +} + +/** + * show_workqueue_state - dump workqueue state + * + * Called from a sysrq handler and prints out all busy workqueues and + * pools. + */ +void show_workqueue_state(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + unsigned long flags; + int pi; + + rcu_read_lock_sched(); + + pr_info("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + struct pool_workqueue *pwq; + bool idle = true; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + idle = false; + break; + } + } + if (idle) + continue; + + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + show_pwq(pwq); + spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } + + for_each_pool(pool, pi) { + struct worker *worker; + bool first = true; + + spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" workers=%d", pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + next_pool: + spin_unlock_irqrestore(&pool->lock, flags); + } + + rcu_read_unlock_sched(); +} + /* * CPU hotplug. * ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v3 wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t 2015-03-09 13:28 ` [PATCH v3 " Tejun Heo @ 2015-03-10 12:58 ` Tejun Heo 0 siblings, 0 replies; 11+ messages in thread From: Tejun Heo @ 2015-03-10 12:58 UTC (permalink / raw) To: linux-kernel; +Cc: Lai Jiangshan, Linus Torvalds, Andrew Morton, Ingo Molnar On Mon, Mar 09, 2015 at 09:28:28AM -0400, Tejun Heo wrote: > Now that %pb[l] formatting is now in mainline, this can go forward. > Here's the updated patch which uses printf instead of the fixed buffer > when printing cpus. I couldn't find a better option at the moment and > am still appending the output to sysrq-t. This only prints out the > workqueues and pools with busy work items so the amount of extra > output should be fairly low in general. If nobody objects, I'll route > these through wq/for-4.1. Applied to wq/for-4.1. Thanks. -- tejun ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2015-03-10 12:58 UTC | newest] Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2014-12-08 17:43 [PATCH wq/for-3.19 1/3] workqueue: make the workqueues list RCU walkable Tejun Heo 2014-12-08 17:44 ` [PATCH wq/for-3.19 2/3] workqueue: keep track of the flushing task and pool manager Tejun Heo 2014-12-08 17:47 ` [PATCH wq/for-3.19 3/3] workqueue: dump workqueues on sysrq-t Tejun Heo 2014-12-08 18:06 ` Andrew Morton 2014-12-08 18:40 ` Tejun Heo 2014-12-08 19:05 ` Andrew Morton 2014-12-08 19:22 ` Tejun Heo 2014-12-10 4:50 ` Greg Kroah-Hartman 2014-12-10 18:34 ` Tejun Heo 2015-03-09 13:28 ` [PATCH v3 " Tejun Heo 2015-03-10 12:58 ` Tejun Heo
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).