LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4
@ 2015-03-12  5:00 Lai Jiangshan
  2015-03-12  5:00 ` [PATCH 1/4] workqueue: Reorder sysfs code Lai Jiangshan
                   ` (6 more replies)
  0 siblings, 7 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-12  5:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

This patchset mostly copies from Frederic and split the apply_workqueue_attrs()
as TJ's suggest.

This patchset doesn't include the patch "workqueue: Allow changing attributions
of ordered workqueues", I hope to reduce the review processing. The handling
for the ordered workqueue will be repose after this patchset accepted.

Thanks,
Lai

Frederic Weisbecker (2):
  workqueue: Reorder sysfs code
  workqueue: Create low-level unbound workqueues cpumask

Lai Jiangshan (2):
  workqueue: split apply_workqueue_attrs() into 3 stages
  workqueue: Allow modifying low level unbound workqueue cpumask

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>

 kernel/workqueue.c | 942 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 539 insertions(+), 403 deletions(-)

-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/4] workqueue: Reorder sysfs code
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
@ 2015-03-12  5:00 ` Lai Jiangshan
  2015-03-12  5:00 ` [PATCH 2/4] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-12  5:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

The sysfs code usually belongs to the botom of the file since it deals
with high level objects. In the workqueue code it's misplaced and such
that we'll need to work around functions references to allow the sysfs
code to call APIs like apply_workqueue_attrs().

Lets move that block further in the file, right above alloc_workqueue_key()
which reference it.

Suggested-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 634 ++++++++++++++++++++++++++---------------------------
 1 file changed, 317 insertions(+), 317 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ca0b1d..c6845fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3001,323 +3001,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
 
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
- *  max_active	RW int	: maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- *  id		RO int	: the associated pool ID
- *  nice	RW int	: nice value of the workers
- *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
- */
-struct wq_device {
-	struct workqueue_struct		*wq;
-	struct device			dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
-	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-	return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
-				struct device_attribute *attr, const char *buf,
-				size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int val;
-
-	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-		return -EINVAL;
-
-	workqueue_set_max_active(wq, val);
-	return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
-	&dev_attr_per_cpu.attr,
-	&dev_attr_max_active.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	const char *delim = "";
-	int node, written = 0;
-
-	rcu_read_lock_sched();
-	for_each_node(node) {
-		written += scnprintf(buf + written, PAGE_SIZE - written,
-				     "%s%d:%d", delim, node,
-				     unbound_pwq_by_node(wq, node)->pool->id);
-		delim = " ";
-	}
-	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-	rcu_read_unlock_sched();
-
-	return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-	mutex_unlock(&wq->mutex);
-
-	return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-	struct workqueue_attrs *attrs;
-
-	attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!attrs)
-		return NULL;
-
-	mutex_lock(&wq->mutex);
-	copy_workqueue_attrs(attrs, wq->unbound_attrs);
-	mutex_unlock(&wq->mutex);
-	return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-		ret = apply_workqueue_attrs(wq, attrs);
-	else
-		ret = -EINVAL;
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			    cpumask_pr_args(wq->unbound_attrs->cpumask));
-	mutex_unlock(&wq->mutex);
-	return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	ret = cpumask_parse(buf, attrs->cpumask);
-	if (!ret)
-		ret = apply_workqueue_attrs(wq, attrs);
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%d\n",
-			    !wq->unbound_attrs->no_numa);
-	mutex_unlock(&wq->mutex);
-
-	return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int v, ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	ret = -EINVAL;
-	if (sscanf(buf, "%d", &v) == 1) {
-		attrs->no_numa = !v;
-		ret = apply_workqueue_attrs(wq, attrs);
-	}
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-	__ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
-	.name				= "workqueue",
-	.dev_groups			= wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
-	return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
-	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-	kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
-	struct wq_device *wq_dev;
-	int ret;
-
-	/*
-	 * Adjusting max_active or creating new pwqs by applyting
-	 * attributes breaks ordering guarantee.  Disallow exposing ordered
-	 * workqueues.
-	 */
-	if (WARN_ON(wq->flags & __WQ_ORDERED))
-		return -EINVAL;
-
-	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-	if (!wq_dev)
-		return -ENOMEM;
-
-	wq_dev->wq = wq;
-	wq_dev->dev.bus = &wq_subsys;
-	wq_dev->dev.init_name = wq->name;
-	wq_dev->dev.release = wq_device_release;
-
-	/*
-	 * unbound_attrs are created separately.  Suppress uevent until
-	 * everything is ready.
-	 */
-	dev_set_uevent_suppress(&wq_dev->dev, true);
-
-	ret = device_register(&wq_dev->dev);
-	if (ret) {
-		kfree(wq_dev);
-		wq->wq_dev = NULL;
-		return ret;
-	}
-
-	if (wq->flags & WQ_UNBOUND) {
-		struct device_attribute *attr;
-
-		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-			ret = device_create_file(&wq_dev->dev, attr);
-			if (ret) {
-				device_unregister(&wq_dev->dev);
-				wq->wq_dev = NULL;
-				return ret;
-			}
-		}
-	}
-
-	dev_set_uevent_suppress(&wq_dev->dev, false);
-	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-	return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
-	struct wq_device *wq_dev = wq->wq_dev;
-
-	if (!wq->wq_dev)
-		return;
-
-	wq->wq_dev = NULL;
-	device_unregister(&wq_dev->dev);
-}
-#else	/* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
-#endif	/* CONFIG_SYSFS */
-
 /**
  * free_workqueue_attrs - free a workqueue_attrs
  * @attrs: workqueue_attrs to free
@@ -4029,6 +3712,323 @@ out_unlock:
 	put_pwq_unlocked(old_pwq);
 }
 
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
+ *  max_active	RW int	: maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id		RO int	: the associated pool ID
+ *  nice	RW int	: nice value of the workers
+ *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+	struct workqueue_struct		*wq;
+	struct device			dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int val;
+
+	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+		return -EINVAL;
+
+	workqueue_set_max_active(wq, val);
+	return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+	&dev_attr_per_cpu.attr,
+	&dev_attr_max_active.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	const char *delim = "";
+	int node, written = 0;
+
+	rcu_read_lock_sched();
+	for_each_node(node) {
+		written += scnprintf(buf + written, PAGE_SIZE - written,
+				     "%s%d:%d", delim, node,
+				     unbound_pwq_by_node(wq, node)->pool->id);
+		delim = " ";
+	}
+	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+	rcu_read_unlock_sched();
+
+	return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+	struct workqueue_attrs *attrs;
+
+	attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+
+	mutex_lock(&wq->mutex);
+	copy_workqueue_attrs(attrs, wq->unbound_attrs);
+	mutex_unlock(&wq->mutex);
+	return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+		ret = apply_workqueue_attrs(wq, attrs);
+	else
+		ret = -EINVAL;
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq->unbound_attrs->cpumask));
+	mutex_unlock(&wq->mutex);
+	return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, attrs->cpumask);
+	if (!ret)
+		ret = apply_workqueue_attrs(wq, attrs);
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n",
+			    !wq->unbound_attrs->no_numa);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int v, ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	if (sscanf(buf, "%d", &v) == 1) {
+		attrs->no_numa = !v;
+		ret = apply_workqueue_attrs(wq, attrs);
+	}
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+	__ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+	.name				= "workqueue",
+	.dev_groups			= wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+	return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev;
+	int ret;
+
+	/*
+	 * Adjusting max_active or creating new pwqs by applyting
+	 * attributes breaks ordering guarantee.  Disallow exposing ordered
+	 * workqueues.
+	 */
+	if (WARN_ON(wq->flags & __WQ_ORDERED))
+		return -EINVAL;
+
+	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+	if (!wq_dev)
+		return -ENOMEM;
+
+	wq_dev->wq = wq;
+	wq_dev->dev.bus = &wq_subsys;
+	wq_dev->dev.init_name = wq->name;
+	wq_dev->dev.release = wq_device_release;
+
+	/*
+	 * unbound_attrs are created separately.  Suppress uevent until
+	 * everything is ready.
+	 */
+	dev_set_uevent_suppress(&wq_dev->dev, true);
+
+	ret = device_register(&wq_dev->dev);
+	if (ret) {
+		kfree(wq_dev);
+		wq->wq_dev = NULL;
+		return ret;
+	}
+
+	if (wq->flags & WQ_UNBOUND) {
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+			ret = device_create_file(&wq_dev->dev, attr);
+			if (ret) {
+				device_unregister(&wq_dev->dev);
+				wq->wq_dev = NULL;
+				return ret;
+			}
+		}
+	}
+
+	dev_set_uevent_suppress(&wq_dev->dev, false);
+	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+	return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev = wq->wq_dev;
+
+	if (!wq->wq_dev)
+		return;
+
+	wq->wq_dev = NULL;
+	device_unregister(&wq_dev->dev);
+}
+#else	/* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
+#endif	/* CONFIG_SYSFS */
+
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
 	bool highpri = wq->flags & WQ_HIGHPRI;
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 2/4] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
  2015-03-12  5:00 ` [PATCH 1/4] workqueue: Reorder sysfs code Lai Jiangshan
@ 2015-03-12  5:00 ` Lai Jiangshan
  2015-03-12  5:00 ` [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-12  5:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Current apply_workqueue_attrs() includes pwqs-allocation and pwqs-installation,
so when we batch multiple apply_workqueue_attrs()s as a transaction, we can't
ensure the transaction must succeed or fail as a complete unit.

To solve this, we split apply_workqueue_attrs() into three stages.
The first stage does the preparation: allocation memory, pwqs.
The second stage does the attrs-installaion and pwqs-installation.
The third stage frees the allocated memory and (old or unused) pwqs.

As the result, batching multiple apply_workqueue_attrs()s can
succeed or fail as a complete unit:
	1) batch do all the first stage for all the workqueues
	2) only commit all when all the above succeed.

This patch is a preparation for the next patch ("Allow modifying low level
unbound workqueue cpumask") which will do a multiple apply_workqueue_attrs().

The patch doesn't have functionality changed except two minor adjustment:
	1) free_unbound_pwq() for the error path is removed, we use the
	   heavier version put_pwq_unlocked() instead since the error path
	   is rare. this adjustment simplifies the code.
	2) the memory-allocation is also moved into wq_pool_mutex.
	   this is needed to avoid to do the further splitting.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 199 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 115 insertions(+), 84 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6845fa..957b244 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3424,17 +3424,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 	return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-	lockdep_assert_held(&wq_pool_mutex);
-
-	if (pwq) {
-		put_unbound_pool(pwq->pool);
-		kmem_cache_free(pwq_cache, pwq);
-	}
-}
-
 /**
  * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of interest
@@ -3497,42 +3486,46 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 	return old_pwq;
 }
 
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-			  const struct workqueue_attrs *attrs)
+struct wq_unbound_install_ctx {
+	struct workqueue_struct	*wq;	/* target to be installed */
+	struct workqueue_attrs	*attrs;	/* attrs for installing */
+	struct pool_workqueue	*dfl_pwq;
+	struct pool_workqueue	*pwq_tbl[];
+};
+
+static void wq_unbound_install_ctx_free(struct wq_unbound_install_ctx *ctx)
 {
+	int node;
+
+	if (ctx) {
+		/* put the pwqs */
+		for_each_node(node)
+			put_pwq_unlocked(ctx->pwq_tbl[node]);
+		put_pwq_unlocked(ctx->dfl_pwq);
+
+		free_workqueue_attrs(ctx->attrs);
+	}
+
+	kfree(ctx);
+}
+
+static struct wq_unbound_install_ctx *
+wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
+			       const struct workqueue_attrs *attrs)
+{
+	struct wq_unbound_install_ctx *ctx;
 	struct workqueue_attrs *new_attrs, *tmp_attrs;
-	struct pool_workqueue **pwq_tbl, *dfl_pwq;
-	int node, ret;
+	int node;
 
-	/* only unbound workqueues can change attributes */
-	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-		return -EINVAL;
+	lockdep_assert_held(&wq_pool_mutex);
 
-	/* creating multiple pwqs breaks ordering guarantee */
-	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+		      GFP_KERNEL);
 
-	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!pwq_tbl || !new_attrs || !tmp_attrs)
-		goto enomem;
+	if (!ctx || !new_attrs || !tmp_attrs)
+		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
@@ -3546,75 +3539,113 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	copy_workqueue_attrs(tmp_attrs, new_attrs);
 
 	/*
-	 * CPUs should stay stable across pwq creations and installations.
-	 * Pin CPUs, determine the target cpumask for each node and create
-	 * pwqs accordingly.
-	 */
-	get_online_cpus();
-
-	mutex_lock(&wq_pool_mutex);
-
-	/*
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
 	 */
-	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-	if (!dfl_pwq)
-		goto enomem_pwq;
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (!ctx->dfl_pwq)
+		goto out_free;
 
 	for_each_node(node) {
 		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-			if (!pwq_tbl[node])
-				goto enomem_pwq;
+			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+			if (!ctx->pwq_tbl[node])
+				goto out_free;
 		} else {
-			dfl_pwq->refcnt++;
-			pwq_tbl[node] = dfl_pwq;
+			ctx->dfl_pwq->refcnt++;
+			ctx->pwq_tbl[node] = ctx->dfl_pwq;
 		}
 	}
 
-	mutex_unlock(&wq_pool_mutex);
+	ctx->wq = wq;
+	ctx->attrs = new_attrs;
+
+out_free:
+	free_workqueue_attrs(tmp_attrs);
+
+	if (!ctx || !ctx->wq) {
+		kfree(new_attrs);
+		wq_unbound_install_ctx_free(ctx);
+		return NULL;
+	} else {
+		return ctx;
+	}
+}
+
+static void wq_unbound_install_ctx_commit(struct wq_unbound_install_ctx *ctx)
+{
+	int node;
 
 	/* all pwqs have been created successfully, let's install'em */
-	mutex_lock(&wq->mutex);
+	mutex_lock(&ctx->wq->mutex);
 
-	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
 	/* save the previous pwq and install the new one */
 	for_each_node(node)
-		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+							  ctx->pwq_tbl[node]);
 
 	/* @dfl_pwq might not have been used, ensure it's linked */
-	link_pwq(dfl_pwq);
-	swap(wq->dfl_pwq, dfl_pwq);
+	link_pwq(ctx->dfl_pwq);
+	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
-	mutex_unlock(&wq->mutex);
+	mutex_unlock(&ctx->wq->mutex);
+}
 
-	/* put the old pwqs */
-	for_each_node(node)
-		put_pwq_unlocked(pwq_tbl[node]);
-	put_pwq_unlocked(dfl_pwq);
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs)
+{
+	struct wq_unbound_install_ctx *ctx;
+	int ret = -ENOMEM;
 
-	put_online_cpus();
-	ret = 0;
-	/* fall through */
-out_free:
-	free_workqueue_attrs(tmp_attrs);
-	free_workqueue_attrs(new_attrs);
-	kfree(pwq_tbl);
-	return ret;
+	/* only unbound workqueues can change attributes */
+	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+		return -EINVAL;
 
-enomem_pwq:
-	free_unbound_pwq(dfl_pwq);
-	for_each_node(node)
-		if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-			free_unbound_pwq(pwq_tbl[node]);
+	/* creating multiple pwqs breaks ordering guarantee */
+	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+		return -EINVAL;
+
+	/*
+	 * CPUs should stay stable across pwq creations and installations.
+	 * Pin CPUs, determine the target cpumask for each node and create
+	 * pwqs accordingly.
+	 */
+	get_online_cpus();
+
+	mutex_lock(&wq_pool_mutex);
+	ctx = wq_unbound_install_ctx_prepare(wq, attrs);
 	mutex_unlock(&wq_pool_mutex);
+
 	put_online_cpus();
-enomem:
-	ret = -ENOMEM;
-	goto out_free;
+
+	/* the ctx has been prepared successfully, let's commit it */
+	if (ctx) {
+		wq_unbound_install_ctx_commit(ctx);
+		ret = 0;
+	}
+
+	wq_unbound_install_ctx_free(ctx);
+
+	return ret;
 }
 
 /**
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
  2015-03-12  5:00 ` [PATCH 1/4] workqueue: Reorder sysfs code Lai Jiangshan
  2015-03-12  5:00 ` [PATCH 2/4] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
@ 2015-03-12  5:00 ` Lai Jiangshan
  2015-03-12 17:33   ` Christoph Lameter
  2015-03-13 23:49   ` Kevin Hilman
  2015-03-12  5:00 ` [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-12  5:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

Create a cpumask that limit the affinity of all unbound workqueues.
This cpumask is controlled though a file at the root of the workqueue
sysfs directory.

It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
such that the effective cpumask applied for a given unbound workqueue is
the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
the new /sys/devices/virtual/workqueue/cpumask_unbounds file.

This patch implements the basic infrastructure and the read interface.
cpumask_unbounds is initially set to cpu_possible_mask.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 957b244..61b5bfa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+static cpumask_var_t wq_unbound_cpumask;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -3529,7 +3531,7 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3959,9 +3961,29 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static ssize_t unbounds_cpumask_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	int written;
+
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq_unbound_cpumask));
+
+	return written;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+	__ATTR(cpumask, 0444, unbounds_cpumask_show, NULL);
+
 static int __init wq_sysfs_init(void)
 {
-	return subsys_virtual_register(&wq_subsys, NULL);
+	int err;
+
+	err = subsys_virtual_register(&wq_subsys, NULL);
+	if (err)
+		return err;
+
+	return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
 }
 core_initcall(wq_sysfs_init);
 
@@ -5094,6 +5116,9 @@ static int __init init_workqueues(void)
 
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
                   ` (2 preceding siblings ...)
  2015-03-12  5:00 ` [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
@ 2015-03-12  5:00 ` Lai Jiangshan
  2015-03-12 17:42   ` Christoph Lameter
  2015-03-13  7:49   ` Lai Jiangshan
  2015-03-12 17:49 ` [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Frederic Weisbecker
                   ` (2 subsequent siblings)
  6 siblings, 2 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-12  5:00 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Allow to modify the low-level unbound workqueues cpumask through
sysfs. This is performed by traversing the entire workqueue list
and calling wq_unbound_install_ctx_prepare() on the unbound workqueues
with the low level mask passed in. Only after all the preparation are done,
we commit them all together.

The oreder-workquue is ignore from the low level unbound workqueue cpumask,
it will be handled in near future.

The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
the default pwq ignores the low level cpumask when (and ONLY when) the cpumask set
by the user doesn't overlap with the low level cpumask. In this case, we can't
apply the empty cpumask to the default pwq, so we use the user-set cpumask
directly.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Original-patch-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 8 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61b5bfa..facaaae 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask;
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -3491,6 +3491,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 struct wq_unbound_install_ctx {
 	struct workqueue_struct	*wq;	/* target to be installed */
 	struct workqueue_attrs	*attrs;	/* attrs for installing */
+	struct list_head	list;	/* queued for batching commit */
 	struct pool_workqueue	*dfl_pwq;
 	struct pool_workqueue	*pwq_tbl[];
 };
@@ -3513,10 +3514,11 @@ static void wq_unbound_install_ctx_free(struct wq_unbound_install_ctx *ctx)
 
 static struct wq_unbound_install_ctx *
 wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
-			       const struct workqueue_attrs *attrs)
+			       const struct workqueue_attrs *attrs,
+			       cpumask_var_t unbound_cpumask)
 {
 	struct wq_unbound_install_ctx *ctx;
-	struct workqueue_attrs *new_attrs, *tmp_attrs;
+	struct workqueue_attrs *new_attrs, *pwq_attrs, *tmp_attrs;
 	int node;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -3525,13 +3527,16 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 		      GFP_KERNEL);
 
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	pwq_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	if (!ctx || !new_attrs || !tmp_attrs)
 		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+	copy_workqueue_attrs(pwq_attrs, attrs);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3544,13 +3549,21 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
+	 *
+	 * If the cpumask set by the user doesn't overlap with the global
+	 * wq_unbound_cpumask, we ignore the wq_unbound_cpumask for this wq
+	 * which means all its nodes' pwqs are its default pwq and its default
+	 * pwq's workers' cpumask is totally equals to the user setting.
 	 */
-	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (cpumask_empty(pwq_attrs->cpumask))
+		ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	else
+		ctx->dfl_pwq = alloc_unbound_pwq(wq, pwq_attrs);
 	if (!ctx->dfl_pwq)
 		goto out_free;
 
 	for_each_node(node) {
-		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+		if (wq_calc_node_cpumask(pwq_attrs, node, -1, tmp_attrs->cpumask)) {
 			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
 			if (!ctx->pwq_tbl[node])
 				goto out_free;
@@ -3564,6 +3577,7 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 	ctx->attrs = new_attrs;
 
 out_free:
+	free_workqueue_attrs(pwq_attrs);
 	free_workqueue_attrs(tmp_attrs);
 
 	if (!ctx || !ctx->wq) {
@@ -3634,7 +3648,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	get_online_cpus();
 
 	mutex_lock(&wq_pool_mutex);
-	ctx = wq_unbound_install_ctx_prepare(wq, attrs);
+	ctx = wq_unbound_install_ctx_prepare(wq, attrs, wq_unbound_cpumask);
 	mutex_unlock(&wq_pool_mutex);
 
 	put_online_cpus();
@@ -3961,19 +3975,85 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static int unbounds_cpumask_apply(cpumask_var_t cpumask)
+{
+	LIST_HEAD(ctxs);
+	int ret = 0;
+	struct workqueue_struct *wq;
+	struct wq_unbound_install_ctx *ctx, *n;
+
+	lockdep_assert_held(&wq_pool_mutex);
+
+	list_for_each_entry(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_UNBOUND))
+			continue;
+		/* creating multiple pwqs breaks ordering guarantee */
+		if (wq->flags & __WQ_ORDERED)
+			continue;
+
+		ctx = wq_unbound_install_ctx_prepare(wq, wq->unbound_attrs,
+						     cpumask);
+		if (!ctx) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		list_add_tail(&ctx->list, &ctxs);
+	}
+
+	list_for_each_entry_safe(ctx, n, &ctxs, list) {
+		if (ret >= 0)
+			wq_unbound_install_ctx_commit(ctx);
+		wq_unbound_install_ctx_free(ctx);
+	}
+
+	return ret;
+}
+
+static ssize_t unbounds_cpumask_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	cpumask_var_t cpumask;
+	int ret = -EINVAL;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, cpumask);
+	if (ret)
+		goto out;
+
+	get_online_cpus();
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (cpumask_intersects(cpumask, cpu_online_mask)) {
+		mutex_lock(&wq_pool_mutex);
+		ret = unbounds_cpumask_apply(cpumask);
+		if (ret >= 0)
+			cpumask_copy(wq_unbound_cpumask, cpumask);
+		mutex_unlock(&wq_pool_mutex);
+	}
+	put_online_cpus();
+out:
+	free_cpumask_var(cpumask);
+	return ret ? ret : count;
+}
+
 static ssize_t unbounds_cpumask_show(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
 	int written;
 
+	mutex_lock(&wq_pool_mutex);
 	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
 			    cpumask_pr_args(wq_unbound_cpumask));
+	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
 static struct device_attribute wq_sysfs_cpumask_attr =
-	__ATTR(cpumask, 0444, unbounds_cpumask_show, NULL);
+	__ATTR(cpumask, 0644, unbounds_cpumask_show, unbounds_cpumask_store);
 
 static int __init wq_sysfs_init(void)
 {
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-12  5:00 ` [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
@ 2015-03-12 17:33   ` Christoph Lameter
  2015-03-13 23:49   ` Kevin Hilman
  1 sibling, 0 replies; 48+ messages in thread
From: Christoph Lameter @ 2015-03-12 17:33 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Frederic Weisbecker, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar

On Thu, 12 Mar 2015, Lai Jiangshan wrote:

> This patch implements the basic infrastructure and the read interface.
> cpumask_unbounds is initially set to cpu_possible_mask.

Reviewed-by: Christoph Lameter <cl@linux.com>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-12  5:00 ` [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-03-12 17:42   ` Christoph Lameter
  2015-03-13  1:38     ` Lai Jiangshan
  2015-03-13  7:49   ` Lai Jiangshan
  1 sibling, 1 reply; 48+ messages in thread
From: Christoph Lameter @ 2015-03-12 17:42 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Kevin Hilman, Mike Galbraith, Paul E. McKenney,
	Tejun Heo, Viresh Kumar, Frederic Weisbecker

On Thu, 12 Mar 2015, Lai Jiangshan wrote:

> The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
> the default pwq ignores the low level cpumask when (and ONLY when) the cpumask set
> by the user doesn't overlap with the low level cpumask. In this case, we can't
> apply the empty cpumask to the default pwq, so we use the user-set cpumask
> directly.

I am wondering now why we have two cpumasks? A script can just interate
through the work queues if we want to set them all right? Then we do not
have to deal with the conflict between the settings in the kernel.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
                   ` (3 preceding siblings ...)
  2015-03-12  5:00 ` [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-03-12 17:49 ` Frederic Weisbecker
  2015-03-13  6:02 ` Mike Galbraith
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
  6 siblings, 0 replies; 48+ messages in thread
From: Frederic Weisbecker @ 2015-03-12 17:49 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar

On Thu, Mar 12, 2015 at 01:00:08PM +0800, Lai Jiangshan wrote:
> This patchset mostly copies from Frederic and split the apply_workqueue_attrs()
> as TJ's suggest.
> 
> This patchset doesn't include the patch "workqueue: Allow changing attributions
> of ordered workqueues", I hope to reduce the review processing. The handling
> for the ordered workqueue will be repose after this patchset accepted.

Ah thanks a lot for taking over this patchset. We strongly need this feature :-)

Thanks.

> Thanks,
> Lai
> 
> Frederic Weisbecker (2):
>   workqueue: Reorder sysfs code
>   workqueue: Create low-level unbound workqueues cpumask
> 
> Lai Jiangshan (2):
>   workqueue: split apply_workqueue_attrs() into 3 stages
>   workqueue: Allow modifying low level unbound workqueue cpumask
> 
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Kevin Hilman <khilman@linaro.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Cc: Mike Galbraith <bitbucket@online.de>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Cc: Frederic Weisbecker <fweisbec@gmail.com>
> 
>  kernel/workqueue.c | 942 ++++++++++++++++++++++++++++++-----------------------
>  1 file changed, 539 insertions(+), 403 deletions(-)
> 
> -- 
> 2.1.0
> 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-12 17:42   ` Christoph Lameter
@ 2015-03-13  1:38     ` Lai Jiangshan
  0 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-13  1:38 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, Kevin Hilman, Mike Galbraith, Paul E. McKenney,
	Tejun Heo, Viresh Kumar, Frederic Weisbecker

On 03/13/2015 01:42 AM, Christoph Lameter wrote:
> On Thu, 12 Mar 2015, Lai Jiangshan wrote:
> 
>> The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
>> the default pwq ignores the low level cpumask when (and ONLY when) the cpumask set
>> by the user doesn't overlap with the low level cpumask. In this case, we can't
>> apply the empty cpumask to the default pwq, so we use the user-set cpumask
>> directly.
> 
> I am wondering now why we have two cpumasks? 

What's your meaning? which two cpumask?

The per-nodes' pwqs' cpumask and the default pwq's cpumask?
They refer to different pool, so they have different cpumask.

1)
If the per-nodes' pwqs exist, they were controlled by A
(A = user-set-cpumask & possible-cpus-of-the-node). Now after this patch,
they are controlled by B (B = A & the-low-level-cpumak).

if A is empty or B is empty, we used default pwq for the node.

2)
The default pwq is different, it was controlled by C (C = user-set-cpumask),
and after this patch, it will be controlled by D
(D = user-set-cpumask(C) & the-low-level-cpumask), But D may be empty,
we can't have a default pwq with empty cpumask, so we have
use C instead in this case.




> A script can just interate
> through the work queues if we want to set them all right? Then we do not
> have to deal with the conflict between the settings in the kernel.

wq->unbound_attrs->cpumask and the-low-level-cpumask can be set by users.
they may be set different or even non-intersect, the non-intersect case
is not real conflict, but it is possible, we have to handle it.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
                   ` (4 preceding siblings ...)
  2015-03-12 17:49 ` [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Frederic Weisbecker
@ 2015-03-13  6:02 ` Mike Galbraith
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
  6 siblings, 0 replies; 48+ messages in thread
From: Mike Galbraith @ 2015-03-13  6:02 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Paul E. McKenney,
	Tejun Heo, Viresh Kumar, Frederic Weisbecker

On Thu, 2015-03-12 at 13:00 +0800, Lai Jiangshan wrote:
> This patchset mostly copies from Frederic and split the apply_workqueue_attrs()
> as TJ's suggest.

Cool.  I'll take these out for a (-rt) test-drive soonish.

	-Mike


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-12  5:00 ` [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
  2015-03-12 17:42   ` Christoph Lameter
@ 2015-03-13  7:49   ` Lai Jiangshan
  1 sibling, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-13  7:49 UTC (permalink / raw)
  To: linux-kernel
  Cc: Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

On 03/12/2015 01:00 PM, Lai Jiangshan wrote:
> Allow to modify the low-level unbound workqueues cpumask through
> sysfs. This is performed by traversing the entire workqueue list
> and calling wq_unbound_install_ctx_prepare() on the unbound workqueues
> with the low level mask passed in. Only after all the preparation are done,
> we commit them all together.
> 
> The oreder-workquue is ignore from the low level unbound workqueue cpumask,
> it will be handled in near future.
> 
> The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
> the default pwq ignores the low level cpumask when (and ONLY when) the cpumask set
> by the user doesn't overlap with the low level cpumask. In this case, we can't
> apply the empty cpumask to the default pwq, so we use the user-set cpumask
> directly.
> 
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Kevin Hilman <khilman@linaro.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Cc: Mike Galbraith <bitbucket@online.de>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Cc: Frederic Weisbecker <fweisbec@gmail.com>
> Original-patch-by: Frederic Weisbecker <fweisbec@gmail.com>
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>

miss a part in wq_update_unbound_numa()

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index facaaae..4027ec9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3722,6 +3722,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	 * wq's, the default pwq should be used.
 	 */
 	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+		cpumask_and(cpumask, cpumask, wq_unbound_cpumask);
+		if (cpumask_empty(cpumask))
+			goto use_dfl_pwq;
 		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
 			goto out_unlock;
 	} else {

> ---
>  kernel/workqueue.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 88 insertions(+), 8 deletions(-)
> 
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 61b5bfa..facaaae 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
>  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
>  static bool workqueue_freezing;		/* PL: have wqs started freezing? */
>  
> -static cpumask_var_t wq_unbound_cpumask;
> +static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
>  
>  /* the per-cpu worker pools */
>  static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
> @@ -3491,6 +3491,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
>  struct wq_unbound_install_ctx {
>  	struct workqueue_struct	*wq;	/* target to be installed */
>  	struct workqueue_attrs	*attrs;	/* attrs for installing */
> +	struct list_head	list;	/* queued for batching commit */
>  	struct pool_workqueue	*dfl_pwq;
>  	struct pool_workqueue	*pwq_tbl[];
>  };
> @@ -3513,10 +3514,11 @@ static void wq_unbound_install_ctx_free(struct wq_unbound_install_ctx *ctx)
>  
>  static struct wq_unbound_install_ctx *
>  wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
> -			       const struct workqueue_attrs *attrs)
> +			       const struct workqueue_attrs *attrs,
> +			       cpumask_var_t unbound_cpumask)
>  {
>  	struct wq_unbound_install_ctx *ctx;
> -	struct workqueue_attrs *new_attrs, *tmp_attrs;
> +	struct workqueue_attrs *new_attrs, *pwq_attrs, *tmp_attrs;
>  	int node;
>  
>  	lockdep_assert_held(&wq_pool_mutex);
> @@ -3525,13 +3527,16 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
>  		      GFP_KERNEL);
>  
>  	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
> +	pwq_attrs = alloc_workqueue_attrs(GFP_KERNEL);
>  	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
>  	if (!ctx || !new_attrs || !tmp_attrs)
>  		goto out_free;
>  
>  	/* make a copy of @attrs and sanitize it */
>  	copy_workqueue_attrs(new_attrs, attrs);
> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
> +	copy_workqueue_attrs(pwq_attrs, attrs);
> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
>  
>  	/*
>  	 * We may create multiple pwqs with differing cpumasks.  Make a
> @@ -3544,13 +3549,21 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
>  	 * If something goes wrong during CPU up/down, we'll fall back to
>  	 * the default pwq covering whole @attrs->cpumask.  Always create
>  	 * it even if we don't use it immediately.
> +	 *
> +	 * If the cpumask set by the user doesn't overlap with the global
> +	 * wq_unbound_cpumask, we ignore the wq_unbound_cpumask for this wq
> +	 * which means all its nodes' pwqs are its default pwq and its default
> +	 * pwq's workers' cpumask is totally equals to the user setting.
>  	 */
> -	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
> +	if (cpumask_empty(pwq_attrs->cpumask))
> +		ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
> +	else
> +		ctx->dfl_pwq = alloc_unbound_pwq(wq, pwq_attrs);
>  	if (!ctx->dfl_pwq)
>  		goto out_free;
>  
>  	for_each_node(node) {
> -		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
> +		if (wq_calc_node_cpumask(pwq_attrs, node, -1, tmp_attrs->cpumask)) {
>  			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
>  			if (!ctx->pwq_tbl[node])
>  				goto out_free;
> @@ -3564,6 +3577,7 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
>  	ctx->attrs = new_attrs;
>  
>  out_free:
> +	free_workqueue_attrs(pwq_attrs);
>  	free_workqueue_attrs(tmp_attrs);
>  
>  	if (!ctx || !ctx->wq) {
> @@ -3634,7 +3648,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
>  	get_online_cpus();
>  
>  	mutex_lock(&wq_pool_mutex);
> -	ctx = wq_unbound_install_ctx_prepare(wq, attrs);
> +	ctx = wq_unbound_install_ctx_prepare(wq, attrs, wq_unbound_cpumask);
>  	mutex_unlock(&wq_pool_mutex);
>  
>  	put_online_cpus();
> @@ -3961,19 +3975,85 @@ static struct bus_type wq_subsys = {
>  	.dev_groups			= wq_sysfs_groups,
>  };
>  
> +static int unbounds_cpumask_apply(cpumask_var_t cpumask)
> +{
> +	LIST_HEAD(ctxs);
> +	int ret = 0;
> +	struct workqueue_struct *wq;
> +	struct wq_unbound_install_ctx *ctx, *n;
> +
> +	lockdep_assert_held(&wq_pool_mutex);
> +
> +	list_for_each_entry(wq, &workqueues, list) {
> +		if (!(wq->flags & WQ_UNBOUND))
> +			continue;
> +		/* creating multiple pwqs breaks ordering guarantee */
> +		if (wq->flags & __WQ_ORDERED)
> +			continue;
> +
> +		ctx = wq_unbound_install_ctx_prepare(wq, wq->unbound_attrs,
> +						     cpumask);
> +		if (!ctx) {
> +			ret = -ENOMEM;
> +			break;
> +		}
> +
> +		list_add_tail(&ctx->list, &ctxs);
> +	}
> +
> +	list_for_each_entry_safe(ctx, n, &ctxs, list) {
> +		if (ret >= 0)
> +			wq_unbound_install_ctx_commit(ctx);
> +		wq_unbound_install_ctx_free(ctx);
> +	}
> +
> +	return ret;
> +}
> +
> +static ssize_t unbounds_cpumask_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t count)
> +{
> +	cpumask_var_t cpumask;
> +	int ret = -EINVAL;
> +
> +	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	ret = cpumask_parse(buf, cpumask);
> +	if (ret)
> +		goto out;
> +
> +	get_online_cpus();
> +	cpumask_and(cpumask, cpumask, cpu_possible_mask);
> +	if (cpumask_intersects(cpumask, cpu_online_mask)) {
> +		mutex_lock(&wq_pool_mutex);
> +		ret = unbounds_cpumask_apply(cpumask);
> +		if (ret >= 0)
> +			cpumask_copy(wq_unbound_cpumask, cpumask);
> +		mutex_unlock(&wq_pool_mutex);
> +	}
> +	put_online_cpus();
> +out:
> +	free_cpumask_var(cpumask);
> +	return ret ? ret : count;
> +}
> +
>  static ssize_t unbounds_cpumask_show(struct device *dev,
>  				     struct device_attribute *attr, char *buf)
>  {
>  	int written;
>  
> +	mutex_lock(&wq_pool_mutex);
>  	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
>  			    cpumask_pr_args(wq_unbound_cpumask));
> +	mutex_unlock(&wq_pool_mutex);
>  
>  	return written;
>  }
>  
>  static struct device_attribute wq_sysfs_cpumask_attr =
> -	__ATTR(cpumask, 0444, unbounds_cpumask_show, NULL);
> +	__ATTR(cpumask, 0644, unbounds_cpumask_show, unbounds_cpumask_store);
>  
>  static int __init wq_sysfs_init(void)
>  {
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-12  5:00 ` [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
  2015-03-12 17:33   ` Christoph Lameter
@ 2015-03-13 23:49   ` Kevin Hilman
  2015-03-14  0:52     ` Kevin Hilman
  2015-03-14  7:52     ` Lai Jiangshan
  1 sibling, 2 replies; 48+ messages in thread
From: Kevin Hilman @ 2015-03-13 23:49 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Frederic Weisbecker, Christoph Lameter,
	Mike Galbraith, Paul E. McKenney, Tejun Heo, Viresh Kumar

Lai Jiangshan <laijs@cn.fujitsu.com> writes:

> From: Frederic Weisbecker <fweisbec@gmail.com>
>
> Create a cpumask that limit the affinity of all unbound workqueues.
> This cpumask is controlled though a file at the root of the workqueue
> sysfs directory.
>
> It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
> such that the effective cpumask applied for a given unbound workqueue is
> the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
> the new /sys/devices/virtual/workqueue/cpumask_unbounds file.
>
> This patch implements the basic infrastructure and the read interface.
> cpumask_unbounds is initially set to cpu_possible_mask.
>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Kevin Hilman <khilman@linaro.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Cc: Mike Galbraith <bitbucket@online.de>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>

[...]

> @@ -5094,6 +5116,9 @@ static int __init init_workqueues(void)
>  
>  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
>  
> +	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
> +	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
> +

As I mentioned in an earlier discussion[1], I still think this could
default too the housekeeping CPUs in the NO_HZ_FULL case:

#ifdef CONFIG_NO_HZ_FULL
	cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);
#else
	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
#endif

But that could also be left to a future optimization as well.

Kevin

[1] https://lkml.org/lkml/2014/2/14/666

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-13 23:49   ` Kevin Hilman
@ 2015-03-14  0:52     ` Kevin Hilman
  2015-03-14  7:52     ` Lai Jiangshan
  1 sibling, 0 replies; 48+ messages in thread
From: Kevin Hilman @ 2015-03-14  0:52 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Frederic Weisbecker, Christoph Lameter,
	Mike Galbraith, Paul E. McKenney, Tejun Heo, Viresh Kumar

Kevin Hilman <khilman@kernel.org> writes:

> Lai Jiangshan <laijs@cn.fujitsu.com> writes:
>
>> From: Frederic Weisbecker <fweisbec@gmail.com>
>>
>> Create a cpumask that limit the affinity of all unbound workqueues.
>> This cpumask is controlled though a file at the root of the workqueue
>> sysfs directory.
>>
>> It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
>> such that the effective cpumask applied for a given unbound workqueue is
>> the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
>> the new /sys/devices/virtual/workqueue/cpumask_unbounds file.
>>
>> This patch implements the basic infrastructure and the read interface.
>> cpumask_unbounds is initially set to cpu_possible_mask.
>>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Kevin Hilman <khilman@linaro.org>
>> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
>> Cc: Mike Galbraith <bitbucket@online.de>
>> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: Viresh Kumar <viresh.kumar@linaro.org>
>> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>
> [...]
>
>> @@ -5094,6 +5116,9 @@ static int __init init_workqueues(void)
>>  
>>  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
>>  
>> +	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
>> +	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
>> +
>
> As I mentioned in an earlier discussion[1], I still think this could
> default too the housekeeping CPUs in the NO_HZ_FULL case:
>
> #ifdef CONFIG_NO_HZ_FULL
> 	cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);
> #else
> 	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
> #endif
>
> But that could also be left to a future optimization as well.

I meant to add:

Acked-by: Kevin Hilman <khilman@linaro.org>

as well, as the NO_HZ_FULL bit could be added as an add-on patch.

Kevin

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-13 23:49   ` Kevin Hilman
  2015-03-14  0:52     ` Kevin Hilman
@ 2015-03-14  7:52     ` Lai Jiangshan
  2015-03-16 17:12       ` Kevin Hilman
  1 sibling, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-14  7:52 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: linux-kernel, Frederic Weisbecker, Christoph Lameter,
	Mike Galbraith, Paul E. McKenney, Tejun Heo, Viresh Kumar

On 03/14/2015 07:49 AM, Kevin Hilman wrote:
> Lai Jiangshan <laijs@cn.fujitsu.com> writes:
> 
>> From: Frederic Weisbecker <fweisbec@gmail.com>
>>
>> Create a cpumask that limit the affinity of all unbound workqueues.
>> This cpumask is controlled though a file at the root of the workqueue
>> sysfs directory.
>>
>> It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
>> such that the effective cpumask applied for a given unbound workqueue is
>> the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
>> the new /sys/devices/virtual/workqueue/cpumask_unbounds file.
>>
>> This patch implements the basic infrastructure and the read interface.
>> cpumask_unbounds is initially set to cpu_possible_mask.
>>
>> Cc: Christoph Lameter <cl@linux.com>
>> Cc: Kevin Hilman <khilman@linaro.org>
>> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
>> Cc: Mike Galbraith <bitbucket@online.de>
>> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>> Cc: Tejun Heo <tj@kernel.org>
>> Cc: Viresh Kumar <viresh.kumar@linaro.org>
>> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> 
> [...]
> 
>> @@ -5094,6 +5116,9 @@ static int __init init_workqueues(void)
>>  
>>  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
>>  
>> +	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
>> +	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
>> +
> 
> As I mentioned in an earlier discussion[1], I still think this could
> default too the housekeeping CPUs in the NO_HZ_FULL case:
> 
> #ifdef CONFIG_NO_HZ_FULL
> 	cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);


No, the default/booted wq_unbound_cpumask should be cpu_possible_mask.

> #else
> 	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
> #endif
> 
> But that could also be left to a future optimization as well.
> 
> Kevin
> 
> [1] https://lkml.org/lkml/2014/2/14/666
> .
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-14  7:52     ` Lai Jiangshan
@ 2015-03-16 17:12       ` Kevin Hilman
  2015-03-16 17:25         ` Frederic Weisbecker
  0 siblings, 1 reply; 48+ messages in thread
From: Kevin Hilman @ 2015-03-16 17:12 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Frederic Weisbecker, Christoph Lameter,
	Mike Galbraith, Paul E. McKenney, Tejun Heo, Viresh Kumar

Lai Jiangshan <laijs@cn.fujitsu.com> writes:

> On 03/14/2015 07:49 AM, Kevin Hilman wrote:
>> Lai Jiangshan <laijs@cn.fujitsu.com> writes:
>> 
>>> From: Frederic Weisbecker <fweisbec@gmail.com>
>>>
>>> Create a cpumask that limit the affinity of all unbound workqueues.
>>> This cpumask is controlled though a file at the root of the workqueue
>>> sysfs directory.
>>>
>>> It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
>>> such that the effective cpumask applied for a given unbound workqueue is
>>> the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
>>> the new /sys/devices/virtual/workqueue/cpumask_unbounds file.
>>>
>>> This patch implements the basic infrastructure and the read interface.
>>> cpumask_unbounds is initially set to cpu_possible_mask.
>>>
>>> Cc: Christoph Lameter <cl@linux.com>
>>> Cc: Kevin Hilman <khilman@linaro.org>
>>> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
>>> Cc: Mike Galbraith <bitbucket@online.de>
>>> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>>> Cc: Tejun Heo <tj@kernel.org>
>>> Cc: Viresh Kumar <viresh.kumar@linaro.org>
>>> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
>>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
>> 
>> [...]
>> 
>>> @@ -5094,6 +5116,9 @@ static int __init init_workqueues(void)
>>>  
>>>  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
>>>  
>>> +	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
>>> +	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
>>> +
>> 
>> As I mentioned in an earlier discussion[1], I still think this could
>> default too the housekeeping CPUs in the NO_HZ_FULL case:
>> 
>> #ifdef CONFIG_NO_HZ_FULL
>> 	cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);
>
>
> No, the default/booted wq_unbound_cpumask should be cpu_possible_mask.
>

Even for NO_HZ_FULL?  

IMO, for NO_HZ_FULL, we want the unbound workqueues to be on the
housekeeping CPU(s).

Kevin

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-16 17:12       ` Kevin Hilman
@ 2015-03-16 17:25         ` Frederic Weisbecker
  2015-03-16 19:38           ` Kevin Hilman
  0 siblings, 1 reply; 48+ messages in thread
From: Frederic Weisbecker @ 2015-03-16 17:25 UTC (permalink / raw)
  To: Kevin Hilman
  Cc: Lai Jiangshan, linux-kernel, Christoph Lameter, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar

On Mon, Mar 16, 2015 at 10:12:12AM -0700, Kevin Hilman wrote:
> Lai Jiangshan <laijs@cn.fujitsu.com> writes:
> 
> > On 03/14/2015 07:49 AM, Kevin Hilman wrote:
> >> Lai Jiangshan <laijs@cn.fujitsu.com> writes:
> >> 
> >>> From: Frederic Weisbecker <fweisbec@gmail.com>
> >>>
> >>> Create a cpumask that limit the affinity of all unbound workqueues.
> >>> This cpumask is controlled though a file at the root of the workqueue
> >>> sysfs directory.
> >>>
> >>> It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
> >>> such that the effective cpumask applied for a given unbound workqueue is
> >>> the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
> >>> the new /sys/devices/virtual/workqueue/cpumask_unbounds file.
> >>>
> >>> This patch implements the basic infrastructure and the read interface.
> >>> cpumask_unbounds is initially set to cpu_possible_mask.
> >>>
> >>> Cc: Christoph Lameter <cl@linux.com>
> >>> Cc: Kevin Hilman <khilman@linaro.org>
> >>> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> >>> Cc: Mike Galbraith <bitbucket@online.de>
> >>> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> >>> Cc: Tejun Heo <tj@kernel.org>
> >>> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> >>> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> >>> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
> >> 
> >> [...]
> >> 
> >>> @@ -5094,6 +5116,9 @@ static int __init init_workqueues(void)
> >>>  
> >>>  	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
> >>>  
> >>> +	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
> >>> +	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
> >>> +
> >> 
> >> As I mentioned in an earlier discussion[1], I still think this could
> >> default too the housekeeping CPUs in the NO_HZ_FULL case:
> >> 
> >> #ifdef CONFIG_NO_HZ_FULL
> >> 	cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);
> >
> >
> > No, the default/booted wq_unbound_cpumask should be cpu_possible_mask.
> >
> 
> Even for NO_HZ_FULL?  
> 
> IMO, for NO_HZ_FULL, we want the unbound workqueues to be on the
> housekeeping CPU(s).

If it should be the default on NO_HZ_FULL, maybe we should do this from the
tick nohz code. Some late or fs initcall that will do the workqueue affinity,
timer affinity, etc...

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask
  2015-03-16 17:25         ` Frederic Weisbecker
@ 2015-03-16 19:38           ` Kevin Hilman
  0 siblings, 0 replies; 48+ messages in thread
From: Kevin Hilman @ 2015-03-16 19:38 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Lai Jiangshan, linux-kernel, Christoph Lameter, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar

Frederic Weisbecker <fweisbec@gmail.com> writes:

> On Mon, Mar 16, 2015 at 10:12:12AM -0700, Kevin Hilman wrote:
>> Lai Jiangshan <laijs@cn.fujitsu.com> writes:
>> 
>> > On 03/14/2015 07:49 AM, Kevin Hilman wrote:

[...]

>> >> 
>> >> As I mentioned in an earlier discussion[1], I still think this could
>> >> default too the housekeeping CPUs in the NO_HZ_FULL case:
>> >> 
>> >> #ifdef CONFIG_NO_HZ_FULL
>> >> 	cpumask_complement(wq_unbound_cpumask, tick_nohz_full_mask);
>> >
>> >
>> > No, the default/booted wq_unbound_cpumask should be cpu_possible_mask.
>> >
>> 
>> Even for NO_HZ_FULL?  
>> 
>> IMO, for NO_HZ_FULL, we want the unbound workqueues to be on the
>> housekeeping CPU(s).
>
> If it should be the default on NO_HZ_FULL, maybe we should do this from the
> tick nohz code. Some late or fs initcall that will do the workqueue affinity,
> timer affinity, etc...

Sure, I'd be fine with that too.

Kevin

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5
  2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
                   ` (5 preceding siblings ...)
  2015-03-13  6:02 ` Mike Galbraith
@ 2015-03-18  4:40 ` Lai Jiangshan
  2015-03-18  4:40   ` [PATCH 1/4 V5] workqueue: Reorder sysfs code Lai Jiangshan
                     ` (5 more replies)
  6 siblings, 6 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-18  4:40 UTC (permalink / raw)
  To: linux-kernel; +Cc: Lai Jiangshan

This patchset mostly copies from Frederic and split the apply_workqueue_attrs()
as TJ's suggest.

This patchset still doesn't include the patch "workqueue: Allow changing attributions
of ordered workqueues", I hope to reduce the review processing. The handling
for the ordered workqueue will be repose after this patchset accepted.

Changed from V4
Add workqueue_unbounds_cpumask_set() kernel API and minimally restruct the patch4.


Thanks,
Lai

Frederic Weisbecker (2):
  workqueue: Reorder sysfs code
  workqueue: Create low-level unbound workqueues cpumask

Lai Jiangshan (2):
  workqueue: split apply_workqueue_attrs() into 3 stages
  workqueue: Allow modifying low level unbound workqueue cpumask

 include/linux/workqueue.h |   1 +
 kernel/workqueue.c        | 971 +++++++++++++++++++++++++++-------------------
 2 files changed, 569 insertions(+), 403 deletions(-)

-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/4 V5] workqueue: Reorder sysfs code
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
@ 2015-03-18  4:40   ` Lai Jiangshan
  2015-03-24 15:41     ` Tejun Heo
  2015-03-18  4:40   ` [PATCH 2/4 V5] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
                     ` (4 subsequent siblings)
  5 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-18  4:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

The sysfs code usually belongs to the botom of the file since it deals
with high level objects. In the workqueue code it's misplaced and such
that we'll need to work around functions references to allow the sysfs
code to call APIs like apply_workqueue_attrs().

Lets move that block further in the file, right above alloc_workqueue_key()
which reference it.

Suggested-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 634 ++++++++++++++++++++++++++---------------------------
 1 file changed, 317 insertions(+), 317 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ca0b1d..c6845fa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3001,323 +3001,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
 
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
- *  max_active	RW int	: maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- *  id		RO int	: the associated pool ID
- *  nice	RW int	: nice value of the workers
- *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
- */
-struct wq_device {
-	struct workqueue_struct		*wq;
-	struct device			dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
-	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-	return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
-				struct device_attribute *attr, const char *buf,
-				size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int val;
-
-	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-		return -EINVAL;
-
-	workqueue_set_max_active(wq, val);
-	return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
-	&dev_attr_per_cpu.attr,
-	&dev_attr_max_active.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	const char *delim = "";
-	int node, written = 0;
-
-	rcu_read_lock_sched();
-	for_each_node(node) {
-		written += scnprintf(buf + written, PAGE_SIZE - written,
-				     "%s%d:%d", delim, node,
-				     unbound_pwq_by_node(wq, node)->pool->id);
-		delim = " ";
-	}
-	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-	rcu_read_unlock_sched();
-
-	return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-	mutex_unlock(&wq->mutex);
-
-	return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-	struct workqueue_attrs *attrs;
-
-	attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!attrs)
-		return NULL;
-
-	mutex_lock(&wq->mutex);
-	copy_workqueue_attrs(attrs, wq->unbound_attrs);
-	mutex_unlock(&wq->mutex);
-	return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-		ret = apply_workqueue_attrs(wq, attrs);
-	else
-		ret = -EINVAL;
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			    cpumask_pr_args(wq->unbound_attrs->cpumask));
-	mutex_unlock(&wq->mutex);
-	return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	ret = cpumask_parse(buf, attrs->cpumask);
-	if (!ret)
-		ret = apply_workqueue_attrs(wq, attrs);
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%d\n",
-			    !wq->unbound_attrs->no_numa);
-	mutex_unlock(&wq->mutex);
-
-	return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int v, ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	ret = -EINVAL;
-	if (sscanf(buf, "%d", &v) == 1) {
-		attrs->no_numa = !v;
-		ret = apply_workqueue_attrs(wq, attrs);
-	}
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-	__ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
-	.name				= "workqueue",
-	.dev_groups			= wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
-	return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
-	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-	kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
-	struct wq_device *wq_dev;
-	int ret;
-
-	/*
-	 * Adjusting max_active or creating new pwqs by applyting
-	 * attributes breaks ordering guarantee.  Disallow exposing ordered
-	 * workqueues.
-	 */
-	if (WARN_ON(wq->flags & __WQ_ORDERED))
-		return -EINVAL;
-
-	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-	if (!wq_dev)
-		return -ENOMEM;
-
-	wq_dev->wq = wq;
-	wq_dev->dev.bus = &wq_subsys;
-	wq_dev->dev.init_name = wq->name;
-	wq_dev->dev.release = wq_device_release;
-
-	/*
-	 * unbound_attrs are created separately.  Suppress uevent until
-	 * everything is ready.
-	 */
-	dev_set_uevent_suppress(&wq_dev->dev, true);
-
-	ret = device_register(&wq_dev->dev);
-	if (ret) {
-		kfree(wq_dev);
-		wq->wq_dev = NULL;
-		return ret;
-	}
-
-	if (wq->flags & WQ_UNBOUND) {
-		struct device_attribute *attr;
-
-		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-			ret = device_create_file(&wq_dev->dev, attr);
-			if (ret) {
-				device_unregister(&wq_dev->dev);
-				wq->wq_dev = NULL;
-				return ret;
-			}
-		}
-	}
-
-	dev_set_uevent_suppress(&wq_dev->dev, false);
-	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-	return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
-	struct wq_device *wq_dev = wq->wq_dev;
-
-	if (!wq->wq_dev)
-		return;
-
-	wq->wq_dev = NULL;
-	device_unregister(&wq_dev->dev);
-}
-#else	/* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
-#endif	/* CONFIG_SYSFS */
-
 /**
  * free_workqueue_attrs - free a workqueue_attrs
  * @attrs: workqueue_attrs to free
@@ -4029,6 +3712,323 @@ out_unlock:
 	put_pwq_unlocked(old_pwq);
 }
 
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
+ *  max_active	RW int	: maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id		RO int	: the associated pool ID
+ *  nice	RW int	: nice value of the workers
+ *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+	struct workqueue_struct		*wq;
+	struct device			dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int val;
+
+	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+		return -EINVAL;
+
+	workqueue_set_max_active(wq, val);
+	return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+	&dev_attr_per_cpu.attr,
+	&dev_attr_max_active.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	const char *delim = "";
+	int node, written = 0;
+
+	rcu_read_lock_sched();
+	for_each_node(node) {
+		written += scnprintf(buf + written, PAGE_SIZE - written,
+				     "%s%d:%d", delim, node,
+				     unbound_pwq_by_node(wq, node)->pool->id);
+		delim = " ";
+	}
+	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+	rcu_read_unlock_sched();
+
+	return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+	struct workqueue_attrs *attrs;
+
+	attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+
+	mutex_lock(&wq->mutex);
+	copy_workqueue_attrs(attrs, wq->unbound_attrs);
+	mutex_unlock(&wq->mutex);
+	return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+		ret = apply_workqueue_attrs(wq, attrs);
+	else
+		ret = -EINVAL;
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq->unbound_attrs->cpumask));
+	mutex_unlock(&wq->mutex);
+	return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, attrs->cpumask);
+	if (!ret)
+		ret = apply_workqueue_attrs(wq, attrs);
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n",
+			    !wq->unbound_attrs->no_numa);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int v, ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	if (sscanf(buf, "%d", &v) == 1) {
+		attrs->no_numa = !v;
+		ret = apply_workqueue_attrs(wq, attrs);
+	}
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+	__ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+	.name				= "workqueue",
+	.dev_groups			= wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+	return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev;
+	int ret;
+
+	/*
+	 * Adjusting max_active or creating new pwqs by applyting
+	 * attributes breaks ordering guarantee.  Disallow exposing ordered
+	 * workqueues.
+	 */
+	if (WARN_ON(wq->flags & __WQ_ORDERED))
+		return -EINVAL;
+
+	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+	if (!wq_dev)
+		return -ENOMEM;
+
+	wq_dev->wq = wq;
+	wq_dev->dev.bus = &wq_subsys;
+	wq_dev->dev.init_name = wq->name;
+	wq_dev->dev.release = wq_device_release;
+
+	/*
+	 * unbound_attrs are created separately.  Suppress uevent until
+	 * everything is ready.
+	 */
+	dev_set_uevent_suppress(&wq_dev->dev, true);
+
+	ret = device_register(&wq_dev->dev);
+	if (ret) {
+		kfree(wq_dev);
+		wq->wq_dev = NULL;
+		return ret;
+	}
+
+	if (wq->flags & WQ_UNBOUND) {
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+			ret = device_create_file(&wq_dev->dev, attr);
+			if (ret) {
+				device_unregister(&wq_dev->dev);
+				wq->wq_dev = NULL;
+				return ret;
+			}
+		}
+	}
+
+	dev_set_uevent_suppress(&wq_dev->dev, false);
+	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+	return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev = wq->wq_dev;
+
+	if (!wq->wq_dev)
+		return;
+
+	wq->wq_dev = NULL;
+	device_unregister(&wq_dev->dev);
+}
+#else	/* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
+#endif	/* CONFIG_SYSFS */
+
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
 	bool highpri = wq->flags & WQ_HIGHPRI;
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 2/4 V5] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
  2015-03-18  4:40   ` [PATCH 1/4 V5] workqueue: Reorder sysfs code Lai Jiangshan
@ 2015-03-18  4:40   ` Lai Jiangshan
  2015-03-24 15:55     ` Tejun Heo
  2015-03-18  4:40   ` [PATCH 3/4 V5] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
                     ` (3 subsequent siblings)
  5 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-18  4:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Current apply_workqueue_attrs() includes pwqs-allocation and pwqs-installation,
so when we batch multiple apply_workqueue_attrs()s as a transaction, we can't
ensure the transaction must succeed or fail as a complete unit.

To solve this, we split apply_workqueue_attrs() into three stages.
The first stage does the preparation: allocation memory, pwqs.
The second stage does the attrs-installaion and pwqs-installation.
The third stage frees the allocated memory and (old or unused) pwqs.

As the result, batching multiple apply_workqueue_attrs()s can
succeed or fail as a complete unit:
	1) batch do all the first stage for all the workqueues
	2) only commit all when all the above succeed.

This patch is a preparation for the next patch ("Allow modifying low level
unbound workqueue cpumask") which will do a multiple apply_workqueue_attrs().

The patch doesn't have functionality changed except two minor adjustment:
	1) free_unbound_pwq() for the error path is removed, we use the
	   heavier version put_pwq_unlocked() instead since the error path
	   is rare. this adjustment simplifies the code.
	2) the memory-allocation is also moved into wq_pool_mutex.
	   this is needed to avoid to do the further splitting.

Suggested-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 203 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 119 insertions(+), 84 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c6845fa..b150828 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3424,17 +3424,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 	return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-	lockdep_assert_held(&wq_pool_mutex);
-
-	if (pwq) {
-		put_unbound_pool(pwq->pool);
-		kmem_cache_free(pwq_cache, pwq);
-	}
-}
-
 /**
  * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of interest
@@ -3497,42 +3486,49 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 	return old_pwq;
 }
 
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-			  const struct workqueue_attrs *attrs)
+/* Context to store the prepared attrs & pwqs before installed */
+struct wq_unbound_install_ctx {
+	struct workqueue_struct	*wq;	/* target to be installed */
+	struct workqueue_attrs	*attrs;	/* attrs for installing */
+	struct pool_workqueue	*dfl_pwq;
+	struct pool_workqueue	*pwq_tbl[];
+};
+
+/* Free the resources after success or abort */
+static void wq_unbound_install_ctx_free(struct wq_unbound_install_ctx *ctx)
 {
+	int node;
+
+	if (ctx) {
+		/* put the pwqs */
+		for_each_node(node)
+			put_pwq_unlocked(ctx->pwq_tbl[node]);
+		put_pwq_unlocked(ctx->dfl_pwq);
+
+		free_workqueue_attrs(ctx->attrs);
+	}
+
+	kfree(ctx);
+}
+
+/* Allocates the attrs and pwqs for later installment */
+static struct wq_unbound_install_ctx *
+wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
+			       const struct workqueue_attrs *attrs)
+{
+	struct wq_unbound_install_ctx *ctx;
 	struct workqueue_attrs *new_attrs, *tmp_attrs;
-	struct pool_workqueue **pwq_tbl, *dfl_pwq;
-	int node, ret;
+	int node;
 
-	/* only unbound workqueues can change attributes */
-	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-		return -EINVAL;
+	lockdep_assert_held(&wq_pool_mutex);
 
-	/* creating multiple pwqs breaks ordering guarantee */
-	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+		      GFP_KERNEL);
 
-	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!pwq_tbl || !new_attrs || !tmp_attrs)
-		goto enomem;
+	if (!ctx || !new_attrs || !tmp_attrs)
+		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
@@ -3546,75 +3542,114 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	copy_workqueue_attrs(tmp_attrs, new_attrs);
 
 	/*
-	 * CPUs should stay stable across pwq creations and installations.
-	 * Pin CPUs, determine the target cpumask for each node and create
-	 * pwqs accordingly.
-	 */
-	get_online_cpus();
-
-	mutex_lock(&wq_pool_mutex);
-
-	/*
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
 	 */
-	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-	if (!dfl_pwq)
-		goto enomem_pwq;
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (!ctx->dfl_pwq)
+		goto out_free;
 
 	for_each_node(node) {
 		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-			if (!pwq_tbl[node])
-				goto enomem_pwq;
+			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+			if (!ctx->pwq_tbl[node])
+				goto out_free;
 		} else {
-			dfl_pwq->refcnt++;
-			pwq_tbl[node] = dfl_pwq;
+			ctx->dfl_pwq->refcnt++;
+			ctx->pwq_tbl[node] = ctx->dfl_pwq;
 		}
 	}
 
-	mutex_unlock(&wq_pool_mutex);
+	ctx->wq = wq;
+	ctx->attrs = new_attrs;
+
+out_free:
+	free_workqueue_attrs(tmp_attrs);
+
+	if (!ctx || !ctx->wq) {
+		kfree(new_attrs);
+		wq_unbound_install_ctx_free(ctx);
+		return NULL;
+	} else {
+		return ctx;
+	}
+}
+
+/* Set the unbound_attr and install the prepared pwqs. Should not fail */
+static void wq_unbound_install_ctx_commit(struct wq_unbound_install_ctx *ctx)
+{
+	int node;
 
 	/* all pwqs have been created successfully, let's install'em */
-	mutex_lock(&wq->mutex);
+	mutex_lock(&ctx->wq->mutex);
 
-	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
 	/* save the previous pwq and install the new one */
 	for_each_node(node)
-		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+							  ctx->pwq_tbl[node]);
 
 	/* @dfl_pwq might not have been used, ensure it's linked */
-	link_pwq(dfl_pwq);
-	swap(wq->dfl_pwq, dfl_pwq);
+	link_pwq(ctx->dfl_pwq);
+	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
-	mutex_unlock(&wq->mutex);
+	mutex_unlock(&ctx->wq->mutex);
+}
 
-	/* put the old pwqs */
-	for_each_node(node)
-		put_pwq_unlocked(pwq_tbl[node]);
-	put_pwq_unlocked(dfl_pwq);
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs)
+{
+	struct wq_unbound_install_ctx *ctx;
+	int ret = -ENOMEM;
 
-	put_online_cpus();
-	ret = 0;
-	/* fall through */
-out_free:
-	free_workqueue_attrs(tmp_attrs);
-	free_workqueue_attrs(new_attrs);
-	kfree(pwq_tbl);
-	return ret;
+	/* only unbound workqueues can change attributes */
+	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+		return -EINVAL;
 
-enomem_pwq:
-	free_unbound_pwq(dfl_pwq);
-	for_each_node(node)
-		if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-			free_unbound_pwq(pwq_tbl[node]);
+	/* creating multiple pwqs breaks ordering guarantee */
+	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+		return -EINVAL;
+
+	/*
+	 * CPUs should stay stable across pwq creations and installations.
+	 * Pin CPUs, determine the target cpumask for each node and create
+	 * pwqs accordingly.
+	 */
+	get_online_cpus();
+
+	mutex_lock(&wq_pool_mutex);
+	ctx = wq_unbound_install_ctx_prepare(wq, attrs);
 	mutex_unlock(&wq_pool_mutex);
+
 	put_online_cpus();
-enomem:
-	ret = -ENOMEM;
-	goto out_free;
+
+	/* the ctx has been prepared successfully, let's commit it */
+	if (ctx) {
+		wq_unbound_install_ctx_commit(ctx);
+		ret = 0;
+	}
+
+	wq_unbound_install_ctx_free(ctx);
+
+	return ret;
 }
 
 /**
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 3/4 V5] workqueue: Create low-level unbound workqueues cpumask
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
  2015-03-18  4:40   ` [PATCH 1/4 V5] workqueue: Reorder sysfs code Lai Jiangshan
  2015-03-18  4:40   ` [PATCH 2/4 V5] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
@ 2015-03-18  4:40   ` Lai Jiangshan
  2015-03-18  4:40   ` [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-18  4:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

Create a cpumask that limit the affinity of all unbound workqueues.
This cpumask is controlled though a file at the root of the workqueue
sysfs directory.

It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
such that the effective cpumask applied for a given unbound workqueue is
the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
the new /sys/devices/virtual/workqueue/cpumask_unbounds file.

This patch implements the basic infrastructure and the read interface.
cpumask_unbounds is initially set to cpu_possible_mask.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b150828..d1197f0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+static cpumask_var_t wq_unbound_cpumask;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -3532,7 +3534,7 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3963,9 +3965,29 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static ssize_t unbounds_cpumask_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	int written;
+
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq_unbound_cpumask));
+
+	return written;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+	__ATTR(cpumask, 0444, unbounds_cpumask_show, NULL);
+
 static int __init wq_sysfs_init(void)
 {
-	return subsys_virtual_register(&wq_subsys, NULL);
+	int err;
+
+	err = subsys_virtual_register(&wq_subsys, NULL);
+	if (err)
+		return err;
+
+	return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
 }
 core_initcall(wq_sysfs_init);
 
@@ -5098,6 +5120,9 @@ static int __init init_workqueues(void)
 
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
                     ` (2 preceding siblings ...)
  2015-03-18  4:40   ` [PATCH 3/4 V5] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
@ 2015-03-18  4:40   ` Lai Jiangshan
  2015-03-24 17:31     ` Tejun Heo
  2015-03-19  8:54   ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Mike Galbraith
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
  5 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-18  4:40 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Allow to modify the low-level unbound workqueues cpumask through
sysfs. This is performed by traversing the entire workqueue list
and calling wq_unbound_install_ctx_prepare() on the unbound workqueues
with the low level mask passed in. Only after all the preparation are done,
we commit them all together.

The oreder-workquue is ignore from the low level unbound workqueue cpumask,
it will be handled in near future.

The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
the default pwq ignores the low level cpumask when (and ONLY when) the cpumask set
by the user doesn't overlap with the low level cpumask. In this case, we can't
apply the empty cpumask to the default pwq, so we use the user-set cpumask
directly.

The default wq_unbound_cpumask is still cpu_possible_mask due to the workqueue
subsystem doesn't know what is the best default value for the runtime, the
system manager or other subsystem which knows the sufficient information should set
it when needed.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Original-patch-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h |   1 +
 kernel/workqueue.c        | 121 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 114 insertions(+), 8 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index deee212..052d1bf 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
+int workqueue_unbounds_cpumask_set(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d1197f0..47f1a9d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_cpumask;
+static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -3492,6 +3492,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 struct wq_unbound_install_ctx {
 	struct workqueue_struct	*wq;	/* target to be installed */
 	struct workqueue_attrs	*attrs;	/* attrs for installing */
+	struct list_head	list;	/* queued for batching commit */
 	struct pool_workqueue	*dfl_pwq;
 	struct pool_workqueue	*pwq_tbl[];
 };
@@ -3516,10 +3517,11 @@ static void wq_unbound_install_ctx_free(struct wq_unbound_install_ctx *ctx)
 /* Allocates the attrs and pwqs for later installment */
 static struct wq_unbound_install_ctx *
 wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
-			       const struct workqueue_attrs *attrs)
+			       const struct workqueue_attrs *attrs,
+			       cpumask_var_t unbound_cpumask)
 {
 	struct wq_unbound_install_ctx *ctx;
-	struct workqueue_attrs *new_attrs, *tmp_attrs;
+	struct workqueue_attrs *new_attrs, *pwq_attrs, *tmp_attrs;
 	int node;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -3528,13 +3530,16 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 		      GFP_KERNEL);
 
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	pwq_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	if (!ctx || !new_attrs || !tmp_attrs)
 		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
+	copy_workqueue_attrs(pwq_attrs, attrs);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3547,13 +3552,21 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
+	 *
+	 * If the cpumask set by the user doesn't overlap with the global
+	 * wq_unbound_cpumask, we ignore the wq_unbound_cpumask for this wq
+	 * which means all its nodes' pwqs are its default pwq and its default
+	 * pwq's workers' cpumask is totally equals to the user setting.
 	 */
-	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (cpumask_empty(pwq_attrs->cpumask))
+		ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	else
+		ctx->dfl_pwq = alloc_unbound_pwq(wq, pwq_attrs);
 	if (!ctx->dfl_pwq)
 		goto out_free;
 
 	for_each_node(node) {
-		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+		if (wq_calc_node_cpumask(pwq_attrs, node, -1, tmp_attrs->cpumask)) {
 			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
 			if (!ctx->pwq_tbl[node])
 				goto out_free;
@@ -3567,6 +3580,7 @@ wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
 	ctx->attrs = new_attrs;
 
 out_free:
+	free_workqueue_attrs(pwq_attrs);
 	free_workqueue_attrs(tmp_attrs);
 
 	if (!ctx || !ctx->wq) {
@@ -3638,7 +3652,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	get_online_cpus();
 
 	mutex_lock(&wq_pool_mutex);
-	ctx = wq_unbound_install_ctx_prepare(wq, attrs);
+	ctx = wq_unbound_install_ctx_prepare(wq, attrs, wq_unbound_cpumask);
 	mutex_unlock(&wq_pool_mutex);
 
 	put_online_cpus();
@@ -3712,6 +3726,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	 * wq's, the default pwq should be used.
 	 */
 	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+		cpumask_and(cpumask, cpumask, wq_unbound_cpumask);
+		if (cpumask_empty(cpumask))
+			goto use_dfl_pwq;
 		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
 			goto out_unlock;
 	} else {
@@ -3749,6 +3766,74 @@ out_unlock:
 	put_pwq_unlocked(old_pwq);
 }
 
+static int unbounds_cpumask_apply(cpumask_var_t cpumask)
+{
+	LIST_HEAD(ctxs);
+	int ret = 0;
+	struct workqueue_struct *wq;
+	struct wq_unbound_install_ctx *ctx, *n;
+
+	lockdep_assert_held(&wq_pool_mutex);
+
+	list_for_each_entry(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_UNBOUND))
+			continue;
+		/* creating multiple pwqs breaks ordering guarantee */
+		if (wq->flags & __WQ_ORDERED)
+			continue;
+
+		ctx = wq_unbound_install_ctx_prepare(wq, wq->unbound_attrs,
+						     cpumask);
+		if (!ctx) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		list_add_tail(&ctx->list, &ctxs);
+	}
+
+	list_for_each_entry_safe(ctx, n, &ctxs, list) {
+		if (ret >= 0)
+			wq_unbound_install_ctx_commit(ctx);
+		wq_unbound_install_ctx_free(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ *  workqueue_unbounds_cpumask_set - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *  When all succeed, it saves @cpumask to the global low-level unbound
+ *  cpumask.
+ *
+ *  Retun:	0	- Success
+ *  		-EINVAL	- No online cpu in the @cpumask
+ *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_unbounds_cpumask_set(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+
+	get_online_cpus();
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (cpumask_intersects(cpumask, cpu_online_mask)) {
+		mutex_lock(&wq_pool_mutex);
+		ret = unbounds_cpumask_apply(cpumask);
+		if (ret >= 0)
+			cpumask_copy(wq_unbound_cpumask, cpumask);
+		mutex_unlock(&wq_pool_mutex);
+	}
+	put_online_cpus();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(workqueue_unbounds_cpumask_set);
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -3965,19 +4050,39 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static ssize_t unbounds_cpumask_store(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	cpumask_var_t cpumask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, cpumask);
+	if (!ret)
+		ret = workqueue_unbounds_cpumask_set(cpumask);
+
+	free_cpumask_var(cpumask);
+	return ret ? ret : count;
+}
+
 static ssize_t unbounds_cpumask_show(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
 	int written;
 
+	mutex_lock(&wq_pool_mutex);
 	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
 			    cpumask_pr_args(wq_unbound_cpumask));
+	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
 static struct device_attribute wq_sysfs_cpumask_attr =
-	__ATTR(cpumask, 0444, unbounds_cpumask_show, NULL);
+	__ATTR(cpumask, 0644, unbounds_cpumask_show, unbounds_cpumask_store);
 
 static int __init wq_sysfs_init(void)
 {
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
                     ` (3 preceding siblings ...)
  2015-03-18  4:40   ` [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-03-19  8:54   ` Mike Galbraith
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
  5 siblings, 0 replies; 48+ messages in thread
From: Mike Galbraith @ 2015-03-19  8:54 UTC (permalink / raw)
  To: Lai Jiangshan; +Cc: linux-kernel

On Wed, 2015-03-18 at 12:40 +0800, Lai Jiangshan wrote:
> This patchset mostly copies from Frederic and split the apply_workqueue_attrs()
> as TJ's suggest.

I plugged these into rt and gave them a go, had no trouble, and no
unbound workqueues perturbing 7 isolated sockets (56 cores).  Poking at
the mask heftily after dropping the shield and giving the box plenty of
generic work to do generated no excitement.  Nice boring testdrive.

> This patchset still doesn't include the patch "workqueue: Allow changing attributions
> of ordered workqueues".

After a brief stare, it looked ok to wedge that one in as well.  Box
didn't tell me I shouldn't have done that.

	-Mike   


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/4 V5] workqueue: Reorder sysfs code
  2015-03-18  4:40   ` [PATCH 1/4 V5] workqueue: Reorder sysfs code Lai Jiangshan
@ 2015-03-24 15:41     ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2015-03-24 15:41 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Frederic Weisbecker, Christoph Lameter,
	Kevin Hilman, Mike Galbraith, Paul E. McKenney, Viresh Kumar

Hello,

On Wed, Mar 18, 2015 at 12:40:14PM +0800, Lai Jiangshan wrote:
> @@ -4029,6 +3712,323 @@ out_unlock:
>  	put_pwq_unlocked(old_pwq);
>  }
>  
> +#ifdef CONFIG_SYSFS
...
> +#else	/* CONFIG_SYSFS */
> +static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
> +#endif	/* CONFIG_SYSFS */
> +
>  static int alloc_and_link_pwqs(struct workqueue_struct *wq)
>  {
>  	bool highpri = wq->flags & WQ_HIGHPRI;

Why here?  It's still in the middle of core logic implementation.  Why
not move it further down so that it's either before or after the
hotplug code?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/4 V5] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-03-18  4:40   ` [PATCH 2/4 V5] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
@ 2015-03-24 15:55     ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2015-03-24 15:55 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On Wed, Mar 18, 2015 at 12:40:15PM +0800, Lai Jiangshan wrote:
> +static void wq_unbound_install_ctx_free(struct wq_unbound_install_ctx *ctx)

Maybe naminig it more consistently with apply_workqueue_attrs() is
better?  apply_wqattrs_cleanup()?

>  {
> +	int node;
> +
> +	if (ctx) {
> +		/* put the pwqs */
> +		for_each_node(node)
> +			put_pwq_unlocked(ctx->pwq_tbl[node]);
> +		put_pwq_unlocked(ctx->dfl_pwq);
> +
> +		free_workqueue_attrs(ctx->attrs);
> +	}
> +
> +	kfree(ctx);
> +}

Wouldn't the following be better?  Or at least put kfree(ctx) together
with the rest?

	if (!ctx)
		return;
	the rest;

> +
> +/* Allocates the attrs and pwqs for later installment */
> +static struct wq_unbound_install_ctx *
> +wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
> +			       const struct workqueue_attrs *attrs)
> +{

apply_wqattrs_prepare()?

...
> +out_free:
> +	free_workqueue_attrs(tmp_attrs);
> +
> +	if (!ctx || !ctx->wq) {
> +		kfree(new_attrs);
> +		wq_unbound_install_ctx_free(ctx);
> +		return NULL;
> +	} else {
> +		return ctx;
> +	}
> +}

Let's separate out error return path even if that takes another goto
or a duplicate free_workqueue_attrs() call.

> +/* Set the unbound_attr and install the prepared pwqs. Should not fail */
> +static void wq_unbound_install_ctx_commit(struct wq_unbound_install_ctx *ctx)

apply_wqattrs_commit()?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-18  4:40   ` [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-03-24 17:31     ` Tejun Heo
  2015-03-31  7:46       ` Lai Jiangshan
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2015-03-24 17:31 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On Wed, Mar 18, 2015 at 12:40:17PM +0800, Lai Jiangshan wrote:
> The oreder-workquue is ignore from the low level unbound workqueue cpumask,
> it will be handled in near future.

Ugh, right, ordered workqueues are tricky.  Maybe we should change how
ordered workqueues are implemented.  Just gate work items at the
workqueue layer instead of fiddling with max_active and the number of
pwqs.

>  static struct wq_unbound_install_ctx *
>  wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
> -			       const struct workqueue_attrs *attrs)
> +			       const struct workqueue_attrs *attrs,
> +			       cpumask_var_t unbound_cpumask)
>  {
...
>  	/* make a copy of @attrs and sanitize it */
>  	copy_workqueue_attrs(new_attrs, attrs);
> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
> +	copy_workqueue_attrs(pwq_attrs, attrs);
> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);

Hmmm... we weren't checking whether the intersection becomes null
before.  Why are we doing it now?  Note that this doesn't really make
things water-tight as cpu on/offlining can still leave the mask w/o
any online cpus.  Shouldn't we just let the scheduler handle it as
before?

> @@ -3712,6 +3726,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
>  	 * wq's, the default pwq should be used.
>  	 */
>  	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
> +		cpumask_and(cpumask, cpumask, wq_unbound_cpumask);
> +		if (cpumask_empty(cpumask))
> +			goto use_dfl_pwq;

So, this special handling is necessary only because we did special in
the above for dfl_pwq.  Why do we need these?

> +static int unbounds_cpumask_apply(cpumask_var_t cpumask)
> +{
..
> +	list_for_each_entry_safe(ctx, n, &ctxs, list) {
> +		if (ret >= 0)

Let's do !ret.

> +			wq_unbound_install_ctx_commit(ctx);
> +		wq_unbound_install_ctx_free(ctx);
> +	}
...
> +/**
> + *  workqueue_unbounds_cpumask_set - Set the low-level unbound cpumask
> + *  @cpumask: the cpumask to set
> + *
> + *  The low-level workqueues cpumask is a global cpumask that limits
> + *  the affinity of all unbound workqueues.  This function check the @cpumask
> + *  and apply it to all unbound workqueues and updates all pwqs of them.
> + *  When all succeed, it saves @cpumask to the global low-level unbound
> + *  cpumask.
> + *
> + *  Retun:	0	- Success
> + *  		-EINVAL	- No online cpu in the @cpumask
> + *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
> + */
> +int workqueue_unbounds_cpumask_set(cpumask_var_t cpumask)
> +{
> +	int ret = -EINVAL;
> +
> +	get_online_cpus();
> +	cpumask_and(cpumask, cpumask, cpu_possible_mask);
> +	if (cpumask_intersects(cpumask, cpu_online_mask)) {

Does this make sense?  We can't prevent cpus going down right after
the mask is set.  What's the point of preventing empty config if we
can't prevent transitions into it and have to handle it anyway?

> +static ssize_t unbounds_cpumask_store(struct device *dev,
> +				      struct device_attribute *attr,
> +				      const char *buf, size_t count)

Naming is too confusing.  Please pick a name which clearly
distinguishes per-wq and global masking.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-24 17:31     ` Tejun Heo
@ 2015-03-31  7:46       ` Lai Jiangshan
  2015-04-01  8:33         ` Lai Jiangshan
  0 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-03-31  7:46 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On 03/25/2015 01:31 AM, Tejun Heo wrote:
> On Wed, Mar 18, 2015 at 12:40:17PM +0800, Lai Jiangshan wrote:
>> The oreder-workquue is ignore from the low level unbound workqueue cpumask,
>> it will be handled in near future.
> 
> Ugh, right, ordered workqueues are tricky.  Maybe we should change how
> ordered workqueues are implemented.  Just gate work items at the
> workqueue layer instead of fiddling with max_active and the number of
> pwqs.
> 
>>  static struct wq_unbound_install_ctx *
>>  wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
>> -			       const struct workqueue_attrs *attrs)
>> +			       const struct workqueue_attrs *attrs,
>> +			       cpumask_var_t unbound_cpumask)
>>  {
> ...
>>  	/* make a copy of @attrs and sanitize it */
>>  	copy_workqueue_attrs(new_attrs, attrs);
>> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
>> +	copy_workqueue_attrs(pwq_attrs, attrs);
>> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
>> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
> 
> Hmmm... we weren't checking whether the intersection becomes null
> before.

Di you refer to the unquoted following code "cpumask_empty(pwq_attrs->cpumask)"?

It is explained in the changelog and the comments.

>  Why are we doing it now?  Note that this doesn't really make
> things water-tight as cpu on/offlining can still leave the mask w/o
> any online cpus.  Shouldn't we just let the scheduler handle it as
> before?

Did you refer to "cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);"?

new_attrs will be copied to wq->unbound_attrs, so we hope it is sanity.
the same code before this patchset did the same work.

And it maybe be used for default pwq, and it can reduce the pool creation:
	cpu_possible_mask = 0-7
	wq_unbound_cpumask = 0-3
	user1 try to set wq1:	attrs->cpumask = 4-9
	user2 try to set wq2:	attrs->cpumask = 4-11
thus both wq1 and wq2's default pwq's pool is the same pool. (pool's cpumask = 4-7)
	

> 
>> @@ -3712,6 +3726,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
>>  	 * wq's, the default pwq should be used.
>>  	 */
>>  	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
>> +		cpumask_and(cpumask, cpumask, wq_unbound_cpumask);
>> +		if (cpumask_empty(cpumask))
>> +			goto use_dfl_pwq;
> 
> So, this special handling is necessary only because we did special in
> the above for dfl_pwq.  Why do we need these?

wq->unbound_attrs is user setting attrs, its cpumask is not controlled by
wq_unbound_cpumask. so we need these cpumask_and().

Another question:
Why wq->unbound_attrs' cpumask is not controlled by wq_unbound_cpumask?

I hope the wq->unbound_attrs is always as the same as the user's last setting,
regardless how much times the wq_unbound_cpumask is changed.

> 
>> +static int unbounds_cpumask_apply(cpumask_var_t cpumask)
>> +{
> ..
>> +	list_for_each_entry_safe(ctx, n, &ctxs, list) {
>> +		if (ret >= 0)
> 
> Let's do !ret.
> 
>> +			wq_unbound_install_ctx_commit(ctx);
>> +		wq_unbound_install_ctx_free(ctx);
>> +	}
> ...
>> +/**
>> + *  workqueue_unbounds_cpumask_set - Set the low-level unbound cpumask
>> + *  @cpumask: the cpumask to set
>> + *
>> + *  The low-level workqueues cpumask is a global cpumask that limits
>> + *  the affinity of all unbound workqueues.  This function check the @cpumask
>> + *  and apply it to all unbound workqueues and updates all pwqs of them.
>> + *  When all succeed, it saves @cpumask to the global low-level unbound
>> + *  cpumask.
>> + *
>> + *  Retun:	0	- Success
>> + *  		-EINVAL	- No online cpu in the @cpumask
>> + *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
>> + */
>> +int workqueue_unbounds_cpumask_set(cpumask_var_t cpumask)
>> +{
>> +	int ret = -EINVAL;
>> +
>> +	get_online_cpus();
>> +	cpumask_and(cpumask, cpumask, cpu_possible_mask);
>> +	if (cpumask_intersects(cpumask, cpu_online_mask)) {
> 
> Does this make sense?  We can't prevent cpus going down right after
> the mask is set.  What's the point of preventing empty config if we
> can't prevent transitions into it and have to handle it anyway?

Like set_cpus_allowed_ptr(). The cpumask must be valid when setting,
although it can be transited into non-intersection later.

This code is originated from Frederic.  Maybe he has some stronger reason.

> 
>> +static ssize_t unbounds_cpumask_store(struct device *dev,
>> +				      struct device_attribute *attr,
>> +				      const char *buf, size_t count)
> 
> Naming is too confusing.  Please pick a name which clearly
> distinguishes per-wq and global masking.

What about these names?
wq_unbound_cpumask ==> wq_unbound_global_cpumask
workqueue_unbounds_cpumask_set() ==> workqueue_set_unbound_global_cpumask(). (public API)
unbounds_cpumask_store() ==> wq_store_unbound_global_cpumask()   (static function for sysfs)

> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-03-31  7:46       ` Lai Jiangshan
@ 2015-04-01  8:33         ` Lai Jiangshan
  2015-04-01 15:52           ` Tejun Heo
  0 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-01  8:33 UTC (permalink / raw)
  To: Tejun Heo, Frederic Weisbecker
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar

Hi, Frederic, TJ

I considered a special case and forgot to consider an another case.

====

Let @L = the low level unbound workqueue cpumask.
Let @U = the user setting cpumask (wq->unbound_attrs->cpumask).

Thus the pwqs in the specified wq are controlled by @L & @U (& = cpmask_and()).
But the per-node pwqs are mandatory controlled by  @L & @U, we don't discus it.
So this mail only takes focus on the default pwq.

What happens for the default pwq when @L & @U == empty cpumask?
I considered this case in the patch.  In my patch, the dfl_pwq directly
use the @U. The reasons:
1) it is not a good idea to directly use cpu_possible_mask.
2) and it is not a good idea  either to use @L & @U ( = empty), in this case,
	the scheduler will use cpu_possible_mask.
3) so we has to chose one from @L and @U.
4) I chose @U instead of @L.
   @L: it is low level global cpumask and it controls *ALL* wqs.
   @U: it is set by the *USER*, it controls only one *SPECIFIC* wq.

Frederic, TJ, both you didn't say anything about my this early quick decision.
Should this case be handled specially? And if yes, does this decision met your requirements?

======

A comment from TJ reminded me that the final cpumask determined by the scheduler
is more important.

Let @O = cpu_online_mask.

The missing case:
(@L & @U) is not empty but (@L & @U @O) is empty.

In my old code (V5 patchset), the dfl_pwq uses (@L & @U), the scheduler will
use cpu_possible_mask instead due to there is no cpu onlined among all cpu in (@L & @U).
It is bad, the pwq is NOT controlled by @L nor @U now.

I think we may use @U for the dfl_pwq in this case.  But it will introduces
a problem:

When (@L & @U) has online cpu, the dfl_pwq's cpumaks is (@L & @U).
when (@L & @U) has no online cpu, the dfl_pwq's cpumask is @U.
It means dfl_pwq may need to be reallocated during the cpuhotplug-add/remove
and it means wq_update_unbound_numa() can fail.

Frederic, TJ, any comments about this case?
TJ, would you like to make wq_update_unbound_numa() be failure-able?

thanks
Lai

On 03/31/2015 03:46 PM, Lai Jiangshan wrote:
> On 03/25/2015 01:31 AM, Tejun Heo wrote:
>> On Wed, Mar 18, 2015 at 12:40:17PM +0800, Lai Jiangshan wrote:
>>> The oreder-workquue is ignore from the low level unbound workqueue cpumask,
>>> it will be handled in near future.
>>
>> Ugh, right, ordered workqueues are tricky.  Maybe we should change how
>> ordered workqueues are implemented.  Just gate work items at the
>> workqueue layer instead of fiddling with max_active and the number of
>> pwqs.
>>
>>>  static struct wq_unbound_install_ctx *
>>>  wq_unbound_install_ctx_prepare(struct workqueue_struct *wq,
>>> -			       const struct workqueue_attrs *attrs)
>>> +			       const struct workqueue_attrs *attrs,
>>> +			       cpumask_var_t unbound_cpumask)
>>>  {
>> ...
>>>  	/* make a copy of @attrs and sanitize it */
>>>  	copy_workqueue_attrs(new_attrs, attrs);
>>> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
>>> +	copy_workqueue_attrs(pwq_attrs, attrs);
>>> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
>>> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
>>
>> Hmmm... we weren't checking whether the intersection becomes null
>> before.
> 
> Di you refer to the unquoted following code "cpumask_empty(pwq_attrs->cpumask)"?
> 
> It is explained in the changelog and the comments.
> 
>>  Why are we doing it now?  Note that this doesn't really make
>> things water-tight as cpu on/offlining can still leave the mask w/o
>> any online cpus.  Shouldn't we just let the scheduler handle it as
>> before?
> 
> Did you refer to "cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);"?
> 
> new_attrs will be copied to wq->unbound_attrs, so we hope it is sanity.
> the same code before this patchset did the same work.
> 
> And it maybe be used for default pwq, and it can reduce the pool creation:
> 	cpu_possible_mask = 0-7
> 	wq_unbound_cpumask = 0-3
> 	user1 try to set wq1:	attrs->cpumask = 4-9
> 	user2 try to set wq2:	attrs->cpumask = 4-11
> thus both wq1 and wq2's default pwq's pool is the same pool. (pool's cpumask = 4-7)
> 	
> 
>>
>>> @@ -3712,6 +3726,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
>>>  	 * wq's, the default pwq should be used.
>>>  	 */
>>>  	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
>>> +		cpumask_and(cpumask, cpumask, wq_unbound_cpumask);
>>> +		if (cpumask_empty(cpumask))
>>> +			goto use_dfl_pwq;
>>
>> So, this special handling is necessary only because we did special in
>> the above for dfl_pwq.  Why do we need these?
> 
> wq->unbound_attrs is user setting attrs, its cpumask is not controlled by
> wq_unbound_cpumask. so we need these cpumask_and().
> 
> Another question:
> Why wq->unbound_attrs' cpumask is not controlled by wq_unbound_cpumask?
> 
> I hope the wq->unbound_attrs is always as the same as the user's last setting,
> regardless how much times the wq_unbound_cpumask is changed.
> 
>>
>>> +static int unbounds_cpumask_apply(cpumask_var_t cpumask)
>>> +{
>> ..
>>> +	list_for_each_entry_safe(ctx, n, &ctxs, list) {
>>> +		if (ret >= 0)
>>
>> Let's do !ret.
>>
>>> +			wq_unbound_install_ctx_commit(ctx);
>>> +		wq_unbound_install_ctx_free(ctx);
>>> +	}
>> ...
>>> +/**
>>> + *  workqueue_unbounds_cpumask_set - Set the low-level unbound cpumask
>>> + *  @cpumask: the cpumask to set
>>> + *
>>> + *  The low-level workqueues cpumask is a global cpumask that limits
>>> + *  the affinity of all unbound workqueues.  This function check the @cpumask
>>> + *  and apply it to all unbound workqueues and updates all pwqs of them.
>>> + *  When all succeed, it saves @cpumask to the global low-level unbound
>>> + *  cpumask.
>>> + *
>>> + *  Retun:	0	- Success
>>> + *  		-EINVAL	- No online cpu in the @cpumask
>>> + *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
>>> + */
>>> +int workqueue_unbounds_cpumask_set(cpumask_var_t cpumask)
>>> +{
>>> +	int ret = -EINVAL;
>>> +
>>> +	get_online_cpus();
>>> +	cpumask_and(cpumask, cpumask, cpu_possible_mask);
>>> +	if (cpumask_intersects(cpumask, cpu_online_mask)) {
>>
>> Does this make sense?  We can't prevent cpus going down right after
>> the mask is set.  What's the point of preventing empty config if we
>> can't prevent transitions into it and have to handle it anyway?
> 
> Like set_cpus_allowed_ptr(). The cpumask must be valid when setting,
> although it can be transited into non-intersection later.
> 
> This code is originated from Frederic.  Maybe he has some stronger reason.
> 
>>
>>> +static ssize_t unbounds_cpumask_store(struct device *dev,
>>> +				      struct device_attribute *attr,
>>> +				      const char *buf, size_t count)
>>
>> Naming is too confusing.  Please pick a name which clearly
>> distinguishes per-wq and global masking.
> 
> What about these names?
> wq_unbound_cpumask ==> wq_unbound_global_cpumask
> workqueue_unbounds_cpumask_set() ==> workqueue_set_unbound_global_cpumask(). (public API)
> unbounds_cpumask_store() ==> wq_store_unbound_global_cpumask()   (static function for sysfs)
> 
>>
>> Thanks.
>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> .
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-01  8:33         ` Lai Jiangshan
@ 2015-04-01 15:52           ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2015-04-01 15:52 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: Frederic Weisbecker, linux-kernel, Christoph Lameter,
	Kevin Hilman, Mike Galbraith, Paul E. McKenney, Viresh Kumar

On Wed, Apr 01, 2015 at 04:33:30PM +0800, Lai Jiangshan wrote:
> The missing case:
> (@L & @U) is not empty but (@L & @U @O) is empty.
> 
> In my old code (V5 patchset), the dfl_pwq uses (@L & @U), the scheduler will
> use cpu_possible_mask instead due to there is no cpu onlined among all cpu in (@L & @U).
> It is bad, the pwq is NOT controlled by @L nor @U now.
> 
> I think we may use @U for the dfl_pwq in this case.  But it will introduces
> a problem:
> 
> When (@L & @U) has online cpu, the dfl_pwq's cpumaks is (@L & @U).
> when (@L & @U) has no online cpu, the dfl_pwq's cpumask is @U.
> It means dfl_pwq may need to be reallocated during the cpuhotplug-add/remove
> and it means wq_update_unbound_numa() can fail.
> 
> Frederic, TJ, any comments about this case?
> TJ, would you like to make wq_update_unbound_numa() be failure-able?

The goal is making @L behave as cpu_possible_mask for workqueues,
right?  If @L & @U is empty, fallback to @L.  If @L & @O is empty,
leave it to the scheduler.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 0/4 V6] workqueue: Introduce low-level unbound wq sysfs cpumask v5
  2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
                     ` (4 preceding siblings ...)
  2015-03-19  8:54   ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Mike Galbraith
@ 2015-04-02 11:14   ` Lai Jiangshan
  2015-04-02 11:14     ` [PATCH 1/4 V6] workqueue: Reorder sysfs code Lai Jiangshan
                       ` (4 more replies)
  5 siblings, 5 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-02 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Tejun Heo, Christoph Lameter, Kevin Hilman,
	Mike Galbraith, Paul E. McKenney, Viresh Kumar,
	Frederic Weisbecker

This patchset mostly copies from Frederic and split the apply_workqueue_attrs()
as TJ's suggest.

This patchset still doesn't include the patch "workqueue: Allow changing attributions
of ordered workqueues", I hope to reduce the review processing. The handling
for the ordered workqueue will be repose after this patchset accepted.

changed from v5:
Apply TJ's comment, rename, error-path .etc.

The default pwq fallback to the low level global cpumask when (and ONLY when) the
cpumask set by the user doesn't overlap with the low level cpumask.

Changed from V4
Add workqueue_unbounds_cpumask_set() kernel API and minimally restruct the patch4.



Cc: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
---

Frederic Weisbecker (2):
  workqueue: Reorder sysfs code
  workqueue: Create low-level unbound workqueues cpumask

Lai Jiangshan (2):
  workqueue: split apply_workqueue_attrs() into 3 stages
  workqueue: Allow modifying low level unbound workqueue cpumask

 include/linux/workqueue.h |   1 +
 kernel/workqueue.c        | 971 +++++++++++++++++++++++++++-------------------
 2 files changed, 568 insertions(+), 404 deletions(-)

-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/4 V6] workqueue: Reorder sysfs code
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
@ 2015-04-02 11:14     ` Lai Jiangshan
  2015-04-06 15:22       ` Tejun Heo
  2015-04-02 11:14     ` [PATCH 2/4 V6] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
                       ` (3 subsequent siblings)
  4 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-02 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

The sysfs code usually belongs to the botom of the file since it deals
with high level objects. In the workqueue code it's misplaced and such
that we'll need to work around functions references to allow the sysfs
code to call APIs like apply_workqueue_attrs().

Lets move that block further in the file, almost the botom.

And declare workqueue_sysfs_unregister() just before destroy_workqueue()
which reference it.

Suggested-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 636 +++++++++++++++++++++++++++--------------------------
 1 file changed, 319 insertions(+), 317 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ca0b1d..25394f6 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3001,323 +3001,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
 
-#ifdef CONFIG_SYSFS
-/*
- * Workqueues with WQ_SYSFS flag set is visible to userland via
- * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
- * following attributes.
- *
- *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
- *  max_active	RW int	: maximum number of in-flight work items
- *
- * Unbound workqueues have the following extra attributes.
- *
- *  id		RO int	: the associated pool ID
- *  nice	RW int	: nice value of the workers
- *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
- */
-struct wq_device {
-	struct workqueue_struct		*wq;
-	struct device			dev;
-};
-
-static struct workqueue_struct *dev_to_wq(struct device *dev)
-{
-	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-	return wq_dev->wq;
-}
-
-static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
-}
-static DEVICE_ATTR_RO(per_cpu);
-
-static ssize_t max_active_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-
-	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
-}
-
-static ssize_t max_active_store(struct device *dev,
-				struct device_attribute *attr, const char *buf,
-				size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int val;
-
-	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
-		return -EINVAL;
-
-	workqueue_set_max_active(wq, val);
-	return count;
-}
-static DEVICE_ATTR_RW(max_active);
-
-static struct attribute *wq_sysfs_attrs[] = {
-	&dev_attr_per_cpu.attr,
-	&dev_attr_max_active.attr,
-	NULL,
-};
-ATTRIBUTE_GROUPS(wq_sysfs);
-
-static ssize_t wq_pool_ids_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	const char *delim = "";
-	int node, written = 0;
-
-	rcu_read_lock_sched();
-	for_each_node(node) {
-		written += scnprintf(buf + written, PAGE_SIZE - written,
-				     "%s%d:%d", delim, node,
-				     unbound_pwq_by_node(wq, node)->pool->id);
-		delim = " ";
-	}
-	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
-	rcu_read_unlock_sched();
-
-	return written;
-}
-
-static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
-	mutex_unlock(&wq->mutex);
-
-	return written;
-}
-
-/* prepare workqueue_attrs for sysfs store operations */
-static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
-{
-	struct workqueue_attrs *attrs;
-
-	attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!attrs)
-		return NULL;
-
-	mutex_lock(&wq->mutex);
-	copy_workqueue_attrs(attrs, wq->unbound_attrs);
-	mutex_unlock(&wq->mutex);
-	return attrs;
-}
-
-static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
-	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
-		ret = apply_workqueue_attrs(wq, attrs);
-	else
-		ret = -EINVAL;
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static ssize_t wq_cpumask_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
-			    cpumask_pr_args(wq->unbound_attrs->cpumask));
-	mutex_unlock(&wq->mutex);
-	return written;
-}
-
-static ssize_t wq_cpumask_store(struct device *dev,
-				struct device_attribute *attr,
-				const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	ret = cpumask_parse(buf, attrs->cpumask);
-	if (!ret)
-		ret = apply_workqueue_attrs(wq, attrs);
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
-			    char *buf)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	int written;
-
-	mutex_lock(&wq->mutex);
-	written = scnprintf(buf, PAGE_SIZE, "%d\n",
-			    !wq->unbound_attrs->no_numa);
-	mutex_unlock(&wq->mutex);
-
-	return written;
-}
-
-static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
-			     const char *buf, size_t count)
-{
-	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct workqueue_attrs *attrs;
-	int v, ret;
-
-	attrs = wq_sysfs_prep_attrs(wq);
-	if (!attrs)
-		return -ENOMEM;
-
-	ret = -EINVAL;
-	if (sscanf(buf, "%d", &v) == 1) {
-		attrs->no_numa = !v;
-		ret = apply_workqueue_attrs(wq, attrs);
-	}
-
-	free_workqueue_attrs(attrs);
-	return ret ?: count;
-}
-
-static struct device_attribute wq_sysfs_unbound_attrs[] = {
-	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
-	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
-	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
-	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
-	__ATTR_NULL,
-};
-
-static struct bus_type wq_subsys = {
-	.name				= "workqueue",
-	.dev_groups			= wq_sysfs_groups,
-};
-
-static int __init wq_sysfs_init(void)
-{
-	return subsys_virtual_register(&wq_subsys, NULL);
-}
-core_initcall(wq_sysfs_init);
-
-static void wq_device_release(struct device *dev)
-{
-	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
-
-	kfree(wq_dev);
-}
-
-/**
- * workqueue_sysfs_register - make a workqueue visible in sysfs
- * @wq: the workqueue to register
- *
- * Expose @wq in sysfs under /sys/bus/workqueue/devices.
- * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
- * which is the preferred method.
- *
- * Workqueue user should use this function directly iff it wants to apply
- * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
- * apply_workqueue_attrs() may race against userland updating the
- * attributes.
- *
- * Return: 0 on success, -errno on failure.
- */
-int workqueue_sysfs_register(struct workqueue_struct *wq)
-{
-	struct wq_device *wq_dev;
-	int ret;
-
-	/*
-	 * Adjusting max_active or creating new pwqs by applyting
-	 * attributes breaks ordering guarantee.  Disallow exposing ordered
-	 * workqueues.
-	 */
-	if (WARN_ON(wq->flags & __WQ_ORDERED))
-		return -EINVAL;
-
-	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
-	if (!wq_dev)
-		return -ENOMEM;
-
-	wq_dev->wq = wq;
-	wq_dev->dev.bus = &wq_subsys;
-	wq_dev->dev.init_name = wq->name;
-	wq_dev->dev.release = wq_device_release;
-
-	/*
-	 * unbound_attrs are created separately.  Suppress uevent until
-	 * everything is ready.
-	 */
-	dev_set_uevent_suppress(&wq_dev->dev, true);
-
-	ret = device_register(&wq_dev->dev);
-	if (ret) {
-		kfree(wq_dev);
-		wq->wq_dev = NULL;
-		return ret;
-	}
-
-	if (wq->flags & WQ_UNBOUND) {
-		struct device_attribute *attr;
-
-		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
-			ret = device_create_file(&wq_dev->dev, attr);
-			if (ret) {
-				device_unregister(&wq_dev->dev);
-				wq->wq_dev = NULL;
-				return ret;
-			}
-		}
-	}
-
-	dev_set_uevent_suppress(&wq_dev->dev, false);
-	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
-	return 0;
-}
-
-/**
- * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
- * @wq: the workqueue to unregister
- *
- * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
- */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
-{
-	struct wq_device *wq_dev = wq->wq_dev;
-
-	if (!wq->wq_dev)
-		return;
-
-	wq->wq_dev = NULL;
-	device_unregister(&wq_dev->dev);
-}
-#else	/* CONFIG_SYSFS */
-static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
-#endif	/* CONFIG_SYSFS */
-
 /**
  * free_workqueue_attrs - free a workqueue_attrs
  * @attrs: workqueue_attrs to free
@@ -4183,6 +3866,8 @@ err_destroy:
 }
 EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
 
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
+
 /**
  * destroy_workqueue - safely terminate a workqueue
  * @wq: target workqueue
@@ -5014,6 +4699,323 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
+ *  max_active	RW int	: maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id		RO int	: the associated pool ID
+ *  nice	RW int	: nice value of the workers
+ *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+	struct workqueue_struct		*wq;
+	struct device			dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	return wq_dev->wq;
+}
+
+static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+static DEVICE_ATTR_RO(per_cpu);
+
+static ssize_t max_active_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t max_active_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int val;
+
+	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+		return -EINVAL;
+
+	workqueue_set_max_active(wq, val);
+	return count;
+}
+static DEVICE_ATTR_RW(max_active);
+
+static struct attribute *wq_sysfs_attrs[] = {
+	&dev_attr_per_cpu.attr,
+	&dev_attr_max_active.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(wq_sysfs);
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	const char *delim = "";
+	int node, written = 0;
+
+	rcu_read_lock_sched();
+	for_each_node(node) {
+		written += scnprintf(buf + written, PAGE_SIZE - written,
+				     "%s%d:%d", delim, node,
+				     unbound_pwq_by_node(wq, node)->pool->id);
+		delim = " ";
+	}
+	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+	rcu_read_unlock_sched();
+
+	return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+	struct workqueue_attrs *attrs;
+
+	attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+
+	mutex_lock(&wq->mutex);
+	copy_workqueue_attrs(attrs, wq->unbound_attrs);
+	mutex_unlock(&wq->mutex);
+	return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+	    attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE)
+		ret = apply_workqueue_attrs(wq, attrs);
+	else
+		ret = -EINVAL;
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq->unbound_attrs->cpumask));
+	mutex_unlock(&wq->mutex);
+	return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, attrs->cpumask);
+	if (!ret)
+		ret = apply_workqueue_attrs(wq, attrs);
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n",
+			    !wq->unbound_attrs->no_numa);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int v, ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	if (sscanf(buf, "%d", &v) == 1) {
+		attrs->no_numa = !v;
+		ret = apply_workqueue_attrs(wq, attrs);
+	}
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+	__ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+	.name				= "workqueue",
+	.dev_groups			= wq_sysfs_groups,
+};
+
+static int __init wq_sysfs_init(void)
+{
+	return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Return: 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev;
+	int ret;
+
+	/*
+	 * Adjusting max_active or creating new pwqs by applyting
+	 * attributes breaks ordering guarantee.  Disallow exposing ordered
+	 * workqueues.
+	 */
+	if (WARN_ON(wq->flags & __WQ_ORDERED))
+		return -EINVAL;
+
+	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+	if (!wq_dev)
+		return -ENOMEM;
+
+	wq_dev->wq = wq;
+	wq_dev->dev.bus = &wq_subsys;
+	wq_dev->dev.init_name = wq->name;
+	wq_dev->dev.release = wq_device_release;
+
+	/*
+	 * unbound_attrs are created separately.  Suppress uevent until
+	 * everything is ready.
+	 */
+	dev_set_uevent_suppress(&wq_dev->dev, true);
+
+	ret = device_register(&wq_dev->dev);
+	if (ret) {
+		kfree(wq_dev);
+		wq->wq_dev = NULL;
+		return ret;
+	}
+
+	if (wq->flags & WQ_UNBOUND) {
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+			ret = device_create_file(&wq_dev->dev, attr);
+			if (ret) {
+				device_unregister(&wq_dev->dev);
+				wq->wq_dev = NULL;
+				return ret;
+			}
+		}
+	}
+
+	dev_set_uevent_suppress(&wq_dev->dev, false);
+	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+	return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev = wq->wq_dev;
+
+	if (!wq->wq_dev)
+		return;
+
+	wq->wq_dev = NULL;
+	device_unregister(&wq_dev->dev);
+}
+#else	/* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
+#endif	/* CONFIG_SYSFS */
+
 static void __init wq_numa_init(void)
 {
 	cpumask_var_t *tbl;
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 2/4 V6] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
  2015-04-02 11:14     ` [PATCH 1/4 V6] workqueue: Reorder sysfs code Lai Jiangshan
@ 2015-04-02 11:14     ` Lai Jiangshan
  2015-04-06 15:39       ` Tejun Heo
  2015-04-02 11:14     ` [PATCH 3/4 V6] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-02 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Current apply_workqueue_attrs() includes pwqs-allocation and pwqs-installation,
so when we batch multiple apply_workqueue_attrs()s as a transaction, we can't
ensure the transaction must succeed or fail as a complete unit.

To solve this, we split apply_workqueue_attrs() into three stages.
The first stage does the preparation: allocation memory, pwqs.
The second stage does the attrs-installaion and pwqs-installation.
The third stage frees the allocated memory and (old or unused) pwqs.

As the result, batching multiple apply_workqueue_attrs()s can
succeed or fail as a complete unit:
	1) batch do all the first stage for all the workqueues
	2) only commit all when all the above succeed.

This patch is a preparation for the next patch ("Allow modifying low level
unbound workqueue cpumask") which will do a multiple apply_workqueue_attrs().

The patch doesn't have functionality changed except two minor adjustment:
	1) free_unbound_pwq() for the error path is removed, we use the
	   heavier version put_pwq_unlocked() instead since the error path
	   is rare. this adjustment simplifies the code.
	2) the memory-allocation is also moved into wq_pool_mutex.
	   this is needed to avoid to do the further splitting.

Suggested-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 200 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 116 insertions(+), 84 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25394f6..15531b8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3424,17 +3424,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 	return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-	lockdep_assert_held(&wq_pool_mutex);
-
-	if (pwq) {
-		put_unbound_pool(pwq->pool);
-		kmem_cache_free(pwq_cache, pwq);
-	}
-}
-
 /**
  * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of interest
@@ -3497,42 +3486,49 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 	return old_pwq;
 }
 
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-			  const struct workqueue_attrs *attrs)
+/* Context to store the prepared attrs & pwqs before installed */
+struct apply_wqattrs_ctx {
+	struct workqueue_struct	*wq;	/* target to be installed */
+	struct workqueue_attrs	*attrs;	/* attrs for installing */
+	struct pool_workqueue	*dfl_pwq;
+	struct pool_workqueue	*pwq_tbl[];
+};
+
+/* Free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+	if (ctx) {
+		int node;
+
+		/* put the pwqs */
+		for_each_node(node)
+			put_pwq_unlocked(ctx->pwq_tbl[node]);
+		put_pwq_unlocked(ctx->dfl_pwq);
+
+		free_workqueue_attrs(ctx->attrs);
+
+		kfree(ctx);
+	}
+}
+
+/* Allocates the attrs and pwqs for later installment */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+		      const struct workqueue_attrs *attrs)
 {
+	struct apply_wqattrs_ctx *ctx;
 	struct workqueue_attrs *new_attrs, *tmp_attrs;
-	struct pool_workqueue **pwq_tbl, *dfl_pwq;
-	int node, ret;
+	int node;
 
-	/* only unbound workqueues can change attributes */
-	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-		return -EINVAL;
+	lockdep_assert_held(&wq_pool_mutex);
 
-	/* creating multiple pwqs breaks ordering guarantee */
-	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+		      GFP_KERNEL);
 
-	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!pwq_tbl || !new_attrs || !tmp_attrs)
-		goto enomem;
+	if (!ctx || !new_attrs || !tmp_attrs)
+		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
@@ -3546,75 +3542,111 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	copy_workqueue_attrs(tmp_attrs, new_attrs);
 
 	/*
-	 * CPUs should stay stable across pwq creations and installations.
-	 * Pin CPUs, determine the target cpumask for each node and create
-	 * pwqs accordingly.
-	 */
-	get_online_cpus();
-
-	mutex_lock(&wq_pool_mutex);
-
-	/*
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
 	 */
-	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-	if (!dfl_pwq)
-		goto enomem_pwq;
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (!ctx->dfl_pwq)
+		goto out_free;
 
 	for_each_node(node) {
 		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-			if (!pwq_tbl[node])
-				goto enomem_pwq;
+			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+			if (!ctx->pwq_tbl[node])
+				goto out_free;
 		} else {
-			dfl_pwq->refcnt++;
-			pwq_tbl[node] = dfl_pwq;
+			ctx->dfl_pwq->refcnt++;
+			ctx->pwq_tbl[node] = ctx->dfl_pwq;
 		}
 	}
 
-	mutex_unlock(&wq_pool_mutex);
+	ctx->wq = wq;
+	ctx->attrs = new_attrs;
+	free_workqueue_attrs(tmp_attrs);
+	return ctx;
+
+out_free:
+	free_workqueue_attrs(tmp_attrs);
+	free_workqueue_attrs(new_attrs);
+	apply_wqattrs_cleanup(ctx);
+	return NULL;
+}
+
+/* Set the unbound_attr and install the prepared pwqs. Should not fail */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+	int node;
 
 	/* all pwqs have been created successfully, let's install'em */
-	mutex_lock(&wq->mutex);
+	mutex_lock(&ctx->wq->mutex);
 
-	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
 	/* save the previous pwq and install the new one */
 	for_each_node(node)
-		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+							  ctx->pwq_tbl[node]);
 
 	/* @dfl_pwq might not have been used, ensure it's linked */
-	link_pwq(dfl_pwq);
-	swap(wq->dfl_pwq, dfl_pwq);
+	link_pwq(ctx->dfl_pwq);
+	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
-	mutex_unlock(&wq->mutex);
+	mutex_unlock(&ctx->wq->mutex);
+}
 
-	/* put the old pwqs */
-	for_each_node(node)
-		put_pwq_unlocked(pwq_tbl[node]);
-	put_pwq_unlocked(dfl_pwq);
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs)
+{
+	struct apply_wqattrs_ctx *ctx;
+	int ret = -ENOMEM;
 
-	put_online_cpus();
-	ret = 0;
-	/* fall through */
-out_free:
-	free_workqueue_attrs(tmp_attrs);
-	free_workqueue_attrs(new_attrs);
-	kfree(pwq_tbl);
-	return ret;
+	/* only unbound workqueues can change attributes */
+	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+		return -EINVAL;
 
-enomem_pwq:
-	free_unbound_pwq(dfl_pwq);
-	for_each_node(node)
-		if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-			free_unbound_pwq(pwq_tbl[node]);
+	/* creating multiple pwqs breaks ordering guarantee */
+	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+		return -EINVAL;
+
+	/*
+	 * CPUs should stay stable across pwq creations and installations.
+	 * Pin CPUs, determine the target cpumask for each node and create
+	 * pwqs accordingly.
+	 */
+	get_online_cpus();
+
+	mutex_lock(&wq_pool_mutex);
+	ctx = apply_wqattrs_prepare(wq, attrs);
 	mutex_unlock(&wq_pool_mutex);
+
 	put_online_cpus();
-enomem:
-	ret = -ENOMEM;
-	goto out_free;
+
+	/* the ctx has been prepared successfully, let's commit it */
+	if (ctx) {
+		apply_wqattrs_commit(ctx);
+		ret = 0;
+	}
+
+	apply_wqattrs_cleanup(ctx);
+
+	return ret;
 }
 
 /**
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 3/4 V6] workqueue: Create low-level unbound workqueues cpumask
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
  2015-04-02 11:14     ` [PATCH 1/4 V6] workqueue: Reorder sysfs code Lai Jiangshan
  2015-04-02 11:14     ` [PATCH 2/4 V6] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
@ 2015-04-02 11:14     ` Lai Jiangshan
  2015-04-02 11:14     ` [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
  2015-04-07 11:26     ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
  4 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-02 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

Create a cpumask that limit the affinity of all unbound workqueues.
This cpumask is controlled though a file at the root of the workqueue
sysfs directory.

It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
such that the effective cpumask applied for a given unbound workqueue is
the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
the new /sys/devices/virtual/workqueue/cpumask file.

This patch implements the basic infrastructure and the read interface.
wq_unbound_global_cpumask is initially set to cpu_possible_mask.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 15531b8..f58c549 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+static cpumask_var_t wq_unbound_global_cpumask;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -3532,7 +3534,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -4947,9 +4949,29 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static ssize_t wq_unbound_global_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int written;
+
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq_unbound_global_cpumask));
+
+	return written;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+	__ATTR(cpumask, 0444, wq_unbound_global_cpumask_show, NULL);
+
 static int __init wq_sysfs_init(void)
 {
-	return subsys_virtual_register(&wq_subsys, NULL);
+	int err;
+
+	err = subsys_virtual_register(&wq_subsys, NULL);
+	if (err)
+		return err;
+
+	return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
 }
 core_initcall(wq_sysfs_init);
 
@@ -5097,6 +5119,9 @@ static int __init init_workqueues(void)
 
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+	BUG_ON(!alloc_cpumask_var(&wq_unbound_global_cpumask, GFP_KERNEL));
+	cpumask_copy(wq_unbound_global_cpumask, cpu_possible_mask);
+
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
                       ` (2 preceding siblings ...)
  2015-04-02 11:14     ` [PATCH 3/4 V6] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
@ 2015-04-02 11:14     ` Lai Jiangshan
  2015-04-06 15:53       ` Tejun Heo
  2015-04-07 11:26     ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
  4 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-02 11:14 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Allow to modify the low-level unbound workqueues cpumask through
sysfs. This is performed by traversing the entire workqueue list
and calling apply_wqattrs_prepare() on the unbound workqueues
with the low level mask passed in. Only after all the preparation are done,
we commit them all together.

The oreder-workquue is ignore from the low level unbound workqueue cpumask,
it will be handled in near future.

The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
the default pwq fallback to the low level global cpumask when (and ONLY when) the
cpumask set by the user doesn't overlap with the low level cpumask.

The default wq_unbound_global_cpumask is still cpu_possible_mask due to the workqueue
subsystem doesn't know what is the best default value for the runtime, the
system manager or other subsystem which knows the sufficient information should set
it when needed.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Original-patch-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h |   1 +
 kernel/workqueue.c        | 124 ++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index deee212..01483b3 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
+int workqueue_set_unbound_global_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f58c549..bc21fda 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_global_cpumask;
+static cpumask_var_t wq_unbound_global_cpumask; /* PL: low level cpumask for all unbound wqs */
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -3492,6 +3492,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 struct apply_wqattrs_ctx {
 	struct workqueue_struct	*wq;	/* target to be installed */
 	struct workqueue_attrs	*attrs;	/* attrs for installing */
+	struct list_head	list;	/* queued for batching commit */
 	struct pool_workqueue	*dfl_pwq;
 	struct pool_workqueue	*pwq_tbl[];
 };
@@ -3516,10 +3517,11 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
 /* Allocates the attrs and pwqs for later installment */
 static struct apply_wqattrs_ctx *
 apply_wqattrs_prepare(struct workqueue_struct *wq,
-		      const struct workqueue_attrs *attrs)
+		      const struct workqueue_attrs *attrs,
+		      cpumask_var_t unbound_cpumask)
 {
 	struct apply_wqattrs_ctx *ctx;
-	struct workqueue_attrs *new_attrs, *tmp_attrs;
+	struct workqueue_attrs *new_attrs, *pwq_attrs, *tmp_attrs;
 	int node;
 
 	lockdep_assert_held(&wq_pool_mutex);
@@ -3528,32 +3530,41 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		      GFP_KERNEL);
 
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	pwq_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	if (!ctx || !new_attrs || !tmp_attrs)
 		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
+	copy_workqueue_attrs(pwq_attrs, attrs);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
-	 * copy of @new_attrs which will be modified and used to obtain
+	 * copy of @pwq_attrs which will be modified and used to obtain
 	 * pools.
 	 */
-	copy_workqueue_attrs(tmp_attrs, new_attrs);
+	copy_workqueue_attrs(tmp_attrs, pwq_attrs);
 
 	/*
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
+	 *
+	 * If the cpumask set by the user doesn't overlap with the global
+	 * wq_unbound_global_cpumask, we fallback to the global
+	 * wq_unbound_global_cpumask.
 	 */
-	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (unlikely(cpumask_empty(tmp_attrs->cpumask)))
+		cpumask_copy(tmp_attrs->cpumask, unbound_cpumask);
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, tmp_attrs);
 	if (!ctx->dfl_pwq)
 		goto out_free;
 
 	for_each_node(node) {
-		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+		if (wq_calc_node_cpumask(pwq_attrs, node, -1, tmp_attrs->cpumask)) {
 			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
 			if (!ctx->pwq_tbl[node])
 				goto out_free;
@@ -3566,10 +3577,12 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	ctx->wq = wq;
 	ctx->attrs = new_attrs;
 	free_workqueue_attrs(tmp_attrs);
+	free_workqueue_attrs(pwq_attrs);
 	return ctx;
 
 out_free:
 	free_workqueue_attrs(tmp_attrs);
+	free_workqueue_attrs(pwq_attrs);
 	free_workqueue_attrs(new_attrs);
 	apply_wqattrs_cleanup(ctx);
 	return NULL;
@@ -3635,7 +3648,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	get_online_cpus();
 
 	mutex_lock(&wq_pool_mutex);
-	ctx = apply_wqattrs_prepare(wq, attrs);
+	ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_global_cpumask);
 	mutex_unlock(&wq_pool_mutex);
 
 	put_online_cpus();
@@ -3709,6 +3722,9 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	 * wq's, the default pwq should be used.
 	 */
 	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+		cpumask_and(cpumask, cpumask, wq_unbound_global_cpumask);
+		if (cpumask_empty(cpumask))
+			goto use_dfl_pwq;
 		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
 			goto out_unlock;
 	} else {
@@ -4733,6 +4749,74 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
+static int workqueue_apply_unbound_global_cpumask(cpumask_var_t cpumask)
+{
+	LIST_HEAD(ctxs);
+	int ret = 0;
+	struct workqueue_struct *wq;
+	struct apply_wqattrs_ctx *ctx, *n;
+
+	lockdep_assert_held(&wq_pool_mutex);
+
+	list_for_each_entry(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_UNBOUND))
+			continue;
+		/* creating multiple pwqs breaks ordering guarantee */
+		if (wq->flags & __WQ_ORDERED)
+			continue;
+
+		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs,
+						     cpumask);
+		if (!ctx) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		list_add_tail(&ctx->list, &ctxs);
+	}
+
+	list_for_each_entry_safe(ctx, n, &ctxs, list) {
+		if (!ret)
+			apply_wqattrs_commit(ctx);
+		apply_wqattrs_cleanup(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ *  workqueue_set_unbound_global_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *  When all succeed, it saves @cpumask to the global low-level unbound
+ *  cpumask.
+ *
+ *  Retun:	0	- Success
+ *  		-EINVAL	- No online cpu in the @cpumask
+ *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_global_cpumask(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+
+	get_online_cpus();
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (!cpumask_empty(cpumask)) {
+		mutex_lock(&wq_pool_mutex);
+		ret = workqueue_apply_unbound_global_cpumask(cpumask);
+		if (ret >= 0)
+			cpumask_copy(wq_unbound_global_cpumask, cpumask);
+		mutex_unlock(&wq_pool_mutex);
+	}
+	put_online_cpus();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(workqueue_set_unbound_global_cpumask);
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -4954,14 +5038,34 @@ static ssize_t wq_unbound_global_cpumask_show(struct device *dev,
 {
 	int written;
 
+	mutex_lock(&wq_pool_mutex);
 	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
 			    cpumask_pr_args(wq_unbound_global_cpumask));
+	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
+static ssize_t wq_unbound_global_cpumask_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	cpumask_var_t cpumask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, cpumask);
+	if (!ret)
+		ret = workqueue_set_unbound_global_cpumask(cpumask);
+
+	free_cpumask_var(cpumask);
+	return ret ? ret : count;
+}
+
 static struct device_attribute wq_sysfs_cpumask_attr =
-	__ATTR(cpumask, 0444, wq_unbound_global_cpumask_show, NULL);
+	__ATTR(cpumask, 0644, wq_unbound_global_cpumask_show,
+	       wq_unbound_global_cpumask_store);
 
 static int __init wq_sysfs_init(void)
 {
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/4 V6] workqueue: Reorder sysfs code
  2015-04-02 11:14     ` [PATCH 1/4 V6] workqueue: Reorder sysfs code Lai Jiangshan
@ 2015-04-06 15:22       ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2015-04-06 15:22 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Frederic Weisbecker, Christoph Lameter,
	Kevin Hilman, Mike Galbraith, Paul E. McKenney, Viresh Kumar

On Thu, Apr 02, 2015 at 07:14:39PM +0800, Lai Jiangshan wrote:
> From: Frederic Weisbecker <fweisbec@gmail.com>
> 
> The sysfs code usually belongs to the botom of the file since it deals
> with high level objects. In the workqueue code it's misplaced and such
> that we'll need to work around functions references to allow the sysfs
> code to call APIs like apply_workqueue_attrs().
> 
> Lets move that block further in the file, almost the botom.
> 
> And declare workqueue_sysfs_unregister() just before destroy_workqueue()
> which reference it.
> 
> Suggested-by: Tejun Heo <tj@kernel.org>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Kevin Hilman <khilman@linaro.org>
> Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
> Cc: Mike Galbraith <bitbucket@online.de>
> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Viresh Kumar <viresh.kumar@linaro.org>
> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>

Moved the forward declaration of workqueue_sysfs_unregister() where
other forward declarations are and applied to wq/for-4.1.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 2/4 V6] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-04-02 11:14     ` [PATCH 2/4 V6] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
@ 2015-04-06 15:39       ` Tejun Heo
  0 siblings, 0 replies; 48+ messages in thread
From: Tejun Heo @ 2015-04-06 15:39 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On Thu, Apr 02, 2015 at 07:14:40PM +0800, Lai Jiangshan wrote:
> The patch doesn't have functionality changed except two minor adjustment:
> 	1) free_unbound_pwq() for the error path is removed, we use the
> 	   heavier version put_pwq_unlocked() instead since the error path
> 	   is rare. this adjustment simplifies the code.
> 	2) the memory-allocation is also moved into wq_pool_mutex.
> 	   this is needed to avoid to do the further splitting.

And we're dropping online_cpus locking before applying the new pwq's.
Is that safe?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-02 11:14     ` [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-04-06 15:53       ` Tejun Heo
  2015-04-07  1:25         ` Lai Jiangshan
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2015-04-06 15:53 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On Thu, Apr 02, 2015 at 07:14:42PM +0800, Lai Jiangshan wrote:
>  	/* make a copy of @attrs and sanitize it */
>  	copy_workqueue_attrs(new_attrs, attrs);
> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
> +	copy_workqueue_attrs(pwq_attrs, attrs);
> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);

Hmmm... why do we need to keep track of both cpu_possible_mask and
unbound_cpumask?  Can't we just make unbound_cpumask replace
cpu_possible_mask for unbound workqueues?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-06 15:53       ` Tejun Heo
@ 2015-04-07  1:25         ` Lai Jiangshan
  2015-04-07  1:58           ` Tejun Heo
  0 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-07  1:25 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On 04/06/2015 11:53 PM, Tejun Heo wrote:
> On Thu, Apr 02, 2015 at 07:14:42PM +0800, Lai Jiangshan wrote:
>>  	/* make a copy of @attrs and sanitize it */
>>  	copy_workqueue_attrs(new_attrs, attrs);
>> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
>> +	copy_workqueue_attrs(pwq_attrs, attrs);
>> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
>> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
> 
> Hmmm... why do we need to keep track of both cpu_possible_mask and
> unbound_cpumask?  Can't we just make unbound_cpumask replace
> cpu_possible_mask for unbound workqueues?
> 

I want to save the original user-setting cpumask.

When any time the wq_unbound_global_cpumask is changed,
the new effective cpumask is
the-original-user-setting-cpumask & wq_unbound_global_cpumask
instead of
the-last-effective-cpumask & wq_unbound_global_cpumask.

thanks,
Lai

> Thanks.
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-07  1:25         ` Lai Jiangshan
@ 2015-04-07  1:58           ` Tejun Heo
  2015-04-07  2:33             ` Lai Jiangshan
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2015-04-07  1:58 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

Hello, Lai.

On Tue, Apr 07, 2015 at 09:25:59AM +0800, Lai Jiangshan wrote:
> On 04/06/2015 11:53 PM, Tejun Heo wrote:
> > On Thu, Apr 02, 2015 at 07:14:42PM +0800, Lai Jiangshan wrote:
> >>  	/* make a copy of @attrs and sanitize it */
> >>  	copy_workqueue_attrs(new_attrs, attrs);
> >> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
> >> +	copy_workqueue_attrs(pwq_attrs, attrs);
> >> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
> >> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
> > 
> > Hmmm... why do we need to keep track of both cpu_possible_mask and
> > unbound_cpumask?  Can't we just make unbound_cpumask replace
> > cpu_possible_mask for unbound workqueues?
> > 
> 
> I want to save the original user-setting cpumask.
> 
> When any time the wq_unbound_global_cpumask is changed,
> the new effective cpumask is
> the-original-user-setting-cpumask & wq_unbound_global_cpumask
> instead of
> the-last-effective-cpumask & wq_unbound_global_cpumask.

Yes, I get that, but that'd require just tracking the original
configured value and the unbound_cpumask masked value, no?  What am I
missing?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-07  1:58           ` Tejun Heo
@ 2015-04-07  2:33             ` Lai Jiangshan
  0 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-07  2:33 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On 04/07/2015 09:58 AM, Tejun Heo wrote:
> Hello, Lai.
> 
> On Tue, Apr 07, 2015 at 09:25:59AM +0800, Lai Jiangshan wrote:
>> On 04/06/2015 11:53 PM, Tejun Heo wrote:
>>> On Thu, Apr 02, 2015 at 07:14:42PM +0800, Lai Jiangshan wrote:
>>>>  	/* make a copy of @attrs and sanitize it */
>>>>  	copy_workqueue_attrs(new_attrs, attrs);
>>>> -	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
>>>> +	copy_workqueue_attrs(pwq_attrs, attrs);
>>>> +	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
>>>> +	cpumask_and(pwq_attrs->cpumask, pwq_attrs->cpumask, unbound_cpumask);
>>>
>>> Hmmm... why do we need to keep track of both cpu_possible_mask and
>>> unbound_cpumask?  Can't we just make unbound_cpumask replace
>>> cpu_possible_mask for unbound workqueues?
>>>
>>
>> I want to save the original user-setting cpumask.
>>
>> When any time the wq_unbound_global_cpumask is changed,
>> the new effective cpumask is
>> the-original-user-setting-cpumask & wq_unbound_global_cpumask
>> instead of
>> the-last-effective-cpumask & wq_unbound_global_cpumask.
> 
> Yes, I get that, but that'd require just tracking the original

wq->unbound_attrs (new_attrs) saves the original configured value
and is needed to be keep track of.
For sanity, it needs to be masked with cpu_possible_mask.

+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);

This code is changed back to the original code (before this patchset).

In the next iterate, I will reduce the number of the local vars to make
the code clearer.

> configured value and the unbound_cpumask masked value, no?  What am I
> missing?
> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
                       ` (3 preceding siblings ...)
  2015-04-02 11:14     ` [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-04-07 11:26     ` Lai Jiangshan
  2015-04-07 11:26       ` [PATCH 2/3 V7] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
                         ` (2 more replies)
  4 siblings, 3 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-07 11:26 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Current apply_workqueue_attrs() includes pwqs-allocation and pwqs-installation,
so when we batch multiple apply_workqueue_attrs()s as a transaction, we can't
ensure the transaction must succeed or fail as a complete unit.

To solve this, we split apply_workqueue_attrs() into three stages.
The first stage does the preparation: allocation memory, pwqs.
The second stage does the attrs-installaion and pwqs-installation.
The third stage frees the allocated memory and (old or unused) pwqs.

As the result, batching multiple apply_workqueue_attrs()s can
succeed or fail as a complete unit:
	1) batch do all the first stage for all the workqueues
	2) only commit all when all the above succeed.

This patch is a preparation for the next patch ("Allow modifying low level
unbound workqueue cpumask") which will do a multiple apply_workqueue_attrs().

The patch doesn't have functionality changed except two minor adjustment:
	1) free_unbound_pwq() for the error path is removed, we use the
	   heavier version put_pwq_unlocked() instead since the error path
	   is rare. this adjustment simplifies the code.
	2) the memory-allocation is also moved into wq_pool_mutex.
	   this is needed to avoid to do the further splitting.

Suggested-by: Tejun Heo <tj@kernel.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 200 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 116 insertions(+), 84 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 586ad91..b13753a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3425,17 +3425,6 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
 	return pwq;
 }
 
-/* undo alloc_unbound_pwq(), used only in the error path */
-static void free_unbound_pwq(struct pool_workqueue *pwq)
-{
-	lockdep_assert_held(&wq_pool_mutex);
-
-	if (pwq) {
-		put_unbound_pool(pwq->pool);
-		kmem_cache_free(pwq_cache, pwq);
-	}
-}
-
 /**
  * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of interest
@@ -3498,42 +3487,49 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 	return old_pwq;
 }
 
-/**
- * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
- * @wq: the target workqueue
- * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
- *
- * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
- * machines, this function maps a separate pwq to each NUMA node with
- * possibles CPUs in @attrs->cpumask so that work items are affine to the
- * NUMA node it was issued on.  Older pwqs are released as in-flight work
- * items finish.  Note that a work item which repeatedly requeues itself
- * back-to-back will stay on its current pwq.
- *
- * Performs GFP_KERNEL allocations.
- *
- * Return: 0 on success and -errno on failure.
- */
-int apply_workqueue_attrs(struct workqueue_struct *wq,
-			  const struct workqueue_attrs *attrs)
+/* Context to store the prepared attrs & pwqs before applied */
+struct apply_wqattrs_ctx {
+	struct workqueue_struct	*wq;	/* target to be applied */
+	struct workqueue_attrs	*attrs;	/* configured attrs */
+	struct pool_workqueue	*dfl_pwq;
+	struct pool_workqueue	*pwq_tbl[];
+};
+
+/* Free the resources after success or abort */
+static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
+{
+	if (ctx) {
+		int node;
+
+		/* put the pwqs */
+		for_each_node(node)
+			put_pwq_unlocked(ctx->pwq_tbl[node]);
+		put_pwq_unlocked(ctx->dfl_pwq);
+
+		free_workqueue_attrs(ctx->attrs);
+
+		kfree(ctx);
+	}
+}
+
+/* Allocates the attrs and pwqs for later installment */
+static struct apply_wqattrs_ctx *
+apply_wqattrs_prepare(struct workqueue_struct *wq,
+		      const struct workqueue_attrs *attrs)
 {
+	struct apply_wqattrs_ctx *ctx;
 	struct workqueue_attrs *new_attrs, *tmp_attrs;
-	struct pool_workqueue **pwq_tbl, *dfl_pwq;
-	int node, ret;
+	int node;
 
-	/* only unbound workqueues can change attributes */
-	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
-		return -EINVAL;
+	lockdep_assert_held(&wq_pool_mutex);
 
-	/* creating multiple pwqs breaks ordering guarantee */
-	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
-		return -EINVAL;
+	ctx = kzalloc(sizeof(*ctx) + nr_node_ids * sizeof(ctx->pwq_tbl[0]),
+		      GFP_KERNEL);
 
-	pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
 	new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
 	tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
-	if (!pwq_tbl || !new_attrs || !tmp_attrs)
-		goto enomem;
+	if (!ctx || !new_attrs || !tmp_attrs)
+		goto out_free;
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
@@ -3547,75 +3543,111 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	copy_workqueue_attrs(tmp_attrs, new_attrs);
 
 	/*
-	 * CPUs should stay stable across pwq creations and installations.
-	 * Pin CPUs, determine the target cpumask for each node and create
-	 * pwqs accordingly.
-	 */
-	get_online_cpus();
-
-	mutex_lock(&wq_pool_mutex);
-
-	/*
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
 	 */
-	dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
-	if (!dfl_pwq)
-		goto enomem_pwq;
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (!ctx->dfl_pwq)
+		goto out_free;
 
 	for_each_node(node) {
 		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
-			pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
-			if (!pwq_tbl[node])
-				goto enomem_pwq;
+			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+			if (!ctx->pwq_tbl[node])
+				goto out_free;
 		} else {
-			dfl_pwq->refcnt++;
-			pwq_tbl[node] = dfl_pwq;
+			ctx->dfl_pwq->refcnt++;
+			ctx->pwq_tbl[node] = ctx->dfl_pwq;
 		}
 	}
 
-	mutex_unlock(&wq_pool_mutex);
+	ctx->attrs = new_attrs;
+	ctx->wq = wq;
+	free_workqueue_attrs(tmp_attrs);
+	return ctx;
+
+out_free:
+	free_workqueue_attrs(tmp_attrs);
+	free_workqueue_attrs(new_attrs);
+	apply_wqattrs_cleanup(ctx);
+	return NULL;
+}
+
+/* Set the unbound_attr and install the prepared pwqs. Should not fail */
+static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx)
+{
+	int node;
 
 	/* all pwqs have been created successfully, let's install'em */
-	mutex_lock(&wq->mutex);
+	mutex_lock(&ctx->wq->mutex);
 
-	copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+	copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs);
 
 	/* save the previous pwq and install the new one */
 	for_each_node(node)
-		pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+		ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node,
+							  ctx->pwq_tbl[node]);
 
 	/* @dfl_pwq might not have been used, ensure it's linked */
-	link_pwq(dfl_pwq);
-	swap(wq->dfl_pwq, dfl_pwq);
+	link_pwq(ctx->dfl_pwq);
+	swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
-	mutex_unlock(&wq->mutex);
+	mutex_unlock(&ctx->wq->mutex);
+}
 
-	/* put the old pwqs */
-	for_each_node(node)
-		put_pwq_unlocked(pwq_tbl[node]);
-	put_pwq_unlocked(dfl_pwq);
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on.  Older pwqs are released as in-flight work
+ * items finish.  Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.
+ *
+ * Return: 0 on success and -errno on failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs)
+{
+	struct apply_wqattrs_ctx *ctx;
+	int ret = -ENOMEM;
 
-	put_online_cpus();
-	ret = 0;
-	/* fall through */
-out_free:
-	free_workqueue_attrs(tmp_attrs);
-	free_workqueue_attrs(new_attrs);
-	kfree(pwq_tbl);
-	return ret;
+	/* only unbound workqueues can change attributes */
+	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+		return -EINVAL;
 
-enomem_pwq:
-	free_unbound_pwq(dfl_pwq);
-	for_each_node(node)
-		if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
-			free_unbound_pwq(pwq_tbl[node]);
+	/* creating multiple pwqs breaks ordering guarantee */
+	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+		return -EINVAL;
+
+	/*
+	 * CPUs should stay stable across pwq creations and installations.
+	 * Pin CPUs, determine the target cpumask for each node and create
+	 * pwqs accordingly.
+	 */
+	get_online_cpus();
+
+	mutex_lock(&wq_pool_mutex);
+	ctx = apply_wqattrs_prepare(wq, attrs);
 	mutex_unlock(&wq_pool_mutex);
+
+	/* the ctx has been prepared successfully, let's commit it */
+	if (ctx) {
+		apply_wqattrs_commit(ctx);
+		ret = 0;
+	}
+
 	put_online_cpus();
-enomem:
-	ret = -ENOMEM;
-	goto out_free;
+
+	apply_wqattrs_cleanup(ctx);
+
+	return ret;
 }
 
 /**
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 2/3 V7] workqueue: Create low-level unbound workqueues cpumask
  2015-04-07 11:26     ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
@ 2015-04-07 11:26       ` Lai Jiangshan
  2015-04-07 11:26       ` [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
  2015-04-17 14:57       ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Tejun Heo
  2 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-07 11:26 UTC (permalink / raw)
  To: linux-kernel
  Cc: Frederic Weisbecker, Christoph Lameter, Kevin Hilman,
	Lai Jiangshan, Mike Galbraith, Paul E. McKenney, Tejun Heo,
	Viresh Kumar

From: Frederic Weisbecker <fweisbec@gmail.com>

Create a cpumask that limit the affinity of all unbound workqueues.
This cpumask is controlled though a file at the root of the workqueue
sysfs directory.

It works on a lower-level than the per WQ_SYSFS workqueues cpumask files
such that the effective cpumask applied for a given unbound workqueue is
the intersection of /sys/devices/virtual/workqueue/$WORKQUEUE/cpumask and
the new /sys/devices/virtual/workqueue/cpumask file.

This patch implements the basic infrastructure and the read interface.
wq_unbound_global_cpumask is initially set to cpu_possible_mask.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 kernel/workqueue.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b13753a..cbccf5d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,6 +299,8 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
+static cpumask_var_t wq_unbound_global_cpumask;
+
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 				     cpu_worker_pools);
@@ -3533,7 +3535,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -4946,9 +4948,29 @@ static struct bus_type wq_subsys = {
 	.dev_groups			= wq_sysfs_groups,
 };
 
+static ssize_t wq_unbound_global_cpumask_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int written;
+
+	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
+			    cpumask_pr_args(wq_unbound_global_cpumask));
+
+	return written;
+}
+
+static struct device_attribute wq_sysfs_cpumask_attr =
+	__ATTR(cpumask, 0444, wq_unbound_global_cpumask_show, NULL);
+
 static int __init wq_sysfs_init(void)
 {
-	return subsys_virtual_register(&wq_subsys, NULL);
+	int err;
+
+	err = subsys_virtual_register(&wq_subsys, NULL);
+	if (err)
+		return err;
+
+	return device_create_file(wq_subsys.dev_root, &wq_sysfs_cpumask_attr);
 }
 core_initcall(wq_sysfs_init);
 
@@ -5096,6 +5118,9 @@ static int __init init_workqueues(void)
 
 	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
+	BUG_ON(!alloc_cpumask_var(&wq_unbound_global_cpumask, GFP_KERNEL));
+	cpumask_copy(wq_unbound_global_cpumask, cpu_possible_mask);
+
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
 
 	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-07 11:26     ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
  2015-04-07 11:26       ` [PATCH 2/3 V7] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
@ 2015-04-07 11:26       ` Lai Jiangshan
  2015-04-22 19:39         ` Tejun Heo
  2015-04-17 14:57       ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Tejun Heo
  2 siblings, 1 reply; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-07 11:26 UTC (permalink / raw)
  To: linux-kernel
  Cc: Lai Jiangshan, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Tejun Heo, Viresh Kumar, Frederic Weisbecker

Allow to modify the low-level unbound workqueues cpumask through
sysfs. This is performed by traversing the entire workqueue list
and calling apply_wqattrs_prepare() on the unbound workqueues
with the low level mask passed in. Only after all the preparation are done,
we commit them all together.

The oreder-workquue is ignore from the low level unbound workqueue cpumask,
it will be handled in near future.

The per-nodes' pwqs are mandatorily controlled by the low level cpumask, while
the default pwq fallback to the low level global cpumask when (and ONLY when) the
cpumask set by the user doesn't overlap with the low level cpumask.

The default wq_unbound_global_cpumask is still cpu_possible_mask due to the workqueue
subsystem doesn't know what is the best default value for the runtime, the
system manager or other subsystem which knows the sufficient information should set
it when needed.

Cc: Christoph Lameter <cl@linux.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Original-patch-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h |   1 +
 kernel/workqueue.c        | 122 +++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 116 insertions(+), 7 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index deee212..01483b3 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -424,6 +424,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
 int apply_workqueue_attrs(struct workqueue_struct *wq,
 			  const struct workqueue_attrs *attrs);
+int workqueue_set_unbound_global_cpumask(cpumask_var_t cpumask);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cbccf5d..557612e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
 static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
 static bool workqueue_freezing;		/* PL: have wqs started freezing? */
 
-static cpumask_var_t wq_unbound_global_cpumask;
+static cpumask_var_t wq_unbound_global_cpumask; /* PL: low level cpumask for all unbound wqs */
 
 /* the per-cpu worker pools */
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -3493,6 +3493,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
 struct apply_wqattrs_ctx {
 	struct workqueue_struct	*wq;	/* target to be applied */
 	struct workqueue_attrs	*attrs;	/* configured attrs */
+	struct list_head	list;	/* queued for batching commit */
 	struct pool_workqueue	*dfl_pwq;
 	struct pool_workqueue	*pwq_tbl[];
 };
@@ -3517,7 +3518,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
 /* Allocates the attrs and pwqs for later installment */
 static struct apply_wqattrs_ctx *
 apply_wqattrs_prepare(struct workqueue_struct *wq,
-		      const struct workqueue_attrs *attrs)
+		      const struct workqueue_attrs *attrs,
+		      cpumask_var_t unbound_cpumask)
 {
 	struct apply_wqattrs_ctx *ctx;
 	struct workqueue_attrs *new_attrs, *tmp_attrs;
@@ -3535,7 +3537,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 
 	/* make a copy of @attrs and sanitize it */
 	copy_workqueue_attrs(new_attrs, attrs);
-	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_global_cpumask);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, unbound_cpumask);
 
 	/*
 	 * We may create multiple pwqs with differing cpumasks.  Make a
@@ -3548,13 +3550,18 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 	 * If something goes wrong during CPU up/down, we'll fall back to
 	 * the default pwq covering whole @attrs->cpumask.  Always create
 	 * it even if we don't use it immediately.
+	 *
+	 * If the cpumask set by the user doesn't overlap with the
+	 * unbound_cpumask, we fallback to the unbound_cpumask.
 	 */
-	ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+	if (unlikely(cpumask_empty(tmp_attrs->cpumask)))
+		cpumask_copy(tmp_attrs->cpumask, unbound_cpumask);
+	ctx->dfl_pwq = alloc_unbound_pwq(wq, tmp_attrs);
 	if (!ctx->dfl_pwq)
 		goto out_free;
 
 	for_each_node(node) {
-		if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+		if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
 			ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
 			if (!ctx->pwq_tbl[node])
 				goto out_free;
@@ -3564,7 +3571,11 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
 		}
 	}
 
+	/* save the user configured attrs */
+	copy_workqueue_attrs(new_attrs, attrs);
+	cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
 	ctx->attrs = new_attrs;
+
 	ctx->wq = wq;
 	free_workqueue_attrs(tmp_attrs);
 	return ctx;
@@ -3636,7 +3647,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	get_online_cpus();
 
 	mutex_lock(&wq_pool_mutex);
-	ctx = apply_wqattrs_prepare(wq, attrs);
+	ctx = apply_wqattrs_prepare(wq, attrs, wq_unbound_global_cpumask);
 	mutex_unlock(&wq_pool_mutex);
 
 	/* the ctx has been prepared successfully, let's commit it */
@@ -3710,6 +3721,14 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	 * wq's, the default pwq should be used.
 	 */
 	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+		/*
+		 * wq->unbound_attrs is the user configured attrs whose
+		 * cpumask is not masked with wq_unbound_global_cpumask,
+		 * so we make complete it.
+		 */
+		cpumask_and(cpumask, cpumask, wq_unbound_global_cpumask);
+		if (cpumask_empty(cpumask))
+			goto use_dfl_pwq;
 		if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
 			goto out_unlock;
 	} else {
@@ -4732,6 +4751,75 @@ out_unlock:
 }
 #endif /* CONFIG_FREEZER */
 
+static int workqueue_apply_unbound_global_cpumask(cpumask_var_t cpumask)
+{
+	LIST_HEAD(ctxs);
+	int ret = 0;
+	struct workqueue_struct *wq;
+	struct apply_wqattrs_ctx *ctx, *n;
+
+	lockdep_assert_held(&wq_pool_mutex);
+
+	list_for_each_entry(wq, &workqueues, list) {
+		if (!(wq->flags & WQ_UNBOUND))
+			continue;
+		/* creating multiple pwqs breaks ordering guarantee */
+		if (wq->flags & __WQ_ORDERED)
+			continue;
+
+		ctx = apply_wqattrs_prepare(wq, wq->unbound_attrs,
+						     cpumask);
+		if (!ctx) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		list_add_tail(&ctx->list, &ctxs);
+	}
+
+	list_for_each_entry_safe(ctx, n, &ctxs, list) {
+		list_del(&ctx->list);
+		if (!ret)
+			apply_wqattrs_commit(ctx);
+		apply_wqattrs_cleanup(ctx);
+	}
+
+	return ret;
+}
+
+/**
+ *  workqueue_set_unbound_global_cpumask - Set the low-level unbound cpumask
+ *  @cpumask: the cpumask to set
+ *
+ *  The low-level workqueues cpumask is a global cpumask that limits
+ *  the affinity of all unbound workqueues.  This function check the @cpumask
+ *  and apply it to all unbound workqueues and updates all pwqs of them.
+ *  When all succeed, it saves @cpumask to the global low-level unbound
+ *  cpumask.
+ *
+ *  Retun:	0	- Success
+ *  		-EINVAL	- No online cpu in the @cpumask
+ *  		-ENOMEM	- Failed to allocate memory for attrs or pwqs.
+ */
+int workqueue_set_unbound_global_cpumask(cpumask_var_t cpumask)
+{
+	int ret = -EINVAL;
+
+	get_online_cpus();
+	cpumask_and(cpumask, cpumask, cpu_possible_mask);
+	if (!cpumask_empty(cpumask)) {
+		mutex_lock(&wq_pool_mutex);
+		ret = workqueue_apply_unbound_global_cpumask(cpumask);
+		if (ret >= 0)
+			cpumask_copy(wq_unbound_global_cpumask, cpumask);
+		mutex_unlock(&wq_pool_mutex);
+	}
+	put_online_cpus();
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(workqueue_set_unbound_global_cpumask);
+
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
@@ -4953,14 +5041,34 @@ static ssize_t wq_unbound_global_cpumask_show(struct device *dev,
 {
 	int written;
 
+	mutex_lock(&wq_pool_mutex);
 	written = scnprintf(buf, PAGE_SIZE, "%*pb\n",
 			    cpumask_pr_args(wq_unbound_global_cpumask));
+	mutex_unlock(&wq_pool_mutex);
 
 	return written;
 }
 
+static ssize_t wq_unbound_global_cpumask_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	cpumask_var_t cpumask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, cpumask);
+	if (!ret)
+		ret = workqueue_set_unbound_global_cpumask(cpumask);
+
+	free_cpumask_var(cpumask);
+	return ret ? ret : count;
+}
+
 static struct device_attribute wq_sysfs_cpumask_attr =
-	__ATTR(cpumask, 0444, wq_unbound_global_cpumask_show, NULL);
+	__ATTR(cpumask, 0644, wq_unbound_global_cpumask_show,
+	       wq_unbound_global_cpumask_store);
 
 static int __init wq_sysfs_init(void)
 {
-- 
2.1.0


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-04-07 11:26     ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
  2015-04-07 11:26       ` [PATCH 2/3 V7] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
  2015-04-07 11:26       ` [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-04-17 14:57       ` Tejun Heo
  2015-04-20  3:21         ` Lai Jiangshan
  2 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2015-04-17 14:57 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On Tue, Apr 07, 2015 at 07:26:35PM +0800, Lai Jiangshan wrote:
> Current apply_workqueue_attrs() includes pwqs-allocation and pwqs-installation,
> so when we batch multiple apply_workqueue_attrs()s as a transaction, we can't
> ensure the transaction must succeed or fail as a complete unit.

Lai, can you please

* Break out threads when posting new version.

* List the changes since the last version?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages
  2015-04-17 14:57       ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Tejun Heo
@ 2015-04-20  3:21         ` Lai Jiangshan
  0 siblings, 0 replies; 48+ messages in thread
From: Lai Jiangshan @ 2015-04-20  3:21 UTC (permalink / raw)
  To: Tejun Heo
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

On 04/17/2015 10:57 PM, Tejun Heo wrote:
> On Tue, Apr 07, 2015 at 07:26:35PM +0800, Lai Jiangshan wrote:
>> Current apply_workqueue_attrs() includes pwqs-allocation and pwqs-installation,
>> so when we batch multiple apply_workqueue_attrs()s as a transaction, we can't
>> ensure the transaction must succeed or fail as a complete unit.
> 
> Lai, can you please
> 
> * Break out threads when posting new version.
> 
> * List the changes since the last version?

My bad.

I thought you had only two comments which ware handled in v7.
It was considered (with my laziness) too less to be renarrated.

---

In [2/4 V6]: 
```quote from TJ:
And we're dropping online_cpus locking before applying the new pwq's.
Is that safe?
```

It was my fault, I didn't remember when I wrongly moved the code.
The patch had been tried my best keep functionality unchanged.
It is fixed in [1/3 V7].

----

The changes from [4/4 V6] to [3/3 V7]:
ctx->attrs (the original configured value) is not calculated until
it is saved into ctx->attrs, and a corresponding local-var is killed.

Could you please consider it as [0/3 V7] this time?

Thx
Lai.

> 
> Thanks.
> 


^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-07 11:26       ` [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
@ 2015-04-22 19:39         ` Tejun Heo
  2015-04-22 23:02           ` Frederic Weisbecker
  0 siblings, 1 reply; 48+ messages in thread
From: Tejun Heo @ 2015-04-22 19:39 UTC (permalink / raw)
  To: Lai Jiangshan
  Cc: linux-kernel, Christoph Lameter, Kevin Hilman, Mike Galbraith,
	Paul E. McKenney, Viresh Kumar, Frederic Weisbecker

Hello,

Generally looks good to me.  Some minor things below.

On Tue, Apr 07, 2015 at 07:26:37PM +0800, Lai Jiangshan wrote:
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index cbccf5d..557612e 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
>  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
>  static bool workqueue_freezing;		/* PL: have wqs started freezing? */
>  
> -static cpumask_var_t wq_unbound_global_cpumask;
> +static cpumask_var_t wq_unbound_global_cpumask; /* PL: low level cpumask for all unbound wqs */

Are we set on this variable name?  What would we lose by naming it
wq_unbound_cpumask or wq_cpu_possible_mask?

> @@ -3493,6 +3493,7 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
>  struct apply_wqattrs_ctx {
>  	struct workqueue_struct	*wq;	/* target to be applied */
>  	struct workqueue_attrs	*attrs;	/* configured attrs */
> +	struct list_head	list;	/* queued for batching commit */
                                                      batch commit
>  	struct pool_workqueue	*dfl_pwq;
>  	struct pool_workqueue	*pwq_tbl[];
>  };
> @@ -3517,7 +3518,8 @@ static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx)
>  /* Allocates the attrs and pwqs for later installment */
>  static struct apply_wqattrs_ctx *
>  apply_wqattrs_prepare(struct workqueue_struct *wq,
> -		      const struct workqueue_attrs *attrs)
> +		      const struct workqueue_attrs *attrs,
> +		      cpumask_var_t unbound_cpumask)

Why do we need this tho?  The global mask is protected by pool mutex,
right?  The update function can set it to the new value and just call
update and revert on failure.

> @@ -3710,6 +3721,14 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
>  	 * wq's, the default pwq should be used.
>  	 */
>  	if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
> +		/*
> +		 * wq->unbound_attrs is the user configured attrs whose
> +		 * cpumask is not masked with wq_unbound_global_cpumask,
> +		 * so we make complete it.
> +		 */
> +		cpumask_and(cpumask, cpumask, wq_unbound_global_cpumask);
> +		if (cpumask_empty(cpumask))
> +			goto use_dfl_pwq;

Wouldn't it be better to apply the global cpumask before calling
wq_calc_node_cpumask()?  Or just move it inside wq_calc_node_cpumask?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-22 19:39         ` Tejun Heo
@ 2015-04-22 23:02           ` Frederic Weisbecker
  2015-04-23  6:29             ` Mike Galbraith
  0 siblings, 1 reply; 48+ messages in thread
From: Frederic Weisbecker @ 2015-04-22 23:02 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Lai Jiangshan, linux-kernel, Christoph Lameter, Kevin Hilman,
	Mike Galbraith, Paul E. McKenney, Viresh Kumar

On Wed, Apr 22, 2015 at 03:39:35PM -0400, Tejun Heo wrote:
> Hello,
> 
> Generally looks good to me.  Some minor things below.
> 
> On Tue, Apr 07, 2015 at 07:26:37PM +0800, Lai Jiangshan wrote:
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index cbccf5d..557612e 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -299,7 +299,7 @@ static DEFINE_SPINLOCK(wq_mayday_lock);	/* protects wq->maydays list */
> >  static LIST_HEAD(workqueues);		/* PR: list of all workqueues */
> >  static bool workqueue_freezing;		/* PL: have wqs started freezing? */
> >  
> > -static cpumask_var_t wq_unbound_global_cpumask;
> > +static cpumask_var_t wq_unbound_global_cpumask; /* PL: low level cpumask for all unbound wqs */
> 
> Are we set on this variable name?  What would we lose by naming it
> wq_unbound_cpumask or wq_cpu_possible_mask?

I like wq_unbound_cpumask personally. In fact I like to have "unbound"
inside to express what's concerned here. I like wq_cpu_possible_mask too
but unfortunately it suggests it's about all workqueues (including per cpu
ones) while it's not.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask
  2015-04-22 23:02           ` Frederic Weisbecker
@ 2015-04-23  6:29             ` Mike Galbraith
  0 siblings, 0 replies; 48+ messages in thread
From: Mike Galbraith @ 2015-04-23  6:29 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Tejun Heo, Lai Jiangshan, linux-kernel, Christoph Lameter,
	Kevin Hilman, Mike Galbraith, Paul E. McKenney, Viresh Kumar


FWIW on the testing side, I'm running these in 3.12(ish), 4.0 and 4.1 
rt trees with NOHZ_FULL, and have yet to meet a problem.

        -Mike

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2015-04-23  6:29 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-03-12  5:00 [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Lai Jiangshan
2015-03-12  5:00 ` [PATCH 1/4] workqueue: Reorder sysfs code Lai Jiangshan
2015-03-12  5:00 ` [PATCH 2/4] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
2015-03-12  5:00 ` [PATCH 3/4] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
2015-03-12 17:33   ` Christoph Lameter
2015-03-13 23:49   ` Kevin Hilman
2015-03-14  0:52     ` Kevin Hilman
2015-03-14  7:52     ` Lai Jiangshan
2015-03-16 17:12       ` Kevin Hilman
2015-03-16 17:25         ` Frederic Weisbecker
2015-03-16 19:38           ` Kevin Hilman
2015-03-12  5:00 ` [PATCH 4/4] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
2015-03-12 17:42   ` Christoph Lameter
2015-03-13  1:38     ` Lai Jiangshan
2015-03-13  7:49   ` Lai Jiangshan
2015-03-12 17:49 ` [PATCH 0/4] workqueue: Introduce low-level unbound wq sysfs cpumask v4 Frederic Weisbecker
2015-03-13  6:02 ` Mike Galbraith
2015-03-18  4:40 ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Lai Jiangshan
2015-03-18  4:40   ` [PATCH 1/4 V5] workqueue: Reorder sysfs code Lai Jiangshan
2015-03-24 15:41     ` Tejun Heo
2015-03-18  4:40   ` [PATCH 2/4 V5] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
2015-03-24 15:55     ` Tejun Heo
2015-03-18  4:40   ` [PATCH 3/4 V5] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
2015-03-18  4:40   ` [PATCH 4/4 V5] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
2015-03-24 17:31     ` Tejun Heo
2015-03-31  7:46       ` Lai Jiangshan
2015-04-01  8:33         ` Lai Jiangshan
2015-04-01 15:52           ` Tejun Heo
2015-03-19  8:54   ` [PATCH 0/4 V5] workqueue: Introduce low-level unbound wq sysfs cpumask v5 Mike Galbraith
2015-04-02 11:14   ` [PATCH 0/4 V6] " Lai Jiangshan
2015-04-02 11:14     ` [PATCH 1/4 V6] workqueue: Reorder sysfs code Lai Jiangshan
2015-04-06 15:22       ` Tejun Heo
2015-04-02 11:14     ` [PATCH 2/4 V6] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
2015-04-06 15:39       ` Tejun Heo
2015-04-02 11:14     ` [PATCH 3/4 V6] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
2015-04-02 11:14     ` [PATCH 4/4 V6] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
2015-04-06 15:53       ` Tejun Heo
2015-04-07  1:25         ` Lai Jiangshan
2015-04-07  1:58           ` Tejun Heo
2015-04-07  2:33             ` Lai Jiangshan
2015-04-07 11:26     ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Lai Jiangshan
2015-04-07 11:26       ` [PATCH 2/3 V7] workqueue: Create low-level unbound workqueues cpumask Lai Jiangshan
2015-04-07 11:26       ` [PATCH 3/3 V7] workqueue: Allow modifying low level unbound workqueue cpumask Lai Jiangshan
2015-04-22 19:39         ` Tejun Heo
2015-04-22 23:02           ` Frederic Weisbecker
2015-04-23  6:29             ` Mike Galbraith
2015-04-17 14:57       ` [PATCH 1/3 V7] workqueue: split apply_workqueue_attrs() into 3 stages Tejun Heo
2015-04-20  3:21         ` Lai Jiangshan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).