LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
@ 2007-03-20 19:34 Cliff Wickman
2007-03-20 22:12 ` Randy Dunlap
0 siblings, 1 reply; 4+ messages in thread
From: Cliff Wickman @ 2007-03-20 19:34 UTC (permalink / raw)
To: linux-kernel
From: Cliff Wickman <cpw@sgi.com>
This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.
At that point, any tasks in that cpuset are incorrectly moved (as I recall,
they were move to a sibling cpuset).
Such tasks should be move the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.
And the empty cpuset should be removed (if it is flagged notify_on_release).
This patch contains the added complexity of taking care not to do memory
allocation while holding the cpusets callback_mutex. And it makes use of the
"cpuset_release_agent" to do the cpuset removals.
It might be simpler to use a separate thread or workqueue. But such code
has not yet been written.
Diffed against 2.6.20-rc6
Signed-off-by: Cliff Wickman <cpw@sgi.com>
---
kernel/cpuset.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 180 insertions(+), 20 deletions(-)
Index: morton.070205/kernel/cpuset.c
===================================================================
--- morton.070205.orig/kernel/cpuset.c
+++ morton.070205/kernel/cpuset.c
@@ -112,6 +112,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
+struct path_list_element {
+ struct list_head list;
+ struct cpuset *cs;
+ char *path;
+};
+
/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
@@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus
* the time manage_mutex is held.
*/
-static void cpuset_release_agent(const char *pathbuf)
+static void cpuset_release_agent(const char *pathbuf, int releasepath)
{
char *argv[3], *envp[3];
int i;
@@ -518,7 +524,8 @@ static void cpuset_release_agent(const c
envp[i] = NULL;
call_usermodehelper(argv[0], argv, envp, 0);
- kfree(pathbuf);
+ if (releasepath)
+ kfree(pathbuf);
}
/*
@@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write(
retval = nbytes;
out2:
mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ cpuset_release_agent(pathbuf, 1);
out1:
kfree(buffer);
return retval;
@@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ cpuset_release_agent(pathbuf, 1);
return 0;
}
@@ -2053,13 +2060,33 @@ out:
}
/*
+ * move every task that is a member of cpuset "from" to cpuset "to"
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+ int moved=0;
+ struct task_struct *g, *tsk;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, tsk) {
+ if (tsk->cpuset == from) {
+ moved++;
+ task_lock(tsk);
+ tsk->cpuset = to;
+ task_unlock(tsk);
+ }
+ } while_each_thread(g, tsk);
+ read_unlock(&tasklist_lock);
+ atomic_add(moved, &to->count);
+ atomic_set(&from->count, 0);
+}
+
+/*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes. Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty cpuset
+ * to its next-highest non-empty parent. And remove the empty cpuset.
*
* This routine is intentionally inefficient in a couple of regards.
* It will check all cpusets in a subtree even if the top cpuset of
@@ -2070,20 +2097,100 @@ out:
*
* Call with both manage_mutex and callback_mutex held.
*
+ * Takes tasklist_lock, and task_lock() for cpuset members that are
+ * moved to another cpuset.
+ *
* Recursive, on depth of cpuset subtree.
*/
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+static void remove_tasks_in_empty_cpusets_in_subtree(const struct cpuset *cur, struct list_head *empty_list, struct path_list_element **ple_array, int *ple_availp, int ple_count)
+{
+ int npids, ple_used=0;
+ struct cpuset *c, *parent;
+ struct path_list_element *ple;
+
+ /* If a cpuset's mems or cpus are empty, move its tasks to its parent */
+ list_for_each_entry(c, &cur->children, sibling) {
+ remove_tasks_in_empty_cpusets_in_subtree(c, empty_list,
+ ple_array, ple_availp, ple_count);
+ /*
+ * If it has no online cpus or no online mems, move its tasks
+ * to its next-highest non-empty parent and remove it.
+ * Remove it even if it has children, as its children are a
+ * subset of cpus and nodes, so they are empty too.
+ * The removal is conditional on whether it is
+ * notify-on-release.
+ */
+ if (cpus_empty(c->cpus_allowed) ||
+ nodes_empty(c->mems_allowed)) {
+ char *path = NULL;
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = c->parent;
+ while (parent && cpus_empty(parent->cpus_allowed))
+ parent = parent->parent;
+ npids = atomic_read(&c->count);
+ /* c->count is the number of tasks using the cpuset */
+ if (npids)
+ /* move member tasks to the parent cpuset */
+ move_member_tasks_to_cpuset(c, parent);
+
+ /*
+ * sanity check that we're not over-running
+ * the array
+ */
+ if (++ple_used > ple_count)
+ return;
+ ple = ple_array[(*ple_availp)++];
+ path = (char *)ple + sizeof(struct path_list_element);
+ if (cpuset_path(c, path,
+ PAGE_SIZE-sizeof(struct path_list_element)) < 0)
+ path = NULL;
+ if (path != NULL) {
+ /*
+ * add path to list of cpusets to remove
+ * (list includes cpusets that are not
+ * notify-on-release)
+ */
+ ple->path = path;
+ ple->cs = c;
+ /*
+ * since we're walking "up" the tree, list
+ * any empty cpusets we find on the tail of
+ * the list (later==higher; start with lower)
+ */
+ list_add_tail(&ple->list, empty_list);
+ }
+ }
+ }
+}
+
+/*
+ * Walk the specified cpuset subtree and remove any offline cpus from
+ * each cpuset.
+ *
+ * Count the number of empty cpusets.
+ *
+ * Call with both manage_mutex and callback_mutex held so
+ * that this function can modify cpus_allowed and mems_allowed.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void remove_offlines_count_emptys(const struct cpuset *cur, int *count)
{
struct cpuset *c;
- /* Each of our child cpusets mems must be online */
list_for_each_entry(c, &cur->children, sibling) {
- guarantee_online_cpus_mems_in_subtree(c);
- if (!cpus_empty(c->cpus_allowed))
- guarantee_online_cpus(c, &c->cpus_allowed);
- if (!nodes_empty(c->mems_allowed))
- guarantee_online_mems(c, &c->mems_allowed);
+ remove_offlines_count_emptys(c, count);
+ /* Remove offline cpus and mems from this cpuset. */
+ cpus_and(c->cpus_allowed, c->cpus_allowed, cpu_online_map);
+ nodes_and(c->mems_allowed, c->mems_allowed, node_online_map);
+ if (cpus_empty(c->cpus_allowed) ||
+ nodes_empty(c->mems_allowed))
+ (*count)++;
}
}
@@ -2095,7 +2202,7 @@ static void guarantee_online_cpus_mems_i
* To ensure that we don't remove a CPU or node from the top cpuset
* that is currently in use by a child cpuset (which would violate
* the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ * call the recursive routine remove_tasks_in_empty_cpusets_in_subtree().
*
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
@@ -2105,15 +2212,68 @@ static void guarantee_online_cpus_mems_i
static void common_cpu_mem_hotplug_unplug(void)
{
+ int i, empty_count=0, ple_avail=0;
+ struct list_head empty_cpuset_list;
+ struct path_list_element *ple, **ple_array=NULL;
+
mutex_lock(&manage_mutex);
- mutex_lock(&callback_mutex);
- guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+ mutex_lock(&callback_mutex);
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;
-
+ remove_offlines_count_emptys(&top_cpuset, &empty_count);
mutex_unlock(&callback_mutex);
+
+ if (empty_count) {
+ /*
+ * allocate the control structures needed for a list of
+ * cpuset paths to free. (allocation must be done without
+ * holding callback_mutex)
+ */
+ ple_array = (struct path_list_element **)kmalloc
+ (empty_count*sizeof(struct empty_cpuset_list *), GFP_KERNEL);
+ if (!ple_array)
+ return;
+ for (i=0; i<empty_count; i++) {
+ ple = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * the space for the path itself immediately follows
+ * the path_list_element structure (we have a full page)
+ */
+ if (!ple)
+ return;
+ ple_array[i]= ple;
+ }
+ INIT_LIST_HEAD(&empty_cpuset_list);
+
+ mutex_lock(&callback_mutex);
+ remove_tasks_in_empty_cpusets_in_subtree(&top_cpuset,
+ &empty_cpuset_list, ple_array, &ple_avail, empty_count);
+ mutex_unlock(&callback_mutex);
+ }
+
mutex_unlock(&manage_mutex);
+
+ if (empty_count) {
+ /*
+ * Free each cpuset on the list.
+ * (but only if it is notify-on-release)
+ */
+ list_for_each_entry(ple, &empty_cpuset_list, list) {
+ if (notify_on_release(ple->cs))
+ cpuset_release_agent(ple->path, 0);
+ /*
+ * 0: don't ask cpuset_release_agent to
+ * release the path
+ */
+ }
+ /* remove the control structures */
+ for (i=0; i<empty_count; i++) {
+ kfree(ple_array[i]);
+ }
+ kfree(ple_array);
+ }
+ return;
}
/*
@@ -2259,7 +2419,7 @@ void cpuset_exit(struct task_struct *tsk
if (atomic_dec_and_test(&cs->count))
check_for_release(cs, &pathbuf);
mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ cpuset_release_agent(pathbuf, 1);
} else {
atomic_dec(&cs->count);
}
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
2007-03-20 19:34 [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent Cliff Wickman
@ 2007-03-20 22:12 ` Randy Dunlap
0 siblings, 0 replies; 4+ messages in thread
From: Randy Dunlap @ 2007-03-20 22:12 UTC (permalink / raw)
To: Cliff Wickman; +Cc: linux-kernel
On Tue, 20 Mar 2007 13:34:01 -0600 Cliff Wickman wrote:
>
> From: Cliff Wickman <cpw@sgi.com>
>
> This patch corrects a situation that occurs when one disables all the cpus
> in a cpuset.
>
> At that point, any tasks in that cpuset are incorrectly moved (as I recall,
> they were move to a sibling cpuset).
> Such tasks should be move the parent of their current cpuset. Or if the
> parent cpuset has no cpus, to its parent, etc.
>
> And the empty cpuset should be removed (if it is flagged notify_on_release).
>
> This patch contains the added complexity of taking care not to do memory
> allocation while holding the cpusets callback_mutex. And it makes use of the
> "cpuset_release_agent" to do the cpuset removals.
>
> It might be simpler to use a separate thread or workqueue. But such code
> has not yet been written.
>
> Diffed against 2.6.20-rc6
>
> Signed-off-by: Cliff Wickman <cpw@sgi.com>
>
> ---
> kernel/cpuset.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++------
> 1 file changed, 180 insertions(+), 20 deletions(-)
>
> Index: morton.070205/kernel/cpuset.c
> ===================================================================
> --- morton.070205.orig/kernel/cpuset.c
> +++ morton.070205/kernel/cpuset.c
> @@ -2070,20 +2097,100 @@ out:
> *
> * Call with both manage_mutex and callback_mutex held.
> *
> + * Takes tasklist_lock, and task_lock() for cpuset members that are
> + * moved to another cpuset.
> + *
> * Recursive, on depth of cpuset subtree.
> */
>
> -static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
> +static void remove_tasks_in_empty_cpusets_in_subtree(const struct cpuset *cur, struct list_head *empty_list, struct path_list_element **ple_array, int *ple_availp, int ple_count)
That line is way too long. Source lines should fit in 80 columns unless
they contain (maybe) a printk string that would be ugly if split (e.g.).
This one should be like so (or some other readable variant):
static void remove_tasks_in_empty_cpusets_in_subtree(
const struct cpuset *cur,
struct list_head *empty_list,
struct path_list_element **ple_array,
int *ple_availp, int ple_count)
> +{
> + int npids, ple_used=0;
> + struct cpuset *c, *parent;
> + struct path_list_element *ple;
> +
> + /* If a cpuset's mems or cpus are empty, move its tasks to its parent */
> + list_for_each_entry(c, &cur->children, sibling) {
> + remove_tasks_in_empty_cpusets_in_subtree(c, empty_list,
> + ple_array, ple_availp, ple_count);
> + /*
> + * If it has no online cpus or no online mems, move its tasks
> + * to its next-highest non-empty parent and remove it.
> + * Remove it even if it has children, as its children are a
> + * subset of cpus and nodes, so they are empty too.
> + * The removal is conditional on whether it is
> + * notify-on-release.
> + */
> + if (cpus_empty(c->cpus_allowed) ||
> + nodes_empty(c->mems_allowed)) {
> + char *path = NULL;
> + /*
> + * Find its next-highest non-empty parent, (top cpuset
> + * has online cpus, so can't be empty).
> + */
> + parent = c->parent;
> + while (parent && cpus_empty(parent->cpus_allowed))
> + parent = parent->parent;
> + npids = atomic_read(&c->count);
> + /* c->count is the number of tasks using the cpuset */
> + if (npids)
> + /* move member tasks to the parent cpuset */
> + move_member_tasks_to_cpuset(c, parent);
> +
> + /*
> + * sanity check that we're not over-running
> + * the array
> + */
> + if (++ple_used > ple_count)
> + return;
> + ple = ple_array[(*ple_availp)++];
> + path = (char *)ple + sizeof(struct path_list_element);
> + if (cpuset_path(c, path,
> + PAGE_SIZE-sizeof(struct path_list_element)) < 0)
> + path = NULL;
> + if (path != NULL) {
> + /*
> + * add path to list of cpusets to remove
> + * (list includes cpusets that are not
> + * notify-on-release)
> + */
> + ple->path = path;
> + ple->cs = c;
> + /*
> + * since we're walking "up" the tree, list
> + * any empty cpusets we find on the tail of
> + * the list (later==higher; start with lower)
> + */
> + list_add_tail(&ple->list, empty_list);
> + }
> + }
> + }
> +}
> +
> +/*
> + * Walk the specified cpuset subtree and remove any offline cpus from
> + * each cpuset.
> + *
> + * Count the number of empty cpusets.
> + *
> + * Call with both manage_mutex and callback_mutex held so
> + * that this function can modify cpus_allowed and mems_allowed.
> + *
> + * Recursive, on depth of cpuset subtree.
Recursive with what limit/bound?
at least realistically if not in source code.
and will that be reasonable 5 years from now,
without causing stack overflow?
Not that this function uses lots of stack, but I didn't track
down the call tree to get here.
> + */
> +
> +static void remove_offlines_count_emptys(const struct cpuset *cur, int *count)
> {
> struct cpuset *c;
>
> - /* Each of our child cpusets mems must be online */
> list_for_each_entry(c, &cur->children, sibling) {
> - guarantee_online_cpus_mems_in_subtree(c);
> - if (!cpus_empty(c->cpus_allowed))
> - guarantee_online_cpus(c, &c->cpus_allowed);
> - if (!nodes_empty(c->mems_allowed))
> - guarantee_online_mems(c, &c->mems_allowed);
> + remove_offlines_count_emptys(c, count);
> + /* Remove offline cpus and mems from this cpuset. */
> + cpus_and(c->cpus_allowed, c->cpus_allowed, cpu_online_map);
> + nodes_and(c->mems_allowed, c->mems_allowed, node_online_map);
> + if (cpus_empty(c->cpus_allowed) ||
> + nodes_empty(c->mems_allowed))
> + (*count)++;
> }
> }
>
> @@ -2095,7 +2202,7 @@ static void guarantee_online_cpus_mems_i
> * To ensure that we don't remove a CPU or node from the top cpuset
> * that is currently in use by a child cpuset (which would violate
> * the rule that cpusets must be subsets of their parent), we first
> - * call the recursive routine guarantee_online_cpus_mems_in_subtree().
> + * call the recursive routine remove_tasks_in_empty_cpusets_in_subtree().
> *
> * Since there are two callers of this routine, one for CPU hotplug
> * events and one for memory node hotplug events, we could have coded
> @@ -2105,15 +2212,68 @@ static void guarantee_online_cpus_mems_i
>
> static void common_cpu_mem_hotplug_unplug(void)
> {
> + int i, empty_count=0, ple_avail=0;
> + struct list_head empty_cpuset_list;
> + struct path_list_element *ple, **ple_array=NULL;
> +
> mutex_lock(&manage_mutex);
> - mutex_lock(&callback_mutex);
>
> - guarantee_online_cpus_mems_in_subtree(&top_cpuset);
> + mutex_lock(&callback_mutex);
> top_cpuset.cpus_allowed = cpu_online_map;
> top_cpuset.mems_allowed = node_online_map;
> -
> + remove_offlines_count_emptys(&top_cpuset, &empty_count);
> mutex_unlock(&callback_mutex);
> +
> + if (empty_count) {
> + /*
> + * allocate the control structures needed for a list of
> + * cpuset paths to free. (allocation must be done without
> + * holding callback_mutex)
> + */
> + ple_array = (struct path_list_element **)kmalloc
cast not needed?
> + (empty_count*sizeof(struct empty_cpuset_list *), GFP_KERNEL);
> + if (!ple_array)
> + return;
> + for (i=0; i<empty_count; i++) {
Spaces: for (i = 0; i < empty_count; i++) {
> + ple = kmalloc(PAGE_SIZE, GFP_KERNEL);
> + /*
> + * the space for the path itself immediately follows
> + * the path_list_element structure (we have a full page)
> + */
> + if (!ple)
> + return;
> + ple_array[i]= ple;
> + }
> + INIT_LIST_HEAD(&empty_cpuset_list);
> +
> + mutex_lock(&callback_mutex);
> + remove_tasks_in_empty_cpusets_in_subtree(&top_cpuset,
> + &empty_cpuset_list, ple_array, &ple_avail, empty_count);
> + mutex_unlock(&callback_mutex);
> + }
> +
> mutex_unlock(&manage_mutex);
> +
> + if (empty_count) {
> + /*
> + * Free each cpuset on the list.
> + * (but only if it is notify-on-release)
> + */
> + list_for_each_entry(ple, &empty_cpuset_list, list) {
> + if (notify_on_release(ple->cs))
> + cpuset_release_agent(ple->path, 0);
> + /*
> + * 0: don't ask cpuset_release_agent to
> + * release the path
> + */
Don't explain function parameters unless they are tricky.
> + }
> + /* remove the control structures */
> + for (i=0; i<empty_count; i++) {
spacing
> + kfree(ple_array[i]);
> + }
> + kfree(ple_array);
> + }
> + return;
> }
>
> /*
> @@ -2259,7 +2419,7 @@ void cpuset_exit(struct task_struct *tsk
> if (atomic_dec_and_test(&cs->count))
> check_for_release(cs, &pathbuf);
> mutex_unlock(&manage_mutex);
> - cpuset_release_agent(pathbuf);
> + cpuset_release_agent(pathbuf, 1);
> } else {
> atomic_dec(&cs->count);
> }
---
~Randy
*** Remember to use Documentation/SubmitChecklist when testing your code ***
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
@ 2007-05-21 20:08 Cliff Wickman
0 siblings, 0 replies; 4+ messages in thread
From: Cliff Wickman @ 2007-05-21 20:08 UTC (permalink / raw)
To: linux-kernel
This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.
At that point, any tasks in that cpuset were incorrectly moved.
(Disabling all cpus in a cpuset caused it to inherit the cpus
of its parent, which may overlap its exclusive sibling.)
Such tasks should be moved to the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.
And the empty cpuset should be removed (if it is flagged notify_on_release).
This patch uses a workqueue thread to call the function that deletes the cpuset.
That way we avoid the complexity of the cpuset locks.
(I've been working with Paul Jackson on this patch, and there is still a
little functional subtlety to work out. Can be tweaked later.)
Diffed against 2.6.21
Signed-off-by: Cliff Wickman <cpw@sgi.com>
---
kernel/cpuset.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 191 insertions(+), 30 deletions(-)
Index: linus.070504/kernel/cpuset.c
===================================================================
--- linus.070504.orig/kernel/cpuset.c
+++ linus.070504/kernel/cpuset.c
@@ -54,6 +54,7 @@
#include <asm/atomic.h>
#include <linux/mutex.h>
#include <linux/kfifo.h>
+#include <linux/workqueue.h>
#define CPUSET_SUPER_MAGIC 0x27e0eb
@@ -111,6 +112,7 @@ typedef enum {
CS_NOTIFY_ON_RELEASE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+ CS_RELEASED_RESOURCE,
} cpuset_flagbits_t;
/* convenient tests for these bits */
@@ -149,6 +151,11 @@ static inline int is_spread_slab(const s
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
+static inline int has_released_a_resource(const struct cpuset *cs)
+{
+ return test_bit(CS_RELEASED_RESOURCE, &cs->flags);
+}
+
/*
* Increment this integer everytime any cpuset changes its
* mems_allowed value. Users of cpusets can track this generation
@@ -543,7 +550,7 @@ static void cpuset_release_agent(const c
static void check_for_release(struct cpuset *cs, char **ppathbuf)
{
if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
- list_empty(&cs->children)) {
+ list_empty(&cs->children)) {
char *buf;
buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
@@ -835,7 +842,8 @@ update_cpu_domains_tree(struct cpuset *r
while (__kfifo_get(queue, (unsigned char *)&cp, sizeof(cp))) {
list_for_each_entry(child, &cp->children, sibling)
- __kfifo_put(queue,(unsigned char *)&child,sizeof(child));
+ __kfifo_put(queue, (unsigned char *)&child,
+ sizeof(child));
update_cpu_domains(cp);
}
@@ -1101,7 +1109,7 @@ static int update_flag(cpuset_flagbits_t
mutex_unlock(&callback_mutex);
if (cpu_exclusive_changed)
- update_cpu_domains_tree(cs);
+ update_cpu_domains_tree(cs);
return 0;
}
@@ -1279,6 +1287,7 @@ static int attach_task(struct cpuset *cs
from = oldcs->mems_allowed;
to = cs->mems_allowed;
+ set_bit(CS_RELEASED_RESOURCE, &oldcs->flags);
mutex_unlock(&callback_mutex);
@@ -1361,6 +1370,10 @@ static ssize_t cpuset_common_file_write(
retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
break;
case FILE_NOTIFY_ON_RELEASE:
+ /* Even if the cpuset had been emptied in the past
+ it must not be considered for release until it has
+ become non-empty again. */
+ clear_bit(CS_RELEASED_RESOURCE, &cs->flags);
retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
break;
case FILE_MEMORY_MIGRATE:
@@ -2014,6 +2027,7 @@ static int cpuset_rmdir(struct inode *un
cpuset_d_remove_dir(d);
dput(d);
number_of_cpusets--;
+ set_bit(CS_RELEASED_RESOURCE, &parent->flags);
mutex_unlock(&callback_mutex);
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
@@ -2081,50 +2095,188 @@ out:
}
/*
+ * Move every task that is a member of cpuset "from" to cpuset "to".
+ *
+ * Called with both manage_sem and callback_sem held
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+ int moved=0;
+ struct task_struct *g, *tsk;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, tsk) {
+ if (tsk->cpuset == from) {
+ moved++;
+ task_lock(tsk);
+ tsk->cpuset = to;
+ task_unlock(tsk);
+ }
+ } while_each_thread(g, tsk);
+ read_unlock(&tasklist_lock);
+ atomic_add(moved, &to->count);
+ atomic_set(&from->count, 0);
+}
+
+/*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes. Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty
+ * cpuset to its next-highest non-empty parent.
+ *
+ * Called with both manage_sem and callback_sem held
+ */
+static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
+{
+ int npids;
+ struct cpuset *parent;
+
+ /* cs->count is the number of tasks using the cpuset */
+ npids = atomic_read(&cs->count);
+ if (!npids)
+ return;
+
+ /* this cpuset has had member tasks */
+ set_bit(CS_RELEASED_RESOURCE, &cs->flags);
+
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = cs->parent;
+ while (parent && cpus_empty(parent->cpus_allowed)) {
+ /*
+ * all these empty cpusets should now be considered to
+ * have been used, and therefore eligible for
+ * release when empty (if they are notify_on_release)
+ */
+ set_bit(CS_RELEASED_RESOURCE, &parent->flags);
+ parent = parent->parent;
+ }
+ /*
+ * the one that we find non-empty will be flagged releaseable when
+ * the tasks exit.
+ */
+
+ /* move member tasks to the non-empty parent cpuset */
+ move_member_tasks_to_cpuset(cs, parent);
+}
+
+/*
+ * Walk the specified cpuset subtree and move any tasks found in
+ * empty cpusets. Also count the number of empty notify_on_release cpusets.
*
- * This routine is intentionally inefficient in a couple of regards.
- * It will check all cpusets in a subtree even if the top cpuset of
- * the subtree has no offline CPUs or nodes. It checks both CPUs and
- * nodes, even though the caller could have been coded to know that
- * only one of CPUs or nodes needed to be checked on a given call.
- * This was done to minimize text size rather than cpu cycles.
+ * Note that such a notify_on_release cpuset must have had, at some time,
+ * member tasks or cpuset descendants and cpus and memory, before it can
+ * be a candidate for release.
*
- * Call with both manage_mutex and callback_mutex held.
+ * Call with both manage_sem and callback_sem held so
+ * that this function can modify cpus_allowed and mems_allowed.
*
- * Recursive, on depth of cpuset subtree.
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next. It always processes a node before
+ * any of its children.
*/
+static void remove_tasks_from_empty_cpusets(const struct cpuset *root, int *count)
+{
+ struct cpuset *cp; /* scans cpusets being updated */
+ struct cpuset *child; /* scans child cpusets of cp */
+ struct kfifo *queue; /* fifo queue of cpusets to be updated */
+
+ queue = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
+ if (queue == ERR_PTR(-ENOMEM))
+ return;
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+ __kfifo_put(queue, (unsigned char *)&root, sizeof(root));
+
+ while (__kfifo_get(queue, (unsigned char *)&cp, sizeof(cp))) {
+ list_for_each_entry(child, &cp->children, sibling)
+ __kfifo_put(queue, (unsigned char *)&child,
+ sizeof(child));
+ /* Remove offline cpus and mems from this cpuset. */
+ cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
+ nodes_and(cp->mems_allowed, cp->mems_allowed, node_online_map);
+ if ((cpus_empty(cp->cpus_allowed) ||
+ nodes_empty(cp->mems_allowed))) {
+ /* Move tasks from the empty cpuset to a parent */
+ remove_tasks_in_empty_cpuset(cp);
+ if (notify_on_release(cp) &&
+ has_released_a_resource(cp))
+ /* count the cpuset to be released */
+ (*count)++;
+ }
+ }
+
+ kfifo_free(queue);
+ return;
+}
+
+/*
+ * Walk the specified cpuset subtree and release the empty
+ * notify_on_release cpusets.
+ *
+ * This walk processes the tree from top to bottom, completing one layer
+ * before dropping down to the next. It always processes a node before
+ * any of its children.
+ *
+ * We hold manage_mutex so that the hierarchy cannot change while
+ * we are scanning it.
+ */
+static void release_empty_cpusets(const struct cpuset *root)
{
- struct cpuset *c;
+ struct cpuset *cp; /* scans cpusets being updated */
+ struct cpuset *child; /* scans child cpusets of cp */
+ struct kfifo *queue; /* fifo queue of cpusets to be updated */
+ char *pathbuf = NULL;
- /* Each of our child cpusets mems must be online */
- list_for_each_entry(c, &cur->children, sibling) {
- guarantee_online_cpus_mems_in_subtree(c);
- if (!cpus_empty(c->cpus_allowed))
- guarantee_online_cpus(c, &c->cpus_allowed);
- if (!nodes_empty(c->mems_allowed))
- guarantee_online_mems(c, &c->mems_allowed);
+ queue = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
+ if (queue == ERR_PTR(-ENOMEM))
+ return;
+
+
+ __kfifo_put(queue, (unsigned char *)&root, sizeof(root));
+
+ mutex_lock(&manage_mutex);
+ while (__kfifo_get(queue, (unsigned char *)&cp, sizeof(cp))) {
+ list_for_each_entry(child, &cp->children, sibling)
+ __kfifo_put(queue, (unsigned char *)&child,
+ sizeof(child));
+ if ((notify_on_release(cp)) && has_released_a_resource(cp) &&
+ (cpus_empty(cp->cpus_allowed) ||
+ nodes_empty(cp->mems_allowed))) {
+ check_for_release(cp, &pathbuf);
+ cpuset_release_agent(pathbuf);
+ }
}
+ mutex_unlock(&manage_mutex);
+
+ kfifo_free(queue);
+
+ return;
+}
+
+/*
+ * This runs from a workqueue.
+ *
+ * It's job is to remove any notify_on_release cpusets that have no
+ * online cpus.
+ *
+ * The argument is not used.
+ */
+static void remove_empty_cpusets(struct work_struct *p)
+{
+ release_empty_cpusets(&top_cpuset);
+ return;
}
+static DECLARE_WORK(remove_empties_block, remove_empty_cpusets);
+
/*
* The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
* cpu_online_map and node_online_map. Force the top cpuset to track
* whats online after any CPU or memory node hotplug or unplug event.
*
- * To ensure that we don't remove a CPU or node from the top cpuset
- * that is currently in use by a child cpuset (which would violate
- * the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
- *
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
* two separate routines here. We code it as a single common routine
@@ -2133,12 +2285,18 @@ static void guarantee_online_cpus_mems_i
static void common_cpu_mem_hotplug_unplug(void)
{
+ int empty_count=0;
+
mutex_lock(&manage_mutex);
mutex_lock(&callback_mutex);
- guarantee_online_cpus_mems_in_subtree(&top_cpuset);
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;
+ remove_tasks_from_empty_cpusets(&top_cpuset, &empty_count);
+ if (empty_count)
+ schedule_work(&remove_empties_block);
+ /* release_empty_cpusets() will lock manage_mutex, but that
+ will be done in the context of the work queue */
mutex_unlock(&callback_mutex);
mutex_unlock(&manage_mutex);
@@ -2286,6 +2444,9 @@ void cpuset_exit(struct task_struct *tsk
mutex_lock(&manage_mutex);
if (atomic_dec_and_test(&cs->count))
check_for_release(cs, &pathbuf);
+ mutex_lock(&callback_mutex);
+ set_bit(CS_RELEASED_RESOURCE, &cs->flags);
+ mutex_unlock(&callback_mutex);
mutex_unlock(&manage_mutex);
cpuset_release_agent(pathbuf);
} else {
^ permalink raw reply [flat|nested] 4+ messages in thread
* [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent
@ 2007-03-21 15:43 Cliff Wickman
0 siblings, 0 replies; 4+ messages in thread
From: Cliff Wickman @ 2007-03-21 15:43 UTC (permalink / raw)
To: linux-kernel
Submission #2: after style changes recommended by Randy Dunlap
This patch corrects a situation that occurs when one disables all the cpus
in a cpuset.
At that point, any tasks in that cpuset are incorrectly moved.
(Disabling all cpus in a cpuset caused it to inherit the cpus
of its parent, which may overlap its exclusive sibling.)
Such tasks should be moved to the parent of their current cpuset. Or if the
parent cpuset has no cpus, to its parent, etc.
And the empty cpuset should be removed (if it is flagged notify_on_release).
This patch contains the added complexity of taking care not to do memory
allocation while holding the cpusets callback_mutex. And it makes use of the
"cpuset_release_agent" to do the cpuset removals.
It might be simpler to use a separate thread or workqueue. But such code
has not yet been written.
Diffed against 2.6.20-rc6
Signed-off-by: Cliff Wickman <cpw@sgi.com>
---
kernel/cpuset.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 180 insertions(+), 20 deletions(-)
Index: morton.070205/kernel/cpuset.c
===================================================================
--- morton.070205.orig/kernel/cpuset.c
+++ morton.070205/kernel/cpuset.c
@@ -112,6 +112,12 @@ typedef enum {
CS_SPREAD_SLAB,
} cpuset_flagbits_t;
+struct path_list_element {
+ struct list_head list;
+ struct cpuset *cs;
+ char *path;
+};
+
/* convenient tests for these bits */
static inline int is_cpu_exclusive(const struct cpuset *cs)
{
@@ -498,7 +504,7 @@ static int cpuset_path(const struct cpus
* the time manage_mutex is held.
*/
-static void cpuset_release_agent(const char *pathbuf)
+static void cpuset_release_agent(const char *pathbuf, int releasepath)
{
char *argv[3], *envp[3];
int i;
@@ -518,7 +524,8 @@ static void cpuset_release_agent(const c
envp[i] = NULL;
call_usermodehelper(argv[0], argv, envp, 0);
- kfree(pathbuf);
+ if (releasepath)
+ kfree(pathbuf);
}
/*
@@ -1364,7 +1371,7 @@ static ssize_t cpuset_common_file_write(
retval = nbytes;
out2:
mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ cpuset_release_agent(pathbuf, 1);
out1:
kfree(buffer);
return retval;
@@ -1990,7 +1997,7 @@ static int cpuset_rmdir(struct inode *un
if (list_empty(&parent->children))
check_for_release(parent, &pathbuf);
mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ cpuset_release_agent(pathbuf, 1);
return 0;
}
@@ -2053,13 +2060,33 @@ out:
}
/*
+ * move every task that is a member of cpuset "from" to cpuset "to"
+ */
+static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
+{
+ int moved=0;
+ struct task_struct *g, *tsk;
+
+ read_lock(&tasklist_lock);
+ do_each_thread(g, tsk) {
+ if (tsk->cpuset == from) {
+ moved++;
+ task_lock(tsk);
+ tsk->cpuset = to;
+ task_unlock(tsk);
+ }
+ } while_each_thread(g, tsk);
+ read_unlock(&tasklist_lock);
+ atomic_add(moved, &to->count);
+ atomic_set(&from->count, 0);
+}
+
+/*
* If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
* or memory nodes, we need to walk over the cpuset hierarchy,
* removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then the guarantee_online_cpus()
- * or guarantee_online_mems() code will use that emptied cpusets
- * parent online CPUs or nodes. Cpusets that were already empty of
- * CPUs or nodes are left empty.
+ * last CPU or node from a cpuset, then move the tasks in the empty cpuset
+ * to its next-highest non-empty parent. And remove the empty cpuset.
*
* This routine is intentionally inefficient in a couple of regards.
* It will check all cpusets in a subtree even if the top cpuset of
@@ -2070,20 +2097,104 @@ out:
*
* Call with both manage_mutex and callback_mutex held.
*
+ * Takes tasklist_lock, and task_lock() for cpuset members that are
+ * moved to another cpuset.
+ *
+ * Recursive, on depth of cpuset subtree.
+ */
+
+static void remove_tasks_in_empty_cpusets_in_subtree(
+ const struct cpuset *cur,
+ struct list_head *empty_list,
+ struct path_list_element **ple_array,
+ int *ple_availp, int ple_count)
+{
+ int npids, ple_used=0;
+ struct cpuset *c, *parent;
+ struct path_list_element *ple;
+
+ /* If a cpuset's mems or cpus are empty, move its tasks to its parent */
+ list_for_each_entry(c, &cur->children, sibling) {
+ remove_tasks_in_empty_cpusets_in_subtree(c, empty_list,
+ ple_array, ple_availp, ple_count);
+ /*
+ * If it has no online cpus or no online mems, move its tasks
+ * to its next-highest non-empty parent and remove it.
+ * Remove it even if it has children, as its children are a
+ * subset of cpus and nodes, so they are empty too.
+ * The removal is conditional on whether it is
+ * notify-on-release.
+ */
+ if (cpus_empty(c->cpus_allowed) ||
+ nodes_empty(c->mems_allowed)) {
+ char *path = NULL;
+ /*
+ * Find its next-highest non-empty parent, (top cpuset
+ * has online cpus, so can't be empty).
+ */
+ parent = c->parent;
+ while (parent && cpus_empty(parent->cpus_allowed))
+ parent = parent->parent;
+ npids = atomic_read(&c->count);
+ /* c->count is the number of tasks using the cpuset */
+ if (npids)
+ /* move member tasks to the parent cpuset */
+ move_member_tasks_to_cpuset(c, parent);
+
+ /*
+ * sanity check that we're not over-running
+ * the array
+ */
+ if (++ple_used > ple_count)
+ return;
+ ple = ple_array[(*ple_availp)++];
+ path = (char *)ple + sizeof(struct path_list_element);
+ if (cpuset_path(c, path,
+ PAGE_SIZE-sizeof(struct path_list_element)) < 0)
+ path = NULL;
+ if (path != NULL) {
+ /*
+ * add path to list of cpusets to remove
+ * (list includes cpusets that are not
+ * notify-on-release)
+ */
+ ple->path = path;
+ ple->cs = c;
+ /*
+ * since we're walking "up" the tree, list
+ * any empty cpusets we find on the tail of
+ * the list (later==higher; start with lower)
+ */
+ list_add_tail(&ple->list, empty_list);
+ }
+ }
+ }
+}
+
+/*
+ * Walk the specified cpuset subtree and remove any offline cpus from
+ * each cpuset.
+ *
+ * Count the number of empty cpusets.
+ *
+ * Call with both manage_mutex and callback_mutex held so
+ * that this function can modify cpus_allowed and mems_allowed.
+ *
* Recursive, on depth of cpuset subtree.
*/
-static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
+static void remove_offlines_count_emptys(const struct cpuset *cur, int *count)
{
struct cpuset *c;
- /* Each of our child cpusets mems must be online */
list_for_each_entry(c, &cur->children, sibling) {
- guarantee_online_cpus_mems_in_subtree(c);
- if (!cpus_empty(c->cpus_allowed))
- guarantee_online_cpus(c, &c->cpus_allowed);
- if (!nodes_empty(c->mems_allowed))
- guarantee_online_mems(c, &c->mems_allowed);
+ remove_offlines_count_emptys(c, count);
+ /* Remove offline cpus and mems from this cpuset. */
+ cpus_and(c->cpus_allowed, c->cpus_allowed, cpu_online_map);
+ nodes_and(c->mems_allowed, c->mems_allowed, node_online_map);
+ if (cpus_empty(c->cpus_allowed) ||
+ nodes_empty(c->mems_allowed))
+ (*count)++;
}
}
@@ -2095,7 +2206,7 @@ static void guarantee_online_cpus_mems_i
* To ensure that we don't remove a CPU or node from the top cpuset
* that is currently in use by a child cpuset (which would violate
* the rule that cpusets must be subsets of their parent), we first
- * call the recursive routine guarantee_online_cpus_mems_in_subtree().
+ * call the recursive routine remove_tasks_in_empty_cpusets_in_subtree().
*
* Since there are two callers of this routine, one for CPU hotplug
* events and one for memory node hotplug events, we could have coded
@@ -2105,15 +2216,64 @@ static void guarantee_online_cpus_mems_i
static void common_cpu_mem_hotplug_unplug(void)
{
+ int i, empty_count=0, ple_avail=0;
+ struct list_head empty_cpuset_list;
+ struct path_list_element *ple, **ple_array=NULL;
+
mutex_lock(&manage_mutex);
- mutex_lock(&callback_mutex);
- guarantee_online_cpus_mems_in_subtree(&top_cpuset);
+ mutex_lock(&callback_mutex);
top_cpuset.cpus_allowed = cpu_online_map;
top_cpuset.mems_allowed = node_online_map;
-
+ remove_offlines_count_emptys(&top_cpuset, &empty_count);
mutex_unlock(&callback_mutex);
+
+ if (empty_count) {
+ /*
+ * allocate the control structures needed for a list of
+ * cpuset paths to free. (allocation must be done without
+ * holding callback_mutex)
+ */
+ ple_array = kmalloc
+ (empty_count*sizeof(struct empty_cpuset_list *), GFP_KERNEL);
+ if (!ple_array)
+ return;
+ for (i = 0; i < empty_count; i++) {
+ ple = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * the space for the path itself immediately follows
+ * the path_list_element structure (we have a full page)
+ */
+ if (!ple)
+ return;
+ ple_array[i]= ple;
+ }
+ INIT_LIST_HEAD(&empty_cpuset_list);
+
+ mutex_lock(&callback_mutex);
+ remove_tasks_in_empty_cpusets_in_subtree(&top_cpuset,
+ &empty_cpuset_list, ple_array, &ple_avail, empty_count);
+ mutex_unlock(&callback_mutex);
+ }
+
mutex_unlock(&manage_mutex);
+
+ if (empty_count) {
+ /*
+ * Free each cpuset on the list.
+ * (but only if it is notify-on-release)
+ */
+ list_for_each_entry(ple, &empty_cpuset_list, list) {
+ if (notify_on_release(ple->cs))
+ cpuset_release_agent(ple->path, 0);
+ }
+ /* remove the control structures */
+ for (i = 0; i < empty_count; i++) {
+ kfree(ple_array[i]);
+ }
+ kfree(ple_array);
+ }
+ return;
}
/*
@@ -2259,7 +2419,7 @@ void cpuset_exit(struct task_struct *tsk
if (atomic_dec_and_test(&cs->count))
check_for_release(cs, &pathbuf);
mutex_unlock(&manage_mutex);
- cpuset_release_agent(pathbuf);
+ cpuset_release_agent(pathbuf, 1);
} else {
atomic_dec(&cs->count);
}
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2007-05-21 20:10 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-20 19:34 [PATCH 1/1] hotplug cpu: move tasks in empty cpusets to parent Cliff Wickman
2007-03-20 22:12 ` Randy Dunlap
2007-03-21 15:43 Cliff Wickman
2007-05-21 20:08 Cliff Wickman
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).