LKML Archive on lore.kernel.org
help / color / mirror / Atom feed
* [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()
@ 2011-02-17  1:49 Li Zefan
  2011-02-17  1:49 ` [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch() Li Zefan
                   ` (4 more replies)
  0 siblings, 5 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-17  1:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, Paul Menage, David Rientjes, 缪 勰, linux-mm

It's not necessary to copy cpuset->mems_allowed to a buffer
allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c |   10 +---------
 1 files changed, 1 insertions(+), 9 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10f1835..f13ff2e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1620,20 +1620,12 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
-	NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
 	int retval;
 
-	if (mask == NULL)
-		return -ENOMEM;
-
 	mutex_lock(&callback_mutex);
-	*mask = cs->mems_allowed;
+	retval = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
 	mutex_unlock(&callback_mutex);
 
-	retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
-	NODEMASK_FREE(mask);
-
 	return retval;
 }
 
-- 
1.7.3.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()
  2011-02-17  1:49 [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Li Zefan
@ 2011-02-17  1:49 ` Li Zefan
  2011-02-17 23:46   ` Paul Menage
  2011-02-20  1:51   ` David Rientjes
  2011-02-17  1:50 ` [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC() Li Zefan
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-17  1:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, Paul Menage, David Rientjes, 缪 勰, linux-mm

oldcs->mems_allowed is not modified during cpuset_attch(), so
we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
Just pass it to cpuset_migrate_mm().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c |    7 ++-----
 1 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f13ff2e..70c9ca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
 	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
 
-	if (from == NULL || to == NULL)
+	if (to == NULL)
 		goto alloc_fail;
 
 	if (cs == &top_cpuset) {
@@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	}
 
 	/* change mm; only needs to be done once even if threadgroup */
-	*from = oldcs->mems_allowed;
 	*to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
 		mpol_rebind_mm(mm, to);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, from, to);
+			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
 		mmput(mm);
 	}
 
 alloc_fail:
-	NODEMASK_FREE(from);
 	NODEMASK_FREE(to);
 }
 
-- 
1.7.3.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-17  1:49 [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Li Zefan
  2011-02-17  1:49 ` [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch() Li Zefan
@ 2011-02-17  1:50 ` Li Zefan
  2011-02-17 22:46   ` Andrew Morton
  2011-02-20  1:51   ` David Rientjes
  2011-02-17  1:50 ` [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone() Li Zefan
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-17  1:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, Paul Menage, David Rientjes, 缪 勰, linux-mm

Those functions that use NODEMASK_ALLOC() can't propogate errno
to users, but will fail silently.

Since all of them are called with cgroup_mutex held, here we use
a global nodemask_t variable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c |   45 +++++++++++++++------------------------------
 1 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 70c9ca2..cc414ac 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -79,6 +79,15 @@ int number_of_cpusets __read_mostly;
 struct cgroup_subsys cpuset_subsys;
 struct cpuset;
 
+/*
+ * In functions that can't propogate errno to users, to avoid declaring a
+ * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
+ * -ENOMEM, we use this global cpuset_mems.
+ *
+ * It should be used with cgroup_lock held.
+ */
+static nodemask_t cpuset_mems;
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -1015,17 +1024,11 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
-	if (!newmems)
-		return;
 
 	cs = cgroup_cs(scan->cg);
-	guarantee_online_mems(cs, newmems);
-
-	cpuset_change_task_nodemask(p, newmems);
+	guarantee_online_mems(cs, &cpuset_mems);
 
-	NODEMASK_FREE(newmems);
+	cpuset_change_task_nodemask(p, &cpuset_mems);
 
 	mm = get_task_mm(p);
 	if (!mm)
@@ -1096,13 +1099,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			   const char *buf)
 {
-	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
+	nodemask_t *oldmem = &cpuset_mems;
 	int retval;
 	struct ptr_heap heap;
 
-	if (!oldmem)
-		return -ENOMEM;
-
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
 	 * it's read-only
@@ -1152,7 +1152,6 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	heap_free(&heap);
 done:
-	NODEMASK_FREE(oldmem);
 	return retval;
 }
 
@@ -1438,10 +1437,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
-	if (to == NULL)
-		goto alloc_fail;
+	nodemask_t *to = &cpuset_mems;
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1470,9 +1466,6 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
 		mmput(mm);
 	}
-
-alloc_fail:
-	NODEMASK_FREE(to);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -2051,10 +2044,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return;
+	nodemask_t *oldmems = &cpuset_mems;
 
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
@@ -2090,7 +2080,6 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			update_tasks_nodemask(cp, oldmems, NULL);
 		}
 	}
-	NODEMASK_FREE(oldmems);
 }
 
 /*
@@ -2132,10 +2121,7 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return NOTIFY_DONE;
+	nodemask_t *oldmems = &cpuset_mems;
 
 	cgroup_lock();
 	switch (action) {
@@ -2158,7 +2144,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	}
 	cgroup_unlock();
 
-	NODEMASK_FREE(oldmems);
 	return NOTIFY_OK;
 }
 #endif
-- 
1.7.3.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone()
  2011-02-17  1:49 [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Li Zefan
  2011-02-17  1:49 ` [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch() Li Zefan
  2011-02-17  1:50 ` [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC() Li Zefan
@ 2011-02-17  1:50 ` Li Zefan
  2011-02-17 23:51   ` Paul Menage
  2011-02-20  1:51   ` David Rientjes
  2011-02-17 23:35 ` [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Paul Menage
  2011-02-20  1:51 ` David Rientjes
  4 siblings, 2 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-17  1:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LKML, Paul Menage, David Rientjes, 缪 勰, linux-mm

Chaning cpuset->mems/cpuset->cpus should be protected under
callback_mutex.

cpuset_clone() doesn't follow this rule. It's ok because it's
called when creating and initializing a cgroup, but we'd better
hold the lock to avoid subtil break in the future.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1e18d26..445573b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
 	cs = cgroup_cs(cgroup);
 	parent_cs = cgroup_cs(parent);
 
+	mutex_lock(&callback_mutex);
 	cs->mems_allowed = parent_cs->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+	mutex_unlock(&callback_mutex);
 	return;
 }
 
-- 
1.7.3.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-17  1:50 ` [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC() Li Zefan
@ 2011-02-17 22:46   ` Andrew Morton
  2011-02-17 23:50     ` Paul Menage
  2011-02-20  1:51   ` David Rientjes
  1 sibling, 1 reply; 22+ messages in thread
From: Andrew Morton @ 2011-02-17 22:46 UTC (permalink / raw)
  To: Li Zefan; +Cc: LKML, Paul Menage, David Rientjes, 缪 勰, linux-mm

On Thu, 17 Feb 2011 09:50:09 +0800
Li Zefan <lizf@cn.fujitsu.com> wrote:

> +/*
> + * In functions that can't propogate errno to users, to avoid declaring a
> + * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
> + * -ENOMEM, we use this global cpuset_mems.
> + *
> + * It should be used with cgroup_lock held.

I'll do s/should/must/ - that would be a nasty bug.

I'd be more comfortable about the maintainability of this optimisation
if we had

	WARN_ON(!cgroup_is_locked());

at each site.

> + */
> +static nodemask_t cpuset_mems;

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()
  2011-02-17  1:49 [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Li Zefan
                   ` (2 preceding siblings ...)
  2011-02-17  1:50 ` [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone() Li Zefan
@ 2011-02-17 23:35 ` Paul Menage
  2011-02-18  2:22   ` Li Zefan
  2011-02-20  1:51 ` David Rientjes
  4 siblings, 1 reply; 22+ messages in thread
From: Paul Menage @ 2011-02-17 23:35 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
> It's not necessary to copy cpuset->mems_allowed to a buffer
> allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().
>
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

Acked-by: Paul Menage <menage@google.com>

The only downside is that we're now doing more work (and more complex
work) inside callback_mutex, but I guess that's OK compared to having
to do a memory allocation. (I poked around in lib/vsprintf.c and I
couldn't see any cases where it might allocate memory, but it would be
particularly bad if there was any way to trigger an Oops.)

> ---
>  kernel/cpuset.c |   10 +---------
>  1 files changed, 1 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 10f1835..f13ff2e 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1620,20 +1620,12 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
>
>  static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
>  {
> -       NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
>        int retval;
>
> -       if (mask == NULL)
> -               return -ENOMEM;
> -

And this was particularly broken since the only caller of
cpuset_sprintf_memlist() doesn't handle a negative error response
anyway and would then overwrite byte 4083 on the preceding page with a
'\n'. And then since the (size_t)(s-page) that's passed to
simple_read_from_buffer() would be a very large number, it would write
arbitrary (user-controlled) amounts of kernel data to the userspace
buffer.

Maybe we could also rename 'retval' to 'count' in this function (and
cpuset_sprintf_cpulist()) to make it clearer that callers don't expect
negative error values?

>        mutex_lock(&callback_mutex);
> -       *mask = cs->mems_allowed;
> +       retval = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
>        mutex_unlock(&callback_mutex);
>
> -       retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
> -
> -       NODEMASK_FREE(mask);
> -
>        return retval;
>  }
>
> --
> 1.7.3.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()
  2011-02-17  1:49 ` [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch() Li Zefan
@ 2011-02-17 23:46   ` Paul Menage
  2011-02-18  2:37     ` Li Zefan
  2011-02-20  1:51   ` David Rientjes
  1 sibling, 1 reply; 22+ messages in thread
From: Paul Menage @ 2011-02-17 23:46 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
> oldcs->mems_allowed is not modified during cpuset_attch(), so
> we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
> Just pass it to cpuset_migrate_mm().
>
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

I'd be inclined to skip this one - we're already allocating one
nodemask, so one more isn't really any extra complexity, and we're
doing horrendously complicated stuff in cpuset_migrate_mm() that's
much more likely to fail in low-memory situations.

It's true that mems_allowed can't change during the call to
cpuset_attach(), but that's due to the fact that both cgroup_attach()
and the cpuset.mems write paths take cgroup_mutex. I might prefer to
leave the allocated nodemask here and wrap callback_mutex around the
places in cpuset_attach() where we're reading from a cpuset's
mems_allowed - that would remove the implicit synchronization via
cgroup_mutex and leave the code a little more understandable.

> ---
>  kernel/cpuset.c |    7 ++-----
>  1 files changed, 2 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index f13ff2e..70c9ca2 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>        struct mm_struct *mm;
>        struct cpuset *cs = cgroup_cs(cont);
>        struct cpuset *oldcs = cgroup_cs(oldcont);
> -       NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
>        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
>
> -       if (from == NULL || to == NULL)
> +       if (to == NULL)
>                goto alloc_fail;
>
>        if (cs == &top_cpuset) {
> @@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>        }
>
>        /* change mm; only needs to be done once even if threadgroup */
> -       *from = oldcs->mems_allowed;
>        *to = cs->mems_allowed;
>        mm = get_task_mm(tsk);
>        if (mm) {
>                mpol_rebind_mm(mm, to);
>                if (is_memory_migrate(cs))
> -                       cpuset_migrate_mm(mm, from, to);
> +                       cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
>                mmput(mm);
>        }
>
>  alloc_fail:
> -       NODEMASK_FREE(from);
>        NODEMASK_FREE(to);
>  }
>
> --
> 1.7.3.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-17 22:46   ` Andrew Morton
@ 2011-02-17 23:50     ` Paul Menage
  2011-02-18  2:47       ` Li Zefan
  0 siblings, 1 reply; 22+ messages in thread
From: Paul Menage @ 2011-02-17 23:50 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Li Zefan, LKML, David Rientjes, 缪 勰, linux-mm

On Thu, Feb 17, 2011 at 2:46 PM, Andrew Morton
<akpm@linux-foundation.org> wrote:
> On Thu, 17 Feb 2011 09:50:09 +0800
> Li Zefan <lizf@cn.fujitsu.com> wrote:
>
>> +/*
>> + * In functions that can't propogate errno to users, to avoid declaring a
>> + * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
>> + * -ENOMEM, we use this global cpuset_mems.
>> + *
>> + * It should be used with cgroup_lock held.
>
> I'll do s/should/must/ - that would be a nasty bug.
>
> I'd be more comfortable about the maintainability of this optimisation
> if we had
>
>        WARN_ON(!cgroup_is_locked());
>
> at each site.
>

Agreed - that was my first thought on reading the patch. How about:

static nodemask_t *cpuset_static_nodemask() {
  static nodemask_t nodemask;
  WARN_ON(!cgroup_is_locked());
  return &nodemask;
}

and then just call cpuset_static_nodemask() in the various locations
being patched?

Paul

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone()
  2011-02-17  1:50 ` [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone() Li Zefan
@ 2011-02-17 23:51   ` Paul Menage
  2011-02-20  1:51   ` David Rientjes
  1 sibling, 0 replies; 22+ messages in thread
From: Paul Menage @ 2011-02-17 23:51 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

On Wed, Feb 16, 2011 at 5:50 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
> Chaning cpuset->mems/cpuset->cpus should be protected under
> callback_mutex.
>
> cpuset_clone() doesn't follow this rule. It's ok because it's
> called when creating and initializing a cgroup, but we'd better
> hold the lock to avoid subtil break in the future.
>
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

Acked-by: Paul Menage <menage@google.com>

Patch title should be s/cpuset_clone/cpuset_post_clone/

> ---
>  kernel/cpuset.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
>
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 1e18d26..445573b 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1840,8 +1840,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
>        cs = cgroup_cs(cgroup);
>        parent_cs = cgroup_cs(parent);
>
> +       mutex_lock(&callback_mutex);
>        cs->mems_allowed = parent_cs->mems_allowed;
>        cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
> +       mutex_unlock(&callback_mutex);
>        return;
>  }
>
> --
> 1.7.3.1
>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()
  2011-02-17 23:35 ` [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Paul Menage
@ 2011-02-18  2:22   ` Li Zefan
  0 siblings, 0 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-18  2:22 UTC (permalink / raw)
  To: Paul Menage
  Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

Paul Menage wrote:
> On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>> It's not necessary to copy cpuset->mems_allowed to a buffer
>> allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().
>>
>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> 
> Acked-by: Paul Menage <menage@google.com>
> 
> The only downside is that we're now doing more work (and more complex
> work) inside callback_mutex, but I guess that's OK compared to having
> to do a memory allocation. (I poked around in lib/vsprintf.c and I
> couldn't see any cases where it might allocate memory, but it would be
> particularly bad if there was any way to trigger an Oops.)
> 
>> ---
>>  kernel/cpuset.c |   10 +---------
>>  1 files changed, 1 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>> index 10f1835..f13ff2e 100644
>> --- a/kernel/cpuset.c
>> +++ b/kernel/cpuset.c
>> @@ -1620,20 +1620,12 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
>>
>>  static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
>>  {
>> -       NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
>>        int retval;
>>
>> -       if (mask == NULL)
>> -               return -ENOMEM;
>> -
> 
> And this was particularly broken since the only caller of
> cpuset_sprintf_memlist() doesn't handle a negative error response
> anyway and would then overwrite byte 4083 on the preceding page with a
> '\n'. And then since the (size_t)(s-page) that's passed to
> simple_read_from_buffer() would be a very large number, it would write
> arbitrary (user-controlled) amounts of kernel data to the userspace
> buffer.
> 
> Maybe we could also rename 'retval' to 'count' in this function (and
> cpuset_sprintf_cpulist()) to make it clearer that callers don't expect
> negative error values?
> 

Good spot!

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()
  2011-02-17 23:46   ` Paul Menage
@ 2011-02-18  2:37     ` Li Zefan
  0 siblings, 0 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-18  2:37 UTC (permalink / raw)
  To: Paul Menage
  Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

Paul Menage wrote:
> On Wed, Feb 16, 2011 at 5:49 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>> oldcs->mems_allowed is not modified during cpuset_attch(), so
>> we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
>> Just pass it to cpuset_migrate_mm().
>>
>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> 
> I'd be inclined to skip this one - we're already allocating one
> nodemask, so one more isn't really any extra complexity, and we're
> doing horrendously complicated stuff in cpuset_migrate_mm() that's
> much more likely to fail in low-memory situations.

That's true, but it's not a reason to add more cases that can fail.

> 
> It's true that mems_allowed can't change during the call to

Sorry to lead you to mistake what I meant. I meant 'from' is not modified
after it's copied from oldcs->mems_allowed, so the two are exactly the
same and thus we only need one.

> cpuset_attach(), but that's due to the fact that both cgroup_attach()
> and the cpuset.mems write paths take cgroup_mutex. I might prefer to
> leave the allocated nodemask here and wrap callback_mutex around the
> places in cpuset_attach() where we're reading from a cpuset's
> mems_allowed - that would remove the implicit synchronization via
> cgroup_mutex and leave the code a little more understandable.

It's not an implicit synchronization, but instead the lock rule for
reading/writing a cpuset's mems/cpus is described in the comment.

> 
>> ---
>>  kernel/cpuset.c |    7 ++-----
>>  1 files changed, 2 insertions(+), 5 deletions(-)
>>
>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>> index f13ff2e..70c9ca2 100644
>> --- a/kernel/cpuset.c
>> +++ b/kernel/cpuset.c
>> @@ -1438,10 +1438,9 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>>        struct mm_struct *mm;
>>        struct cpuset *cs = cgroup_cs(cont);
>>        struct cpuset *oldcs = cgroup_cs(oldcont);
>> -       NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
>>        NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
>>
>> -       if (from == NULL || to == NULL)
>> +       if (to == NULL)
>>                goto alloc_fail;
>>
>>        if (cs == &top_cpuset) {
>> @@ -1463,18 +1462,16 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>>        }
>>
>>        /* change mm; only needs to be done once even if threadgroup */
>> -       *from = oldcs->mems_allowed;
>>        *to = cs->mems_allowed;
>>        mm = get_task_mm(tsk);
>>        if (mm) {
>>                mpol_rebind_mm(mm, to);
>>                if (is_memory_migrate(cs))
>> -                       cpuset_migrate_mm(mm, from, to);
>> +                       cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
>>                mmput(mm);
>>        }
>>
>>  alloc_fail:
>> -       NODEMASK_FREE(from);
>>        NODEMASK_FREE(to);
>>  }
>>
>> --
>> 1.7.3.1
>>
> 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-17 23:50     ` Paul Menage
@ 2011-02-18  2:47       ` Li Zefan
  2011-02-19  2:28         ` Paul Menage
  0 siblings, 1 reply; 22+ messages in thread
From: Li Zefan @ 2011-02-18  2:47 UTC (permalink / raw)
  To: Paul Menage
  Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

Paul Menage wrote:
> On Thu, Feb 17, 2011 at 2:46 PM, Andrew Morton
> <akpm@linux-foundation.org> wrote:
>> On Thu, 17 Feb 2011 09:50:09 +0800
>> Li Zefan <lizf@cn.fujitsu.com> wrote:
>>
>>> +/*
>>> + * In functions that can't propogate errno to users, to avoid declaring a
>>> + * nodemask_t variable, and avoid using NODEMASK_ALLOC that can return
>>> + * -ENOMEM, we use this global cpuset_mems.
>>> + *
>>> + * It should be used with cgroup_lock held.
>>
>> I'll do s/should/must/ - that would be a nasty bug.
>>
>> I'd be more comfortable about the maintainability of this optimisation
>> if we had
>>
>>        WARN_ON(!cgroup_is_locked());
>>
>> at each site.
>>
> 
> Agreed - that was my first thought on reading the patch. How about:
> 
> static nodemask_t *cpuset_static_nodemask() {

Then this should be 'noinline', otherwise we'll have one copy for each
function that calls it.

>   static nodemask_t nodemask;
>   WARN_ON(!cgroup_is_locked());
>   return &nodemask;
> }
> 
> and then just call cpuset_static_nodemask() in the various locations
> being patched?
> 

I think a defect of this is people might call it twice in one function
but don't know it returns the same variable?

For example in cpuset_attach():

void cpuset_attach(...)
{
	nodemask_t *from = cpuset_static_nodemask();
	nodemask_t *to = cpuset_static_nodemask();
	...
}

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-18  2:47       ` Li Zefan
@ 2011-02-19  2:28         ` Paul Menage
  0 siblings, 0 replies; 22+ messages in thread
From: Paul Menage @ 2011-02-19  2:28 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, David Rientjes, 缪 勰, linux-mm

On Thu, Feb 17, 2011 at 6:47 PM, Li Zefan <lizf@cn.fujitsu.com> wrote:
>
> I think a defect of this is people might call it twice in one function
> but don't know it returns the same variable?

Hopefully they'd read the comments...

But it's not a big issue either way - having the WARN_ON() statements
in front of each use works OK too, given that there are only a few of
them.

Paul

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist()
  2011-02-17  1:49 [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Li Zefan
                   ` (3 preceding siblings ...)
  2011-02-17 23:35 ` [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Paul Menage
@ 2011-02-20  1:51 ` David Rientjes
  4 siblings, 0 replies; 22+ messages in thread
From: David Rientjes @ 2011-02-20  1:51 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

On Thu, 17 Feb 2011, Li Zefan wrote:

> It's not necessary to copy cpuset->mems_allowed to a buffer
> allocated by NODEMASK_ALLOC(). Just pass it to nodelist_scnprintf().
> 
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch()
  2011-02-17  1:49 ` [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch() Li Zefan
  2011-02-17 23:46   ` Paul Menage
@ 2011-02-20  1:51   ` David Rientjes
  1 sibling, 0 replies; 22+ messages in thread
From: David Rientjes @ 2011-02-20  1:51 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

On Thu, 17 Feb 2011, Li Zefan wrote:

> oldcs->mems_allowed is not modified during cpuset_attch(), so
> we don't have to copy it to a buffer allocated by NODEMASK_ALLOC().
> Just pass it to cpuset_migrate_mm().
> 
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-17  1:50 ` [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC() Li Zefan
  2011-02-17 22:46   ` Andrew Morton
@ 2011-02-20  1:51   ` David Rientjes
  2011-02-21  3:20     ` Li Zefan
  1 sibling, 1 reply; 22+ messages in thread
From: David Rientjes @ 2011-02-20  1:51 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

On Thu, 17 Feb 2011, Li Zefan wrote:

> Those functions that use NODEMASK_ALLOC() can't propogate errno
> to users, but will fail silently.
> 
> Since all of them are called with cgroup_mutex held, here we use
> a global nodemask_t variable.
> 
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

I like the idea and the comment is explicit enough that we don't need any 
refcounting to ensure double usage under cgroup_lock.  I think each 
function should be modified to use cpuset_mems directly, though, instead 
of defining local variables that indirectly access it which only serves to 
make this patch smaller.  Then we can ensure that all occurrences of 
cpuset_mems appear within the lock without being concerned about other 
references.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone()
  2011-02-17  1:50 ` [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone() Li Zefan
  2011-02-17 23:51   ` Paul Menage
@ 2011-02-20  1:51   ` David Rientjes
  1 sibling, 0 replies; 22+ messages in thread
From: David Rientjes @ 2011-02-20  1:51 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

On Thu, 17 Feb 2011, Li Zefan wrote:

> Chaning cpuset->mems/cpuset->cpus should be protected under
> callback_mutex.
> 
> cpuset_clone() doesn't follow this rule. It's ok because it's
> called when creating and initializing a cgroup, but we'd better
> hold the lock to avoid subtil break in the future.
> 
> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-20  1:51   ` David Rientjes
@ 2011-02-21  3:20     ` Li Zefan
  2011-02-21  5:30       ` Li Zefan
  2011-02-22  0:25       ` David Rientjes
  0 siblings, 2 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-21  3:20 UTC (permalink / raw)
  To: David Rientjes; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

David Rientjes wrote:
> On Thu, 17 Feb 2011, Li Zefan wrote:
> 
>> Those functions that use NODEMASK_ALLOC() can't propogate errno
>> to users, but will fail silently.
>>
>> Since all of them are called with cgroup_mutex held, here we use
>> a global nodemask_t variable.
>>
>> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> 
> I like the idea and the comment is explicit enough that we don't need any 
> refcounting to ensure double usage under cgroup_lock.  I think each 
> function should be modified to use cpuset_mems directly, though, instead 
> of defining local variables that indirectly access it which only serves to 
> make this patch smaller.  Then we can ensure that all occurrences of 
> cpuset_mems appear within the lock without being concerned about other 
> references.
> 

Unfortunately, as I looked into the code again I found cpuset_change_nodemask()
is called by other functions that use the global cpuset_mems, so I
think we'd better check the refcnt of cpuset_mems.

How about this:

[PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

Those functions that use NODEMASK_ALLOC() can't propogate errno
to users, so might fail silently.

Based on the fact that all of them are called with cgroup_mutex
held, we fix this by using a global nodemask.

cpuset_change_nodemask() is an exception, because it's called
by other functions. Here we declare a static nodemask in the
function for its own use.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c |   82 +++++++++++++++++++++++++++++++++----------------------
 1 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 70c9ca2..da620d2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -79,6 +79,38 @@ int number_of_cpusets __read_mostly;
 struct cgroup_subsys cpuset_subsys;
 struct cpuset;
 
+static nodemask_t cpuset_mems;
+static int cpuset_mems_ref;
+
+/*
+ * In functions that can't propagate errno to users, to avoid declaring a
+ * nodemask_t variable in stack, and avoid using NODEMASK_ALLOC that can
+ * return -ENOMEM, we use a global cpuset_mems.
+ *
+ * It must be used with cgroup_lock held.
+ */
+static nodemask_t *cpuset_static_nodemask(void)
+{
+	WARN_ON(!cgroup_lock_is_held());
+	WARN_ON(cpuset_mems_ref);
+
+	cpuset_mems_ref++;
+	return &cpuset_mems;
+}
+
+/*
+ * Calling cpuset_static_nodemask() should be paired with this function,
+ * so we insure the global nodemask won't be used by more than one user
+ * at the one time.
+ */
+static void cpuset_release_static_nodemask(void)
+{
+	WARN_ON(!cgroup_lock_is_held());
+
+	cpuset_mems_ref--;
+	WARN_ON(!cpuset_mems_ref);
+}
+
 /* See "Frequency meter" comments, below. */
 
 struct fmeter {
@@ -1015,17 +1047,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
-	if (!newmems)
-		return;
+	static nodemask_t newmems;	/* protected by cgroup_mutex */
 
 	cs = cgroup_cs(scan->cg);
-	guarantee_online_mems(cs, newmems);
-
-	cpuset_change_task_nodemask(p, newmems);
+	guarantee_online_mems(cs, &newmems);
 
-	NODEMASK_FREE(newmems);
+	cpuset_change_task_nodemask(p, &newmems);
 
 	mm = get_task_mm(p);
 	if (!mm)
@@ -1096,13 +1123,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
 static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 			   const char *buf)
 {
-	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
+	nodemask_t *oldmem = cpuset_static_nodemask();
 	int retval;
 	struct ptr_heap heap;
 
-	if (!oldmem)
-		return -ENOMEM;
-
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
 	 * it's read-only
@@ -1152,7 +1176,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	heap_free(&heap);
 done:
-	NODEMASK_FREE(oldmem);
+	cpuset_release_static_nodemask();
 	return retval;
 }
 
@@ -1438,10 +1462,7 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
-	if (to == NULL)
-		goto alloc_fail;
+	nodemask_t *to = cpuset_static_nodemask();
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
@@ -1461,18 +1482,17 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 		rcu_read_unlock();
 	}
 
+	cpuset_release_static_nodemask();
+
 	/* change mm; only needs to be done once even if threadgroup */
-	*to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
-		mpol_rebind_mm(mm, to);
+		mpol_rebind_mm(mm, &cs->mems_allowed);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
+			cpuset_migrate_mm(mm, &oldcs->mems_allowed,
+					  &cs->mems_allowed);
 		mmput(mm);
 	}
-
-alloc_fail:
-	NODEMASK_FREE(to);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -2051,10 +2071,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return;
+	nodemask_t *oldmems = cpuset_static_nodemask();
 
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
@@ -2090,7 +2107,8 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			update_tasks_nodemask(cp, oldmems, NULL);
 		}
 	}
-	NODEMASK_FREE(oldmems);
+
+	cpuset_release_static_nodemask();
 }
 
 /*
@@ -2132,19 +2150,18 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return NOTIFY_DONE;
+	nodemask_t *oldmems;
 
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
+		oldmems = cpuset_static_nodemask();
 		*oldmems = top_cpuset.mems_allowed;
 		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 		mutex_unlock(&callback_mutex);
 		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+		cpuset_release_static_nodemask();
 		break;
 	case MEM_OFFLINE:
 		/*
@@ -2158,7 +2175,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	}
 	cgroup_unlock();
 
-	NODEMASK_FREE(oldmems);
 	return NOTIFY_OK;
 }
 #endif
-- 
1.7.3.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-21  3:20     ` Li Zefan
@ 2011-02-21  5:30       ` Li Zefan
  2011-02-22  0:25       ` David Rientjes
  1 sibling, 0 replies; 22+ messages in thread
From: Li Zefan @ 2011-02-21  5:30 UTC (permalink / raw)
  To: David Rientjes; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

> +/*
> + * Calling cpuset_static_nodemask() should be paired with this function,
> + * so we insure the global nodemask won't be used by more than one user
> + * at the one time.
> + */
> +static void cpuset_release_static_nodemask(void)
> +{
> +	WARN_ON(!cgroup_lock_is_held());
> +
> +	cpuset_mems_ref--;
> +	WARN_ON(!cpuset_mems_ref);

WARN_ON(cpuset_mems_ref);

> +}

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-21  3:20     ` Li Zefan
  2011-02-21  5:30       ` Li Zefan
@ 2011-02-22  0:25       ` David Rientjes
  2011-02-22  2:15         ` Li Zefan
  1 sibling, 1 reply; 22+ messages in thread
From: David Rientjes @ 2011-02-22  0:25 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

On Mon, 21 Feb 2011, Li Zefan wrote:

> Unfortunately, as I looked into the code again I found cpuset_change_nodemask()
> is called by other functions that use the global cpuset_mems, so I
> think we'd better check the refcnt of cpuset_mems.
> 
> How about this:
> 
> [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
> 
> Those functions that use NODEMASK_ALLOC() can't propogate errno
> to users, so might fail silently.
> 
> Based on the fact that all of them are called with cgroup_mutex
> held, we fix this by using a global nodemask.
> 

If all of the functions that require a nodemask are protected by 
cgroup_mutex, then I think it would be much better to just statically 
allocate them within the function and avoid any nodemask in file scope.  
cpuset_mems cannot be shared so introducing it with a refcount would 
probably just be confusing.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-22  0:25       ` David Rientjes
@ 2011-02-22  2:15         ` Li Zefan
  2011-02-22 20:30           ` David Rientjes
  0 siblings, 1 reply; 22+ messages in thread
From: Li Zefan @ 2011-02-22  2:15 UTC (permalink / raw)
  To: David Rientjes; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

David Rientjes wrote:
> On Mon, 21 Feb 2011, Li Zefan wrote:
> 
>> Unfortunately, as I looked into the code again I found cpuset_change_nodemask()
>> is called by other functions that use the global cpuset_mems, so I
>> think we'd better check the refcnt of cpuset_mems.
>>
>> How about this:
>>
>> [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
>>
>> Those functions that use NODEMASK_ALLOC() can't propogate errno
>> to users, so might fail silently.
>>
>> Based on the fact that all of them are called with cgroup_mutex
>> held, we fix this by using a global nodemask.
>>
> 
> If all of the functions that require a nodemask are protected by 
> cgroup_mutex, then I think it would be much better to just statically 
> allocate them within the function and avoid any nodemask in file scope.  
> cpuset_mems cannot be shared so introducing it with a refcount would 
> probably just be confusing.
> 

I'll repost the patchset after you ack this patch.

[PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()

Those functions that use NODEMASK_ALLOC() can't propogate errno
to users, so might fail silently.

Fix it by using one static nodemask_t variable for each function, and
those variables are protected by cgroup_mutex.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cpuset.c |   50 ++++++++++++++++----------------------------------
 1 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 8fef8c6..073ce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
-	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
-	if (!newmems)
-		return;
+	static nodemask_t newmems;	/* protected by cgroup_mutex */
 
 	cs = cgroup_cs(scan->cg);
-	guarantee_online_mems(cs, newmems);
-
-	cpuset_change_task_nodemask(p, newmems);
+	guarantee_online_mems(cs, &newmems);
 
-	NODEMASK_FREE(newmems);
+	cpuset_change_task_nodemask(p, &newmems);
 
 	mm = get_task_mm(p);
 	if (!mm)
@@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
 	struct mm_struct *mm;
 	struct cpuset *cs = cgroup_cs(cont);
 	struct cpuset *oldcs = cgroup_cs(oldcont);
-	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
-
-	if (to == NULL)
-		goto alloc_fail;
+	static nodemask_t to;		/* protected by cgroup_mutex */
 
 	if (cs == &top_cpuset) {
 		cpumask_copy(cpus_attach, cpu_possible_mask);
 	} else {
 		guarantee_online_cpus(cs, cpus_attach);
 	}
-	guarantee_online_mems(cs, to);
+	guarantee_online_mems(cs, &to);
 
 	/* do per-task migration stuff possibly for each in the threadgroup */
-	cpuset_attach_task(tsk, to, cs);
+	cpuset_attach_task(tsk, &to, cs);
 	if (threadgroup) {
 		struct task_struct *c;
 		rcu_read_lock();
 		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-			cpuset_attach_task(c, to, cs);
+			cpuset_attach_task(c, &to, cs);
 		}
 		rcu_read_unlock();
 	}
 
 	/* change mm; only needs to be done once even if threadgroup */
-	*to = cs->mems_allowed;
+	to = cs->mems_allowed;
 	mm = get_task_mm(tsk);
 	if (mm) {
-		mpol_rebind_mm(mm, to);
+		mpol_rebind_mm(mm, &to);
 		if (is_memory_migrate(cs))
-			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
+			cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
 		mmput(mm);
 	}
-
-alloc_fail:
-	NODEMASK_FREE(to);
 }
 
 /* The various types of files and directories in a cpuset file system */
@@ -2051,10 +2040,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 	struct cpuset *cp;	/* scans cpusets being updated */
 	struct cpuset *child;	/* scans child cpusets of cp */
 	struct cgroup *cont;
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return;
+	static nodemask_t oldmems;	/* protected by cgroup_mutex */
 
 	list_add_tail((struct list_head *)&root->stack_list, &queue);
 
@@ -2071,7 +2057,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
 			continue;
 
-		*oldmems = cp->mems_allowed;
+		oldmems = cp->mems_allowed;
 
 		/* Remove offline cpus and mems from this cpuset. */
 		mutex_lock(&callback_mutex);
@@ -2087,10 +2073,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, oldmems, NULL);
+			update_tasks_nodemask(cp, &oldmems, NULL);
 		}
 	}
-	NODEMASK_FREE(oldmems);
 }
 
 /*
@@ -2132,19 +2117,16 @@ void cpuset_update_active_cpus(void)
 static int cpuset_track_online_nodes(struct notifier_block *self,
 				unsigned long action, void *arg)
 {
-	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
-	if (oldmems == NULL)
-		return NOTIFY_DONE;
+	static nodemask_t oldmems;	/* protected by cgroup_mutex */
 
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-		*oldmems = top_cpuset.mems_allowed;
+		oldmems = top_cpuset.mems_allowed;
 		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 		mutex_unlock(&callback_mutex);
-		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+		update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
 		break;
 	case MEM_OFFLINE:
 		/*
-- 
1.7.3.1

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
  2011-02-22  2:15         ` Li Zefan
@ 2011-02-22 20:30           ` David Rientjes
  0 siblings, 0 replies; 22+ messages in thread
From: David Rientjes @ 2011-02-22 20:30 UTC (permalink / raw)
  To: Li Zefan; +Cc: Andrew Morton, LKML, Paul Menage, miaox, linux-mm

On Tue, 22 Feb 2011, Li Zefan wrote:

> [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC()
> 
> Those functions that use NODEMASK_ALLOC() can't propogate errno
> to users, so might fail silently.
> 
> Fix it by using one static nodemask_t variable for each function, and
> those variables are protected by cgroup_mutex.
> 

I think there would also be incentive to do the same thing for 
update_nodemask() even though its caller can handle -ENOMEM.  Imagine 
current being out of memory and the NODEMASK_ALLOC() subsequently failing 
because it is oom yet it may be trying to give itself more memory.  It's 
also protected by cgroup_mutex so the only thing we're sacrificing is 8 
bytes on the defconfig and 256 bytes even with CONFIG_NODES_SHIFT == 10.  
On machines that large, this seems like an acceptable cost to ensure we 
can give ourselves more memory :)

> Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
> ---
>  kernel/cpuset.c |   50 ++++++++++++++++----------------------------------
>  1 files changed, 16 insertions(+), 34 deletions(-)
> 
> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
> index 8fef8c6..073ce91 100644
> --- a/kernel/cpuset.c
> +++ b/kernel/cpuset.c
> @@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
>  	struct cpuset *cs;
>  	int migrate;
>  	const nodemask_t *oldmem = scan->data;
> -	NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
> -
> -	if (!newmems)
> -		return;
> +	static nodemask_t newmems;	/* protected by cgroup_mutex */
>  
>  	cs = cgroup_cs(scan->cg);
> -	guarantee_online_mems(cs, newmems);
> -
> -	cpuset_change_task_nodemask(p, newmems);
> +	guarantee_online_mems(cs, &newmems);

The newmems nodemask is going to be persistant across calls since it is 
static, so we have to be careful that nothing depends on it being 
NODE_MASK_NONE.  Indeed, NODEMASK_ALLOC() with just GFP_KERNEL doesn't 
guarantee anything different.  guarantee_online_mems() sets the nodemask, 
so this looks good.

>  
> -	NODEMASK_FREE(newmems);
> +	cpuset_change_task_nodemask(p, &newmems);
>  
>  	mm = get_task_mm(p);
>  	if (!mm)
> @@ -1438,41 +1433,35 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
>  	struct mm_struct *mm;
>  	struct cpuset *cs = cgroup_cs(cont);
>  	struct cpuset *oldcs = cgroup_cs(oldcont);
> -	NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
> -
> -	if (to == NULL)
> -		goto alloc_fail;
> +	static nodemask_t to;		/* protected by cgroup_mutex */
>  
>  	if (cs == &top_cpuset) {
>  		cpumask_copy(cpus_attach, cpu_possible_mask);
>  	} else {
>  		guarantee_online_cpus(cs, cpus_attach);
>  	}
> -	guarantee_online_mems(cs, to);
> +	guarantee_online_mems(cs, &to);
>  
>  	/* do per-task migration stuff possibly for each in the threadgroup */
> -	cpuset_attach_task(tsk, to, cs);
> +	cpuset_attach_task(tsk, &to, cs);
>  	if (threadgroup) {
>  		struct task_struct *c;
>  		rcu_read_lock();
>  		list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
> -			cpuset_attach_task(c, to, cs);
> +			cpuset_attach_task(c, &to, cs);
>  		}
>  		rcu_read_unlock();
>  	}
>  
>  	/* change mm; only needs to be done once even if threadgroup */
> -	*to = cs->mems_allowed;
> +	to = cs->mems_allowed;
>  	mm = get_task_mm(tsk);
>  	if (mm) {
> -		mpol_rebind_mm(mm, to);
> +		mpol_rebind_mm(mm, &to);
>  		if (is_memory_migrate(cs))
> -			cpuset_migrate_mm(mm, &oldcs->mems_allowed, to);
> +			cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to);
>  		mmput(mm);
>  	}
> -
> -alloc_fail:
> -	NODEMASK_FREE(to);
>  }
>  
>  /* The various types of files and directories in a cpuset file system */
> @@ -2051,10 +2040,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
>  	struct cpuset *cp;	/* scans cpusets being updated */
>  	struct cpuset *child;	/* scans child cpusets of cp */
>  	struct cgroup *cont;
> -	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
> -
> -	if (oldmems == NULL)
> -		return;
> +	static nodemask_t oldmems;	/* protected by cgroup_mutex */
>  
>  	list_add_tail((struct list_head *)&root->stack_list, &queue);
>  
> @@ -2071,7 +2057,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
>  		    nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
>  			continue;
>  
> -		*oldmems = cp->mems_allowed;
> +		oldmems = cp->mems_allowed;
>  
>  		/* Remove offline cpus and mems from this cpuset. */
>  		mutex_lock(&callback_mutex);
> @@ -2087,10 +2073,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
>  			remove_tasks_in_empty_cpuset(cp);
>  		else {
>  			update_tasks_cpumask(cp, NULL);
> -			update_tasks_nodemask(cp, oldmems, NULL);
> +			update_tasks_nodemask(cp, &oldmems, NULL);
>  		}
>  	}
> -	NODEMASK_FREE(oldmems);
>  }
>  
>  /*
> @@ -2132,19 +2117,16 @@ void cpuset_update_active_cpus(void)
>  static int cpuset_track_online_nodes(struct notifier_block *self,
>  				unsigned long action, void *arg)
>  {
> -	NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
> -
> -	if (oldmems == NULL)
> -		return NOTIFY_DONE;
> +	static nodemask_t oldmems;	/* protected by cgroup_mutex */
>  
>  	cgroup_lock();
>  	switch (action) {
>  	case MEM_ONLINE:
> -		*oldmems = top_cpuset.mems_allowed;
> +		oldmems = top_cpuset.mems_allowed;
>  		mutex_lock(&callback_mutex);
>  		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
>  		mutex_unlock(&callback_mutex);
> -		update_tasks_nodemask(&top_cpuset, oldmems, NULL);
> +		update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
>  		break;
>  	case MEM_OFFLINE:
>  		/*

The NODEMASK_FREE() wasn't removed from cpuset_track_online_nodes().  
After that's fixed:

Acked-by: David Rientjes <rientjes@google.com>

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2011-02-22 20:30 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-02-17  1:49 [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Li Zefan
2011-02-17  1:49 ` [PATCH 2/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_attch() Li Zefan
2011-02-17 23:46   ` Paul Menage
2011-02-18  2:37     ` Li Zefan
2011-02-20  1:51   ` David Rientjes
2011-02-17  1:50 ` [PATCH 3/4] cpuset: Fix unchecked calls to NODEMASK_ALLOC() Li Zefan
2011-02-17 22:46   ` Andrew Morton
2011-02-17 23:50     ` Paul Menage
2011-02-18  2:47       ` Li Zefan
2011-02-19  2:28         ` Paul Menage
2011-02-20  1:51   ` David Rientjes
2011-02-21  3:20     ` Li Zefan
2011-02-21  5:30       ` Li Zefan
2011-02-22  0:25       ` David Rientjes
2011-02-22  2:15         ` Li Zefan
2011-02-22 20:30           ` David Rientjes
2011-02-17  1:50 ` [PATCH 4/4] cpuset: Hold callback_mutex in cpuset_clone() Li Zefan
2011-02-17 23:51   ` Paul Menage
2011-02-20  1:51   ` David Rientjes
2011-02-17 23:35 ` [PATCH 1/4] cpuset: Remove unneeded NODEMASK_ALLOC() in cpuset_sprintf_memlist() Paul Menage
2011-02-18  2:22   ` Li Zefan
2011-02-20  1:51 ` David Rientjes

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).